# Generate Clusters

In [1]:
# standard imports
import numpy as np
import os

# plotting tools
import matplotlib.pyplot as plt
%matplotlib inline

# data management
import pandas as pd


## Part 1: Hitters

In [2]:
import hittingpredictor as hp

In [17]:

# which years are we interested in querying?
years = range(2019-3, 2019+1)


year_dfs = []

for year in years:

    df = hp.scrape_year(year=year,cat='bat',verbose=0)
    year_dfs.append(df)

# concatenate results from all years
all_year_data = year_dfs[0]
for year in year_dfs[1:]:
    all_year_data = pd.concat([all_year_data, year])
    



In [18]:
nclusters = 12

year_df,df,stereotype_df,hitter_cluster_centroid_df = hp.compute_cluster(all_year_data,years,nclusters,min_pas=150,verbose=0)



In [19]:
# this cell is where the magic happens:
# choose all the prefactors

# weights to consider for each year
year_weights = {}
year_weights[2016.0] = 0.075
year_weights[2017.0] = 0.075
year_weights[2018.0] = 0.35
year_weights[2019.0] = 0.5

# penalty if missing
year_weights_penalty = {}
year_weights_penalty[2016.0] = 0.00
year_weights_penalty[2017.0] = 0.00
year_weights_penalty[2018.0] = 0.05
year_weights_penalty[2019.0] = 0.05

# how aggresively to bring players back to the cluster center
regression_factor = 0.65
err_regression_factor = 1.2



In [22]:

# if you'd like to do them all...
arr = np.array(list(df['Name']))
pls = np.unique(arr)

pls = ['J.D. Martinez']

print('Player, HR, eHR, H, eH, AB, eAB, SB, eSB, RBI, eRBI, R, eR, PA, Adj')

for pl in pls:
    hp.generate_player_prediction(pl,df,hitter_cluster_centroid_df,\
                               estimated_pas=600,\
                               year_weights=year_weights,\
                               year_weights_penalty=year_weights_penalty,\
                               regression_factor=regression_factor,err_regression_factor=err_regression_factor,\
                               AgeDict={},verbose=0)

    

Player, HR, eHR, H, eH, AB, eAB, SB, eSB, RBI, eRBI, R, eR, PA, Adj
J.D. Martinez, 36.1, 5.64, 121.37, 25.62, 528.01, 5.33, 3.68, 2.92, 104.06, 13.85, 91.63, 11.69, 600, 1.0, 


## Part 2: Pitchers

In [7]:
import pitchingpredictor as pp

In [11]:

# which years are we interested in querying?
years = range(2019-2, 2019+1)


year_dfs = []

for year in years:

    df = pp.scrape_year(year=year,cat='pit',verbose=0)
    year_dfs.append(df)

# concatenate results from all years
all_year_data = year_dfs[0]
for year in year_dfs[1:]:
    all_year_data = pd.concat([all_year_data, year])
    




In [12]:
nclusters = 12

year_df,df,stereotype_df,cluster_centroid_df = pp.compute_cluster_pitching(all_year_data,years,nclusters,min_ip=10,verbose=0)




In [15]:
# new tunings for pitchers...
year_weights = {}
year_weights[2017.0] = 0.12
year_weights[2018.0] = 0.33
year_weights[2019.0] = 0.55
print(year_weights)

# penalty if missing
year_weights_penalty = {}
year_weights_penalty[2017.0] = 0.00
year_weights_penalty[2018.0] = 0.05
year_weights_penalty[2019.0] = -0.05


regression_factor = 0.8
err_regression_factor = 1.5




{2017.0: 0.12, 2018.0: 0.33, 2019.0: 0.55}


In [16]:

# if you'd like to do them all...
arr = np.array(list(df['Name']))
pls = np.unique(arr)

pls = ['Max Scherzer']

print('Player, HR, eHR, ER, eER, BB, eBB, H, eH, SO, eSO, IP, Adj')

for pl in pls:
    pp.generate_player_prediction(pl,df,cluster_centroid_df,\
                               estimated_ips=200,\
                               year_weights=year_weights,\
                               year_weights_penalty=year_weights_penalty,\
                               regression_factor=regression_factor,err_regression_factor=err_regression_factor,\
                               AgeDict={},verbose=0)

    

Player, HR, eHR, ER, eER, BB, eBB, H, eH, SO, eSO, IP, Adj
Max Scherzer, 21.1, 0.73, 61.04, 6.62, 45.91, 22.02, 129.46, 26.55, 270.29, 53.04, 0.0, 0.0, 200, 1.0, 


In [23]:
import pandas as pd

link='https://baseballsavant.mlb.com/statcast_search/csv?hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7C&hfC=&hfSea=2019%7C&hfSit=&player_type=batter&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfInfield=&team=&position=&hfOutfield=&hfRO=&home_road=&hfFlag=&hfPull=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_pas=0#results'
df_all = pd.read_csv(link, low_memory=False)

In [24]:
df_all

Unnamed: 0,pitches,player_id,player_name,total_pitches,pitch_percent,ba,iso,babip,slg,woba,...,takes,eff_min_vel,release_extension,pos3_int_start_distance,pos4_int_start_distance,pos5_int_start_distance,pos6_int_start_distance,pos7_int_start_distance,pos8_int_start_distance,pos9_int_start_distance
0,3223,656555,Rhys Hoskins,3223,100.0,0.226,0.228,0.267,0.454,0.351,...,1948,-0.3,6.04,108.0,152.0,118.0,145.0,302.0,328.0,300.0
1,3048,660670,Ronald Acuna Jr.,3048,100.0,0.280,0.238,0.337,0.518,0.373,...,1652,-0.2,6.01,109.0,153.0,116.0,146.0,313.0,328.0,296.0
2,2965,543760,Marcus Semien,2965,100.0,0.285,0.237,0.294,0.522,0.377,...,1713,-0.4,5.95,108.0,152.0,115.0,145.0,301.0,321.0,291.0
3,2915,608324,Alex Bregman,2915,100.0,0.296,0.296,0.281,0.592,0.423,...,1883,-0.5,5.98,106.0,151.0,119.0,145.0,296.0,326.0,292.0
4,2909,605141,Mookie Betts,2909,100.0,0.295,0.229,0.309,0.524,0.384,...,1790,-0.4,6.00,107.0,152.0,119.0,145.0,293.0,327.0,300.0
5,2888,502671,Paul Goldschmidt,2888,100.0,0.260,0.216,0.303,0.476,0.350,...,1544,-0.3,6.02,108.0,151.0,120.0,146.0,311.0,328.0,301.0
6,2862,593428,Xander Bogaerts,2862,100.0,0.309,0.246,0.338,0.555,0.394,...,1684,-0.4,6.01,107.0,152.0,119.0,146.0,293.0,326.0,298.0
7,2843,542340,Jonathan Villar,2843,100.0,0.274,0.179,0.341,0.453,0.339,...,1423,-0.3,6.02,108.0,148.0,104.0,148.0,292.0,320.0,297.0
8,2805,656305,Matt Chapman,2805,100.0,0.249,0.257,0.270,0.506,0.359,...,1617,-0.4,5.95,106.0,151.0,119.0,145.0,304.0,325.0,297.0
9,2793,467793,Carlos Santana,2793,100.0,0.281,0.234,0.293,0.515,0.384,...,1688,-0.4,5.98,113.0,159.0,124.0,148.0,295.0,324.0,300.0
