In [4]:
import pandas
pandas.set_option('display.float_format', lambda x: '%.4f' % x)
from matplotlib import pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
from sklearn.cluster import KMeans
import sklearn.metrics as metrics
import warnings
from sklearn import preprocessing

%cd "C:\Users\dbetc\Desktop\Baseball Analysis"

data = pandas.read_csv('baseball_tableau_dashboard.csv')

def set_value(row_number, assigned_value): 
    return assigned_value[row_number]

pitch_dict ={'Knuckleball' : 'Knuckle/Slider', 'Eephus' : 'Other', 'Knuckle Ball' : 'Knuckle/Slider', 'Curveball' : 'Curve',
             'Screwball' : 'Off-Speed','Knuckle Curve' : 'Curve', 'Slider' : 'Knuckle/Slider',
             'Forkball' : 'Sinker', 'Changeup' : 'Off-Speed', 'Split Finger' : 'Sinker',
             'Split-Finger' : 'Sinker', 'Pitch Out' : 'Other', 'Cutter' : 'Cutter', 'Sinker' : 'Sinker', 
             '2-Seam Fastball' : 'Sinker','4-Seam Fastball' : 'Rising Fastball'} 
  
data['pitch_name_2'] = data.pitch_name.map(pitch_dict)
data['movement_hor_2'] = abs(data['movement_hor'])
data['movement_vert_2'] = abs(data['movement_vert'])

C:\Users\dbetc\Desktop\Baseball Analysis


In [5]:
speed = data.groupby(['pitcher_id','pitcher_name','pitch_name_2'])['speed'].median()
spin = data.groupby(['pitcher_id','pitcher_name','pitch_name_2'])['spin_rate'].median()
freq = data.groupby(['pitcher_id', 'pitcher_name','pitch_name_2'])['play_id'].count()
pitch_comp = freq.groupby(['pitcher_id', 'pitcher_name']).apply(lambda x: 100 * x / float(x.sum()))
move_hor = data.groupby(['pitcher_id','pitcher_name','pitch_name_2'])['movement_hor_2'].median()
move_vert = data.groupby(['pitcher_id','pitcher_name','pitch_name_2'])['movement_vert_2'].median()

pitches_data = pandas.DataFrame(dict(speed = speed, spin = spin, count = pitch_comp,
                     move_hor=move_hor, move_vert=move_vert)).reset_index()
new_pitch_data = pitches_data.pivot_table(values=['speed','spin','count','move_hor','move_vert'], index=['pitcher_id','pitcher_name'],
                                 columns='pitch_name_2', aggfunc='first')
new_pitch_data.columns = ['_'.join(col).strip() for col in new_pitch_data.columns.values]
new_pitch_data = new_pitch_data.fillna(0).reset_index()
new_pitch_data_drop = new_pitch_data.drop(['pitcher_id','pitcher_name'],1)
data_standardized = preprocessing.scale(new_pitch_data_drop)

In [7]:
kmeans = KMeans(n_clusters = 10, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(data_standardized)
y_kmeans1=y_kmeans
y_kmeans1=y_kmeans+1
cluster = pandas.DataFrame(y_kmeans1)
cluster.columns = ['cluster']

joined = pandas.merge(new_pitch_data,cluster,left_index=True,right_index=True)
speed_tot = pitches_data.groupby(['pitcher_id','pitcher_name'])['speed'].median()
spin_tot = pitches_data.groupby(['pitcher_id','pitcher_name'])['spin'].median()
move_hor_tot = pitches_data.groupby(['pitcher_id','pitcher_name'])['move_hor'].median()
move_vert_tot = pitches_data.groupby(['pitcher_id','pitcher_name'])['move_vert'].median()

test = pandas.DataFrame(dict(speed_tot = speed_tot, spin_tot = spin_tot,
                     move_hor_tot=move_hor_tot, move_vert_tot=move_vert_tot)).reset_index()

pandas.merge(joined,test,on='pitcher_id').to_csv('pitcher_clusters.csv')

Unnamed: 0,pitcher_id,pitcher_name_x,count_Curve,count_Cutter,count_Knuckle/Slider,count_Off-Speed,count_Other,count_Rising Fastball,count_Sinker,move_hor_Curve,...,spin_Off-Speed,spin_Other,spin_Rising Fastball,spin_Sinker,cluster,pitcher_name_y,speed_tot,spin_tot,move_hor_tot,move_vert_tot
0,112526,Bartolo Colon,0.0000,2.7828,7.3609,11.1311,0.0000,16.3824,62.3429,0.0000,...,1651.5000,0.0000,2285.0000,2136.0000,6,Bartolo Colon,84.2000,2285.0000,0.7000,0.7000
1,279571,Matt Belisle,7.8534,0.0000,36.9983,0.1745,0.0000,46.0733,8.9005,1.1000,...,1364.0000,0.0000,2046.5000,2006.0000,2,Matt Belisle,86.5000,2046.5000,0.8000,0.7000
2,282332,CC Sabathia,0.0000,43.0160,30.6000,10.8409,0.0000,1.0887,14.4545,0.0000,...,1877.5000,0.0000,2077.0000,2038.0000,6,CC Sabathia,89.9000,2077.0000,0.7514,0.7000
3,407822,Jorge De La Rosa,4.2914,12.0758,0.0000,0.0000,0.0000,43.3134,40.3194,1.0000,...,0.0000,0.0000,2095.5000,1421.0000,7,Jorge De La Rosa,83.5000,2068.2500,1.0000,0.7500
4,407845,Fernando Rodney,0.0000,0.0000,1.4877,29.6606,0.0000,21.1530,47.6987,0.0000,...,1721.0000,0.0000,2178.0000,2032.0000,4,Fernando Rodney,89.5945,2105.0000,0.8000,0.8946
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199,680692,Jonathan Stiever,9.4891,0.0000,20.4380,16.0584,0.0000,54.0146,0.0000,0.1000,...,1796.0000,0.0000,2311.5000,0.0000,3,Jonathan Stiever,80.0000,2241.2500,0.5250,1.0000
1200,680702,Joey Gerber,0.0000,0.0000,34.7639,0.0000,0.0000,27.4678,37.7682,0.0000,...,0.0000,0.0000,2271.0000,2324.5000,10,Joey Gerber,93.0000,2271.0000,0.9000,1.3000
1201,681911,Alex Vesia,0.0000,0.0000,9.0226,18.0451,0.0000,72.9323,0.0000,0.0000,...,1990.5000,0.0000,2366.0000,0.0000,3,Alex Vesia,85.1000,2366.0000,0.7000,1.0000
1202,683232,Nick Mears,31.5789,0.0000,0.0000,0.0000,0.0000,67.5439,0.8772,0.3000,...,0.0000,0.0000,2386.0000,2315.0000,8,Nick Mears,94.0000,2386.0000,0.4000,1.4000
