In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
cutterDL = pd.read_csv('DraftLeagueCutter.csv')
cutterMLB = pd.read_csv('MLBCutter.csv')

In [3]:
cutterMLB = cutterMLB[cutterMLB['pitches'] > 50]
cutterMLB.drop('pitches', inplace=True, axis=1)

In [4]:
cutterDL['league'] = 'Draft League'
cutterMLB['league'] = 'MLB'

In [5]:
cutterDL = cutterDL.astype(str)
cutterMLB = cutterMLB.astype(str)

In [6]:
pitchersDL = list(set(cutterDL['pitcher']))

In [7]:
cutter = pd.concat([cutterDL, cutterMLB])

In [8]:
def combined_features(row):
    return row['avgvelo']+" "+row['avgspinrate']+" "+row['avghorzbreak']+" "+row['avgvertbreak']+" "+row['avgeffectvelo']

In [9]:
#fig, ax = plt.subplots(figsize=(100,100))
#sns.heatmap(cosine_sim,cmap='magma',ax=ax)

In [10]:
def get_index_from_pitcher(pitcher):
    return cutter[cutter.pitcher == pitcher].index[0]

In [11]:
def get_pitcher_from_index(index):
    return cutter[cutter.index == index].values[0][0]

In [12]:
mlb_draft_pitcher = []
mlb_comp = []
cos_value = []

for x in pitchersDL:
    
    mlb_draft_pitcher.append(x)
    
    try:

        cutter = pd.concat([cutterDL, cutterMLB])

        cutter["combined_features"] = cutter.apply(combined_features, axis =1)

        cv = CountVectorizer()
        count_matrix = cv.fit_transform(cutter["combined_features"])

        cosine_sim = cosine_similarity(count_matrix)

        pitcher_like = x
        cutter = cutter[(cutter['pitcher'] == pitcher_like) | (cutter['league'] == 'MLB')]

        player_index = get_index_from_pitcher(pitcher_like)

        similar_pitchers = list(enumerate(cosine_sim[player_index]))

        sorted_similar_pitchers = sorted(similar_pitchers, key=lambda x:x[1], reverse=True)

        mlb_comp.append(get_pitcher_from_index(sorted_similar_pitchers[1][0]))
        
        cos_value.append(sorted_similar_pitchers[1][1])
                
    except IndexError:
        mlb_comp.append('No Comp')
        cos_value.append('No Similarity')
        pass

In [13]:
cosine_comp = pd.DataFrame(list(zip(mlb_draft_pitcher, mlb_comp, cos_value)), columns=['MLB DL Pitcher', 'MLB Comp', "Cosine Value"])
cosine_comp = cosine_comp.sort_values('MLB DL Pitcher')
cosine_comp

Unnamed: 0,MLB DL Pitcher,MLB Comp,Cosine Value
6,"Blatter, Brock","Matzek, Tyler",0.363636
23,"Bortka, Josh","Pérez, Martín",0.381385
28,"Bosch, Carter","Wade, Konner",0.353553
17,"Braithwaite, Trey","Loup, Aaron",0.222222
35,"Brewer, Michael","Thompson, Keegan",0.251976
47,"Brown, Matt","Anderson, Tyler",0.381385
3,"Busse, Terry","Bumgarner, Madison",0.381385
24,"Carter, Alan","Canning, Griffin",0.235702
19,"Cherry, Derrick","Voth, Austin",0.251976
49,"Chiriboga, Chris","Bumgarner, Madison",0.381385


In [14]:
cosine_comp.to_csv('cutter_cosine_comp.csv', sep='\t')