In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
gca = pd.read_csv('/Users/amitmishra/epl_2425/data/player_gca_2025_cleaned.csv')
poss = pd.read_csv('/Users/amitmishra/epl_2425/data/player_possession_2025_cleaned.csv')
defense = pd.read_csv('/Users/amitmishra/epl_2425/data/player_defense_2025_cleaned.csv')
passes = pd.read_csv('/Users/amitmishra/epl_2425/data/player_pass_2025_cleaned.csv')
shots = pd.read_csv('/Users/amitmishra/epl_2425/data/player_shot_2025_cleaned.csv')

In [5]:
gca = gca.reset_index(drop = True)
poss = poss.reset_index(drop = True)
passes = passes.reset_index(drop = True)
defense = defense.reset_index(drop = True)
shots = shots.reset_index(drop = True)

In [6]:
gca = gca.drop(['Nation', 'Born', 'Matches'], axis = 1)
poss = poss.drop(['Nation', 'Born', 'Matches'], axis = 1)
passes = passes.drop(['Nation', 'Born', 'Matches'], axis = 1)
defense = defense.drop(['Nation', 'Born', 'Matches'], axis = 1)
shots = shots.drop(['Nation', 'Born', 'Matches'], axis = 1)

In [7]:
shots.head()

Unnamed: 0,Player,Pos,Squad,Age,90s,Goals,Shots,SoT,SoT%,Sh/90,...,G/SoT,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG
0,Joshua Acheampong,DF,Chelsea,18,1.9,0,2,0,0.0,1.06,...,,8.9,0,0,0,0.2,0.2,0.11,-0.2,-0.2
1,Tyler Adams,MF,Bournemouth,25,21.8,0,9,2,22.2,0.41,...,0.0,16.9,0,0,0,1.6,1.6,0.18,-1.6,-1.6
2,Tosin Adarabioyo,DF,Chelsea,26,15.7,1,13,2,15.4,0.83,...,0.5,12.5,0,0,0,0.9,0.9,0.07,0.1,0.1
3,Simon Adingra,"FW,MF",Brighton,22,12.2,2,33,8,24.2,2.71,...,0.25,17.0,0,0,0,2.5,2.5,0.07,-0.5,-0.5
4,Emmanuel Agbadou,DF,Wolves,27,15.7,1,7,2,28.6,0.45,...,0.5,27.6,1,0,0,0.8,0.8,0.12,0.2,0.2


In [6]:
gca.dtypes

Player          object
Pos             object
Squad           object
Age             object
90s             object
SCA             object
SCA90           object
SCA_PassLive    object
SCA_PassDead    object
SCA_TO          object
SCA_Shot        object
SCA_Fld         object
SCA_DefAct      object
GCA             object
GCA90           object
GCA_PassLive    object
GCA_PassDead    object
GCA_TO          object
GCA_Shot        object
GCA_Fld         object
GCA_DefAct      object
dtype: object

In [9]:
text_cols = ['Player', 'Pos', 'Squad']

#Changing the datatypes for the gca dataframe

for col in gca.columns:
    if col not in text_cols:
        gca[col] = pd.to_numeric(gca[col], errors='coerce')

In [10]:
#Changing the datatypes for the possession dataframe

for poss_col in poss.columns:
    if poss_col not in text_cols:
        poss[poss_col] = pd.to_numeric(poss[poss_col], errors = 'coerce')
        
for pass_col in passes.columns:
    if pass_col not in text_cols:
        passes[pass_col] = pd.to_numeric(passes[pass_col], errors = 'coerce')
        
for def_col in defense.columns:
    if def_col not in text_cols:
        defense[def_col] = pd.to_numeric(defense[def_col], errors = 'coerce')
        
for shot_col in shots.columns:
    if shot_col not in text_cols:
        shots[shot_col] = pd.to_numeric(shots[shot_col], errors = 'coerce')

In [13]:
shots.columns

Index(['Player', 'Pos', 'Squad', 'Age', '90s', 'Goals', 'Shots', 'SoT', 'SoT%',
       'Sh/90', 'SoT/90', 'G/Sh', 'G/SoT', 'Dist', 'FK', 'PK', 'PKatt', 'xG',
       'npxG', 'npxG/Sh', 'G-xG', 'np:G-xG'],
      dtype='object')

### Merging columns

In [14]:
#First let's specify only the features we want in each of the columns we want to merge

gca_features = ['Player', 'Pos', 'Squad', 'Age', '90s','SCA_TO', 'SCA_PassLive', 'SCA_Shot']
def_features = ['Player', 'Tkl_Mid', 'Tkl_Att', 'Interceptions']
poss_features = ['Player','Touches_Att3rd', 'Touches_AttPen', 'PrgC', 'Carry1/3', 'Carry_PA']
shot_features = ['Player', 'Goals', 'Shots', 'SoT', 'SoT%', 'Sh/90', 'SoT/90', 'G/Sh', 'G/SoT', 'Dist', 'xG','npxG']

In [15]:
base_df = gca[gca_features]
df = base_df.merge(defense[def_features], on = 'Player', how = 'left')
df = df.merge(poss[poss_features], on = 'Player', how = 'left')
df = df.merge(shots[shot_features], on = 'Player', how = 'left')
df.head()

Unnamed: 0,Player,Pos,Squad,Age,90s,SCA_TO,SCA_PassLive,SCA_Shot,Tkl_Mid,Tkl_Att,...,Shots,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist,xG,npxG
0,Joshua Acheampong,DF,Chelsea,18.0,1.9,0.0,2.0,0.0,0.0,0.0,...,2.0,0.0,0.0,1.06,0.0,0.0,,8.9,0.2,0.2
1,Tyler Adams,MF,Bournemouth,25.0,21.8,0.0,35.0,1.0,49.0,8.0,...,9.0,2.0,22.2,0.41,0.09,0.0,0.0,16.9,1.6,1.6
2,Tosin Adarabioyo,DF,Chelsea,26.0,15.7,0.0,8.0,6.0,6.0,0.0,...,13.0,2.0,15.4,0.83,0.13,0.08,0.5,12.5,0.9,0.9
3,Simon Adingra,"FW,MF",Brighton,22.0,12.2,6.0,33.0,8.0,8.0,5.0,...,33.0,8.0,24.2,2.71,0.66,0.06,0.25,17.0,2.5,2.5
4,Emmanuel Agbadou,DF,Wolves,27.0,15.7,0.0,15.0,0.0,9.0,0.0,...,7.0,2.0,28.6,0.45,0.13,0.14,0.5,27.6,0.8,0.8


In [16]:
df.columns

Index(['Player', 'Pos', 'Squad', 'Age', '90s', 'SCA_TO', 'SCA_PassLive',
       'SCA_Shot', 'Tkl_Mid', 'Tkl_Att', 'Interceptions', 'Touches_Att3rd',
       'Touches_AttPen', 'PrgC', 'Carry1/3', 'Carry_PA', 'Goals', 'Shots',
       'SoT', 'SoT%', 'Sh/90', 'SoT/90', 'G/Sh', 'G/SoT', 'Dist', 'xG',
       'npxG'],
      dtype='object')

### Choose features from the following to get similar players: 

#### 'SCA_TO', 'SCA_PassLive',
#### 'SCA_Shot', 'Tkl_Mid', 'Tkl_Att', 'Pass_blocked', 'Interceptions',
#### 'Touches_Att3rd', 'Touches_AttPen', 'PrgC', 'Carry_PA'
#### 'SoT%', 'G/SoT','xG','npxG'

### Implementing cosine similarity

In [17]:
similar_features = ['SCA_TO', 'SCA_PassLive',
       'SCA_Shot', 'Tkl_Mid', 'Tkl_Att', 'Interceptions', 'Touches_Att3rd',
       'Touches_AttPen', 'PrgC', 'Carry1/3', 'Carry_PA','SoT%', 'G/SoT','npxG']

def player_finder(df, target_player, features, top_n = 10):
    #Check if target player exists
    if target_player not in df['Player'].values:
        print(f'{target_player} not in the list')
        return None
    
    #Filter for more than 10 full games
    df_filtered = df[df['90s'] >10].copy()  
    
    #Select only the features we want to compare
    feature_data = df_filtered[features].copy()
    
    #Handle missing data
    feature_data = feature_data.fillna(0)
    
    #Standardize the features so the ones with greater magintude don't have a bigger say
    scaler = StandardScaler()
    feature_data_scaled = scaler.fit_transform(feature_data)
    
    #Find index of the target player
    target_idx = df_filtered[df_filtered['Player'] == target_player].index[0]
    target_player_idx = df_filtered.index.get_loc(target_idx)
    
    #Calculate cosine similarity
    similarities = cosine_similarity([feature_data_scaled[target_player_idx]], feature_data_scaled)[0]
    
    # Create results DataFrame
    results = df_filtered[['Player', 'Squad', 'Pos', 'Age'] + features].copy()
    results['Similarity'] = similarities
    
    # Sort by similarity (descending) and exclude the target player
    results = results[results['Player'] != target_player].sort_values('Similarity', ascending=False)
    
    return results.head(top_n) 

In [18]:
target_player = "Gabriel Martinelli"
similar_players = player_finder(df, target_player, similar_features, top_n=10)

In [19]:
print(f'Players similar to {target_player} are:')
similar_players[(similar_players['Pos'] == 'FW') | (similar_players['Pos'] == 'FW,MF')]

Players similar to Gabriel Martinelli are:


Unnamed: 0,Player,Squad,Pos,Age,SCA_TO,SCA_PassLive,SCA_Shot,Tkl_Mid,Tkl_Att,Interceptions,Touches_Att3rd,Touches_AttPen,PrgC,Carry1/3,Carry_PA,SoT%,G/SoT,npxG,Similarity
128207,Noni Madueke,Chelsea,FW,22.0,10.0,57.0,7.0,12.0,5.0,10.0,595.0,184.0,154.0,50.0,93.0,37.5,0.23,9.6,0.957336
181618,Mohamed Salah,Liverpool,FW,32.0,22.0,123.0,14.0,5.0,9.0,9.0,1119.0,356.0,154.0,59.0,126.0,41.3,0.4,18.2,0.948022
10735,Harvey Barnes,Newcastle Utd,FW,26.0,7.0,44.0,6.0,6.0,6.0,6.0,462.0,128.0,93.0,48.0,47.0,31.7,0.45,7.2,0.945905
181617,Bukayo Saka,Arsenal,"FW,MF",22.0,12.0,71.0,10.0,11.0,9.0,3.0,715.0,162.0,96.0,32.0,51.0,33.3,0.23,6.0,0.927575
64132,Anthony Elanga,Nott'ham Forest,"FW,MF",22.0,5.0,56.0,7.0,8.0,3.0,6.0,679.0,110.0,102.0,45.0,48.0,54.5,0.25,4.5,0.90667
192274,Sávio,Manchester City,"FW,MF",20.0,15.0,80.0,4.0,10.0,8.0,6.0,774.0,154.0,137.0,45.0,75.0,38.2,0.05,5.0,0.902771
96180,Son Heung-min,Tottenham,FW,32.0,10.0,87.0,7.0,8.0,6.0,5.0,668.0,122.0,96.0,49.0,44.0,43.6,0.25,5.7,0.900741
138875,Bryan Mbeumo,Brentford,FW,24.0,11.0,98.0,8.0,15.0,16.0,14.0,998.0,168.0,130.0,69.0,75.0,43.0,0.44,7.5,0.900073
