In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
gca = pd.read_csv('/Users/amitmishra/epl_2425/data/player_gca_2025_cleaned.csv')
poss = pd.read_csv('/Users/amitmishra/epl_2425/data/player_possession_2025_cleaned.csv')
defense = pd.read_csv('/Users/amitmishra/epl_2425/data/player_defense_2025_cleaned.csv')
passes = pd.read_csv('/Users/amitmishra/epl_2425/data/player_pass_2025_cleaned.csv')
shots = pd.read_csv('/Users/amitmishra/epl_2425/data/player_shot_2025_cleaned.csv')

In [3]:
gca = gca.reset_index(drop = True)
poss = poss.reset_index(drop = True)
passes = passes.reset_index(drop = True)
defense = defense.reset_index(drop = True)
shots = shots.reset_index(drop = True)

In [4]:
gca = gca.drop(['Nation', 'Born', 'Matches'], axis = 1)
poss = poss.drop(['Nation', 'Born', 'Matches'], axis = 1)
passes = passes.drop(['Nation', 'Born', 'Matches'], axis = 1)
defense = defense.drop(['Nation', 'Born', 'Matches'], axis = 1)
shots = shots.drop(['Nation', 'Born', 'Matches'], axis = 1)

In [5]:
shots.head()

Unnamed: 0,Player,Pos,Squad,Age,90s,Goals,Shots,SoT,SoT%,Sh/90,...,G/SoT,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG
0,Joshua Acheampong,DF,Chelsea,18,1.9,0,2,0,0.0,1.06,...,,8.9,0,0,0,0.2,0.2,0.11,-0.2,-0.2
1,Tyler Adams,MF,Bournemouth,25,21.8,0,9,2,22.2,0.41,...,0.0,16.9,0,0,0,1.6,1.6,0.18,-1.6,-1.6
2,Tosin Adarabioyo,DF,Chelsea,26,15.7,1,13,2,15.4,0.83,...,0.5,12.5,0,0,0,0.9,0.9,0.07,0.1,0.1
3,Simon Adingra,"FW,MF",Brighton,22,12.2,2,33,8,24.2,2.71,...,0.25,17.0,0,0,0,2.5,2.5,0.07,-0.5,-0.5
4,Emmanuel Agbadou,DF,Wolves,27,15.7,1,7,2,28.6,0.45,...,0.5,27.6,1,0,0,0.8,0.8,0.12,0.2,0.2


In [6]:
gca.dtypes

Player          object
Pos             object
Squad           object
Age             object
90s             object
SCA             object
SCA90           object
SCA_PassLive    object
SCA_PassDead    object
SCA_TO          object
SCA_Shot        object
SCA_Fld         object
SCA_DefAct      object
GCA             object
GCA90           object
GCA_PassLive    object
GCA_PassDead    object
GCA_TO          object
GCA_Shot        object
GCA_Fld         object
GCA_DefAct      object
dtype: object

In [7]:
text_cols = ['Player', 'Pos', 'Squad']

#Changing the datatypes for the gca dataframe

for col in gca.columns:
    if col not in text_cols:
        gca[col] = pd.to_numeric(gca[col], errors='coerce')

In [8]:
#Changing the datatypes for the possession dataframe

for poss_col in poss.columns:
    if poss_col not in text_cols:
        poss[poss_col] = pd.to_numeric(poss[poss_col], errors = 'coerce')
        
for pass_col in passes.columns:
    if pass_col not in text_cols:
        passes[pass_col] = pd.to_numeric(passes[pass_col], errors = 'coerce')
        
for def_col in defense.columns:
    if def_col not in text_cols:
        defense[def_col] = pd.to_numeric(defense[def_col], errors = 'coerce')
        
for shot_col in shots.columns:
    if shot_col not in text_cols:
        shots[shot_col] = pd.to_numeric(shots[shot_col], errors = 'coerce')

In [9]:
shots.columns

Index(['Player', 'Pos', 'Squad', 'Age', '90s', 'Goals', 'Shots', 'SoT', 'SoT%',
       'Sh/90', 'SoT/90', 'G/Sh', 'G/SoT', 'Dist', 'FK', 'PK', 'PKatt', 'xG',
       'npxG', 'npxG/Sh', 'G-xG', 'np:G-xG'],
      dtype='object')

### Merging columns

In [10]:
#First let's specify only the features we want in each of the columns we want to merge

gca_features = ['Player', 'Pos', 'Squad', 'Age', '90s','SCA_TO', 'SCA_PassLive', 'SCA_Shot']
def_features = ['Player', 'Tkl_Mid', 'Tkl_Att', 'Interceptions']
poss_features = ['Player','Touches_Att3rd', 'Touches_AttPen', 'PrgC', 'Carry1/3', 'Carry_PA']
shot_features = ['Player', 'Goals', 'Shots', 'SoT', 'SoT%', 'Sh/90', 'SoT/90', 'G/Sh', 'G/SoT', 'Dist', 'xG','npxG']

In [11]:
base_df = gca[gca_features]
df = base_df.merge(defense[def_features], on = 'Player', how = 'left')
df = df.merge(poss[poss_features], on = 'Player', how = 'left')
df = df.merge(shots[shot_features], on = 'Player', how = 'left')
df.head()

Unnamed: 0,Player,Pos,Squad,Age,90s,SCA_TO,SCA_PassLive,SCA_Shot,Tkl_Mid,Tkl_Att,...,Shots,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist,xG,npxG
0,Joshua Acheampong,DF,Chelsea,18.0,1.9,0.0,2.0,0.0,0.0,0.0,...,2.0,0.0,0.0,1.06,0.0,0.0,,8.9,0.2,0.2
1,Tyler Adams,MF,Bournemouth,25.0,21.8,0.0,35.0,1.0,49.0,8.0,...,9.0,2.0,22.2,0.41,0.09,0.0,0.0,16.9,1.6,1.6
2,Tosin Adarabioyo,DF,Chelsea,26.0,15.7,0.0,8.0,6.0,6.0,0.0,...,13.0,2.0,15.4,0.83,0.13,0.08,0.5,12.5,0.9,0.9
3,Simon Adingra,"FW,MF",Brighton,22.0,12.2,6.0,33.0,8.0,8.0,5.0,...,33.0,8.0,24.2,2.71,0.66,0.06,0.25,17.0,2.5,2.5
4,Emmanuel Agbadou,DF,Wolves,27.0,15.7,0.0,15.0,0.0,9.0,0.0,...,7.0,2.0,28.6,0.45,0.13,0.14,0.5,27.6,0.8,0.8


In [12]:
df.columns

Index(['Player', 'Pos', 'Squad', 'Age', '90s', 'SCA_TO', 'SCA_PassLive',
       'SCA_Shot', 'Tkl_Mid', 'Tkl_Att', 'Interceptions', 'Touches_Att3rd',
       'Touches_AttPen', 'PrgC', 'Carry1/3', 'Carry_PA', 'Goals', 'Shots',
       'SoT', 'SoT%', 'Sh/90', 'SoT/90', 'G/Sh', 'G/SoT', 'Dist', 'xG',
       'npxG'],
      dtype='object')

### Choose features from the following to get similar players: 

#### 'SCA_TO', 'SCA_PassLive',
#### 'SCA_Shot', 'Tkl_Mid', 'Tkl_Att', 'Pass_blocked', 'Interceptions',
#### 'Touches_Att3rd', 'Touches_AttPen', 'PrgC', 'Carry_PA'
#### 'SoT%', 'G/SoT','xG','npxG'

### Implementing cosine similarity

In [13]:
similar_features = ['SCA_TO', 'SCA_PassLive',
       'SCA_Shot', 'Tkl_Mid', 'Tkl_Att', 'Interceptions', 'Touches_Att3rd',
       'Touches_AttPen', 'PrgC', 'Carry_PA', 'G/SoT','npxG']

def player_finder(df, target_player, features, top_n = 10):
    #Check if target player exists
    if target_player not in df['Player'].values:
        print(f'{target_player} not in the list')
        return None
    
    #Filter for more than 10 full games
    df_filtered = df[df['90s'] >10].copy()  
    
    #Select only the features we want to compare
    feature_data = df_filtered[features].copy()
    
    #Handle missing data
    feature_data = feature_data.fillna(0)
    
    #Standardize the features so the ones with greater magintude don't have a bigger say
    scaler = StandardScaler()
    feature_data_scaled = scaler.fit_transform(feature_data)
    
    #Find index of the target player
    target_idx = df_filtered[df_filtered['Player'] == target_player].index[0]
    target_player_idx = df_filtered.index.get_loc(target_idx)
    
    #Calculate cosine similarity
    similarities = cosine_similarity([feature_data_scaled[target_player_idx]], feature_data_scaled)[0]
    
    # Create results DataFrame
    results = df_filtered[['Player', 'Squad', 'Pos', 'Age'] + features].copy()
    results['Similarity'] = similarities*100
    
    # Sort by similarity (descending) and exclude the target player
    results = results[results['Player'] != target_player].sort_values('Similarity', ascending=False)
    
    return results.head(top_n) 

In [14]:
target_player = "Nicolas Jackson"
similar_players = player_finder(df, target_player, similar_features, top_n=10)

In [15]:
print(f'Players similar to {target_player}:')
similar_players[(similar_players['Pos'] == 'FW') | (similar_players['Pos'] == 'FW,MF')]

Players similar to Nicolas Jackson:


Unnamed: 0,Player,Squad,Pos,Age,SCA_TO,SCA_PassLive,SCA_Shot,Tkl_Mid,Tkl_Att,Interceptions,Touches_Att3rd,Touches_AttPen,PrgC,Carry_PA,G/SoT,npxG,Similarity
106843,Alexander Isak,Newcastle Utd,FW,24.0,8.0,70.0,9.0,3.0,7.0,3.0,596.0,189.0,83.0,43.0,0.46,17.2,94.60403
85519,Erling Haaland,Manchester City,FW,24.0,8.0,51.0,10.0,3.0,5.0,5.0,384.0,190.0,24.0,23.0,0.35,18.8,93.715763
224322,Ollie Watkins,Aston Villa,FW,28.0,4.0,36.0,7.0,6.0,4.0,8.0,407.0,168.0,55.0,38.0,0.39,13.8,90.999451
234985,Yoane Wissa,Brentford,FW,27.0,4.0,40.0,13.0,8.0,8.0,8.0,432.0,163.0,60.0,34.0,0.46,18.5,90.064745
192277,Kevin Schade,Brentford,FW,22.0,6.0,35.0,4.0,10.0,12.0,5.0,507.0,146.0,61.0,38.0,0.41,8.2,88.13184
106861,Diogo Jota,Liverpool,FW,27.0,6.0,24.0,5.0,8.0,4.0,8.0,259.0,97.0,37.0,21.0,0.46,7.9,87.917879
53422,Liam Delap,Ipswich Town,FW,21.0,12.0,28.0,7.0,3.0,5.0,2.0,335.0,92.0,61.0,30.0,0.34,7.8,87.233244
74839,Cody Gakpo,Liverpool,FW,25.0,9.0,57.0,7.0,13.0,9.0,12.0,557.0,110.0,59.0,28.0,0.45,7.1,86.532152
106857,Brennan Johnson,Tottenham,FW,23.0,6.0,30.0,4.0,20.0,10.0,4.0,475.0,125.0,60.0,35.0,0.55,10.4,86.201532
224292,Jamie Vardy,Leicester City,FW,37.0,6.0,33.0,2.0,7.0,5.0,5.0,260.0,111.0,29.0,16.0,0.35,9.6,85.463001


In [16]:
df[df['Player'] == 'João Pedro']

Unnamed: 0,Player,Pos,Squad,Age,90s,SCA_TO,SCA_PassLive,SCA_Shot,Tkl_Mid,Tkl_Att,...,Shots,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist,xG,npxG
106854,João Pedro,"FW,MF",Brighton,22.0,21.6,3.0,56.0,2.0,6.0,4.0,...,42.0,15.0,35.7,1.94,0.69,0.12,0.33,17.2,8.9,5.0


In [17]:
df[df['Player'] == 'Nicolas Jackson']

Unnamed: 0,Player,Pos,Squad,Age,90s,SCA_TO,SCA_PassLive,SCA_Shot,Tkl_Mid,Tkl_Att,...,Shots,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist,xG,npxG
106845,Nicolas Jackson,FW,Chelsea,23.0,24.7,9.0,42.0,7.0,7.0,9.0,...,76.0,34.0,44.7,3.08,1.38,0.13,0.29,12.9,12.3,12.3


In [18]:
df[df['Player'] == 'Noni Madueke']

Unnamed: 0,Player,Pos,Squad,Age,90s,SCA_TO,SCA_PassLive,SCA_Shot,Tkl_Mid,Tkl_Att,...,Shots,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist,xG,npxG
128207,Noni Madueke,FW,Chelsea,22.0,22.6,10.0,57.0,7.0,12.0,5.0,...,80.0,30.0,37.5,3.54,1.33,0.09,0.23,14.4,9.6,9.6


In [19]:
df[df['Player'] == 'Alejandro Garnacho']

Unnamed: 0,Player,Pos,Squad,Age,90s,SCA_TO,SCA_PassLive,SCA_Shot,Tkl_Mid,Tkl_Att,...,Shots,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist,xG,npxG
74842,Alejandro Garnacho,"MF,FW",Manchester Utd,20.0,24.4,3.0,64.0,9.0,6.0,13.0,...,84.0,29.0,34.5,3.44,1.19,0.07,0.21,17.0,7.3,7.3


In [20]:
df[df['Player'] == 'Liam Delap']

Unnamed: 0,Player,Pos,Squad,Age,90s,SCA_TO,SCA_PassLive,SCA_Shot,Tkl_Mid,Tkl_Att,...,Shots,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist,xG,npxG
53422,Liam Delap,FW,Ipswich Town,21.0,28.8,12.0,28.0,7.0,3.0,5.0,...,66.0,29.0,43.9,2.29,1.01,0.15,0.34,16.0,9.3,7.8


In [28]:
afc = df[df['Squad'] == 'Arsenal']

In [22]:
afc = afc[afc['Pos'] != 'GK']

In [32]:
import plotly.express as px


fig = px.scatter(afc, x=afc.Tkl_Mid/afc['90s'] , y= afc.Interceptions/afc['90s'], text='Player')

fig.update_layout(width = 1200, height = 650, xaxis_title = 'Tackles in midfield per 90', yaxis_title = 'Interceptions per 90', title = 'Tackles vs Midfield Interceptions (Stats via Fbref)')


fig.update_traces(marker_color = 'red',textposition='top center') 
fig.show()

In [24]:
df.Squad

0                Chelsea
1            Bournemouth
2                Chelsea
3               Brighton
4                 Wolves
               ...      
234992           Everton
234993       Bournemouth
234994           Arsenal
234995    Manchester Utd
234996           Arsenal
Name: Squad, Length: 234997, dtype: object

In [26]:
for k in df.Squad.unique():
    team_data = df[(df['Squad'] == k) & (df['90s'] >= 10)]
    fig = px.scatter(team_data, x=team_data.Tkl_Mid/team_data['90s'] , y= team_data.Interceptions/team_data['90s'], text='Player')

    fig.update_layout(width = 1200, height = 650, xaxis_title = 'Tackles in midfield per 90', yaxis_title = 'Interceptions per 90', title = f'Midfield Tackles and Interceptions - {k}')


    fig.update_traces(marker_color = 'black',textposition='top center')
    fig.show()

ValueError: Cannot accept list of column references or list of columns for both `x` and `y`.

In [27]:
# Calculate the stats first
df['Tkl_Mid_per_90'] = df['Tkl_Mid'] / df['90s']
df['Interceptions_per_90'] = df['Interceptions'] / df['90s']

for k in df.Squad.unique():
    team_data = df[(df['Squad'] == k) & (df['90s'] >= 10)]
    fig = px.scatter(team_data, 
                    x='Tkl_Mid_per_90', 
                    y='Interceptions_per_90', 
                    text='Player')

    fig.update_layout(width=1200, height=650, 
                     xaxis_title='Tackles in midfield per 90', 
                     yaxis_title='Interceptions per 90', 
                     title=f'Midfield Tackles and Interceptions - {k}')

    fig.update_traces(marker_color='black', textposition='top center')
    fig.show()