## **Player Recommender Tool**

In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

##### **Data Retrieval**

In [2]:
# fbref table url
url = "https://fbref.com/en/comps/Big5/stats/players/Big-5-European-Leagues-Stats"

_standard stats_
- https://fbref.com/en/comps/Big5/stats/players/Big-5-European-Leagues-Stats
- https://fbref.com/en/comps/Big5/2022-2023/stats/players/2022-2023-Big-5-European-Leagues-Stats
- https://fbref.com/en/comps/Big5/2021-2022/stats/players/2021-2022-Big-5-European-Leagues-Stats

_defensive actions_
- https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats

_miscellaneous_
- https://fbref.com/en/comps/Big5/misc/players/Big-5-European-Leagues-Stats

_passing_
- https://fbref.com/en/comps/Big5/passing/players/Big-5-European-Leagues-Stats

_possession_
- https://fbref.com/en/comps/Big5/possession/players/Big-5-European-Leagues-Stats

In [3]:
big5_stats = pd.read_html(url)[0]
big5_stats.head()

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Playing Time,Playing Time,...,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Unnamed: 37_level_0
Unnamed: 0_level_1,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Ast,G+A,G-PK,G+A-PK,xG,xAG,xG+xAG,npxG,npxG+xAG,Matches
0,1,Max Aarons,eng ENG,DF,Bournemouth,eng Premier League,23,2000,20,13,...,0.07,0.07,0.0,0.07,0.0,0.06,0.06,0.0,0.06,Matches
1,2,Brenden Aaronson,us USA,"MF,FW",Union Berlin,de Bundesliga,22,2000,30,14,...,0.14,0.28,0.14,0.28,0.14,0.13,0.27,0.14,0.27,Matches
2,3,Paxten Aaronson,us USA,MF,Eint Frankfurt,de Bundesliga,19,2003,7,1,...,0.89,0.89,0.0,0.89,0.11,0.07,0.19,0.11,0.19,Matches
3,4,Keyliane Abdallah,fr FRA,FW,Marseille,fr Ligue 1,17,2006,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches
4,5,Yunis Abdelhamid,ma MAR,DF,Reims,fr Ligue 1,35,1987,31,31,...,0.0,0.13,0.1,0.1,0.11,0.01,0.12,0.09,0.09,Matches


In [4]:
# creating a data with the same headers but without multi indexing
big5_stats.columns = [' '.join(col).strip() for col in big5_stats.columns]
big5_stats = big5_stats.reset_index(drop=True)
big5_stats.head(1)

Unnamed: 0,Unnamed: 0_level_0 Rk,Unnamed: 1_level_0 Player,Unnamed: 2_level_0 Nation,Unnamed: 3_level_0 Pos,Unnamed: 4_level_0 Squad,Unnamed: 5_level_0 Comp,Unnamed: 6_level_0 Age,Unnamed: 7_level_0 Born,Playing Time MP,Playing Time Starts,...,Per 90 Minutes Ast,Per 90 Minutes G+A,Per 90 Minutes G-PK,Per 90 Minutes G+A-PK,Per 90 Minutes xG,Per 90 Minutes xAG,Per 90 Minutes xG+xAG,Per 90 Minutes npxG,Per 90 Minutes npxG+xAG,Unnamed: 37_level_0 Matches
0,1,Max Aarons,eng ENG,DF,Bournemouth,eng Premier League,23,2000,20,13,...,0.07,0.07,0.0,0.07,0.0,0.06,0.06,0.0,0.06,Matches


In [5]:
# creating a list with new names
new_columns = []
for col in big5_stats.columns:
  if 'level_0' in col:
      new_col = col.split()[-1]  # takes the last name
  else:
      new_col = col
  new_columns.append(new_col)

# rename columns
big5_stats.columns = new_columns
big5_stats = big5_stats.fillna(0)

In [6]:
# clean data
big5_stats['Age'] = big5_stats['Age'].str[:2]
big5_stats['Position'] = big5_stats['Pos'].str[:2]
big5_stats['Nation'] = big5_stats['Nation'].str.split(' ').str.get(1)
big5_stats['League'] = big5_stats['Comp'].str.split(' ').str.get(1)
big5_stats['League_'] = big5_stats['Comp'].str.split(' ').str.get(2)
big5_stats['League'] = big5_stats['League'] + ' ' + big5_stats['League_']
big5_stats = big5_stats.drop(columns=['League_', 'Comp', 'Rk', 'Pos','Matches'])

In [7]:
big5_stats.head()

Unnamed: 0,Player,Nation,Squad,Age,Born,Playing Time MP,Playing Time Starts,Playing Time Min,Playing Time 90s,Performance Gls,...,Per 90 Minutes G+A,Per 90 Minutes G-PK,Per 90 Minutes G+A-PK,Per 90 Minutes xG,Per 90 Minutes xAG,Per 90 Minutes xG+xAG,Per 90 Minutes npxG,Per 90 Minutes npxG+xAG,Position,League
0,Max Aarons,ENG,Bournemouth,23,2000,20,13,1237,13.7,0,...,0.07,0.0,0.07,0.0,0.06,0.06,0.0,0.06,DF,Premier League
1,Brenden Aaronson,USA,Union Berlin,22,2000,30,14,1267,14.1,2,...,0.28,0.14,0.28,0.14,0.13,0.27,0.14,0.27,MF,
2,Paxten Aaronson,USA,Eint Frankfurt,19,2003,7,1,101,1.1,0,...,0.89,0.0,0.89,0.11,0.07,0.19,0.11,0.19,MF,
3,Keyliane Abdallah,FRA,Marseille,17,2006,1,0,4,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FW,Ligue 1
4,Yunis Abdelhamid,MAR,Reims,35,1987,31,31,2781,30.9,4,...,0.13,0.1,0.1,0.11,0.01,0.12,0.09,0.09,DF,Ligue 1


In [8]:
big5_stats.dtypes

Player                     object
Nation                     object
Squad                      object
Age                        object
Born                       object
Playing Time MP            object
Playing Time Starts        object
Playing Time Min           object
Playing Time 90s           object
Performance Gls            object
Performance Ast            object
Performance G+A            object
Performance G-PK           object
Performance PK             object
Performance PKatt          object
Performance CrdY           object
Performance CrdR           object
Expected xG                object
Expected npxG              object
Expected xAG               object
Expected npxG+xAG          object
Progression PrgC           object
Progression PrgP           object
Progression PrgR           object
Per 90 Minutes Gls         object
Per 90 Minutes Ast         object
Per 90 Minutes G+A         object
Per 90 Minutes G-PK        object
Per 90 Minutes G+A-PK      object
Per 90 Minutes

##### **Data Preparation**

In [9]:
# check for null values
print("Number of null values in each column:")
big5_stats.isna().sum()

Number of null values in each column:


Player                       0
Nation                     119
Squad                        0
Age                          5
Born                         0
Playing Time MP              0
Playing Time Starts          0
Playing Time Min             0
Playing Time 90s             0
Performance Gls              0
Performance Ast              0
Performance G+A              0
Performance G-PK             0
Performance PK               0
Performance PKatt            0
Performance CrdY             0
Performance CrdR             0
Expected xG                  0
Expected npxG                0
Expected xAG                 0
Expected npxG+xAG            0
Progression PrgC             0
Progression PrgP             0
Progression PrgR             0
Per 90 Minutes Gls           0
Per 90 Minutes Ast           0
Per 90 Minutes G+A           0
Per 90 Minutes G-PK          0
Per 90 Minutes G+A-PK        0
Per 90 Minutes xG            0
Per 90 Minutes xAG           0
Per 90 Minutes xG+xAG        0
Per 90 M

In [10]:
null_value_counts = big5_stats.isna().sum()
sorted_null_values = null_value_counts.sort_values(ascending=False)
sorted_null_values.head()

League                621
Nation                119
Age                     5
Position                0
Per 90 Minutes G+A      0
dtype: int64

In [11]:
#column_to_check = 'League' # all Bundesliga
#column_to_check = 'Nation' # in web, the main table is divided in multiple tables, so null rows are headers of each table
#column_to_check = 'Age'
column_to_check = 'Born'

null_rows = big5_stats.loc[big5_stats[column_to_check].isnull()]

print("Rows where '{}' is null:".format(column_to_check))
print(null_rows[['Player', 'Nation', 'Squad', 'Age', 'Born']])

Rows where 'Born' is null:
Empty DataFrame
Columns: [Player, Nation, Squad, Age, Born]
Index: []


In [12]:
# null 1
big5_stats['League'] = big5_stats['League'].fillna('Bundesliga')

In [13]:
# null 2
term_to_remove = 'Player'
print(big5_stats.shape)
print('')

big5_stats = big5_stats.loc[~big5_stats['Player'].str.contains(term_to_remove, na=False)]
print(big5_stats.shape)

(2966, 36)

(2852, 36)


In [14]:
# null 3

# Max Moerstedt - Hoffenheim
big5_stats.loc[1854, 'Age'] = 18
big5_stats.loc[1854, 'Born'] = 2006
# Marco Pellegrino - Salernitana
big5_stats.loc[2155, 'Nation'] = "ARG"
big5_stats.loc[2155, 'Age'] = 21
big5_stats.loc[2155, 'Born'] = 2002
# Marco Pellegrino - Milan
big5_stats.loc[2156, 'Nation'] = "ARG"
big5_stats.loc[2156, 'Age'] = 21
big5_stats.loc[2156, 'Born'] = 2002
# Pablo Saenz - Granada
big5_stats.loc[2399, 'Nation'] = "ESP"
big5_stats.loc[2399, 'Age'] = 23
big5_stats.loc[2399, 'Born'] = 2001
# Max Svensson - Osasuna
big5_stats.loc[2629, 'Age'] = 22
big5_stats.loc[2629, 'Born'] = 2001
# Santiago García - Getafe
big5_stats.loc[986, 'Nation'] = "ESP"
# Mahmut Kücüksahin - Augsburg
big5_stats.loc[1449, 'Nation'] = "TUR"

In [15]:
# convert from objects to numeric
columns_to_convert = ['Age', 'Born', 'Playing Time MP', 'Playing Time Starts', 'Playing Time Min', 'Playing Time 90s', 'Performance Gls',
       'Performance Ast', 'Performance G+A', 'Performance G-PK', 'Performance PK', 'Performance PKatt', 'Performance CrdY',
       'Performance CrdR', 'Expected xG', 'Expected npxG', 'Expected xAG', 'Expected npxG+xAG', 'Progression PrgC', 'Progression PrgP',
       'Progression PrgR', 'Per 90 Minutes Gls', 'Per 90 Minutes Ast', 'Per 90 Minutes G+A', 'Per 90 Minutes G-PK', 'Per 90 Minutes G+A-PK',
       'Per 90 Minutes xG', 'Per 90 Minutes xAG', 'Per 90 Minutes xG+xAG', 'Per 90 Minutes npxG', 'Per 90 Minutes npxG+xAG']

big5_stats[columns_to_convert] = big5_stats[columns_to_convert].apply(pd.to_numeric, errors='coerce')

In [16]:
duplicated_rows = big5_stats[big5_stats.duplicated(subset=['Player'])]

print("Rows where the value in column 'Player' is duplicated:")
print(duplicated_rows[['Player', 'Nation', 'Squad', 'Age', 'Born']]) # players that moved in this season

Rows where the value in column 'Player' is duplicated:
                  Player Nation          Squad  Age  Born
17    Bénie Adama Traore    CIV         Nantes   20  2002
36         Lucien Agoume    FRA        Sevilla   21  2002
55         Sergio Akieme    ESP          Reims   25  1997
59         Paul Akouokou    CIV           Lyon   25  1997
75       Mohamed Ali Cho    FRA           Nice   19  2004
...                  ...    ...            ...  ...   ...
2929        Duván Zapata    COL       Atalanta   32  1991
2933      Bryan Zaragoza    ESP  Bayern Munich   22  2001
2941      Alessio Zerbin    ITA         Napoli   24  1999
2955        Nadir Zortea    ITA       Atalanta   24  1999
2964         Milan Đurić    BIH          Monza   33  1990

[149 rows x 5 columns]


In [17]:
# find Acerbi
search_text = 'Acerbi'
filtered_acerbi = big5_stats[big5_stats['Player'].str.contains(search_text)]
filtered_acerbi

Unnamed: 0,Player,Nation,Squad,Age,Born,Playing Time MP,Playing Time Starts,Playing Time Min,Playing Time 90s,Performance Gls,...,Per 90 Minutes G+A,Per 90 Minutes G-PK,Per 90 Minutes G+A-PK,Per 90 Minutes xG,Per 90 Minutes xAG,Per 90 Minutes xG+xAG,Per 90 Minutes npxG,Per 90 Minutes npxG+xAG,Position,League
13,Francesco Acerbi,ITA,Inter,35,1988,29,26,2388,26.5,3,...,0.15,0.11,0.15,0.06,0.07,0.13,0.06,0.13,DF,Serie A


In [18]:
# save df
big5_stats.to_csv('../data/big5_stats.csv', index=False)

Filter dataframe to include only _defenders_

In [19]:
print(big5_stats.columns)
print('')
print(big5_stats['Position'].unique().tolist())

Index(['Player', 'Nation', 'Squad', 'Age', 'Born', 'Playing Time MP',
       'Playing Time Starts', 'Playing Time Min', 'Playing Time 90s',
       'Performance Gls', 'Performance Ast', 'Performance G+A',
       'Performance G-PK', 'Performance PK', 'Performance PKatt',
       'Performance CrdY', 'Performance CrdR', 'Expected xG', 'Expected npxG',
       'Expected xAG', 'Expected npxG+xAG', 'Progression PrgC',
       'Progression PrgP', 'Progression PrgR', 'Per 90 Minutes Gls',
       'Per 90 Minutes Ast', 'Per 90 Minutes G+A', 'Per 90 Minutes G-PK',
       'Per 90 Minutes G+A-PK', 'Per 90 Minutes xG', 'Per 90 Minutes xAG',
       'Per 90 Minutes xG+xAG', 'Per 90 Minutes npxG',
       'Per 90 Minutes npxG+xAG', 'Position', 'League'],
      dtype='object')

['DF', 'MF', 'FW', 'GK']


In [20]:
big5_def = big5_stats[big5_stats['Position'] == 'DF']
print(big5_def.shape)
print('')
print(big5_def[['Player']].head(10))

(1003, 36)

               Player
0          Max Aarons
4    Yunis Abdelhamid
6       Nabil Aberdin
9               Abner
11        Abdel Abqar
13   Francesco Acerbi
14  Joshua Acheampong
15       Marcos Acuña
22   Tosin Adarabioyo
27    Nathaniel Adjei


In [21]:
# save df
big5_def.to_csv('../data/big5_def.csv', index=False)

In [22]:
# create a dictionary
players = [f"{player} ({squad})" for player, squad in zip(big5_def['Player'], big5_def['Squad'])]
player_dict = {player: idx for idx, player in enumerate(players)}

In [23]:
player_dict

{'Max Aarons (Bournemouth)': 0,
 'Yunis Abdelhamid (Reims)': 1,
 'Nabil Aberdin (Getafe)': 2,
 'Abner (Betis)': 3,
 'Abdel Abqar (Alavés)': 4,
 'Francesco Acerbi (Inter)': 5,
 'Joshua Acheampong (Chelsea)': 6,
 'Marcos Acuña (Sevilla)': 7,
 'Tosin Adarabioyo (Fulham)': 8,
 'Nathaniel Adjei (Lorient)': 9,
 'Adryelson (Lyon)': 10,
 'Emmanuel Agbadou (Reims)': 11,
 'Felix Agu (Werder Bremen)': 12,
 'Nayef Aguerd (West Ham)': 13,
 'Ruben Aguilar (Lens)': 14,
 'Anel Ahmedhodžić (Sheffield Utd)': 15,
 'Joseph Aidoo (Celta Vigo)': 16,
 "Ola Aina (Nott'ham Forest)": 17,
 'Rayan Aït-Nouri (Wolves)': 18,
 'Kristoffer Ajer (Brentford)': 19,
 'Manuel Akanji (Manchester City)': 20,
 'Nathan Aké (Manchester City)': 21,
 'Sergio Akieme (Almería)': 22,
 'Sergio Akieme (Reims)': 23,
 'Kevin Akpoguma (Hoffenheim)': 24,
 'David Alaba (Real Madrid)': 25,
 'Raúl Albiol (Villarreal)': 26,
 'Omar Alderete (Getafe)': 27,
 'Trent Alexander-Arnold (Liverpool)': 28,
 'Marcos Alonso (Barcelona)': 29,
 'Adrià Alti

In [24]:
print(big5_stats.columns)

Index(['Player', 'Nation', 'Squad', 'Age', 'Born', 'Playing Time MP',
       'Playing Time Starts', 'Playing Time Min', 'Playing Time 90s',
       'Performance Gls', 'Performance Ast', 'Performance G+A',
       'Performance G-PK', 'Performance PK', 'Performance PKatt',
       'Performance CrdY', 'Performance CrdR', 'Expected xG', 'Expected npxG',
       'Expected xAG', 'Expected npxG+xAG', 'Progression PrgC',
       'Progression PrgP', 'Progression PrgR', 'Per 90 Minutes Gls',
       'Per 90 Minutes Ast', 'Per 90 Minutes G+A', 'Per 90 Minutes G-PK',
       'Per 90 Minutes G+A-PK', 'Per 90 Minutes xG', 'Per 90 Minutes xAG',
       'Per 90 Minutes xG+xAG', 'Per 90 Minutes npxG',
       'Per 90 Minutes npxG+xAG', 'Position', 'League'],
      dtype='object')


In [25]:
# select only data per 90min
big5_def_90min = big5_def[['Player', 'Nation', 'Squad', 'Age', 'Born', 'Playing Time MP',
       'Playing Time Starts', 'Playing Time Min', 'Playing Time 90s',
       'Performance Gls', 'Performance Ast', 'Performance G+A',
       'Performance G-PK', 'Performance PK', 'Performance PKatt',
       'Performance CrdY', 'Performance CrdR', 'Expected xG', 'Expected npxG',
       'Expected xAG', 'Expected npxG+xAG', 'Progression PrgC',
       'Progression PrgP', 'Progression PrgR', 'Position', 'League']]

In [26]:
# standardize data per 90 minutes
def scale_to_90(df, minutes_played_column, columns_to_scale):
    for column in columns_to_scale:
        df.loc[:, column] = (df[column] / df[minutes_played_column]) * 90
    return df

# Applying the function to scale specified columns
big5_def_90min = scale_to_90(big5_def_90min, 'Playing Time Min', columns_to_scale=['Performance Gls', 'Performance Ast', 'Performance G+A',
       'Performance G-PK', 'Performance PK', 'Performance PKatt', 'Performance CrdY', 'Performance CrdR', 'Expected xG', 'Expected npxG',
       'Expected xAG', 'Expected npxG+xAG', 'Progression PrgC', 'Progression PrgP', 'Progression PrgR'])

In [27]:
new_order = ['Player', 'Nation', 'Squad', 'Age', 'Position', 'League',
             'Playing Time MP', 'Playing Time Starts', 'Playing Time Min',
             'Playing Time 90s', 'Performance Gls', 'Performance Ast',
             'Performance G+A', 'Performance G-PK', 'Performance PK',
             'Performance PKatt', 'Performance CrdY', 'Performance CrdR',
             'Expected xG', 'Expected npxG', 'Expected xAG',
             'Expected npxG+xAG', 'Progression PrgC', 'Progression PrgP',
             'Progression PrgR']

# Reindex the DataFrame with the new order of columns
big5_def_90min = big5_def_90min.reindex(columns=new_order)

In [28]:
big5_def_90min.columns

Index(['Player', 'Nation', 'Squad', 'Age', 'Position', 'League',
       'Playing Time MP', 'Playing Time Starts', 'Playing Time Min',
       'Playing Time 90s', 'Performance Gls', 'Performance Ast',
       'Performance G+A', 'Performance G-PK', 'Performance PK',
       'Performance PKatt', 'Performance CrdY', 'Performance CrdR',
       'Expected xG', 'Expected npxG', 'Expected xAG', 'Expected npxG+xAG',
       'Progression PrgC', 'Progression PrgP', 'Progression PrgR'],
      dtype='object')

In [29]:
big5_def_90min.rename(columns={'Performance Gls': 'Goals', 'Performance Ast': 'Assists', 'Performance G+A': 'Goals + Assists',
                                'Performance G-PK': 'Non-Penalty Goals', 'Performance PK': 'Penalty Goals',
                                'Performance PKatt': 'Penalty Kicked', 'Performance CrdY': 'Yellow Cards',
                                'Performance CrdR': 'Red Cards', 'Expected xG': 'xG',
                                'Expected npxG': 'Non-Penalty xG', 'Expected xAG': 'xAG',
                                'Expected npxG+xAG': 'npxG+xAG', 'Progression PrgC': 'Progressive Carries',
                                'Progression PrgP': 'Progressive Passes', 'Progression PrgR': 'Progressive Passes Received'}, inplace=True)

In [30]:
# save df
big5_def_90min.to_csv('../data/big5_def_90min.csv', index=False)

##### **Data Modeling**

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
big5_def_90min = pd.read_csv("../data/big5_def_90min.csv")
num_cols_90min = big5_def_90min.select_dtypes(include='number')

In [33]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(num_cols_90min)

In [34]:
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_scaled)

In [35]:
print("Number of components selected by PCA:", pca.n_components_)
print("Explained variance ratio:", pca.explained_variance_ratio_)

Number of components selected by PCA: 11
Explained variance ratio: [0.20301446 0.17863613 0.11628544 0.09855554 0.09642692 0.05598553
 0.0553185  0.04861882 0.04559865 0.04145377 0.03419052]


In [36]:
# Similarity Calculation
similarity_matrix = cosine_similarity(X_pca)

In [37]:
# Set Target Player
target_player = 'Francesco Acerbi'

In [38]:
# Find Similar Players
player_index = big5_def_90min.index[big5_def_90min['Player'] == target_player].tolist()[0]
similar_players_indices = similarity_matrix[player_index].argsort()[::-1][1:6]  # Get top 5 similar players

similar_players_data = big5_def_90min.iloc[similar_players_indices]

print("Top 5 similar players to", target_player, ":")
for i, (_, row) in enumerate(similar_players_data.iterrows(), 1):
    similarity_score = similarity_matrix[player_index, similar_players_indices[i - 1]]
    print(f"{i}. Name: {row['Player']}, Age: {row['Age']}, Similarity Score: {similarity_score}")

Top 5 similar players to Francesco Acerbi :
1. Name: Thiago Silva, Age: 38, Similarity Score: 0.9775921613328956
2. Name: Matteo Darmian, Age: 33, Similarity Score: 0.964261279560727
3. Name: Sergio Ramos, Age: 37, Similarity Score: 0.9615094493512925
4. Name: Antonio Raillo, Age: 31, Similarity Score: 0.9380227696172011
5. Name: Axel Witsel, Age: 34, Similarity Score: 0.9248593403538459


##### **Data Visualization**

_Recommender Tool w/ Streamlit -> `recommender_tool.py`_