In [1]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import pairwise_distances
import numpy as np
import warnings
warnings.filterwarnings('ignore')


In [2]:
df= pd.read_parquet('../data/players.parquet')
df.head()

Unnamed: 0,Joueur,Équipe,Équipe dans la période sélectionnée,Place,Âge,Valeur marchande,Minutes jouées,Buts,xG,Passes décisives,...,Passes pénétrantes par 90,"Passes en profondeur précises, %",Passes progressives par 90,"Passes progressives précises, %",League,id,90s,Duels aériens gagnés par 90,Dribbles réussis par 90,xG/Tir
0,P. Onuachu,Southampton,Genk,CF,28,17000000,1263,16,11.71,0,...,0.29,0.0,1.07,46.67,Jupiler Pro League,0.0,14.03,2.99376,1.142108,0.29275
1,A. Skov Olsen,Club Brugge,Club Brugge,"RWB, RAMF, RW",23,17000000,1681,7,5.79,7,...,0.64,50.0,7.01,87.79,Jupiler Pro League,0.0,18.68,0.215,4.38864,0.103393
2,Sergio Gómez,Manchester City,Anderlecht,LB,22,15000000,14,0,0.0,0,...,0.0,0.0,0.0,0.0,Jupiler Pro League,0.0,0.16,0.0,0.0,0.0
3,N. Lang,Club Brugge,Club Brugge,"LAMF, CF, LWF",23,15000000,2456,9,9.13,6,...,2.31,39.68,6.71,86.34,Jupiler Pro League,0.0,27.29,0.255884,3.663378,0.169074
4,Fábio Silva,PSV,Anderlecht,CF,20,13000000,1632,7,7.66,1,...,0.94,29.41,1.93,80.0,Jupiler Pro League,0.0,18.13,1.048662,1.819298,0.170222


## *Players metrics template*

In [6]:
templates = {
    "Striker": [
        '90s',
        'Joueur',
        'Équipe',
        'Place',
        'Âge',
        'xG par 90', 
        'Tirs par 90',
        'Touches de balle dans la surface de réparation sur 90',
        'Actions défensives réussies par 90',
        'Duels aériens gagnés par 90',
        'Dribbles réussis par 90',
        'xG/Tir',
        'Passes réceptionnées par 90'
        ],
    
    "Winger/Attacking Midfielder": [
        'xG par 90',
        'xA par 90',
        'Tirs par 90', 
        'Touches de balle dans la surface de réparation sur 90',
        'Сentres précises, %', 
        'Fautes subies par 90',
        'Dribbles réussis par 90', 
        'Tirs par 90',
        'Passes vers la surface de réparation précises, %', 
        'Interceptions PAdj'
        ],
    
    "Midfielder": [
        '90s',
        'Joueur',
        'Équipe',
        'Place',
        'Âge',
        'Passes précises, %',
        'Passes progressives par 90', 
        'xA par 90', 
        'Dribbles réussis par 90',
        'Fautes subies par 90', 
        'Interceptions PAdj', 
        'Courses progressives par 90'
        ],
    
    "Defender": [
        'Passes réceptionnées par 90',
        'Passes en avant précises, %',
        'Interceptions PAdj',
        'Tacles glissés PAdj',
        'Fautes par 90', 
        'Duels aériens par 90', 
        'Duels aériens gagnés, %'
        ],
    
    "Full Wing Back": [
        'Passes réceptionnées par 90',
        'Passes en avant précises, %',
        'Interceptions PAdj',
        'Tacles glissés PAdj',
        'Fautes par 90',
        'Duels aériens par 90',
        'Duels aériens gagnés, %',
        'Сentres précises, %',
        'xA par 90']
}

columns_to_keep = [col for template in templates.values() for col in template]

# Filter the DataFrame to keep only the specified columns
filtered_df = df[columns_to_keep]

filtered_df

Unnamed: 0,90s,Joueur,Équipe,Place,Âge,xG par 90,Tirs par 90,Touches de balle dans la surface de réparation sur 90,Actions défensives réussies par 90,Duels aériens gagnés par 90,...,"Duels aériens gagnés, %",Passes réceptionnées par 90,"Passes en avant précises, %",Interceptions PAdj,Tacles glissés PAdj,Fautes par 90,Duels aériens par 90,"Duels aériens gagnés, %.1","Сentres précises, %",xA par 90
0,14.03,P. Onuachu,Southampton,CF,28,0.83,2.85,4.42,3.28,2.993760,...,41.58,13.61,57.89,2.38,0.10,2.21,7.20,41.58,0.00,0.05
1,18.68,A. Skov Olsen,Club Brugge,"RWB, RAMF, RW",23,0.31,3.00,4.12,4.18,0.215000,...,25.00,27.52,65.52,3.45,0.08,0.64,0.86,25.00,42.68,0.26
2,0.16,Sergio Gómez,Manchester City,LB,22,0.00,0.00,0.00,0.00,0.000000,...,0.00,0.00,0.00,22.50,0.00,0.00,0.00,0.00,0.00,0.00
3,27.29,N. Lang,Club Brugge,"LAMF, CF, LWF",23,0.33,1.98,3.88,3.44,0.255884,...,24.14,29.24,68.86,2.28,0.05,0.84,1.06,24.14,30.77,0.37
4,18.13,Fábio Silva,PSV,CF,20,0.42,2.48,5.68,2.81,1.048662,...,25.33,12.19,47.37,1.07,0.00,1.49,4.14,25.33,9.09,0.08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5319,0.11,V. Stange,Hertha BSC U19,LCMF,19,0.00,0.00,0.00,0.00,0.000000,...,0.00,0.00,0.00,12.00,0.00,0.00,0.00,0.00,0.00,0.00
5320,0.26,D. Gebuhr,Eintracht Frankfurt II,RCB,20,0.00,0.00,0.00,0.00,0.000000,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
5321,8.61,N. N'Goumou,Borussia M'gladbach,"RAMF, CF, LAMF",23,0.09,1.39,2.67,5.23,0.696865,...,46.15,15.56,72.34,3.14,0.00,0.81,1.51,46.15,28.57,0.09
5322,0.59,S. Coulibaly,Borussia Dortmund II,LCB,19,0.00,0.00,0.00,3.40,1.700000,...,100.00,66.23,92.86,3.33,0.00,0.00,1.70,100.00,0.00,0.00


In [4]:
def scale_data(df, template):
    """
    Scale the data in the DataFrame based on the specified template using StandardScaler.

    Parameters:
        df (DataFrame): The input DataFrame.
        template (list): List of column names to be included in the scaled DataFrame.

    Returns:
        DataFrame: Scaled DataFrame based on the specified template.
    """
    # Columns that should not be scaled
    no_to_scale = ['90s', 'Joueur', 'Équipe', 'Place', 'Âge']

    # Select only the specified columns
    df = df[template]

    # Initialize the scaler
    scaler = StandardScaler()

    # Fit and transform the features, excluding non-scalable columns
    scaler.fit(df.drop(no_to_scale, axis=1))
    scaled_features = scaler.transform(df.drop(no_to_scale, axis=1))

    # Create a new DataFrame with scaled features and original column names
    scaled_feat_df = pd.DataFrame(scaled_features, columns=df.columns[5:])

    # Concatenate unscaled columns and scaled features, and return the result
    return pd.concat([df[no_to_scale], scaled_feat_df], axis=1)


In [5]:
# Define selected features for clustering (you can adjust this list)
selected_features = [
    'xG par 90', 'Tirs par 90', 'Touches de balle dans la surface de réparation sur 90',
    'Actions défensives réussies par 90', 'Duels aériens gagnés par 90',
    'Dribbles réussis par 90', 'xG/Tir', 'Passes réceptionnées par 90'
]

# Define weights for each metric (adjust these weights as needed)
metric_weights = {
    'xG par 90': 0.5,
    'Tirs par 90': 0.3,
    'Touches de balle dans la surface de réparation sur 90': 0.2,
    'Actions défensives réussies par 90': 0.4,
    'Duels aériens gagnés par 90': 0.3,
    'Dribbles réussis par 90': 0.2,
    'xG/Tir': 0.5,
    'Passes réceptionnées par 90': 0.4
}

# Scale the data
X = scale_data(df, templates["Striker"])[selected_features]

# Apply weights to the data
for metric, weight in metric_weights.items():
    X[metric] *= weight

# Calculate pairwise cosine similarities (or use an appropriate similarity metric)
cosine_similarities = 1 - pairwise_distances(X, metric='cosine')

# Define a function to find similar players based on cosine similarity (or an appropriate metric)
def find_similar_players(player_name, df, cosine_similarities):
    """
    Find similar players based on cosine similarity.

    Parameters:
        player_name (str): The name of the player to find similarities for.
        df (DataFrame): The input DataFrame containing player data.
        cosine_similarities (array): Pairwise cosine similarities between players.

    Returns:
        DataFrame: DataFrame containing similar players and their similarity percentages.
    """
    # Find the index of the specified player in the DataFrame
    player_index = df[df['Joueur'] == player_name].index[0]

    # Get similarity scores for the specified player and convert them to percentages
    similarities = cosine_similarities[player_index] * 100

    # Create a copy of the original DataFrame to avoid modifying the original data
    similar_players = df.copy()

    # Add a new column for similarity percentage to the DataFrame
    similar_players['Similarity Percentage'] = similarities

    # Sort the DataFrame by similarity percentage in descending order
    similar_players = round(similar_players.sort_values(by='Similarity Percentage', ascending=False), 1)

    # Return the DataFrame of similar players
    return similar_players

# Example usage:
player_name = 'S. Gnabry'  # Replace with the player name you want to find similar players for
similar_players = find_similar_players(player_name, df, cosine_similarities)
similar_players[['Joueur', 'Équipe', 'League', 'Place', 'Âge', '90s', 'Similarity Percentage']].reset_index(drop=True)[:10]


Unnamed: 0,Joueur,Équipe,League,Place,Âge,90s,Similarity Percentage
0,S. Gnabry,Bayern München,German Bundesliga,"CF, RAMF, LAMF",27,23.0,100.0
1,Ansu Fati,Barcelona,La Liga,"LWF, CF, LAMF",20,16.8,99.4
2,K. Mbappé,PSG,Ligue 1,CF,24,33.4,98.8
3,D. Berardi,Sassuolo,Italian Serie A,"RWF, RW, RAMF",28,22.3,98.6
4,M. Vlap,Twente,Eredevise,"AMF, RCMF, LCMF",26,29.6,98.5
5,N. Pépé,Nice,Ligue 1,"CF, RW, RWF",27,18.0,98.0
6,M. Tel,Bayern München,German Bundesliga,"CF, RAMF",18,5.4,97.7
7,S. Szymański,Fenerbahçe,Eredevise,"AMF, LCMF",24,23.5,97.4
8,P. Dybala,Roma,Italian Serie A,"CF, AMF, RWF",29,20.9,97.2
9,J. Ito,Reims,Jupiler Pro League,"RAMF, RW",30,1.1,97.1
