In [1]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Load the dataset with the proper delimiter
df = pd.read_csv("C:/Users/yacin/Desktop/Projet DS/Complete_Merged_Data.csv", sep=";")

# Exclure la catégorie "Coach" des données
df = df[df['position'] != 'Coach']

# Supprimer les colonnes dont le nom est 'Unnamed'
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
# Display shape and sample
print("Dataset shape:", df.shape)
df.head()


Dataset shape: (15323, 53)


Unnamed: 0,full_name,age,league,position,Current Club,minutes_played_overall,nationality,appearances_overall,goals_overall,assists_overall,...,dribbles_successful_per90_percentile_overall,blocks_total_overall,ratings_total_overall,xg_per_90_overall,average_rating_percentile_overall,aerial_duels_won_total_overall,duels_per_game_overall,duels_won_percentage_overall,annual_salary_eur,annual_salary_eur_percentile
0,Aaron Cresswell,34,Premier League,Defender,West Ham United,431,England,11,0,0,...,9.0,2.0,59.98,0.0,23.0,6.0,1.89,58.82,3016000.0,52.0
1,Aaron Hickey,22,Premier League,Defender,Brentford,720,Scotland,9,0,0,...,9.0,2.0,7.32,0.0,93.0,0.0,6.0,83.33,1809600.0,29.0
2,Aaron Ramsdale,26,Premier League,Goalkeeper,Arsenal,540,England,6,0,0,...,9.0,0.0,39.22,0.0,17.0,0.0,0.17,100.0,7238400.0,83.0
3,Aaron Wan-Bissaka,26,Premier League,Defender,Manchester United,1782,England,22,0,2,...,64.0,14.0,147.98,0.01,71.0,21.0,6.62,62.59,5428800.0,73.0
4,Abdoulaye Doucouré,31,Premier League,Midfielder,Everton,2643,Mali,32,7,1,...,62.0,3.0,215.1,0.27,57.0,25.0,8.61,36.33,7841600.0,85.0


In [2]:
# Check for duplicates
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

# Remove duplicates
df = df.drop_duplicates()


Number of duplicate rows: 720


In [3]:
# ❓ Step 2: Handle Missing Values

# Drop columns with more than 40% missing data
threshold = 0.4 * len(df)
df = df.dropna(thresh=threshold, axis=1)

# Fill missing values
for col in df.columns:
    if df[col].isnull().sum() > 0:
        if df[col].dtype == 'object':
            # Fill with most frequent value
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            # Convert to numeric and fill with median
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df[col] = df[col].fillna(df[col].median())

# 🚨 Step 3: Remove Outliers (1st - 99th percentile)
numeric_cols = df.select_dtypes(include=np.number).columns

for col in numeric_cols:
    upper_limit = df[col].quantile(0.99)
    lower_limit = df[col].quantile(0.01)
    df = df[(df[col] <= upper_limit) & (df[col] >= lower_limit)]

# ✅ Final shape after cleaning
print("Cleaned dataset shape:", df.shape)

# 👀 Preview cleaned data
df.head()

Cleaned dataset shape: (9714, 53)


Unnamed: 0,full_name,age,league,position,Current Club,minutes_played_overall,nationality,appearances_overall,goals_overall,assists_overall,...,dribbles_successful_per90_percentile_overall,blocks_total_overall,ratings_total_overall,xg_per_90_overall,average_rating_percentile_overall,aerial_duels_won_total_overall,duels_per_game_overall,duels_won_percentage_overall,annual_salary_eur,annual_salary_eur_percentile
0,Aaron Cresswell,34,Premier League,Defender,West Ham United,431,England,11,0,0,...,9.0,2.0,59.98,0.0,23.0,6.0,1.89,58.82,3016000.0,52.0
5,Adam Davies,32,Premier League,Goalkeeper,Sheffield United,0,Wales,0,0,0,...,21.0,0.0,66.795,0.03,20.0,4.0,4.035,43.82,0.0,52.0
7,Adam Smith,33,Premier League,Defender,AFC Bournemouth,2158,England,28,0,2,...,21.0,11.0,188.12,0.0,32.0,19.0,5.89,49.09,0.0,5.0
8,Adam Webster,29,Premier League,Defender,Brighton & Hove Albion,1144,England,15,0,0,...,9.0,8.0,97.43,0.02,59.0,31.0,6.07,56.47,3317600.0,56.0
9,Adam Wharton,20,Premier League,Defender,Crystal Palace,1305,England,16,0,3,...,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [64]:
def recommend_dynamic_features_by_position(player_name, position_target, df, top_n=10):
    """
    Recommande des joueurs similaires à player_name,
    et inclut le joueur cible même avec peu de données.
    """

    # 1. Features manuels
    position_features = {
        'goalkeeper': [
            'saves_per_game_overall', 'save_percentage_overall', 'clean_sheets_overall',
            'shots_faced_total_overall', 'conceded_overall', 'passes_completed_total_overall'
        ],
        'defender': [
            'interceptions_total_overall', 'tackles_total_overall',
            'aerial_duels_won_total_overall', 'blocks_total_overall',
            'duels_per_game_overall', 'pass_completion_rate_overall'
        ],
        'midfielder': [
            'assists_overall', 'key_passes_total_overall', 'passes_completed_total_overall',
            'dribbles_per_game_overall', 'tackles_total_overall',
            'interceptions_total_overall', 'average_rating_percentile_overall'
        ],
        'forward': [
            'goals_overall', 'xg_total_overall', 'shots_on_target_per_game_overall',
            'dribbles_per_game_overall', 'key_passes_total_overall', 'goals_involved_per_90_overall'
        ]
    }

    selected_features = position_features.get(position_target.lower())
    if not selected_features:
        return f"No features defined for position: {position_target}"

    print(f"Selected features for position '{position_target}': {selected_features}")

            # 2. Préparer df
    df_pos = df.copy()
    df_pos['position'] = df_pos['position'].astype(str).str.lower()
    df_pos['full_name'] = df_pos['full_name'].astype(str).str.strip()
    
    # 🔥 Puis ICI tu fais l'imputation des salaires
    for pos in df_pos['position'].unique():
        pos_filter = df_pos['position'] == pos
        valid_salaries = df_pos.loc[pos_filter & (df_pos['annual_salary_eur'] > 200000), 'annual_salary_eur']
        
        if not valid_salaries.empty:
            median_salary = valid_salaries.median()
            df_pos.loc[pos_filter & (df_pos['annual_salary_eur'] == 0), 'annual_salary_eur'] = median_salary




    # 3. Chercher le joueur dans tout df
    player_name_clean = player_name.strip().lower()
    player_row = df_pos[df_pos['full_name'].str.lower() == player_name_clean]

    if player_row.empty:
        return f"Player '{player_name}' not found at all."

    # 4. Vérifier si sa position correspond
    player_position = player_row.iloc[0]['position']
    if position_target.lower() not in player_position:
        return f"Player '{player_name}' exists but not in position '{position_target}'. Found position: {player_position}"

    # 5. Maintenant filtrer tous les joueurs de même position
    df_filtered = df_pos[df_pos['position'].str.contains(position_target.lower())]

    df_filtered_others = df_filtered[df_filtered['full_name'].str.lower() != player_name_clean]

    # Nettoyer
    for col in selected_features:
        df_filtered_others[col] = pd.to_numeric(df_filtered_others[col], errors='coerce')
    df_filtered_others = df_filtered_others.dropna(subset=selected_features)

    # Remettre le joueur
    df_final = pd.concat([player_row, df_filtered_others])

    # Standardisation
    scaler = StandardScaler()
    X_final = scaler.fit_transform(df_final[selected_features])

    # Similarité
    player_idx = df_final[df_final['full_name'].str.lower() == player_name_clean].index[0]
    row_position = df_final.index.get_loc(player_idx)
    target_vector = X_final[row_position].reshape(1, -1)

    from sklearn.metrics.pairwise import cosine_similarity
    similarities = cosine_similarity(target_vector, X_final).flatten()
    similar_indices = similarities.argsort()[::-1][:top_n+1]

    # Colonnes importantes
    general_columns = [
        'full_name', 'age', 'league', 'position', 'Current Club',
        'minutes_played_overall', 'nationality', 'yellow_cards_overall', 'red_cards_overall','annual_salary_eur'
    ]

    final_columns = general_columns + selected_features
    similar_players = df_final.iloc[similar_indices]

    return similar_players[final_columns]


In [65]:
recommend_dynamic_features_by_position("Alexy Bosetti", "forward", df, top_n=5)


Selected features for position 'forward': ['goals_overall', 'xg_total_overall', 'shots_on_target_per_game_overall', 'dribbles_per_game_overall', 'key_passes_total_overall', 'goals_involved_per_90_overall']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered_others[col] = pd.to_numeric(df_filtered_others[col], errors='coerce')


Unnamed: 0,full_name,age,league,position,Current Club,minutes_played_overall,nationality,yellow_cards_overall,red_cards_overall,annual_salary_eur,goals_overall,xg_total_overall,shots_on_target_per_game_overall,dribbles_per_game_overall,key_passes_total_overall,goals_involved_per_90_overall
12305,Alexy Bosetti,31,Ligue 2,forward,Annecy,184,France,0,0,800000.0,1,0.86,0.33,0.33,3.0,0.49
7607,Brenner,25,Serie A,forward,Udinese,386,Brazil,0,0,949567.0,1,1.15,0.44,0.44,5.0,0.7
12672,Mayron Antonio George Clayton,31,Ligue 2,forward,Pau,221,Costa Rica,0,0,800000.0,1,0.8,0.25,0.5,2.0,0.41
7324,Michael Gregoritsch,30,Bundesliga,forward,Freiburg,417,Austria,2,0,1140000.0,1,1.27,0.29,0.36,3.0,0.43
7133,Haris Tabakovic,30,Bundesliga,forward,Hoffenheim,467,Switzerland,0,0,1140000.0,2,1.77,0.62,0.54,4.0,0.58
7321,Mërgim Berisha,26,Bundesliga,forward,Hoffenheim,329,Germany,1,0,3500000.0,1,1.13,0.27,0.64,3.0,0.55


In [55]:
recommend_dynamic_features_by_position("Adam Webster", "defender", df, top_n=5)

Selected features for position 'defender': ['interceptions_total_overall', 'tackles_total_overall', 'aerial_duels_won_total_overall', 'blocks_total_overall', 'duels_per_game_overall', 'pass_completion_rate_overall']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered_others[col] = pd.to_numeric(df_filtered_others[col], errors='coerce')


Unnamed: 0,full_name,age,league,position,Current Club,minutes_played_overall,nationality,yellow_cards_overall,red_cards_overall,annual_salary_eur,interceptions_total_overall,tackles_total_overall,aerial_duels_won_total_overall,blocks_total_overall,duels_per_game_overall,pass_completion_rate_overall
8,Adam Webster,29,Premier League,defender,Brighton & Hove Albion,1144,England,2,0,3317600.0,15.0,15.0,31.0,8.0,6.07,90.81
6994,Anthony Rouault,23,Bundesliga,defender,Stuttgart,1215,France,1,0,120000.0,17.0,15.0,34.0,9.0,6.0,90.43
11151,Edoardo Goldaniga,31,Serie B,defender,Como,1440,Italy,4,0,0.0,15.0,16.0,33.0,9.0,6.38,84.76
11220,Filippo Scaglia,32,Serie B,defender,Südtirol,1344,Italy,3,0,0.0,17.0,17.0,37.0,9.0,7.07,86.03
10458,Christopher Wooh,23,Ligue 1,defender,Rennes,1387,Cameroon,4,1,240000.0,19.0,18.0,36.0,9.0,6.61,85.9
5253,Taariq Fielies,32,Premier Soccer League,defender,AmaZulu,1620,South Africa,3,0,0.0,18.0,12.0,33.0,10.0,6.82,90.24
