**NOTEBOOK FOR ANALYSIS**

In [68]:
import pandas as pd
import warnings
import rich

warnings.filterwarnings("ignore")

# Load the dataset
df = pd.read_csv('2023-2024/best5leagues_datasetplayers.csv')

In [69]:
df.keys()

Index(['Player', 'Nation', 'Squad', 'Age', 'Born', '90s', 'SCA SCA',
       'SCA SCA90', 'SCA Types PassLive', 'SCA Types PassDead', 'SCA Types TO',
       'SCA Types Sh', 'SCA Types Fld', 'SCA Types Def', 'GCA GCA',
       'GCA GCA90', 'GCA Types PassLive', 'GCA Types PassDead', 'GCA Types TO',
       'GCA Types Sh', 'GCA Types Fld', 'GCA Types Def', 'Position_2',
       'Position', 'League'],
      dtype='object')

In [70]:
#Isolate for players who played at least 12 matches

df['90s'] = pd.to_numeric(df['90s'], errors='coerce')
matches = 12.0
filtered_df = df[df['90s'] >= matches] 

In [71]:
top_players = {}

# List of statistics columns to evaluate
stats_columns = ['SCA SCA90', 'GCA GCA90', 'SCA Types PassLive', 'GCA Types PassLive', 'GCA Types Def']

for col in stats_columns:
    filtered_df[col] = pd.to_numeric(filtered_df[col], errors='coerce')

for stat in stats_columns:
    top_players[stat] = filtered_df.nlargest(5, stat)[['Player', stat]]


In [72]:
for stat, players in top_players.items():
    rich.print(f"Top 5 players for {stat}:")
    rich.print(players)
    print("\n")





















**Aggregated Analysis for top 5 Championships**

In [73]:
df = pd.read_csv("2023-2024/combined_players_stats.csv")

df.head()

Unnamed: 0,Player,Nation,Pos,Age,Playing Time MP,Playing Time Starts,Playing Time Min,Playing Time 90s,Performance Gls,Performance Ast,...,Per 90 Minutes G+A-PK,Per 90 Minutes xG,Per 90 Minutes xAG,Per 90 Minutes xG+xAG,Per 90 Minutes npxG,Per 90 Minutes npxG+xAG,Matches,League,Team,MP
0,William Saliba,fr FRA,DF,22.0,38.0,38,3420.0,38.0,2.0,1.0,...,0.08,0.04,0.01,0.05,0.04,0.05,Matches,EPL,Arsenal,
1,Declan Rice,eng ENG,MF,24.0,38.0,37,3225.0,35.8,7.0,8.0,...,0.42,0.09,0.15,0.24,0.09,0.24,Matches,EPL,Arsenal,
2,Martin Ødegaard,no NOR,MF,24.0,35.0,35,3091.0,34.3,8.0,10.0,...,0.47,0.22,0.28,0.5,0.17,0.45,Matches,EPL,Arsenal,
3,Ben White,eng ENG,DF,25.0,37.0,35,2988.0,33.2,4.0,4.0,...,0.24,0.03,0.11,0.15,0.03,0.15,Matches,EPL,Arsenal,
4,Bukayo Saka,eng ENG,FW,21.0,35.0,35,2919.0,32.4,16.0,9.0,...,0.59,0.48,0.32,0.8,0.33,0.65,Matches,EPL,Arsenal,


In [74]:
df.keys()

Index(['Player', 'Nation', 'Pos', 'Age', 'Playing Time MP',
       'Playing Time Starts', 'Playing Time Min', 'Playing Time 90s',
       'Performance Gls', 'Performance Ast', 'Performance G+A',
       'Performance G-PK', 'Performance PK', 'Performance PKatt',
       'Performance CrdY', 'Performance CrdR', 'Expected xG', 'Expected npxG',
       'Expected xAG', 'Expected npxG+xAG', 'Progression PrgC',
       'Progression PrgP', 'Progression PrgR', 'Per 90 Minutes Gls',
       'Per 90 Minutes Ast', 'Per 90 Minutes G+A', 'Per 90 Minutes G-PK',
       'Per 90 Minutes G+A-PK', 'Per 90 Minutes xG', 'Per 90 Minutes xAG',
       'Per 90 Minutes xG+xAG', 'Per 90 Minutes npxG',
       'Per 90 Minutes npxG+xAG', 'Matches', 'League', 'Team', 'MP'],
      dtype='object')

In [75]:
df['Playing Time 90s'] = pd.to_numeric(df['Playing Time 90s'], errors='coerce')
matches = 12.0
df = df[df['Playing Time 90s'] >= matches]

In [76]:
top_players = {}

# List of statistics columns to evaluate
stats_columns = ['Performance Gls', 'Expected xG', 'Per 90 Minutes Gls', 'Per 90 Minutes npxG+xAG', 'Progression PrgP']

for col in stats_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

for stat in stats_columns:
    top_players[stat] = df.nlargest(5, stat)[['Player', stat]]

In [77]:
for stat, players in top_players.items():
    rich.print(f"Top 5 players for {stat}:")
    rich.print(players)
    print("\n")





















In [78]:
def analize(path_file: str, variables: list, matches_played: float, top_n_players: int):
    """
    Analyzes player statistics from a specific league and compares them to the top players across Europe.

    Parameters:
    - path_file (str): Path to the CSV file containing player statistics for the specific league.
    - variables (list): List of statistical variables to analyze (e.g., ['Goals', 'Assists', 'Shots']).
    - matches_played (float): Minimum number of matches played (in 90-minute equivalents) to consider a player.
    - top_n_players (int): Number of top players to return for each statistical variable.

    Returns:
    - tuple: A tuple containing:
        - str: A title for the best players of the league.
        - dict: A dictionary where keys are the statistical variables and values are DataFrames of the top players 
                in the league for each variable.
        - str: A title for the best players in Europe.
        - dict: A dictionary where keys are the statistical variables and values are DataFrames of the top players 
                across Europe for each variable.
    """
    df = pd.read_csv(path_file)
    df['Playing Time 90s'] = pd.to_numeric(df['Playing Time 90s'], errors='coerce')
    df = df[df['Playing Time 90s'] >= matches_played]

    df_top_players = pd.read_csv("2023-2024/combined_players_stats.csv")
    df_top_players['Playing Time 90s'] = pd.to_numeric(df_top_players['Playing Time 90s'], errors='coerce')
    df_top_players = df_top_players[df_top_players['Playing Time 90s'] >= matches_played]
    
    top_players = {}
    top_players_5_leagues = {}

    for col in variables:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df_top_players[col] = pd.to_numeric(df_top_players[col], errors='coerce')
    
    for stat in variables:
        top_players[stat] = df.nlargest(5, stat)[['Player', stat]]
        top_players_5_leagues[stat] = df_top_players.nlargest(top_n_players, stat)[['Player', stat]]

    return "Best players of the league", top_players, "Best players in Europe", top_players_5_leagues

In [79]:
stats_columns = ['Performance Gls', 'Expected xG', 'Per 90 Minutes Gls', 'Per 90 Minutes npxG+xAG', 'Progression PrgP']

rich.print(analize("2023-2024/Eredivise/combined_players_stats_Eredivise.csv",stats_columns, 12, 5 ))