This script aggregates statistical data for players in the Big 5 Leagues across various stats type files. It addresses the challenge of players who have played for multiple teams. The script retrieves player stats type files, select important variables, aggregates based on the URL of the player, saves them as CSV files, and provides a confirmation message for each file.

In [113]:
import pandas as pd
import numpy as np

STANDARD STATS

In [114]:
# Load the player statistics dataset
standard_stats = pd.read_csv("datasets/6_big5_player_standard_stats.csv")
standard_stats.head(2)


Unnamed: 0,Season_End_Year,Squad,Comp,Player,Nation,Pos,Age,Born,MP_Playing,Starts_Playing,...,Ast_Per,G+A_Per,G_minus_PK_Per,G+A_minus_PK_Per,xG_Per,xAG_Per,xG+xAG_Per,npxG_Per,npxG+xAG_Per,Url
0,2023,Ajaccio,Ligue 1,Mickaël Alphonse,GLP,DF,33,1989,26,14,...,0.0,0.0,0.0,0.0,0.03,0.04,0.08,0.03,0.08,https://fbref.com/en/players/0a63446f/Mickael-...
1,2023,Ajaccio,Ligue 1,Cédric Avinel,GLP,DF,35,1986,24,19,...,0.0,0.05,0.05,0.05,0.05,0.0,0.05,0.05,0.05,https://fbref.com/en/players/a94d93be/Cedric-A...


In [115]:
standard_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889 entries, 0 to 2888
Data columns (total 38 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Season_End_Year      2889 non-null   int64  
 1   Squad                2889 non-null   object 
 2   Comp                 2889 non-null   object 
 3   Player               2889 non-null   object 
 4   Nation               2888 non-null   object 
 5   Pos                  2889 non-null   object 
 6   Age                  2889 non-null   int64  
 7   Born                 2889 non-null   int64  
 8   MP_Playing           2889 non-null   int64  
 9   Starts_Playing       2889 non-null   int64  
 10  Min_Playing          2889 non-null   int64  
 11  Mins_Per_90_Playing  2889 non-null   float64
 12  Gls                  2889 non-null   int64  
 13  Ast                  2889 non-null   int64  
 14  G+A                  2889 non-null   int64  
 15  G_minus_PK           2889 non-null   i

In [116]:
standard_stats.columns

Index(['Season_End_Year', 'Squad', 'Comp', 'Player', 'Nation', 'Pos', 'Age',
       'Born', 'MP_Playing', 'Starts_Playing', 'Min_Playing',
       'Mins_Per_90_Playing', 'Gls', 'Ast', 'G+A', 'G_minus_PK', 'PK', 'PKatt',
       'CrdY', 'CrdR', 'xG_Expected', 'npxG_Expected', 'xAG_Expected',
       'npxG+xAG_Expected', 'PrgC_Progression', 'PrgP_Progression',
       'PrgR_Progression', 'Gls_Per', 'Ast_Per', 'G+A_Per', 'G_minus_PK_Per',
       'G+A_minus_PK_Per', 'xG_Per', 'xAG_Per', 'xG+xAG_Per', 'npxG_Per',
       'npxG+xAG_Per', 'Url'],
      dtype='object')

In [117]:
standard_col_selection = ['Url', 'MP_Playing', 'Starts_Playing', 'Min_Playing', 'Gls', 'Ast', 'G+A', 'G_minus_PK', 
                          'PK', 'PKatt', 'CrdY', 'CrdR', 'PrgC_Progression', 'PrgP_Progression', 'PrgR_Progression']

In [118]:
new_standard_stats = standard_stats[standard_col_selection]
new_standard_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889 entries, 0 to 2888
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Url               2889 non-null   object 
 1   MP_Playing        2889 non-null   int64  
 2   Starts_Playing    2889 non-null   int64  
 3   Min_Playing       2889 non-null   int64  
 4   Gls               2889 non-null   int64  
 5   Ast               2889 non-null   int64  
 6   G+A               2889 non-null   int64  
 7   G_minus_PK        2889 non-null   int64  
 8   PK                2889 non-null   int64  
 9   PKatt             2889 non-null   int64  
 10  CrdY              2889 non-null   int64  
 11  CrdR              2889 non-null   int64  
 12  PrgC_Progression  2884 non-null   float64
 13  PrgP_Progression  2884 non-null   float64
 14  PrgR_Progression  2884 non-null   float64
dtypes: float64(3), int64(11), object(1)
memory usage: 338.7+ KB


In [119]:
# Group the data by the "Url" column and sum the statistics
new_standard_stats = new_standard_stats.groupby("Url").sum().reset_index()

# Uppercase the column names of a DataFrame
new_standard_stats.columns = new_standard_stats.columns.str.upper()

# Save the DataFrame to a CSV file
new_standard_stats.to_csv("datasets/aggregated/agg_standard_stats.csv", index=False)

SHOOTING STATS

In [120]:
# Load the player statistics dataset
shooting_stats = pd.read_csv("datasets/7_big5_player_shooting_stats.csv")
shooting_stats.head(2)

Unnamed: 0,Season_End_Year,Squad,Comp,Player,Nation,Pos,Age,Born,Mins_Per_90,Gls_Standard,...,Dist_Standard,FK_Standard,PK_Standard,PKatt_Standard,xG_Expected,npxG_Expected,npxG_per_Sh_Expected,G_minus_xG_Expected,np:G_minus_xG_Expected,Url
0,2023,Ajaccio,Ligue 1,Mickaël Alphonse,GLP,DF,33,1989,15.1,0,...,16.3,0.0,0,0,0.5,0.5,0.05,-0.5,-0.5,https://fbref.com/en/players/0a63446f/Mickael-...
1,2023,Ajaccio,Ligue 1,Cédric Avinel,GLP,DF,35,1986,19.9,1,...,4.1,0.0,0,0,1.0,1.0,0.41,0.0,0.0,https://fbref.com/en/players/a94d93be/Cedric-A...


In [121]:
shooting_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889 entries, 0 to 2888
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Season_End_Year         2889 non-null   int64  
 1   Squad                   2889 non-null   object 
 2   Comp                    2889 non-null   object 
 3   Player                  2889 non-null   object 
 4   Nation                  2888 non-null   object 
 5   Pos                     2889 non-null   object 
 6   Age                     2889 non-null   int64  
 7   Born                    2889 non-null   int64  
 8   Mins_Per_90             2889 non-null   float64
 9   Gls_Standard            2889 non-null   int64  
 10  Sh_Standard             2889 non-null   int64  
 11  SoT_Standard            2889 non-null   int64  
 12  SoT_percent_Standard    2361 non-null   float64
 13  Sh_per_90_Standard      2889 non-null   float64
 14  SoT_per_90_Standard     2889 non-null   

In [122]:
shooting_stats.columns

Index(['Season_End_Year', 'Squad', 'Comp', 'Player', 'Nation', 'Pos', 'Age',
       'Born', 'Mins_Per_90', 'Gls_Standard', 'Sh_Standard', 'SoT_Standard',
       'SoT_percent_Standard', 'Sh_per_90_Standard', 'SoT_per_90_Standard',
       'G_per_Sh_Standard', 'G_per_SoT_Standard', 'Dist_Standard',
       'FK_Standard', 'PK_Standard', 'PKatt_Standard', 'xG_Expected',
       'npxG_Expected', 'npxG_per_Sh_Expected', 'G_minus_xG_Expected',
       'np:G_minus_xG_Expected', 'Url'],
      dtype='object')

In [123]:
shooting_col_selection = ['Url', 'Sh_Standard', 'SoT_Standard', 'Dist_Standard', 'FK_Standard']

In [124]:
new_shooting_stats = shooting_stats[shooting_col_selection]
new_shooting_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889 entries, 0 to 2888
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Url            2889 non-null   object 
 1   Sh_Standard    2889 non-null   int64  
 2   SoT_Standard   2889 non-null   int64  
 3   Dist_Standard  2359 non-null   float64
 4   FK_Standard    2884 non-null   float64
dtypes: float64(2), int64(2), object(1)
memory usage: 113.0+ KB


In [125]:
# Define the aggregation methods for each column
aggregation_methods_shooting = {
    "Sh_Standard": "sum",
    "SoT_Standard": "sum",
    "Dist_Standard": "mean",
    "FK_Standard": "sum"
    # Add more columns and corresponding aggregation methods as needed
}

# Group the data by the "Url" column and apply the aggregation methods
new_shooting_stats = new_shooting_stats.groupby("Url").agg(aggregation_methods_shooting).reset_index()

# Uppercase the column names of a DataFrame
new_shooting_stats.columns = new_shooting_stats.columns.str.upper()

# Save the DataFrame to a CSV file
new_shooting_stats.to_csv("datasets/aggregated/agg_shooting_stats.csv", index=False)

PASSING STATS

In [126]:
# Load the player statistics dataset
pass_stats = pd.read_csv("datasets/8_big5_player_passing_stats.csv")
pass_stats.head(2)

Unnamed: 0,Season_End_Year,Squad,Comp,Player,Nation,Pos,Age,Born,Mins_Per_90,Cmp_Total,...,Ast,xAG,xA,A_minus_xAG,KP,Final_Third,PPA,CrsPA,PrgP,Url
0,2023,Ajaccio,Ligue 1,Mickaël Alphonse,GLP,DF,33,1989,15.1,448.0,...,0,0.7,0.9,-0.7,11.0,43.0,10.0,5.0,51.0,https://fbref.com/en/players/0a63446f/Mickael-...
1,2023,Ajaccio,Ligue 1,Cédric Avinel,GLP,DF,35,1986,19.9,607.0,...,0,0.0,0.1,0.0,0.0,13.0,0.0,0.0,31.0,https://fbref.com/en/players/a94d93be/Cedric-A...


In [127]:
pass_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889 entries, 0 to 2888
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Season_End_Year     2889 non-null   int64  
 1   Squad               2889 non-null   object 
 2   Comp                2889 non-null   object 
 3   Player              2889 non-null   object 
 4   Nation              2888 non-null   object 
 5   Pos                 2889 non-null   object 
 6   Age                 2889 non-null   int64  
 7   Born                2889 non-null   int64  
 8   Mins_Per_90         2889 non-null   float64
 9   Cmp_Total           2884 non-null   float64
 10  Att_Total           2884 non-null   float64
 11  Cmp_percent_Total   2860 non-null   float64
 12  TotDist_Total       2884 non-null   float64
 13  PrgDist_Total       2884 non-null   float64
 14  Cmp_Short           2884 non-null   float64
 15  Att_Short           2884 non-null   float64
 16  Cmp_pe

In [128]:
pass_stats.columns

Index(['Season_End_Year', 'Squad', 'Comp', 'Player', 'Nation', 'Pos', 'Age',
       'Born', 'Mins_Per_90', 'Cmp_Total', 'Att_Total', 'Cmp_percent_Total',
       'TotDist_Total', 'PrgDist_Total', 'Cmp_Short', 'Att_Short',
       'Cmp_percent_Short', 'Cmp_Medium', 'Att_Medium', 'Cmp_percent_Medium',
       'Cmp_Long', 'Att_Long', 'Cmp_percent_Long', 'Ast', 'xAG', 'xA',
       'A_minus_xAG', 'KP', 'Final_Third', 'PPA', 'CrsPA', 'PrgP', 'Url'],
      dtype='object')

In [129]:
pass_col_selection = ['Url', 'Cmp_Total', 'Att_Total', 'TotDist_Total', 'PrgDist_Total', 'KP', 'Final_Third', 'PPA', 'CrsPA']

In [130]:
new_pass_stats = pass_stats[pass_col_selection]
new_pass_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889 entries, 0 to 2888
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Url            2889 non-null   object 
 1   Cmp_Total      2884 non-null   float64
 2   Att_Total      2884 non-null   float64
 3   TotDist_Total  2884 non-null   float64
 4   PrgDist_Total  2884 non-null   float64
 5   KP             2884 non-null   float64
 6   Final_Third    2884 non-null   float64
 7   PPA            2884 non-null   float64
 8   CrsPA          2884 non-null   float64
dtypes: float64(8), object(1)
memory usage: 203.3+ KB


In [131]:
# Group the data by the "Url" column and sum the statistics
new_pass_stats = new_pass_stats.groupby("Url").sum().reset_index()

# Uppercase the column names of a DataFrame
new_pass_stats.columns = new_pass_stats.columns.str.upper()

# Save the DataFrame to a CSV file
new_pass_stats.to_csv("datasets/aggregated/agg_pass_stats.csv", index=False)

GCA STATS

In [132]:
# Load the player statistics dataset
gca_stats = pd.read_csv("datasets/10_big5_player_gca_stats.csv")
gca_stats.head(2)

Unnamed: 0,Season_End_Year,Squad,Comp,Player,Nation,Pos,Age,Born,Mins_Per_90,SCA_SCA,...,Def_SCA,GCA_GCA,GCA90_GCA,PassLive_GCA,PassDead_GCA,TO_GCA,Sh_GCA,Fld_GCA,Def_GCA,Url
0,2023,Ajaccio,Ligue 1,Mickaël Alphonse,GLP,DF,33,1989,15.1,19.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://fbref.com/en/players/0a63446f/Mickael-...
1,2023,Ajaccio,Ligue 1,Cédric Avinel,GLP,DF,35,1986,19.9,3.0,...,0.0,1.0,0.05,0.0,0.0,0.0,1.0,0.0,0.0,https://fbref.com/en/players/a94d93be/Cedric-A...


In [133]:
gca_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889 entries, 0 to 2888
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Season_End_Year  2889 non-null   int64  
 1   Squad            2889 non-null   object 
 2   Comp             2889 non-null   object 
 3   Player           2889 non-null   object 
 4   Nation           2888 non-null   object 
 5   Pos              2889 non-null   object 
 6   Age              2889 non-null   int64  
 7   Born             2889 non-null   int64  
 8   Mins_Per_90      2889 non-null   float64
 9   SCA_SCA          2884 non-null   float64
 10  SCA90_SCA        2884 non-null   float64
 11  PassLive_SCA     2884 non-null   float64
 12  PassDead_SCA     2884 non-null   float64
 13  TO_SCA           2884 non-null   float64
 14  Sh_SCA           2884 non-null   float64
 15  Fld_SCA          2884 non-null   float64
 16  Def_SCA          2884 non-null   float64
 17  GCA_GCA       

In [134]:
gca_stats.columns

Index(['Season_End_Year', 'Squad', 'Comp', 'Player', 'Nation', 'Pos', 'Age',
       'Born', 'Mins_Per_90', 'SCA_SCA', 'SCA90_SCA', 'PassLive_SCA',
       'PassDead_SCA', 'TO_SCA', 'Sh_SCA', 'Fld_SCA', 'Def_SCA', 'GCA_GCA',
       'GCA90_GCA', 'PassLive_GCA', 'PassDead_GCA', 'TO_GCA', 'Sh_GCA',
       'Fld_GCA', 'Def_GCA', 'Url'],
      dtype='object')

In [135]:
gca_col_selection = ['Url', 'SCA_SCA', 'GCA_GCA']

In [136]:
new_gca_stats = gca_stats[gca_col_selection]
new_gca_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889 entries, 0 to 2888
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Url      2889 non-null   object 
 1   SCA_SCA  2884 non-null   float64
 2   GCA_GCA  2884 non-null   float64
dtypes: float64(2), object(1)
memory usage: 67.8+ KB


In [137]:
# Group the data by the "Url" column and sum the statistics
new_gca_stats = new_gca_stats.groupby("Url").sum().reset_index()

# Uppercase the column names of a DataFrame
new_gca_stats.columns = new_gca_stats.columns.str.upper()

# Save the DataFrame to a CSV file
new_gca_stats.to_csv("datasets/aggregated/agg_gca_stats.csv", index=False)

DEFENSE STATS

In [138]:
# Load the player statistics dataset
defense_stats = pd.read_csv("datasets/11_big5_player_defense_stats.csv")
defense_stats.head(2)

Unnamed: 0,Season_End_Year,Squad,Comp,Player,Nation,Pos,Age,Born,Mins_Per_90,Tkl_Tackles,...,Tkl_percent_Challenges,Lost_Challenges,Blocks_Blocks,Sh_Blocks,Pass_Blocks,Int,Tkl+Int,Clr,Err,Url
0,2023,Ajaccio,Ligue 1,Mickaël Alphonse,GLP,DF,33,1989,15.1,35.0,...,60.6,13.0,17.0,5.0,12.0,35,70.0,44.0,1.0,https://fbref.com/en/players/0a63446f/Mickael-...
1,2023,Ajaccio,Ligue 1,Cédric Avinel,GLP,DF,35,1986,19.9,13.0,...,66.7,4.0,20.0,13.0,7.0,15,28.0,91.0,1.0,https://fbref.com/en/players/a94d93be/Cedric-A...


In [139]:
defense_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889 entries, 0 to 2888
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Season_End_Year         2889 non-null   int64  
 1   Squad                   2889 non-null   object 
 2   Comp                    2889 non-null   object 
 3   Player                  2889 non-null   object 
 4   Nation                  2888 non-null   object 
 5   Pos                     2889 non-null   object 
 6   Age                     2889 non-null   int64  
 7   Born                    2889 non-null   int64  
 8   Mins_Per_90             2889 non-null   float64
 9   Tkl_Tackles             2884 non-null   float64
 10  TklW_Tackles            2889 non-null   int64  
 11  Def 3rd_Tackles         2884 non-null   float64
 12  Mid 3rd_Tackles         2884 non-null   float64
 13  Att 3rd_Tackles         2884 non-null   float64
 14  Tkl_Challenges          2884 non-null   

In [140]:
defense_stats.columns

Index(['Season_End_Year', 'Squad', 'Comp', 'Player', 'Nation', 'Pos', 'Age',
       'Born', 'Mins_Per_90', 'Tkl_Tackles', 'TklW_Tackles', 'Def 3rd_Tackles',
       'Mid 3rd_Tackles', 'Att 3rd_Tackles', 'Tkl_Challenges',
       'Att_Challenges', 'Tkl_percent_Challenges', 'Lost_Challenges',
       'Blocks_Blocks', 'Sh_Blocks', 'Pass_Blocks', 'Int', 'Tkl+Int', 'Clr',
       'Err', 'Url'],
      dtype='object')

In [141]:
defense_col_selection = ['Url', 'Tkl_Tackles', 'TklW_Tackles', 'Def 3rd_Tackles', 'Mid 3rd_Tackles', 'Att 3rd_Tackles', 
                         'Tkl_Challenges', 'Att_Challenges', 'Lost_Challenges', 'Blocks_Blocks', 'Sh_Blocks', 
                         'Pass_Blocks', 'Int', 'Tkl+Int', 'Clr', 'Err']

In [142]:
new_defense_stats = defense_stats[defense_col_selection]
new_defense_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889 entries, 0 to 2888
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Url              2889 non-null   object 
 1   Tkl_Tackles      2884 non-null   float64
 2   TklW_Tackles     2889 non-null   int64  
 3   Def 3rd_Tackles  2884 non-null   float64
 4   Mid 3rd_Tackles  2884 non-null   float64
 5   Att 3rd_Tackles  2884 non-null   float64
 6   Tkl_Challenges   2884 non-null   float64
 7   Att_Challenges   2884 non-null   float64
 8   Lost_Challenges  2884 non-null   float64
 9   Blocks_Blocks    2884 non-null   float64
 10  Sh_Blocks        2884 non-null   float64
 11  Pass_Blocks      2884 non-null   float64
 12  Int              2889 non-null   int64  
 13  Tkl+Int          2884 non-null   float64
 14  Clr              2884 non-null   float64
 15  Err              2884 non-null   float64
dtypes: float64(13), int64(2), object(1)
memory usage: 361.2+ KB


In [143]:
# Group the data by the "Url" column and sum the statistics
new_defense_stats = new_defense_stats.groupby("Url").sum().reset_index()

# Uppercase the column names of a DataFrame
new_defense_stats.columns = new_defense_stats.columns.str.upper()

# Save the DataFrame to a CSV file
new_defense_stats.to_csv("datasets/aggregated/agg_defense_stats.csv", index=False)

POSSESSION STATS

In [144]:
# Load the player statistics dataset
possession_stats = pd.read_csv("datasets/12_big5_player_possession_stats.csv")
possession_stats.head(2)

Unnamed: 0,Season_End_Year,Squad,Comp,Player,Nation,Pos,Age,Born,Mins_Per_90,Touches_Touches,...,TotDist_Carries,PrgDist_Carries,PrgC_Carries,Final_Third_Carries,CPA_Carries,Mis_Carries,Dis_Carries,Rec_Receiving,PrgR_Receiving,Url
0,2023,Ajaccio,Ligue 1,Mickaël Alphonse,GLP,DF,33,1989,15.1,879.0,...,2988.0,1528.0,30.0,25.0,0.0,15.0,14.0,403.0,34.0,https://fbref.com/en/players/0a63446f/Mickael-...
1,2023,Ajaccio,Ligue 1,Cédric Avinel,GLP,DF,35,1986,19.9,876.0,...,3235.0,1495.0,3.0,1.0,0.0,3.0,0.0,508.0,1.0,https://fbref.com/en/players/a94d93be/Cedric-A...


In [145]:
possession_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889 entries, 0 to 2888
Data columns (total 32 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Season_End_Year      2889 non-null   int64  
 1   Squad                2889 non-null   object 
 2   Comp                 2889 non-null   object 
 3   Player               2889 non-null   object 
 4   Nation               2888 non-null   object 
 5   Pos                  2889 non-null   object 
 6   Age                  2889 non-null   int64  
 7   Born                 2889 non-null   int64  
 8   Mins_Per_90          2889 non-null   float64
 9   Touches_Touches      2884 non-null   float64
 10  Def Pen_Touches      2884 non-null   float64
 11  Def 3rd_Touches      2884 non-null   float64
 12  Mid 3rd_Touches      2884 non-null   float64
 13  Att 3rd_Touches      2884 non-null   float64
 14  Att Pen_Touches      2884 non-null   float64
 15  Live_Touches         2884 non-null   f

In [146]:
possession_stats.columns

Index(['Season_End_Year', 'Squad', 'Comp', 'Player', 'Nation', 'Pos', 'Age',
       'Born', 'Mins_Per_90', 'Touches_Touches', 'Def Pen_Touches',
       'Def 3rd_Touches', 'Mid 3rd_Touches', 'Att 3rd_Touches',
       'Att Pen_Touches', 'Live_Touches', 'Att_Take', 'Succ_Take',
       'Succ_percent_Take', 'Tkld_Take', 'Tkld_percent_Take',
       'Carries_Carries', 'TotDist_Carries', 'PrgDist_Carries', 'PrgC_Carries',
       'Final_Third_Carries', 'CPA_Carries', 'Mis_Carries', 'Dis_Carries',
       'Rec_Receiving', 'PrgR_Receiving', 'Url'],
      dtype='object')

In [147]:
possession_col_selection = ['Url', 'Touches_Touches', 'Def Pen_Touches', 'Def 3rd_Touches', 'Mid 3rd_Touches', 
                            'Att 3rd_Touches', 'Att Pen_Touches', 'Live_Touches', 'Att_Take', 'Succ_Take', 
                            'Tkld_Take', 'Carries_Carries', 'TotDist_Carries', 'PrgDist_Carries', 'PrgC_Carries',
                            'Final_Third_Carries', 'CPA_Carries', 'Mis_Carries', 'Dis_Carries', 'Rec_Receiving', 'PrgR_Receiving']

In [148]:
new_possession_stats = possession_stats[possession_col_selection]
new_possession_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889 entries, 0 to 2888
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Url                  2889 non-null   object 
 1   Touches_Touches      2884 non-null   float64
 2   Def Pen_Touches      2884 non-null   float64
 3   Def 3rd_Touches      2884 non-null   float64
 4   Mid 3rd_Touches      2884 non-null   float64
 5   Att 3rd_Touches      2884 non-null   float64
 6   Att Pen_Touches      2884 non-null   float64
 7   Live_Touches         2884 non-null   float64
 8   Att_Take             2884 non-null   float64
 9   Succ_Take            2884 non-null   float64
 10  Tkld_Take            2884 non-null   float64
 11  Carries_Carries      2884 non-null   float64
 12  TotDist_Carries      2884 non-null   float64
 13  PrgDist_Carries      2884 non-null   float64
 14  PrgC_Carries         2884 non-null   float64
 15  Final_Third_Carries  2884 non-null   f

In [149]:
# Group the data by the "Url" column and sum the statistics
new_possession_stats = new_possession_stats.groupby("Url").sum().reset_index()

# Uppercase the column names of a DataFrame
new_possession_stats.columns = new_possession_stats.columns.str.upper()

# Save the DataFrame to a CSV file
new_possession_stats.to_csv("datasets/aggregated/agg_possession_stats.csv", index=False)

PLAYING TIME STATS

In [150]:
# Load the player statistics dataset
playing_time_stats = pd.read_csv("datasets/13_big5_player_playing_time_stats.csv")
playing_time_stats.head(2)

Unnamed: 0,Season_End_Year,Squad,Comp,Player,Nation,Pos,Age,Born,MP_Playing.Time,Min_Playing.Time,...,onGA_Team.Success,plus_per__minus__Team.Success,plus_per__minus_90_Team.Success,On_minus_Off_Team.Success,onxG_Team.Success..xG.,onxGA_Team.Success..xG,xGplus_per__minus__Team.Success..xG,xGplus_per__minus_90_Team.Success..xG,On_minus_Off_Team.Success..xG,Url
0,2023,Ajaccio,Ligue 1,Mickaël Alphonse,GLP,DF,33.0,1989.0,26,1361.0,...,30.0,-23.0,-1.52,-0.3,12.0,20.8,-8.8,-0.58,-0.19,https://fbref.com/en/players/0a63446f/Mickael-...
1,2023,Ajaccio,Ligue 1,Cédric Avinel,GLP,DF,35.0,1986.0,24,1794.0,...,40.0,-29.0,-1.45,-0.24,19.0,28.4,-9.4,-0.47,0.0,https://fbref.com/en/players/a94d93be/Cedric-A...


In [151]:
playing_time_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3527 entries, 0 to 3526
Data columns (total 31 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Season_End_Year                        3527 non-null   int64  
 1   Squad                                  3527 non-null   object 
 2   Comp                                   3527 non-null   object 
 3   Player                                 3527 non-null   object 
 4   Nation                                 3523 non-null   object 
 5   Pos                                    3527 non-null   object 
 6   Age                                    3526 non-null   float64
 7   Born                                   3526 non-null   float64
 8   MP_Playing.Time                        3527 non-null   int64  
 9   Min_Playing.Time                       2889 non-null   float64
 10  Mn_per_MP_Playing.Time                 2889 non-null   float64
 11  Min_

In [152]:
playing_time_stats.columns

Index(['Season_End_Year', 'Squad', 'Comp', 'Player', 'Nation', 'Pos', 'Age',
       'Born', 'MP_Playing.Time', 'Min_Playing.Time', 'Mn_per_MP_Playing.Time',
       'Min_percent_Playing.Time', 'Mins_Per_90_Playing.Time', 'Starts_Starts',
       'Mn_per_Start_Starts', 'Compl_Starts', 'Subs_Subs', 'Mn_per_Sub_Subs',
       'unSub_Subs', 'PPM_Team.Success', 'onG_Team.Success',
       'onGA_Team.Success', 'plus_per__minus__Team.Success',
       'plus_per__minus_90_Team.Success', 'On_minus_Off_Team.Success',
       'onxG_Team.Success..xG.', 'onxGA_Team.Success..xG',
       'xGplus_per__minus__Team.Success..xG',
       'xGplus_per__minus_90_Team.Success..xG',
       'On_minus_Off_Team.Success..xG', 'Url'],
      dtype='object')

In [153]:
playing_time_col_selection = ['Url', 'Compl_Starts', 'Subs_Subs', 'unSub_Subs', 'PPM_Team.Success', 
                              'onG_Team.Success', 'onGA_Team.Success', 'plus_per__minus__Team.Success']

In [154]:
new_playing_time_stats = playing_time_stats[playing_time_col_selection]
new_playing_time_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3527 entries, 0 to 3526
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Url                            3527 non-null   object 
 1   Compl_Starts                   3527 non-null   int64  
 2   Subs_Subs                      3527 non-null   int64  
 3   unSub_Subs                     3527 non-null   int64  
 4   PPM_Team.Success               2891 non-null   float64
 5   onG_Team.Success               2891 non-null   float64
 6   onGA_Team.Success              2891 non-null   float64
 7   plus_per__minus__Team.Success  2891 non-null   float64
dtypes: float64(4), int64(3), object(1)
memory usage: 220.6+ KB


In [155]:
# Define the aggregation methods for each column
aggregation_methods = {
    "Compl_Starts": "sum",
    "Subs_Subs": "sum",
    "unSub_Subs": "sum",
    "PPM_Team.Success": "mean",
    "onG_Team.Success": "sum",
    "onGA_Team.Success": "sum",
    "plus_per__minus__Team.Success": "sum"
    # Add more columns and corresponding aggregation methods as needed
}

# Group the data by the "Url" column and apply the aggregation methods
new_playing_time_stats = new_playing_time_stats.groupby("Url").agg(aggregation_methods).reset_index()

# Uppercase the column names of a DataFrame
new_playing_time_stats.columns = new_playing_time_stats.columns.str.upper()

# Save the DataFrame to a CSV file
new_playing_time_stats.to_csv("datasets/aggregated/agg_playing_time_stats.csv", index=False)

MISCELLANEOUS STATS

In [156]:
# Load the player statistics dataset
misc_stats = pd.read_csv("datasets/14_big5_player_misc_stats.csv")
misc_stats.head(2)

Unnamed: 0,Season_End_Year,Squad,Comp,Player,Nation,Pos,Age,Born,Mins_Per_90,CrdY,...,Int,TklW,PKwon,PKcon,OG,Recov,Won_Aerial,Lost_Aerial,Won_percent_Aerial,Url
0,2023,Ajaccio,Ligue 1,Mickaël Alphonse,GLP,DF,33,1989,15.1,2,...,35,23,0.0,1.0,1,89.0,32.0,27.0,54.2,https://fbref.com/en/players/0a63446f/Mickael-...
1,2023,Ajaccio,Ligue 1,Cédric Avinel,GLP,DF,35,1986,19.9,2,...,15,7,0.0,2.0,0,93.0,38.0,15.0,71.7,https://fbref.com/en/players/a94d93be/Cedric-A...


In [157]:
misc_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889 entries, 0 to 2888
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Season_End_Year     2889 non-null   int64  
 1   Squad               2889 non-null   object 
 2   Comp                2889 non-null   object 
 3   Player              2889 non-null   object 
 4   Nation              2888 non-null   object 
 5   Pos                 2889 non-null   object 
 6   Age                 2889 non-null   int64  
 7   Born                2889 non-null   int64  
 8   Mins_Per_90         2889 non-null   float64
 9   CrdY                2889 non-null   int64  
 10  CrdR                2889 non-null   int64  
 11  2CrdY               2889 non-null   int64  
 12  Fls                 2889 non-null   int64  
 13  Fld                 2889 non-null   int64  
 14  Off                 2889 non-null   int64  
 15  Crs                 2889 non-null   int64  
 16  Int   

In [158]:
misc_stats.columns

Index(['Season_End_Year', 'Squad', 'Comp', 'Player', 'Nation', 'Pos', 'Age',
       'Born', 'Mins_Per_90', 'CrdY', 'CrdR', '2CrdY', 'Fls', 'Fld', 'Off',
       'Crs', 'Int', 'TklW', 'PKwon', 'PKcon', 'OG', 'Recov', 'Won_Aerial',
       'Lost_Aerial', 'Won_percent_Aerial', 'Url'],
      dtype='object')

In [159]:
misc_col_selection = ['Url', '2CrdY', 'Fls', 'Fld', 'Off', 'Crs', 'TklW', 'PKwon', 'PKcon', 
                      'OG', 'Recov', 'Won_Aerial', 'Lost_Aerial']

In [160]:
new_misc_stats = misc_stats[misc_col_selection]
new_misc_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889 entries, 0 to 2888
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Url          2889 non-null   object 
 1   2CrdY        2889 non-null   int64  
 2   Fls          2889 non-null   int64  
 3   Fld          2889 non-null   int64  
 4   Off          2889 non-null   int64  
 5   Crs          2889 non-null   int64  
 6   TklW         2889 non-null   int64  
 7   PKwon        2884 non-null   float64
 8   PKcon        2884 non-null   float64
 9   OG           2889 non-null   int64  
 10  Recov        2884 non-null   float64
 11  Won_Aerial   2884 non-null   float64
 12  Lost_Aerial  2884 non-null   float64
dtypes: float64(5), int64(7), object(1)
memory usage: 293.5+ KB


In [161]:
# Group the data by the "Url" column and sum the statistics
new_misc_stats = new_misc_stats.groupby("Url").sum().reset_index()

# Uppercase the column names of a DataFrame
new_misc_stats.columns = new_misc_stats.columns.str.upper()

# Save the DataFrame to a CSV file
new_misc_stats.to_csv("datasets/aggregated/agg_misc_stats.csv", index=False)

KEEPER STATS

In [162]:
# Load the player statistics dataset
keeper_stats = pd.read_csv("datasets/15_big5_player_keepers_stats.csv")
keeper_stats.head(2)

Unnamed: 0,Season_End_Year,Squad,Comp,Player,Nation,Pos,Age,Born,MP_Playing,Starts_Playing,...,D,L,CS,CS_percent,PKatt_Penalty,PKA_Penalty,PKsv_Penalty,PKm_Penalty,Save_percent_Penalty,Url
0,2023,Ajaccio,Ligue 1,Benjamin Leroy,FRA,GK,33,1989,32,32,...,4.0,22.0,4.0,12.5,9.0,8.0,0.0,1.0,0.0,https://fbref.com/en/players/1e7aea56/Benjamin...
1,2023,Ajaccio,Ligue 1,Ghjuvanni Quilichini,FRA,GK,19,2002,1,0,...,0.0,0.0,1.0,,0.0,0.0,0.0,0.0,,https://fbref.com/en/players/1b729a98/Ghjuvann...


In [163]:
keeper_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 28 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Season_End_Year       208 non-null    int64  
 1   Squad                 208 non-null    object 
 2   Comp                  208 non-null    object 
 3   Player                208 non-null    object 
 4   Nation                208 non-null    object 
 5   Pos                   208 non-null    object 
 6   Age                   208 non-null    int64  
 7   Born                  208 non-null    int64  
 8   MP_Playing            208 non-null    int64  
 9   Starts_Playing        208 non-null    int64  
 10  Min_Playing           208 non-null    int64  
 11  Mins_Per_90           208 non-null    float64
 12  GA                    207 non-null    float64
 13  GA90                  207 non-null    float64
 14  SoTA                  207 non-null    float64
 15  Saves                 2

In [164]:
keeper_stats.columns

Index(['Season_End_Year', 'Squad', 'Comp', 'Player', 'Nation', 'Pos', 'Age',
       'Born', 'MP_Playing', 'Starts_Playing', 'Min_Playing', 'Mins_Per_90',
       'GA', 'GA90', 'SoTA', 'Saves', 'Save_percent', 'W', 'D', 'L', 'CS',
       'CS_percent', 'PKatt_Penalty', 'PKA_Penalty', 'PKsv_Penalty',
       'PKm_Penalty', 'Save_percent_Penalty', 'Url'],
      dtype='object')

In [165]:
keeper_col_selection = ['Url', 'GA', 'SoTA', 'Saves', 'W', 'D', 'L', 'CS', 'PKatt_Penalty', 
                        'PKA_Penalty', 'PKsv_Penalty', 'PKm_Penalty']

In [166]:
new_keeper_stats = keeper_stats[keeper_col_selection]
new_keeper_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Url            208 non-null    object 
 1   GA             207 non-null    float64
 2   SoTA           207 non-null    float64
 3   Saves          207 non-null    float64
 4   W              208 non-null    int64  
 5   D              207 non-null    float64
 6   L              207 non-null    float64
 7   CS             207 non-null    float64
 8   PKatt_Penalty  207 non-null    float64
 9   PKA_Penalty    207 non-null    float64
 10  PKsv_Penalty   207 non-null    float64
 11  PKm_Penalty    207 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 19.6+ KB


In [167]:
# Group the data by the "Url" column and sum the statistics
new_keeper_stats = new_keeper_stats.groupby("Url").sum().reset_index()

# Uppercase the column names of a DataFrame
new_keeper_stats.columns = new_keeper_stats.columns.str.upper()

# Save the DataFrame to a CSV file
new_keeper_stats.to_csv("datasets/aggregated/agg_keeper_stats.csv", index=False)