Data Visualization Notebook with FBref-DW Merged Data:

In [57]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as so

In [88]:
# Global variables:
repo_dir = os.getcwd()  # Directory of the notebook
source_data_dir = os.path.join(repo_dir, 'fbref-dw-merges')

# Create a statistics dictionary for the DEF category:
df_stat_dict = {'MP': 'mean',         # total matches played
                  '90s_r': 'mean',     # number of 90s played
                  'Min': 'mean',       # number of minutes played
                  'Tkl': 'mean',       # total tackles
                  'TklW': 'mean',      # total tackles won
                  'Def 3rd': 'mean',   # total tackles in def. 3rd
                  'Mid 3rd': 'mean',   # total tackles in mid. 3rd
                  'Blocks.1' : 'mean', # total blocks
                  'Sh.3': 'mean',      # total shots blocked
                  'market_value_in_eur': 'mean',
                  'big_6' : 'mean'     # big 6 dummy
                  }

# Create a statistics dictionary for the MID category:
mf_stat_dict = {'MP': 'mean',          # total matches played
                  '90s_r': 'mean',      # number of 90s played
                  'Min': 'mean',        # number of minutes played
                  'Gls.1': 'mean',       # goals/90
                  'Ast.1': 'mean',      # assists/90
                  'G-PK.1': 'mean',     # npg per 90
                  'G+A-PK': 'mean',     # npg+a per 90
                  'npxG+xAG.1' : 'mean',
                  'SoT%': 'mean',
                  'Cmp%': 'mean',       # overall pass completion
                  'Cmp%.1': 'mean',     # mid-range pass completion
                  'SCA90': 'mean',
                  'GCA90': 'mean',
                  'Tkl%': 'mean',       # successful tackle rate
                  'PrgC.1': 'mean',     # total progressive carries
                  'market_value_in_eur': 'mean',
                  'big_6' : 'mean'     # big 6 dummy
                  }

# Create a statistics dictionary for the FW category:
fw_stat_dict = {'MP': 'mean',           # total matches played
                  '90s_r': 'mean',      # number of 90s played
                  'Min': 'mean',        # number of minutes played
                  'Gls.1': 'mean',      # goals/90
                  'Ast.1': 'mean',      # assists/90
                  'G-PK.1': 'mean',     # npg per 90
                  'G+A-PK': 'mean',     # npg+a per 90
                  'npxG+xAG.1' : 'mean', # per 90
                  'npxG.1': 'mean',     # per 90
                  'G/Sh': 'mean',       # overall pass completion
                  'G/SoT': 'mean',      # mid-range pass completion
                  'np:G-xG': 'mean',    # per 90
                  'A-xAG': 'mean',      # total A - xAG 
                  'Crs': 'mean',        # total crosses
                  'GCA90': 'mean',
                  'market_value_in_eur': 'mean',
                  'big_6' : 'mean'     # big 6 dummy
                  }

# Stat column lists by position:
df_stat_cols = [key for key in df_stat_dict]
mf_stat_cols = [key for key in mf_stat_dict]
fw_stat_cols = [key for key in fw_stat_dict]
# List of these lists:
stats_lists = [df_stat_cols, mf_stat_cols, fw_stat_cols]

In [84]:
# Functions:
def filter_by_season_pos(df, pos, season):
    return (df[df['position'].str.contains(pos) & (df['season'] == season)])

In [89]:
# Import EPL data:
epl_player_data = pd.read_csv(os.path.join(source_data_dir, "ENG-Premier League_fbref_dw_merge.csv"))

# Add the "Big 6" dummy:
big6_teams = ['Arsenal', 'Manchester City', 'Manchester Utd', 'Tottenham', 'Chelsea', 'Liverpool']
epl_player_data['big_6'] = epl_player_data['team_x'].isin(big6_teams).astype(int)

In [90]:
# Create a list of position-level, 22/23 dataframes that are sliced to necessary stat columns:
epl_position_dfs = [filter_by_season_pos(epl_player_data, pos, 2223)[cols_list] for pos, cols_list in zip(['DF', 'MF', 'FW'], stats_lists)]

In [91]:
# Define each position-level 22/23 dataframe from the list:
epl_def_df = epl_position_dfs[0]
epl_mf_df = epl_position_dfs[1]
epl_fw_df = epl_position_dfs[2]

In [93]:
epl_def_df

Unnamed: 0,MP,90s_r,Min,Tkl,TklW,Def 3rd,Mid 3rd,Blocks.1,Sh.3,market_value_in_eur,big_6,Tkl_per90,TklW_per90,Def 3rd_per90,Mid 3rd_per90,Blocks.1_per90
1390,38.0,33.9,3055.0,60.0,31.0,23.0,30.0,47.0,11.0,4.750000e+07,1,1.769912,0.914454,0.678466,0.884956,1.386431
1392,2.0,0.3,29.0,0.0,0.0,0.0,0.0,1.0,0.0,4.000000e+06,1,0.000000,0.000000,0.000000,0.000000,3.333333
1393,6.0,2.2,198.0,8.0,5.0,4.0,3.0,3.0,1.0,4.000000e+06,0,3.636364,2.272727,1.818182,1.363636,1.363636
1399,7.0,4.7,427.0,7.0,5.0,6.0,1.0,5.0,3.0,1.066667e+07,1,1.489362,1.063830,1.276596,0.212766,1.063830
1402,27.0,8.8,795.0,20.0,11.0,9.0,9.0,8.0,2.0,2.750000e+07,1,2.272727,1.250000,1.022727,1.022727,0.909091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1826,27.0,24.8,2230.0,43.0,24.0,25.0,14.0,16.0,6.0,2.000000e+07,0,1.733871,0.967742,1.008065,0.564516,0.645161
1831,21.0,14.6,1310.0,33.0,16.0,23.0,6.0,14.0,1.0,2.000000e+05,0,2.260274,1.095890,1.575342,0.410959,0.958904
1838,37.0,36.7,3307.0,36.0,20.0,26.0,9.0,70.0,42.0,2.166667e+07,0,0.980926,0.544959,0.708447,0.245232,1.907357
1839,26.0,20.2,1814.0,27.0,15.0,19.0,8.0,35.0,28.0,2.200000e+07,0,1.336634,0.742574,0.940594,0.396040,1.732673


DEFENDERS Visualizations:

In [92]:
# Convert basic stats to per90 terms:
for x in epl_def_df.iloc[:, 3:8]:
    epl_def_df[f'{x}_per90'] = epl_def_df[x] / epl_def_df['90s_r']
epl_def_df

Unnamed: 0,MP,90s_r,Min,Tkl,TklW,Def 3rd,Mid 3rd,Blocks.1,Sh.3,market_value_in_eur,big_6,Tkl_per90,TklW_per90,Def 3rd_per90,Mid 3rd_per90,Blocks.1_per90
1390,38.0,33.9,3055.0,60.0,31.0,23.0,30.0,47.0,11.0,4.750000e+07,1,1.769912,0.914454,0.678466,0.884956,1.386431
1392,2.0,0.3,29.0,0.0,0.0,0.0,0.0,1.0,0.0,4.000000e+06,1,0.000000,0.000000,0.000000,0.000000,3.333333
1393,6.0,2.2,198.0,8.0,5.0,4.0,3.0,3.0,1.0,4.000000e+06,0,3.636364,2.272727,1.818182,1.363636,1.363636
1399,7.0,4.7,427.0,7.0,5.0,6.0,1.0,5.0,3.0,1.066667e+07,1,1.489362,1.063830,1.276596,0.212766,1.063830
1402,27.0,8.8,795.0,20.0,11.0,9.0,9.0,8.0,2.0,2.750000e+07,1,2.272727,1.250000,1.022727,1.022727,0.909091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1826,27.0,24.8,2230.0,43.0,24.0,25.0,14.0,16.0,6.0,2.000000e+07,0,1.733871,0.967742,1.008065,0.564516,0.645161
1831,21.0,14.6,1310.0,33.0,16.0,23.0,6.0,14.0,1.0,2.000000e+05,0,2.260274,1.095890,1.575342,0.410959,0.958904
1838,37.0,36.7,3307.0,36.0,20.0,26.0,9.0,70.0,42.0,2.166667e+07,0,0.980926,0.544959,0.708447,0.245232,1.907357
1839,26.0,20.2,1814.0,27.0,15.0,19.0,8.0,35.0,28.0,2.200000e+07,0,1.336634,0.742574,0.940594,0.396040,1.732673


In [None]:
# Create a scatterplot for each defensive statistic:
for col in epl_def_df.iloc[:, 10:]:
    so.scatterplot(data=epl_def_df, x=col, y="market_value_in_eur")
    plt.show()

MIDFIELDERS Visualizations:

In [None]:
# Create a scatterplot for each midfielder statistic:
for col in epl_mf_df.columns:
    so.regplot(data=epl_mf_df, x=col, y="market_value_in_eur")
    plt.show()