In [174]:
import pandas as pd
import json
import re

In [175]:
team_info = pd.read_csv('./data/players_21.csv', sep=',',encoding='utf-8')

In [176]:
def remove_special_characters(input_str):
    return re.sub(r'[^a-zA-Z0-9\s]', '', input_str)

# leggi il csv e impora i dati in un dataframe in utf-8
df = pd.read_csv('./data/2021-2022-player-stats.csv', sep=';',encoding='ISO-8859-1')
df["Player"] = df["Player"].str.replace('?', '')
df["Player"] = df["Player"].apply(remove_special_characters)

In [177]:
team_info['long_name_cleaned'] = team_info['long_name'].apply(remove_special_characters)

def find_matching_long_name(player_name):
    matching_rows = team_info[team_info['long_name_cleaned'].str.contains(player_name)]
    if not matching_rows.empty:
        return matching_rows['short_name'].iloc[0]
    else:
        return player_name 

df['Player'] = df['Player'].apply(find_matching_long_name)
df = df.merge(team_info[["short_name","player_face_url", "club_logo_url"]], left_on='Player', right_on='short_name', how='left')
df.drop(columns=['short_name'], inplace=True)
df.rename(columns={"player_face_url": "PlayerFaceUrl", "club_logo_url": "ClubLogoUrl"}, inplace=True)

In [178]:
# count number of player with the same SquadName
# note: the number of player with the same SquadName is > 1 (USEFUL for ffill)
print(df['Squad'].value_counts()[df['Squad'].value_counts() < 2])

Series([], Name: count, dtype: int64)


In [179]:
df['PlayerFaceUrl'].fillna('https://resources.premierleague.com/premierleague/photos/players/250x250/Photo-Missing.png', inplace=True)
# sort by SquadName
df.sort_values(by=['Squad'], inplace=True)
# fill none values on ClubLogoUrl field with the row with the same Squad
df['ClubLogoUrl'].fillna(method='ffill', inplace=True)
# correct: no results == no Squad with only 1 player

In [180]:
acronyms = {"Rk": "Rank", "Player": "Player's name", "Nation": "Player's nation", "Pos": "Position",
            "Squad": "Squad’s name", "Comp": "League that squat occupies", "Age": "Player's age",
            "Born": "Year of birth", "MP": "Matches played", "Starts": "Matches started",
            "Min": "Minutes played", "90s": "Minutes played divided by 90",
            "Goals": "Goals scored or allowed",
            "Shots": "Shots total (Does not include penalty kicks)",
            "SoT": "Shots on target (Does not include penalty kicks)",
            'SoT%': 'Shots on target percentage (Does not include penalty kicks)',
            'G/Sh': 'Goals per shot',
            'G/SoT': 'Goals per shot on target (Does not include penalty kicks)',
            'ShoDist': 'Average distance, in yards, from goal of all shots taken (Does not include penalty kicks)',
            'ShoFK': 'Shots from free kicks',
            'ShoPK': 'Penalty kicks made',
            "PasProg": "Completed passes that move the ball towards the opponent's goal at least 10 yards from its furthest point in the last six passes, or any completed pass into the penalty area",
                  "PasAtt": "Passes attempted",
                  "PasLive": "Live-ball passes",
                  "PasDead": "Dead-ball passes",
                  "PasFK": "Passes attempted from free kicks",
                  "TB": "Completed pass sent between back defenders into open space",
                  "Sw": "Passes that travel more than 40 yards of the width of the pitch",
                  "PasCrs": "Crosses",
                  "TI": "Throw-Ins taken",
                  "CK": "Corner kicks",
                  "CkIn": "Inswinging corner kicks",
                  "CkOut": "Outswinging corner kicks",
                  "CkStr": "Straight corner kicks",
                  "PasCmp": "Passes completed",
                  "PasOff": "Offsides",
                  "PasBlocks": "Blocked by the opponent who was standing it the path",
                  "SCA": "Shot-creating actions",
                  "ScaPassLive": "Completed live-ball passes that lead to a shot attempt",
                  "ScaPassDead": "Completed dead-ball passes that lead to a shot attempt",
                  "ScaDrib": "Successful dribbles that lead to a shot attempt",
                  "ScaSh": "Shots that lead to another shot attempt",
                  "ScaFld": "Fouls drawn that lead to a shot attempt",
                  "ScaDef": "Defensive actions that lead to a shot attempt",
                  "GCA": "Goal-creating actions",
                  "GcaPassLive": "Completed live-ball passes that lead to a goal",
                  "GcaPassDead": "Completed dead-ball passes that lead to a goal",
                  "GcaDrib": "Successful dribbles that lead to a goal",
                  "GcaSh": "Shots that lead to another goal-scoring shot",
                  "GcaFld": "Fouls drawn that lead to a goal",
                  "GcaDef": "Defensive actions that lead to a goal",
                  "Tkl": "Number of players tackled",
                  "TklWon": "Tackles in which the tackler's team won possession of the ball",
                  "TklDef3rd": "Tackles in defensive 1/3",
                  "TklMid3rd": "Tackles in middle 1/3",
                  "TklAtt3rd": "Tackles in attacking 1/3",
                  "TklDri": "Number of dribblers tackled",
                  "TklDriAtt": "Number of times dribbled past plus number of tackles",
                  "TklDri%": "Percentage of dribblers tackled",
                  "TklDriPast": "Number of times dribbled past by an opposing player",
                  "Blocks": 'Number of times blocking the ball by standing in its path',
            'BlkSh': 'Number of times blocking a shot by standing in its path',
            'PKatt': 'Penalty kicks attempted',
            'PasTotCmp': 'Passes completed',
            'PasTotAtt': 'Passes attempted',
            'PasTotCmp%': 'Pass completion percentage',
            'PasTotDist': 'Total distance, in yards, that completed passes have traveled in any direction',
            'PasTotPrgDist': 'Total distance, in yards, that completed passes have traveled towards the opponent\'s goal',
            'PasShoCmp': 'Passes completed (Passes between 5 and 15 yards)',
            'PasShoAtt': 'Passes attempted (Passes between 5 and 15 yards)',
            'PasShoCmp%': 'Pass completion percentage (Passes between 5 and 15 yards)',
            'PasMedCmp': 'Passes completed (Passes between 15 and 30 yards)',
            'PasMedAtt': 'Passes attempted (Passes between 15 and 30 yards)',
            'PasMedCmp%': 'Pass completion percentage (Passes between 15 and 30 yards)',
            'PasLonCmp': 'Passes completed (Passes longer than 30 yards)',
            'PasLonAtt': 'Passes attempted (Passes longer than 30 yards)',
            'PasLonCmp%': 'Pass completion percentage (Passes longer than 30 yards)',
            'Assists': 'Assists',
            'PasAss': 'Passes that directly lead to a shot (assisted shots)',
            'Pas3rd': 'Completed passes that enter the 1/3 of the pitch closest to the goal',
            "PPA": "Completed passes into the 18-yard box",
            "CrsPA": "Completed crosses into the 18-yard box",
            "BlkPass": "Number of times blocking a pass by standing in its path",
                  "Int": "Interceptions",
                  "Tkl+Int": "Number of players tackled plus number of interceptions",
                  "Clr": "Clearances",
                  "Err": "Mistakes leading to an opponent's shot",
                  "Touches": "Number of times a player touched the ball. Note: Receiving a pass, then dribbling, then sending a pass counts as one touch",
                  "TouDefPen": "Touches in defensive penalty area",
                  "TouDef3rd": "Touches in defensive 1/3",
                  "TouMid3rd": "Touches in middle 1/3",
                  "TouAtt3rd": "Touches in attacking 1/3",
                  "TouAttPen": "Touches in attacking penalty area",
                  "TouLive": "Live-ball touches. Does not include corner kicks, free kicks, throw-ins, kick-offs, goal kicks or penalty kicks.",
                  "ToAtt": "Number of attempts to take on defenders while dribbling",
                  "ToSuc": "Number of defenders taken on successfully, by dribbling past them",
                  "ToSuc%": "Percentage of take-ons Completed Successfully",
                  "ToTkl": "Number of times tackled by a defender during a take-on attempt",
                  "ToTkl%": "Percentage of time tackled by a defender during a take-on attempt",
                  "Carries": "Number of times the player controlled the ball with their feet",
                  "CarTotDist": "Total distance, in yards, a player moved the ball while controlling it with their feet, in any direction",
                  "CarPrgDist": "Total distance, in yards, a player moved the ball while controlling it with their feet towards the opponent's goal",
                  "CarProg": "Carries that move the ball towards the opponent's goal at least 5 yards, or any carry into the penalty area",
                  "Car3rd": "Carries that enter the 1/3 of the pitch closest to the goal",
                  "CPA": "Carries into the 18-yard box",
                  "CarMis": "Number of times a player failed when attempting to gain control of a ball",
                  "CarDis": "Number of times a player loses control of the ball after being tackled by an opposing player",
                  "Rec": "Number of times a player successfully received a pass",
            "RecProg":
            "Completed passes that move the ball towards the opponents goal at least 10 yards from its furthest point in the last six passes, or any completed pass into the penalty area Make this a dictionary",
            "CrdY": "Yellow cards",
                  "CrdR": "Red cards",
                  "2CrdY": "Second yellow card",
                  "Fls": "Fouls committed",
                  "Fld": "Fouls drawn",
                  "Off": "Offsides",
                  "Crs": "Crosses",
                  "TklW": "Tackles in which the tackler's team won possession of the ball",
                  "PKwon": "Penalty kicks won",
                  "PKcon": "Penalty kicks conceded",
                  "OG": "Own goals",
                  "Recov": "Number of loose balls recovered",
                  "AerWon": "Aerials won",
                  "AerLost": "Aerials lost",
                  "AerWon%": "Percentage of aerials won",
            }
# dump this dict into a json
with open('./data/acronyms.json', 'w') as fp:
    json.dump(acronyms, fp)


In [181]:
# stampa i valori distinti di comp
print(df['Pos'].unique())

# stampa la lista delle features
print(list(df.columns))

['GK' 'DF' 'MFFW' 'MF' 'FWMF' 'FW' 'MFDF' 'FWDF' 'DFMF' 'DFFW' 'GKMF']
['Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'Born', 'MP', 'Starts', 'Min', '90s', 'Goals', 'Shots', 'SoT', 'SoT%', 'G/Sh', 'G/SoT', 'ShoDist', 'ShoFK', 'ShoPK', 'PKatt', 'PasTotCmp', 'PasTotAtt', 'PasTotCmp%', 'PasTotDist', 'PasTotPrgDist', 'PasShoCmp', 'PasShoAtt', 'PasShoCmp%', 'PasMedCmp', 'PasMedAtt', 'PasMedCmp%', 'PasLonCmp', 'PasLonAtt', 'PasLonCmp%', 'Assists', 'PasAss', 'Pas3rd', 'PPA', 'CrsPA', 'PasProg', 'PasAtt', 'PasLive', 'PasDead', 'PasFK', 'TB', 'PasPress', 'Sw', 'PasCrs', 'CK', 'CkIn', 'CkOut', 'CkStr', 'PasGround', 'PasLow', 'PasHigh', 'PaswLeft', 'PaswRight', 'PaswHead', 'TI', 'PaswOther', 'PasCmp', 'PasOff', 'PasOut', 'PasInt', 'PasBlocks', 'SCA', 'ScaPassLive', 'ScaPassDead', 'ScaDrib', 'ScaSh', 'ScaFld', 'ScaDef', 'GCA', 'GcaPassLive', 'GcaPassDead', 'GcaDrib', 'GcaSh', 'GcaFld', 'GcaDef', 'Tkl', 'TklWon', 'TklDef3rd', 'TklMid3rd', 'TklAtt3rd', 'TklDri', 'TklDriAtt', 'TklDri%', 'T

In [182]:
# stampa le rows di comp con Serie A e mi selezioni solo la colonna goals e player
df.loc[df['Comp'] == 'Serie A', ['Shots', 'Player']]

Unnamed: 0,Shots,Player
3064,1.42,D. Zappacosta
2827,0.69,Rafael Tolói
2456,0.78,Marten de Roon
1207,1.10,H. Hateboer
349,2.47,Jeremie Boga
...,...,...
636,0.59,Mickal Cuisance
171,0.00,Issa Bah
1914,0.23,Marco Modolo
2752,0.58,Michael Svoboda


In [183]:
# remove null and duplicate values
# df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

print(df.describe().transpose())

          count         mean         std   min        25%      50%      75%   
Rk       3094.0  1463.119586  840.542193   1.0   736.2500  1470.50  2186.75  \
Age      3093.0    26.131911    4.615213  16.0    23.0000    26.00    29.00   
Born     3094.0  1994.723659   36.167323   0.0  1992.0000  1996.00  1999.00   
MP       3094.0    18.889463   11.616462   1.0     8.0000    20.00    29.00   
Starts   3094.0    13.825792   11.379233   0.0     3.0000    12.00    23.00   
...         ...          ...         ...   ...        ...      ...      ...   
OG       3094.0     0.003607    0.020783   0.0     0.0000     0.00     0.00   
Recov    3094.0     7.401823    3.611095   0.0     5.0825     7.38     9.36   
AerWon   3094.0     1.658452    1.912718   0.0     0.5300     1.22     2.26   
AerLost  3094.0     1.842599    2.196356   0.0     0.8500     1.40     2.20   
AerWon%  3094.0    40.605333   24.114787   0.0    27.0000    43.65    57.40   

            max  
Rk       2921.0  
Age        41.0

In [184]:
competitions = ["Serie A", "Premier League", "La Liga", "Bundesliga", "Ligue 1"]

forward_positions = ["FW", "FWMF", "FWDF"]
midfield_positions = ["MF", "MFDF", "MFFW"]
defender_positions = ["DF", "DFMF", "DFFW"]

common_features = ["Player", "Squad", "Comp", "MP", "Min", "Pos", "90s", "PlayerFaceUrl", "ClubLogoUrl"]

forward_features = ["Goals", "Shots", "SoT", "G/Sh", "G/SoT", "ShoDist", "GCA", "SCA", "Off", "PKwon", "ScaDrib", "Assists",
                    "ScaPassLive", "Car3rd", "ScaFld",  "Carries", "CarTotDist", "CarPrgDist", 'CPA', "CarMis", "CarDis"]  # "ToAtt", "ToSuc"

midfielder_features = ["Goals", "PasTotCmp", "PasTotCmp%", "PasTotDist", "PasTotPrgDist", "Assists", "PasAss", "Pas3rd", "Crs", "PasCmp",
                       "PasOff", "PasBlocks", "SCA", "ScaPassLive", "ScaPassDead", "ScaDrib", "ScaSh", "ScaFld", "GCA", "GcaPassLive",
                       "GcaPassDead", "GcaDrib", "GcaSh", "GcaFld", "Tkl", "TklWon", "TklDef3rd", "TklMid3rd", "TklAtt3rd", "TklDri",
                       "TklDriAtt", "TklDri%", "TklDriPast", "Blocks", "BlkSh", "Int", "Recov", "Carries", "CarTotDist", "CarPrgDist", "Fld"]

defender_features = ["PasTotCmp", "PasTotDist", "PasTotPrgDist", "Tkl", "TklWon", "TklDef3rd", "TklMid3rd", "TklAtt3rd", "TklDri", "TklDriAtt", "TklDriPast", "Blocks",
                     "BlkSh", "Int", "Tkl+Int", "Recov", "AerWon", "AerLost", "Carries", "CarTotDist", "CarPrgDist", "CrdY", "CrdR", "Fls", "Clr"]


positions = [forward_positions,midfield_positions,defender_positions]

In [185]:
# removing outliers, who never player more than 180 mins
df = df[df['90s'] > 5]

for competition in competitions:
    df_refined = df[df['Comp'] == competition]

    import os
    if not os.path.exists(f'./data/{competition}'):
        os.makedirs(f'./data/{competition}')

    for position in positions:
        pos, features_set = None, None
        
        if position == forward_positions:
            pos = "attk"
            features_set = common_features + forward_features
        elif position == midfield_positions:
            pos = "cen"
            features_set = common_features + midfielder_features
        else:
            pos = "dif"
            features_set = common_features + defender_features

        

        df_refined_1 = df_refined[df["Pos"] == position[0]][features_set]
        df_refined_2 = df_refined[df["Pos"] == position[1]][features_set]
        df_refined_3 = df_refined[df["Pos"] == position[2]][features_set]
        df_forward = pd.concat([df_refined_1, df_refined_2, df_refined_3], axis=0)
        df_forward.to_csv(f'./data/{competition}/{pos}.csv', index=False)
        

  df_refined_1 = df_refined[df["Pos"] == position[0]][features_set]
  df_refined_2 = df_refined[df["Pos"] == position[1]][features_set]
  df_refined_3 = df_refined[df["Pos"] == position[2]][features_set]
  df_refined_1 = df_refined[df["Pos"] == position[0]][features_set]
  df_refined_2 = df_refined[df["Pos"] == position[1]][features_set]
  df_refined_3 = df_refined[df["Pos"] == position[2]][features_set]
  df_refined_1 = df_refined[df["Pos"] == position[0]][features_set]
  df_refined_2 = df_refined[df["Pos"] == position[1]][features_set]
  df_refined_3 = df_refined[df["Pos"] == position[2]][features_set]
  df_refined_1 = df_refined[df["Pos"] == position[0]][features_set]
  df_refined_2 = df_refined[df["Pos"] == position[1]][features_set]
  df_refined_3 = df_refined[df["Pos"] == position[2]][features_set]
  df_refined_1 = df_refined[df["Pos"] == position[0]][features_set]
  df_refined_2 = df_refined[df["Pos"] == position[1]][features_set]
  df_refined_3 = df_refined[df["Pos"] == positio