# Introduction
For our analysis, more detailed data than provided by transfermarkt.ch would be helpful. Such data is provided by [fbref.com](https://fbref.com/en/). In this notebook we will try to get that data.

In [None]:
#pip install sportsipy

In [None]:
#import packages
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

# Webscraping preperation
Before we can scrape through the website, we will need to make some preperations:
## Get Player ids for the URL

In [None]:
#get the players for which we need data
file="stats_tm.pkl"
tm=pd.read_pickle(file)

In [None]:
#create list withe the name of the players and there current team
player_list=[]
team_list=[]
for index, row inn tm.iterrows():
    player_list.append(index)
    team_list.append(row["Current Team (2020/21)"])

In [None]:
#The url for each player on fbref includes a unique player id.To get this id we can use the sportsipy api, which provides
# the id we need. However, the api does not contain all ids for all players, which is why a lot of player ids will be missing
#This also means we can not use these ids/players 
from sportsipy.fb.roster import Roster

#Create dictionary with all players
player_dic={}
for i in range(len(player_list)):#iterate through all plyaers
    try:
        team = Roster(team_list[i]) #get team of player
        player=team(player_list[i]) #get player
        player_dic[player.name.replace(" ","-")]=player.player_id  # save player id
    except ValueError:
        continue #if the team/player is not availabe in the api, it raises a Value Error. In that case, the code should just skip that player

In [None]:
#Save that dictionary with the ids as pickle
import pickle
f=open("player_dic.pkl","wb")
pickle.dump(player_dic,f)
f.close()

## Dictionary with names for the tables available on fbref.com

In [None]:
#Create a dictionary with the (Field Player) table we find on fbref.com
stats_dic={0:"Standard",
          1:"Shooting",
          2:"Passing",
          3:"Pass_Types",
          4:"Goal_and_Shot_Creation",
          5:"Defensive_Actions",
          6:"Possession",
          7:"Playing_Time",
          8:"Miscellaneous"}

In [None]:
#Create a dictionary with the Goalkeeper tables table we find on fbref.com
gk_stats_dic={0:"Standard",1:"Advanced Goalkeeping"}

## Functions to create the necessary Dataframes out of the html codes

In [None]:
#get position of the player
def get_position(data):
    details=data.find_all("p")
    if details[0].text[0:8]=="Position":
        details=details[0].text
    elif details[1].text[0:8]=="Position":
        details=details[1].text
    elif details[2].text[0:8]=="Position":
        details=details[2].text
    return details

In [None]:
#get foot of the player
def get_foot(details):
    if "% Right" in details:
            return "Right"
    elif "% Left" in details:
        return "Left"
    else:
        return np.nan

In [None]:
#get unique values of a list while preserving order
def unique_values(lst):
    seen = set()
    seen_add = seen.add
    return [x for x in lst if not (x in seen or seen_add(x))]

In [None]:
# Create a dataframe out of the data (rows) we have, with the columns we get from another function and the seasons for which
#data is available
def df_creation(rows,columns,seasons):
    n=len(seasons) #only put data for specific seasnons in df
    x=columns.index("Age")#get index of age attribute, since we need all later columns
    df = pd.DataFrame(rows[0:n], columns=columns[x:x+len(rows[0])], index=seasons)#create df
    return df

In [None]:
#get all seasons in which player has played
def get_seasons(columns, index):
    seasons=[]
    for season in index:
        seasons.append(columns[season])
    return seasons

In [None]:
#get the table data from the html code (given by tr and td)
def get_data(alpha,ind):
    rows = [] #initliaze list to store all rows of data
    for rownum, row in enumerate(alpha[ind].find_all('tr')): #find all rows in table
        if len(row.find_all('td')) > 0: 
            rowdata = [] #initiliaze list of row data
            for i in range(len(row.find_all('td'))): #get all column values for row
                rowdata.append(row.find_all('td')[i].text)
            rows.append(rowdata)
    return rows

In [None]:
#get all column names from html code
def get_columns(alpha,ind):
    columns=[]
    for header in alpha[ind].find_all('th'):
        columns.append(header.string)
    return columns

In [None]:
#find all tables in html code and return them
def get_alpha(page):
    return data.find_all("table")

In [None]:
#function that combine previous functions to create the dataframe
def get_df(alpha,ind,Season):
    columns=get_columns(alpha,ind)
    index=get_index(columns)
    rows=get_data(alpha,ind)
    seasons=get_seasons(columns,index)
    df=df_creation(rows,columns,seasons)
    df=df.loc[Season]
    return df

In [None]:
#makes sure that columns only include the Seasons in which a player has played and drops all other columns
def get_index(columns):
    index=[]
    for i,e in enumerate(columns):
        try:
            if "20" in e:
                index.append(i)
        except TypeError:
            continue
    return index

In [None]:
#There are 9 different tables with data. Since every table is multiple times in the html resultset "alpha", we need to find
#the first appearance of every table so we don't have and duplicates
def get_index_table(alpha):
    index_tables=[0]
    for i in range(len(alpha)):
        columns=[]
        for header in alpha[i].find_all('th'):
            columns.append(header.string)
        if i!=0:
            if unique_values(columns_compare)[1]!=unique_values(columns)[1]:
                index_tables.append(i)
        columns_compare=columns
    return index_tables

In [None]:
#There are 9 different tables with data. Since every table is multiple times in the html resultset "alpha", we need to find
#the first appearance of every table so we don't have and duplicates
def get_index_table_gk(alpha):
    index_tables=[0]
    for i in range(len(alpha)):
        columns=[]
        for header in alpha[i].find_all('th'):
            columns.append(header.string)
        if i!=0:
            if unique_values(columns_compare)[1]!=unique_values(columns)[1]:
                index_tables.append(i)
                break
        columns_compare=columns
    return index_tables

In [None]:
#drops df of player for which we miss some values
def drop_empty(df):
    for index, row in df.iterrows():
        for i in row:
            if i=="":
                df.drop(index, axis=0,inplace=True)
                break
    return df

In [None]:
#transform the numbers in the df into a formar we can later work with
def clean_numbers(df):
    for index,row in df.iterrows():
        for i in range(len(row)):
            row[i]=row[i].replace(",","")
        row=row.replace("","0")#replaces instances with no values with a 0 (for instance if player has not taken a penalty, it shows "" instead of 0)
    return df

In [None]:
#adds a row with the total of all competitions for a player in a season
def add_total(df):
    comp=list(df.index)
    df=df.append(df.sum(numeric_only=True), ignore_index=True)
    comp.append("Total")
    df.index=comp
    return df

In [None]:
#checks if data is missing in a series object (if a lot of values in sucession are "") or if it should actually be a 0
# if missing: replace entire series with invalid. If should be 0: Leave row as it is and replace "" with 0 with another function
#later
def check_missing_data_ser(ser):
    num=4
    k=len(ser)-num
    for i in range(k):
        a=list(ser)[i:i+num].count("")
        if a==num:
            ser[:]="Invalid"
            break
    return ser

In [None]:
# chcecks missing data for an entire dataframe
def check_missing_data_df(df):
    for index, row in df.iterrows():
        check_missing_data_ser(row)
    return df

In [None]:
#drops rows that were deemed invalid
def drop_invalid(df):
    df = df[df.Age != "Invalid"]
    return df

In [None]:
# Creates dataframe if data for a player was only available for a single competition
def df_single_comp(df):
    df=check_missing_data_ser(df)
    if df[0]=="Invalid":
         return None
    df=df.to_frame().T.drop(["Comp","Squad","Country","Age","Matches"],axis=1)
    if "LgRank" in df.columns:
        df.drop("LgRank", axis=1,inplace=True)
    df=clean_numbers(df)
    df=df.apply(pd.to_numeric)
    df.index=["Total"]
    return df

In [None]:
# Creates dataframe if data for a player was available for multiple competitions
def df_mult_comp(df):
    df=check_missing_data_df(df)
    df=drop_invalid(df)
    if df is None:
        return df
    if df.empty:
        return df
    df.index=df["Comp"]
    df=df.drop(["Age","Squad","Country","Matches","Comp"],axis=1)
    if "LgRank" in df.columns:
        df.drop("LgRank", axis=1,inplace=True)
    df=clean_numbers(df)
    df=df.replace("","0")
    df=df.apply(pd.to_numeric)
    df=df.dropna()
    if df.empty:
        return df
    df=add_total(df)
    return df

In [None]:
# function that gets all stats for a player with the previous functions and stores them in a dictionary for every player
# (9 df per player)
def player_stats(data,Season):
    empty_dic={}
    alpha=get_alpha(data)
    index_tables=get_index_table(alpha)
    for ind in index_tables:
        df=get_df(alpha,ind,Season)
        if type(df)==pd.core.series.Series:
            df=df_single_comp(df)
        else:
            df=df_mult_comp(df)
        if ind%6==0:
            empty_dic[stats_dic[ind/6]]=df
        elif ind%5==0 and ind!=0:
            empty_dic[stats_dic[ind/5]]=df
    return empty_dic

In [None]:
# function that gets all stats for a GK with the previous functions and stores them in a dictionary for every GK (2 df per GK)
def gk_stats(data,Season):
    empty_dic={}
    alpha=get_alpha(data)
    index_tables=get_index_table_gk(alpha)
    for ind in index_tables:
        df=get_df(alpha,ind,Season)
        if type(df)==pd.core.series.Series:
            df=df_single_comp(df)
        else:
            df=df_mult_comp(df)
        if ind%6==0:
            empty_dic[gk_stats_dic[ind/6]]=df
        elif ind%5==0 and ind!=0:
            empty_dic[gk_stats_dic[ind/5]]=df
        elif ind%4==0 and ind!=0:
            empty_dic[gk_stats_dic[ind/4]]=df
    return empty_dic

In [None]:
# define the Season we want to look at and create empty dictionaries to store the data for each postion in a seprate dictionary
Season="2018-2019"
GK_dic={}
DF_dic={}
MF_dic={}
FW_dic={}
foot_dic={}

# Webscraping
Now that we have done all preperations, we can procede with the actual webscraping:

In [None]:
for i in range(len(player_dic)):
    try:
        name=list(player_dic.keys())[i]#get player (url) name
        value=player_dic[name]#get player id
        page="https://fbref.com/en/players/"+value+"/all_comps/"+ name + "-Stats---All-Competitions" #get page
        html = requests.get(page).text #get webpage
        data = BeautifulSoup(html, 'html5') #Create a BeatifulSoup object, from which we can get the data
        details=get_position(data) #get part were position and foot are
        position=details[10:12] #get position
        foot=get_foot(details) #get strong foot of player
        foot_dic[name]=foot#store strong foot in dicitonary
        if position == "GK":#add the data of a player into the corresponding position dictionary
            GK_dic[name]=gk_stats(data,Season)
        elif position == "DF":
            DF_dic[name]=player_stats(data,Season)
        elif position == "MF":
            MF_dic[name]=player_stats(data,Season)
        elif position == "FW":
            FW_dic[name]=player_stats(data,Season)
        else:
            print(f"Invalid Structure for {name}")
    except KeyError:
        print(f"Key Error for {name}") #it sometimes raises a key error (for instance for Rodrygo, becuase he didn't play a 2018/2019 seasom)
        continue #skip that player
    except IndexError: #it raises an IndexError once --> Skip that player
        print (f"IndexError for {name}")
        continue

# Data Handling
Now that we have all the data, we need to bring it into a form we can work with:
## Delete missing data
We have some players for which data is missing. Thus, we need to delete those players from the dictionary.

In [None]:
#function that deletes those players from the dictionary that have data missing
def del_player_missing_data(dic):
    copy=dic.copy()
    for player in dic:
        if len(dic[player])<9:
            del copy[player]
    return copy

In [None]:
#function that deletes those goalkeepers from the dictionary that have data missing
def del_player_missing_data_GK(dic):
    copy=dic.copy()
    for player in dic:
        if len(dic[player])<2:
            del copy[player]
    return copy

In [None]:
# list with all positions to iterate through
positions_all=["GK","DF","MF","FW"]

In [None]:
#iterate through all positions and delete those players who have data missing
for position in positions_all:
    if position=="GK":
        GK_dic=del_player_missing_data_GK(GK_dic)
    else:
        exec(f"{position}_dic=del_player_missing_data({position}_dic)")

In [None]:
#del player which have a None object instead of a dataframe in at least one case (because data is missing)
def del_None(dic):
    copy=dic.copy()
    for player in dic:
        for stat in dic[player]:
            if dic[player][stat] is None:
                del copy[player]
                break
    return copy

In [None]:
#iterate through all positions and delete missing data
for position in positions_all:
    exec(f"{position}_dic=del_None({position}_dic)")

In [None]:
#delete players which have empty dataframes
def del_empty(dic):
    copy=dic.copy()
    for player in dic:
        for stat in dic[player]:
            if dic[player][stat].empty:
                del copy[player]
                break
    return copy

In [None]:
#iterate through all positions and delete empty dataframe
for position in positions_all:
    exec(f"{position}_dic=del_empty({position}_dic)")

## Data Transformation
For each table, we need the total data for each player for our analysis. Thus, we need to transform the data for each player and create a dataframe with the total value for each table for each player. We then need to add the total values to a dataframe with all players (in a specific position) so we can analyse the dataframe later. For each table, we create a function that cleans the data for that specific table. What the datavalues mean (for instance for npxG+xA/90) can be viewed on this [website](https://fbref.com/en/players/dea698d9/all_comps/Cristiano-Ronaldo-Stats---All-Competitions) for each table *(This is just an explantory link of a player. If you hover over the stat you are intrested in, a text will pop up that explains what that stat means).*

In [None]:
def standard_transform(dic):
    df=pd.DataFrame()
    for player in list(dic.keys()):
        ddf=dic[player]["Standard"]
        name_columns=list(ddf.columns)
        name_columns[11:16]=['Gls/90', 'Ast/90', 'G+A/90', 'G-PK/90', 'G+A-PK/90']
        name_columns[-5:]=['xG/90', 'xA/90', 'xG+xA/90', 'npxG/90', 'npxG+xA/90']
        ddf.columns=name_columns
        total={}
        minutes=ddf.loc["Total","90s"]
        total["Gls/90"]=ddf.loc["Total","Gls"]/minutes
        total["Ast/90"]=ddf.loc["Total","Ast"]/minutes
        total["G+A/90"]=total["Gls/90"]+total["Ast/90"]
        total["G-PK/90"]=ddf.loc["Total","G-PK"]/minutes
        total["G+A-PK/90"]=total["G-PK/90"]+total["Ast/90"]
        total["xG/90"]=ddf.loc["Total","xG"]/minutes
        total["xA/90"]=ddf.loc["Total","xA"]/minutes
        total["xG+xA/90"]=total["xG/90"]+total["xA/90"]
        total["npxG/90"]=ddf.loc["Total","npxG"]/minutes
        total["npxG+xA/90"]=ddf.loc["Total","npxG+xA"]/minutes
        for key in list(total.keys()):
            ddf.loc["Total",key]=total[key]
        df[player]=ddf.loc["Total"]
    return df.T

In [None]:
def passing_transform(dic):
    df=pd.DataFrame()
    for player in list(dic.keys()):
        ddf=dic[player]["Passing"]
        name_columns=['90s','Cmp (Total)','Att (Total)', 'Cmp% (Total)', 'TotDist', 'PrgDist', 'Cmp (Short)', 'Att (Short)', 'Cmp% (Short)', 'Cmp (Medium)', 'Att (Medium)', 'Cmp% (Medium)', 'Cmp (Long)', 'Att (Long)', 'Cmp% (Long)', 'Ast', 'xA', 'A-xA', 'KP', '1/3', 'PPA', 'CrsPA','Prog']
        ddf.columns=name_columns
        total={}
        total["Cmp% (Short)"]=ddf.loc["Total","Cmp (Short)"]/ddf.loc["Total","Att (Short)"]*100
        total["Cmp% (Medium)"]=ddf.loc["Total","Cmp (Medium)"]/ddf.loc["Total","Att (Medium)"]*100
        total["Cmp% (Long)"]=ddf.loc["Total","Cmp (Long)"]/ddf.loc["Total","Att (Long)"]*100
        total["Cmp% (Total)"]=ddf.loc["Total","Cmp (Total)"]/ddf.loc["Total","Att (Total)"]*100
        for key in list(total.keys()):
            ddf.loc["Total",key]=total[key]
        df[player]=ddf.loc["Total"]
    return df.T

In [None]:
def shooting_transform(dic):
    df=pd.DataFrame()
    for player in list(dic.keys()):
        ddf=dic[player]["Shooting"]
        total={}
        minutes=ddf.loc["Total","90s"]
        total["SoT%"]=ddf.loc["Total","SoT"]/ddf.loc["Total","Sh"]*100
        total["Sh/90"]=ddf.loc["Total","Sh"]/minutes
        total["SoT/90"]=ddf.loc["Total","SoT"]/minutes
        total["G/Sh"]=ddf.loc["Total","Gls"]/ddf.loc["Total","Sh"]
        total["G/SoT"]=ddf.loc["Total","Gls"]/ddf.loc["Total","SoT"]
        total["npxG/Sh"]=ddf.loc["Total","npxG"]/ddf.loc["Total","Sh"]
        for key in list(total.keys()):
            ddf.loc["Total",key]=total[key]
        df[player]=ddf.loc["Total"]
    return df.T

In [None]:
def pass_types_transform(dic):
    df=pd.DataFrame()
    for player in list(dic.keys()):
        ddf=dic[player]["Pass_Types"]
        ddf.columns=name_columns=['90s', 'Att', 'Live', 'Dead', 'FK', 'TB', 'Press', 'Sw', 'Crs', 'CK', 'In (CK)', 'Out (CK)', 'Str (CK)', 'Ground (Height)', 'Low (Height)', 'High (Hight)', 'Left Foot', 'Right Foot', 'Head', 'TI', 'Other Body Part', 'Cmp', 'Off', 'Out', 'Int', 'Blocks']
        df[player]=ddf.loc["Total"]
    return df.T

In [None]:
def gasc_transform(dic):
    df=pd.DataFrame()
    for player in list(dic.keys()):
        ddf=dic[player]["Goal_and_Shot_Creation"]
        name_columns=['90s', 'SCA', 'SCA90', 'PassLive (SCA)', 'PassDead (SCA)', 'Drib (SCA)', 'Sh (SCA)', 'Fld (SCA)', 'Def (SCA)', 'GCA', 'GCA90', 'PassLive (GCA)', 'PassDead (GCA)', 'Drib (GCA)', 'Sh (GCA)', 'Fld (GCA)', 'Def (GCA)']
        ddf.columns=name_columns
        total={}
        minutes=ddf.loc["Total","90s"]
        total["SCA90"]=ddf.loc["Total","SCA"]/minutes
        total["GCA90"]=ddf.loc["Total","GCA"]/minutes
        for key in list(total.keys()):
            ddf.loc["Total",key]=total[key]
        df[player]=ddf.loc["Total"]
    return df.T

In [None]:
def defensive_transform(dic):
    df=pd.DataFrame()
    for player in list(dic.keys()):
        ddf=dic[player]["Defensive_Actions"]
        name_columns=['90s', 'Tkl (Total)', 'TklW (Total)', 'Def 3rd (Tkl)', 'Mid 3rd (Tkl)', 'Att 3rd (Tkl)', 'Tkl (vs. dribbles)', 'Att (vs. dribbles)', 'Tkl% (vs. dribbles)', 'Past (vs. dribbles)', 'Press', 'Succ (Press)', 'Succ % (Press)', 'Def 3rd (Press)', 'Mid 3rd (Press)', 'Att 3rd (Press)', 'Blocks', 'Sh', 'ShSv', 'Pass', 'Int', 'Tkl+Int', 'Clr', 'Err']
        ddf.columns=name_columns
        total={}
        total["Tkl% (vs. dribbles)"]=ddf.loc["Total","Tkl (vs. dribbles)"]/ddf.loc["Total","Att (vs. dribbles)"]*100
        total["Succ % (Press)"]=ddf.loc["Total","Succ (Press)"]/ddf.loc["Total","Press"]*100
        for key in list(total.keys()):
            ddf.loc["Total",key]=total[key]
        df[player]=ddf.loc["Total"]
    return df.T

In [None]:
def possession_transform(dic):
    df=pd.DataFrame()
    for player in list(dic.keys()):
        ddf=dic[player]["Possession"]
        name_columns=['90s', 'Touches', 'Def Pen (Touches)', 'Def 3rd (Touches)', 'Mid 3rd (Touches)', 'Att 3rd (Touches)', 'Att Pen (Touches)', 'Live (Touches)', 'Succ (Dribbles)', 'Att (Dribbles)', 'Succ% (Dribbles)', '#Pl (Dribbles)', 'Nutmegs', 'Carries', 'TotDist (Carries)', 'PrgDist (Carries)', 'Prog (Carries)', 'Fin 1/3 (Carries)', 'PA (Carries)', 'Mis (Carries)', 'Dis (Carries)', 'Target of pass', 'Rec', 'Rec%', 'Prog']
        ddf.columns=name_columns
        total={}
        total["Succ% (Dribbles)"]=ddf.loc["Total","Succ (Dribbles)"]/ddf.loc["Total","Att (Dribbles)"]*100
        total["Rec%"]=ddf.loc["Total","Rec"]/ddf.loc["Total","Target of pass"]*100
        for key in list(total.keys()):
            ddf.loc["Total",key]=total[key]
        df[player]=ddf.loc["Total"]
    return df.T

In [None]:
def playing_time_transform(dic):
    df=pd.DataFrame()
    for player in list(dic.keys()):
        ddf=dic[player]["Playing_Time"]
        ddf.drop("On-Off",axis=1, inplace=True)
        total={}
        minutes=ddf.loc["Total","90s"]
        if ddf.shape[0]>2:
            mn_start=0
            mn_sub=0
            min_percent=0
            i=0
            while i<ddf.shape[0]-1:
                min_percent=min_percent+ddf.iloc[i]["Min"]*ddf.iloc[i]["Min%"]
                mn_start=mn_start+ddf.iloc[i]["Mn/Start"]*ddf.iloc[i]["Starts"]
                mn_sub=mn_sub+ddf.iloc[i]["Mn/Sub"]*ddf.iloc[i]["Starts"]
                i=i+1
            total["Mn/Start"]=mn_start/ddf.loc["Total","Starts"]
            total["Mn/Sub"]=mn_sub/ddf.loc["Total","Subs"]
            total["Min%"]=min_percent/ddf.loc["Total","Min"]
        total["Mn/MP"]=ddf.loc["Total","Min"]/ddf.loc["Total","MP"]
        total["+/-90"]=ddf.loc["Total","+/-"]/minutes
        total["+/-90"]=ddf.loc["Total","xG+/-"]/minutes
        for key in list(total.keys()):
            ddf.loc["Total",key]=total[key]
        df[player]=ddf.loc["Total"]
    return df.T

In [None]:
def miscellaneous_transform(dic):
    df=pd.DataFrame()
    for player in list(dic.keys()):
        ddf=dic[player]["Miscellaneous"]
        name_columns=['90s', 'CrdY', 'CrdR', '2CrdY', 'Fls', 'Fld', 'Off', 'Crs', 'Int', 'TklW', 'PKwon', 'PKcon', 'OG', 'Recov', 'Won (Aerial Duels)', 'Lost (Aerial Duels)', 'Won% (Aerial Duels)']
        ddf.columns=name_columns
        total={}
        total["Won% (Aerial Duels)"]=ddf.loc["Total","Won (Aerial Duels)"]/(ddf.loc["Total","Won (Aerial Duels)"]+ddf.loc["Total","Lost (Aerial Duels)"])*100
        for key in list(total.keys()):
            ddf.loc["Total",key]=total[key]
        df[player]=ddf.loc["Total"]
    return df.T

In [None]:
def gk_standard_transform(dic):
    df=pd.DataFrame()
    for player in list(dic.keys()):
        ddf=dic[player]["Standard"]
        name_columns=list(ddf.columns) 
        name_columns[-1]="Save% (Penalties)"
        ddf.columns=name_columns
        total={}
        minutes=comp=ddf.loc["Total","90s"]
        total["GA90"]=ddf.loc["Total","GA"]/minutes
        total["Save%"]=ddf.loc["Total","Saves"]/ddf.loc["Total","SoTA"]*100
        total["CS%"]=ddf.loc["Total","CS"]/ddf.loc["Total","MP"]*100
        total["Save% (Penalties)"]=ddf.loc["Total","PKsv"]/ddf.loc["Total","PKatt"]*100
        for key in list(total.keys()):
            ddf.loc["Total",key]=total[key]
        df[player]=ddf.loc["Total"]
    return df.T

In [None]:
def advanced_gk(dic):
    df=pd.DataFrame()
    for player in list(dic.keys()):
        ddf=dic[player]["Advanced Goalkeeping"]
        name_columns=['90s', 'GA', 'PKA', 'FKA', 'CKA', 'OGA', 'PSxG', 'PSxG/SoT', 'PSxG+/-', 'PSxG+/-/90', 'Cmp (Pass Longer 40Y)', 'Att (Pass Longer 40Y)', 'Cmp% (Pass Longer 40Y)', 'Att (Passes)', 'Att (Throws)', 'Pass Longer 40Y%', 'AvgLen (Pass)', 'Att (GK)', 'Pass Longer 40Y% (GK)', 'AvgLen (GK)', 'Opp (Crosses)', 'Stp (Crosses)', 'Stp% (Crosses)', '#OPA', '#OPA/90', 'AvgDist']
        ddf.columns=name_columns
        total={}
        minutes=comp=ddf.loc["Total","90s"]
        total["PSxG+/-/90"]=ddf.loc["Total","PSxG+/-"]/minutes
        total["Cmp% (Pass Longer 40Y)"]=ddf.loc["Total","Cmp (Pass Longer 40Y)"]/ddf.loc["Total","Att (Pass Longer 40Y)"]*100
        total["Stp% (Crosses)"]=ddf.loc["Total","Stp (Crosses)"]/ddf.loc["Total","Opp (Crosses)"]*100
        total["#OPA/90"]=ddf.loc["Total","#OPA"]/minutes
        if ddf.shape[0]>2:
            SoT=0
            launches=0
            gk_launches=0
            i=0
            while i<ddf.shape[0]-1:
                SoT=SoT+ddf.iloc[i]["PSxG"]/ddf.iloc[i]["PSxG/SoT"]
                launches=launches+ddf.iloc[i]["Att (Passes)"]*ddf.iloc[i]["Pass Longer 40Y%"]
                gk_launches=gk_launches+ddf.iloc[i]["Att (GK)"]*ddf.iloc[i]["Pass Longer 40Y% (GK)"]
                i=i+1
            total["PSxG/SoT"]=ddf.loc["Total","PSxG"]/SoT
            total["Pass Longer 40Y%"]=launches/ddf.loc["Total","Att (Passes)"]
            total["Pass Longer 40Y% (GK)"]=gk_launches/ddf.loc["Total","Att (GK)"]
        for key in list(total.keys()):
            ddf.loc["Total",key]=total[key]
        df[player]=ddf.loc["Total"]
    return df.T

In [None]:
len(FW_dic)+len(GK_dic)+len(DF_dic)+len(MF_dic)

In [None]:
#Create total dataframe for goalkeepers with functions created before
GK_standard=gk_standard_transform(GK_dic)
GK_advanced=advanced_gk(GK_dic)

In [None]:
#Create list with field positions to iterate through and create an empty dictionary for every position to save the dataframes inside
positions=["DF","MF","FW"]
files_DF,files_MF,files_FW={},{},{}

In [None]:
#Itereate through every position and store the corresponding dataframes in the corresponding dictionary.
for position in positions:
    exec(f"{position}_standard=standard_transform({position}_dic)")
    exec(f"{position}_shooting=shooting_transform({position}_dic)")
    exec(f"{position}_passing=passing_transform({position}_dic)")
    exec(f"{position}_pass_types=pass_types_transform({position}_dic)")
    exec(f"{position}_gasc=gasc_transform({position}_dic)")
    exec(f"{position}_defensive=defensive_transform({position}_dic)")
    exec(f"{position}_possession=possession_transform({position}_dic)")
    exec(f"{position}_playing_time=playing_time_transform({position}_dic)")
    exec(f"{position}_miscellaneous=miscellaneous_transform({position}_dic)")

# Save files
Now that we have the dataframes, we will save them as a csv file:

In [None]:
#Create function that create file name for every table and  poition
def FileCreation(position):
    exec(f"files_{position}['{position}/standard.csv']={position}_standard")
    exec(f"files_{position}['{position}/shooting.csv']={position}_shooting")
    exec(f"files_{position}['{position}/passing.csv']={position}_passing")
    exec(f"files_{position}['{position}/pass_types.csv']={position}_pass_types")
    exec(f"files_{position}['{position}/Goals_and_Shooting_Creation.csv']={position}_gasc")
    exec(f"files_{position}['{position}/defensive.csv']={position}_defensive")
    exec(f"files_{position}['{position}/possession.csv']={position}_possession")
    exec(f"files_{position}['{position}/playing_time.csv']={position}_playing_time")
    exec(f"files_{position}['{position}/miscellaneous.csv']={position}_miscellaneous")

In [None]:
#create the filenames
for position in positions:
    FileCreation(position)

In [None]:
#save the GK stats in a csv file
GK_standard.to_csv("GK/GK_standard.csv")
GK_advanced.to_csv("GK/GK_advanced.csv")

In [None]:
#create list with all dictionaries
field_files=[]
for position in positions:
    exec(f"field_files.append(files_{position})")

In [None]:
#save all files in the corresponding path
for files in field_files:
    for file in files:
        files[file].to_csv(file)

In [None]:
# Save dictionary with strong foor of player as pickle
import pickle
f=open("foot_dic.pkl","wb")
pickle.dump(foot_dic,f)
f.close()