***import relevant packages***

In [2]:
import os
import pandas as pd
import numpy as np
# non-interactive plotting
import matplotlib.pyplot as plt
import seaborn as sns
# interactive plotting
import plotly.graph_objects as go

# Attributes and stats datasets

## Import datasets

In [3]:
def imp_fmt_data(dir):
    """
    This function imports the attributes and statistics datasets and formats it adequately to work on it.
    To get the proper html raw data we specify the directory.
    """
    os.chdir(dir) # change directory
    # import attributes dataset
    attributes = pd.read_html(dir + "/squad_attributes.html")[0]
    # import stats dataset
    stats = pd.read_html(dir + "/squad_stats.html")[0]
    # convert unavailable data ('-') to NaN
    for df in (attributes,stats):
        df.replace('-',np.nan,inplace=True)
    # remove units ('km', '%') to treat as numeric data
    for cols in ['Distance','Dist/90']: # remove 'km'
        stats[cols] = stats[cols].str.replace('km','').astype('float')
    rm_pct = ['Gwin','Pens Saved Ratio','Pen/R','Cr C/A','Hdr %',
              'Pas %','Shot %','Tck R']
    for cols in rm_pct: #remove '%' and convert to decimal
        stats[cols] = stats[cols].str.replace('%','').astype('float')
        stats[cols] = stats[cols]/100
    # format App to get number of starts + number of substitute entries
    apps = stats['Apps']
    stats['Apps'] = [int(x[0:x.find(' (')])+int(x[(x.find(' (')+2):len(x)-1]) \
         if x.find(' (')!=-1 else x for x in apps]
    # format data to correct type
    ch_to_int = ['Age','AT Apps','AT Gls','AT Lge Apps','AT Lge Gls','Apps',
                 'Ast','Clean sheets','FA','Fls','D','Lost','G. Mis','Won',
                 'Gls','Conc','Int Ast','Int Conc','Mins','Last C','Last Gl',
                 'Pens','Pens Faced','Pens Saved','Pens S','PoM','Red',
                 'Starts','Tgls','Tcon','Yel','Hdrs A','ChC','Cr A','Cr C',
                 'Drb','Hdrs','Itc','K Hdrs','K Pas','K Tck','Gl Mst',
                 'Mstks','Off','Pas A','Ps C','Svh','Svp','Svt','Shots',
                 'ShT','Tck A','Tck W']
    for cols in stats.columns[2:len(stats.columns)]:
        stats[cols] = stats[cols].astype('float16')
    for cols in ch_to_int:
        stats[cols] = stats[cols].astype('Int16')
    # return attributes dataset and statistics dataset
    return [attributes,stats]

## Import raw datasets and construct each position's dataset

In [17]:
directory = "C:/Users/ybena/Documents/FM_Stat_Analysis"
# adjust directory depending on squad analysis or scouting analysis, season and quarter
directory = directory + ""

# import raw datasets
attr, stats = imp_fmt_data(directory)

# construct stats datasets for each task
mand_stats = ['Name', 'Position','Age','Apps','Starts']
def_1v1 = mand_stats + ['Fls','K Tck','Tck W','Tck A','Tck R']
aer_def = mand_stats + ['Aer A/90','Hdrs W/90','Hdr %']
build_up = mand_stats + ['Ps A/90','Ps C/90','Pas %']
cuts_pass = mand_stats + ['Int/90']
cross = mand_stats + ['Cr A','Cr C','Cr C/A']
crea_chances = mand_stats + ['K Ps/90','Asts/90','Ps C/90']
drib_1v1 = mand_stats + ['Drb']
scoring = mand_stats + ['Shot/90','ShT/90','Gls/90','Gls']

def task_subset_df(df0,*args):
    #result = []
    for x in args:
        x = df0[x].dropna()
        #result.append(x)
    return x

def_1v1 = task_subset_df(stats,def_1v1)
aer_def = task_subset_df(stats,aer_def)
build_up = task_subset_df(stats,build_up)
cuts_pass = task_subset_df(stats,cuts_pass)
cross = task_subset_df(stats,cross)
crea_chances = task_subset_df(stats,crea_chances)
drib_1v1 = task_subset_df(stats,drib_1v1)
scoring = task_subset_df(stats,scoring)

Unnamed: 0,Name,Position,Age,Apps,Starts,Shot/90,ShT/90,Gls/90,Gls
2,Daniele Capelli,D (RC),33,24,22,0.5,0.409912,0.049988,1
3,Carlo Pelagatti,D (RC),31,25,25,0.320068,0.199951,0.040009,1
4,Matteo Lovato,D (C),20,9,7,0.409912,0.27002,0.140015,1
8,NicolÃ² Fazzi,"D/WB (RL), M (RC), AM (R)",25,29,20,3.099609,1.44043,0.160034,3
10,Manuel Nicoletti,"D (LC), WB (L)",21,26,24,0.379883,0.130005,0.040009,1


# Functions to visualize KPIs in interactive plots

In [5]:
def scatter_pointsize(starts):
    """
    This function determines the size of a point in our scatterplots based on the number of starts of the player.
    The more a player has started games, the more his point is bigger.
    """
    return [0.02 if x<10 else 0.04 if x>25 else 0.08 for x in starts]

def scatter_intplot(df,kpi_x,kpi_y,tg_ratio,task):
    """
    This function produces the interactive plots, based on two KPIs to compare and a target completion ratio if appropriate.
    """
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        # KPIs
        x=df[kpi_x], y=df[kpi_y],
        mode='markers',
        # label the text boxes with useful info as Name, Position, Club, Division, Apps, Starts
        text=df['Name']
    ))
    # Add 100% and target completion ratios
    cp = np.linspace(0,100,1000)
    fig.add_trace(go.Scatter(
        x=cp, y=cp,
        name="100% completion ratio"
    ))
    fig.add_trace(go.Scatter(
        x=cp, y=tg_ratio*cp,
        name=str(int(tg_ratio*100)) + "% completion ratio"
    ))
    # Add titles and labels
    fig.update_layout(title=task,
                     xaxis_title=kpi_x, yaxis_title=kpi_y)
    fig.show()