***import relevant packages***

In [8]:
import os
import pandas as pd
import numpy as np
# non-interactive plotting
import matplotlib.pyplot as plt
import seaborn as sns
# interactive plotting
import plotly.graph_objects as go

***some short helpful functions***

In [9]:
def strl(lst):
    """
    This function converts a list of numeric values into a list of string element-wise.
    """
    return [str(x) for x in lst]

def avg(lst):
    """
    This function calculates the mean of a list/series/etc.
    """
    return sum(lst)/len(lst)

# Attributes and stats datasets

## Function to import and format datasets

In [81]:
def imp_fmt_data(dir):
    """
    This function imports the attributes and statistics datasets and formats it adequately to work on it.
    To get the proper html raw data we specify the directory.
    """
    os.chdir(dir) # change directory
    # import attributes dataset
    attributes = pd.read_html(dir + "/squad_attributes.html")[0]
    # import stats dataset
    stats = pd.read_html(dir + "/squad_stats.html")[0]
    # convert unavailable data ('-') to NaN
    for df in (attributes,stats):
        df.replace('-',np.nan,inplace=True)
    # remove units ('km', '%') to treat as numeric data
    for cols in ['Distance','Dist/90']: # remove 'km'
        stats[cols] = stats[cols].str.replace('km','').astype('float')
    rm_pct = ['Gwin','Pens Saved Ratio','Pen/R','Cr C/A','Hdr %',
              'Pas %','Shot %','Tck R']
    for cols in rm_pct: #remove '%' and convert to decimal
        stats[cols] = stats[cols].str.replace('%','').astype('float')
        stats[cols] = stats[cols]/100
    # format App to get number of starts + number of substitute entries
    apps = stats['Apps']
    stats['Apps'] = [int(x[0:x.find(' (')])+int(x[(x.find(' (')+2):len(x)-1]) \
         if x.find(' (')!=-1 else x for x in apps]
    # format data to correct type
    ch_to_int = ['Age','AT Apps','AT Gls','AT Lge Apps','AT Lge Gls','Apps',
                 'Ast','Clean sheets','FA','Fls','D','Lost','G. Mis','Won',
                 'Gls','Conc','Int Ast','Int Conc','Mins','Last C','Last Gl',
                 'Pens','Pens Faced','Pens Saved','Pens S','PoM','Red',
                 'Starts','Tgls','Tcon','Yel','Hdrs A','ChC','Cr A','Cr C',
                 'Drb','Hdrs','Itc','K Hdrs','K Pas','K Tck','Gl Mst',
                 'Mstks','Off','Pas A','Ps C','Svh','Svp','Svt','Shots',
                 'ShT','Tck A','Tck W']
    for cols in stats.columns[4:len(stats.columns)]:
        stats[cols] = stats[cols].astype('float64')
    for cols in ch_to_int:
        stats[cols] = stats[cols].astype('Int16')
    # return attributes dataset and statistics dataset
    return [attributes,stats]

## Import raw datasets and construct each position's relevant statistics dataset

In [84]:
directory = "C:/Users/ybena/Documents/FM_Stat_Analysis"
# adjust directory depending on squad analysis or scouting analysis, season and quarter
directory = directory + ""

# import raw datasets
attr, stats = imp_fmt_data(directory)

# construct stats datasets for each task
mand_stats = ['Name','Age','Position','Club','Division','Mins','Apps','Starts']
def_1v1 = mand_stats + ['Fls','Tck A','Tck W','Tck R']
aer_def = mand_stats + ['Hdrs W/90','Aer A/90','Hdr %']
build_up = mand_stats + ['Ps A/90','Ps C/90','Pas %']
cuts_pass = mand_stats + ['Int/90']
cross = mand_stats + ['Cr C','Cr A','Asts/90','Drb','Cr C/A']
crea_chances = mand_stats + ['Ch C/90','Tgls/90','K Ps/90','Ps C/90','Asts/90']
drib_1v1 = mand_stats + ['Drb','Ch C/90','FA']
scoring = mand_stats + ['ShT/90','Shot/90','Gls/90','Shot %']
work_ethic = mand_stats + ['Dist/90']

def per90min(df,lst):
    """
    This function divides a set of statistics by 90mins played. The df should have a column named 'Mins'.
    """
    for x in lst:
        df.loc[:,x] = df.loc[:,x]*90/df['Mins']
        df.rename(columns={x:x+'/90'}, inplace=True)
    return df

def roundstat(df):
    """
    This function rounds all float values to 2 decimals.
    """
    for x in df.columns:
        if df.loc[:,x].dtype=='float64':
            df.loc[:,x] = round(df.loc[:,x],2)
    return df

def_1v1 = stats[def_1v1] # subset
def_1v1.insert(len(def_1v1.columns)-1, 'Fls/TckA', stats['Fls']/stats['Tck A']) # add stats
def_1v1 = per90min(def_1v1,['Fls','Tck A','Tck W']) # divide per 90min
def_1v1 = roundstat(def_1v1).dropna() # round and drop NaN

aer_def = stats[aer_def]
aer_def = roundstat(aer_def).dropna()

build_up = stats[build_up]
build_up = roundstat(build_up).dropna()

# cuts_pass = fmt_df(cuts_pass)

cross = stats[cross]
cross = per90min(cross,['Cr C','Cr A','Drb'])
cross = roundstat(cross).dropna()

crea_chances = stats[crea_chances]
crea_chances = roundstat(crea_chances).dropna()

drib_1v1 = stats[drib_1v1]
drib_1v1 = per90min(drib_1v1,['Drb','FA'])
drib_1v1 = roundstat(drib_1v1).dropna()

scoring = stats[scoring]
scoring.insert(len(scoring.columns),'Gls/ShT', stats['Gls/90']/stats['ShT/90'])
scoring = roundstat(scoring).dropna()

work_ethic = stats[work_ethic]
work_ethic.loc[:,'Mins'] = work_ethic.loc[:,'Mins'].div(90)
work_ethic.rename(columns={'Mins':'Mins/90'}, inplace=True)
work_ethic = roundstat(work_ethic).dropna()


#work_ethic.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


# Data visualization

## Functions to plot relevant data

In [95]:
def scatter_pointsize(starts):
    """
    This function determines the size of a point in our scatterplots based on the number of starts of the player.
    The more a player has started games, the more his point is bigger.
    """
    return [0.02 if x<10 else 0.04 if x>25 else 0.08 for x in starts]

def scatter_intplot(df,kpi_x,kpi_y,task,tg_ratio=0):
    """
    This function produces the interactive plots, based on two KPIs to compare and a target completion ratio if appropriate.
    The df should always include Name, Age, Position, Starts, Apps
    """
    # to avoid errors due to trying to plot NaN values, remove all rows containing at leat one NaN
    #df = df.dropna()
    # list containing the points description
    hovtext = ['Name: ' + i1 + '<br>'
              +'Age: ' + i2 + '<br>'
              +'Position: ' + i3 + '<br>'
              +'Apps: ' + i4 + '(' + i5 + ')' + '<br>'
              +'Club: ' + i6 + '<br>'
              +'Division: ' + i7 + '<br>'
              for i1,i2,i3,i4,i5,i6,i7
              in zip(df['Name'], strl(df['Age']), df['Position'],
                     strl(df['Starts']), strl(df['Apps']-df['Starts']),
                     df['Club'], df['Division'])
              ]
    # create a figure object to contain the plot
    fig = go.Figure()
    # add the main points
    fig.add_trace(go.Scatter(
        # KPIs
        x=df[kpi_x], y=df[kpi_y],
        mode='markers',
        # label the text boxes with useful info as Name, Position, Club, Division, Apps, Starts
        text=hovtext
    ))
    # Add 100% and target completion ratios if needed
    if tg_ratio != 0:
        cp = np.linspace(0,100,1000)
        fig.add_trace(go.Scatter(
            x=cp, y=cp,
            name="100% completion ratio"
        ))
        fig.add_trace(go.Scatter(
            x=cp, y=tg_ratio*cp,
            name=str(int(tg_ratio*100)) + "% completion ratio"
        ))
    # Add titles and labels
    fig.update_layout(title=task, xaxis_title=kpi_x, yaxis_title=kpi_y)
    fig.show()

## 1v1 defending

In [96]:
scatter_intplot(def_1v1,'Fls/90','Tck A/90','Comparing fouls and attempted tackles')

In [97]:
scatter_intplot(def_1v1,'Tck A/90','Tck W/90','Comparing attempted tackes with won tackes',0.7)

## Aerial defending

In [98]:
scatter_intplot(aer_def,'Aer A/90','Hdrs W/90','Comparing attempted aerial challenges with won headers',0.8)

## Build up

In [99]:
scatter_intplot(build_up,'Ps A/90','Ps C/90','Comparing attempted passes with completed passes',0.9)

## Interceptions

## Crossing

In [100]:
scatter_intplot(cross,'Cr A/90','Cr C/90','Comparing attempted crosses with completed crosses',0.4)

In [102]:
scatter_intplot(cross,'Asts/90','Cr C/90','Comparing completed crosses with assists')

In [103]:
scatter_intplot(cross,'Drb/90','Cr A/90','Comparing attempted crosses with dribbles made')

## Creating chances

In [105]:
scatter_intplot(crea_chances,'Ch C/90','Tgls/90','Comparing chances created with team goals scored')

In [108]:
scatter_intplot(crea_chances,'Ps C/90','K Ps/90','Comparing completed passes with key passes',0.03)

In [109]:
scatter_intplot(crea_chances,'Ps C/90','Asts/90','Comparing completed passes with assists')

## 1v1 dribbling

In [111]:
scatter_intplot(drib_1v1,'Drb/90','Ch C/90','Comparing chances created with dribbles made')

In [112]:
scatter_intplot(drib_1v1,'Drb/90','FA/90','Comparing fouls against with dribbles made')

## Scoring goals

In [114]:
scatter_intplot(scoring,'Shot/90','ShT/90','Comparing shots against with shots on target',0.5)

In [115]:
scatter_intplot(scoring,'ShT/90','Gls/90','Comparing shots on target against with goals',0.3)

## Work ethics

In [117]:
scatter_intplot(work_ethic,'Mins/90','Dist/90','Comparing average minutes played per game against average distance per game')