In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
# read in the dataset
df = pd.read_csv('2021-2022 Football Player Stats.csv',sep=';')

#### The dataset contains statistics from the top 5 leagues in Europe (Premier League/England, Ligue 1/France, Bundesliga/Germany, Serie A/Italy, La Liga/Spain) for the 2021-2022 season


#### All the statistics are either per 90 minutes or percentage

In [3]:
# select players with minimum 15 games over the course of a season
# 90 x 15 = 1350, round it up to 1400 
df = df[df['Min']>1400]

In [4]:
# split df into player positions
# start with defenders
defenders = df.loc[df['Pos'].isin(['DF','DFFW','DFMF'])]

### These are some of the stats we'd be interested in for defenders

PasTotCmp% : Pass completion percentage

TklWon : Tackles in which the tackler's team won possession of the ball

TklDriPast : Number of times dribbled past by an opposing player

Press% : Percentage of time the squad gained possession withing five seconds of applying pressure

Blocks : Number of times blocking the ball by standing in its path

Int : Interceptions

Clr : Clearances

Err : Mistakes leading to an opponent's shot

Recov : Number of loose balls recovered

AerWon% : Percentage of aerials won

CarProg : Carries that move the ball towards the opponent's goal at least 5 yards, or any carry into the penalty area

PasProg : Completed passes that move the ball towards the opponent's goal at least 10 yards from its furthest point in the last six passes, or any completed pass into the penalty area


In [5]:
# make dataframes containing only the features and only the player info
defender_features = defenders[['PasTotCmp%','TklWon','TklDriPast','Press%',
                              'Blocks','Int','Clr','Err','Recov','AerWon%',
                              'CarProg','PasProg']]
defender_info = defenders[['Player','Nation','Pos','Squad','Comp']]

In [6]:
# fix the indices 
defender_info.reset_index(drop=True,inplace=True)
defender_features.reset_index(drop=True,inplace=True)

In [7]:
# concatenate the info and features into a single DF
defender_all = pd.concat([defender_info,defender_features],axis=1)

In [8]:
# create a StandardScaler object
scaler = StandardScaler()

In [9]:
# scale the features
defender_features_scaled = scaler.fit_transform(defender_features)

In [10]:
defender_features_scaled = pd.DataFrame(defender_features_scaled,
                                        columns=defender_features.columns)

In [11]:
# let's take a look at classic defender stats
# tackles won and blocks
# we'll split them into four categories for better visualization
tmp_X = defender_features_scaled[['TklWon','Blocks']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

In [14]:
# let's plot the stats
fig = px.scatter(data_frame=defender_all,x='TklWon',y='Blocks',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'TklWon':'Tackles Won (per 90 min)',
                    'Blocks':'Blocks (per 90 min)'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

Players scoring the highest in both tackles and blocks:
- Danilo Soares at Bochum in the Bundesliga
- Petar Stojanovic at Empoli in Serie A
- Cristian Romero at Tottenham in the Premier League

Players with considerably higher number of blocks:
- Kevin Akpoguma at Hoffenheim in the Bundesliga
- Charlie Tayler at Burnley in the Premier League

Interestingly, Virgil van Dijk is at the very bottom left of the plot, meaning he has the lowest combined block and tackling numbers. However, he's a phenomenal defender and carries out his job in a different fashion by positioning himself well for interceptions and winning aerial duels.

In [107]:
# more 'modern' defenders
# interceptions and successful pressing
tmp_X = defender_features_scaled[['Int','Press%']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

In [109]:
fig = px.scatter(data_frame=defender_all,x='Int',y='Press%',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'Int':'Interceptions (per 90 min)',
                    'Press%':'Successful pressing (%)'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

Players scoring the highest in both categories
- Timo H&uuml;bers at K&ouml;ln in the Bundesliga
- Konstantinos Mavropanos at Stuttgart in the Bundesliga
- Gleison Bremer at Torino in Serie A

The player with by-far the highest number of interceptions is Hassane Kamara at Watford in the Premier League

In [110]:
# ball-playing defenders
# progressive carries and progressive passes
tmp_X = defender_features_scaled[['CarProg','PasProg']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

In [111]:
fig = px.scatter(data_frame=defender_all,x='CarProg',y='PasProg',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'CarProg':'Progressive carries (per 90 min)',
                    'PasProg':'Progressive passes (per 90 min)'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

Players scoring the highest in both categories
- Alphonso Davies at Bayern Munich in the Bundesliga
- Aymeric Laporte at Manchester City in the Premier League
- João Cancelo at Manchester City in the Premier League
- Jo&euml;l Matip at Liverpool in the Premier League

Players with the highest amount of progressive passes
- Trent Alexander-Arnold at Liverpool in the Premier League
- Jordi Alba at Barcelona in La Liga

In [112]:
# error proneness
# errors leading to a shot and times dribbles past
tmp_X = defender_features_scaled[['Err','TklDriPast']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

In [113]:
fig = px.scatter(data_frame=defender_all,x='Err',y='TklDriPast',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'Err':'Errors leading to a shot (per 90 min)',
                    'TklDriPast':'Dribbled past (per 90 min)'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

This plot came out interestingly. I was not expecting so many players to have zero errors leading to a shot. Let's then take a look at the extremes for each category.

The player that has the most errors leading to a shot is Rafael Czichos at K&ouml;ln in the Bundesliga. Don't worry Rafael, I think you are fully capable to improve!

The player that is dribbled past the most luckily has zero errors leading to a shot. That player is Petar Stojanović at Empoli in Serie A.

In [114]:
# let's see if we can find out the best-of-the-best defenders 
# do this by aggregating all statistics

# first we need to negate the negative stats so they don't count
# as positive features
defender_score = defender_features.copy()
defender_score['TklDriPast'] = -defender_score['TklDriPast']
defender_score['Err'] = -defender_score['Err']
defender_score.head()

Unnamed: 0,PasTotCmp%,TklWon,TklDriPast,Press%,Blocks,Int,Clr,Err,Recov,AerWon%,CarProg,PasProg
0,75.5,1.16,-0.66,26.0,2.69,1.75,2.19,-0.0,5.53,22.7,5.44,2.94
1,82.4,1.39,-0.42,35.9,1.87,3.11,3.2,-0.0,6.77,59.8,2.75,2.72
2,90.3,0.43,-0.39,29.1,1.7,1.7,4.08,-0.04,10.7,64.0,3.48,3.12
3,79.4,1.55,-0.56,34.0,1.43,1.24,1.2,-0.0,8.92,47.1,6.37,4.9
4,71.4,1.28,-1.17,25.1,1.83,1.39,2.44,-0.0,8.33,46.6,2.61,3.06


In [115]:
# make a new scaler object and scale our data
scaler = StandardScaler()
defender_score_scaled = scaler.fit_transform(defender_score)
defender_score_scaled = pd.DataFrame(defender_score_scaled,columns=defender_score.columns)

In [116]:
defender_score_scaled.head()

Unnamed: 0,PasTotCmp%,TklWon,TklDriPast,Press%,Blocks,Int,Clr,Err,Recov,AerWon%,CarProg,PasProg
0,-0.852621,0.125736,0.269613,-1.646764,2.029583,-0.442302,-0.697968,0.800803,-1.951506,-2.804576,0.937015,-0.223639
1,0.168634,0.672501,0.875444,0.486694,0.147528,1.856719,-0.017453,0.800803,-1.19922,0.312186,-0.432277,-0.393603
2,1.337897,-1.609649,0.951173,-0.978711,-0.242654,-0.526825,0.57547,-0.250746,1.18504,0.665027,-0.060685,-0.084578
3,-0.27539,1.052859,0.522042,0.077243,-0.862355,-1.304435,-1.365006,0.800803,0.105146,-0.754738,1.410413,1.29058
4,-1.459454,0.411005,-1.017778,-1.840715,0.055721,-1.050867,-0.529523,0.800803,-0.252796,-0.796743,-0.503541,-0.130932


In [117]:
# let's try to make this two-dimensional
# i.e. negative stats on one axis, positive stats on the other
defender_score_scaled['neg_stats'] = defender_score_scaled[['TklDriPast','Err']].sum(axis=1)
defender_score_scaled['pos_stats'] = defender_score_scaled[['PasTotCmp%','TklWon',
                                                           'Press%','Blocks','Int',
                                                           'Clr','Recov','AerWon%',
                                                           'CarProg','PasProg']].sum(axis=1)

In [118]:
tmp_X = defender_score_scaled[['pos_stats','neg_stats']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

In [119]:
defender_score_scaled['label'] = labels

# add our cluster labels and positive/negative statistics to the
# dataframe that has they player info
defender_score_info = pd.concat([defender_info,defender_score_scaled[['pos_stats',
                                                                      'neg_stats',
                                                                      'label']]],axis=1)

In [120]:
fig = px.scatter(data_frame=defender_score_info,x='pos_stats',y='neg_stats',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'pos_stats':'Aggregated positive stats',
                    'neg_stats':'Aggregated negative stats'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

According to the aggregation, Jose Luis Palomino at Atalanta in Serie A is the best performing defender

Notable players in the top performing cluster
- Jo&euml;l Matip at Liverpool in the Premier League
- Benjamin Pavard at Bayern Munich in the Bundesliga
- Niklas S&uuml;le at Bayern Munich in the Bundesliga 
- Gerard Piqué at Barcelona in La Liga


In [121]:
# let's now look at more fullback/progressive related features
# make dataframes containing only the features and only the player info
fullback_features = defenders[['PasTotCmp%','TklDriPast','Press%',
                              'Int','Err','CarProg','PasProg',
                              'Pas3rd','Car3rd','PasAss','SCA','DriSucc%']]

In [122]:
fullback_features.reset_index(drop=True,inplace=True)

In [123]:
# concatenate the info and features into a single DF
fullback_all = pd.concat([defender_info,fullback_features],axis=1)

In [124]:
# create a StandardScaler object
scaler = StandardScaler()

In [125]:
# scale the features
fullback_features_scaled = scaler.fit_transform(fullback_features)

In [126]:
fullback_features_scaled = pd.DataFrame(fullback_features_scaled,
                                        columns=fullback_features.columns)

In [127]:
# accurate and progressive passing
tmp_X = fullback_features_scaled[['PasTotCmp%','PasProg']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

In [128]:
fig = px.scatter(data_frame=fullback_all,x='PasProg',y='PasTotCmp%',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'PasProg':'Progressive passes (per 90 min)',
                    'PasTotCmp%':'Passing accuracy (%)'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

Top performing players in this category
- Trent Alexander-Arnold at Liverpool in the Premier League
- Jordi Alba at Barcelona in La Liga
- João Cancelo at Manchester City in the Premier League
- Andrew Robertson at Liverpool in the Premier League

In [129]:
# Goal threats
# passes into the final third of the pitch and shot-creating-actions
tmp_X = fullback_features_scaled[['Pas3rd','SCA']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

In [130]:
fig = px.scatter(data_frame=fullback_all,x='Pas3rd',y='SCA',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'Pas3rd':'Passes into final third (per 90 min)',
                    'SCA':'Shot-creating actions (per 90 min)'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

Interestingly, these two statistics seem to branch off from each other, meaning that players are either passing the ball into the final third, or making shot-creating actions. The shot-creating action players therefore seem more likely to receive the ball in the final third, while the the players passing it to the final third will receive the ball further back and therefore create fewer shot-creating actions.

Players passing it the most into the final third
- Aymeric Laporte at Manchester City in the Premier League
- Kyle Walker at Manchester City in the Premier League
- Jordi Alba at Barcelona in La Liga
- João Cancelo at Manchester City in the Premier League

Players making shot-creating actions
- Trent Alexander-Arnold at Liverpool in the Premier League
- Andrew Robertson at Liverpool in the Premier League
- Federico Dimarco at Inter in Serie A
- Filip Kostić at Frankfurt in the Bundesliga

Here we see a clear difference between the fullbacks playing at Liverpool and Manchester City. Based on this, we can assume that City fullbacks receive the ball further back on the pitch, starting the build-up to an attack, while Liverpool fullbacks are much further up the pitch, where they deliver crosses or dribble into the opponent's penalty area.



In [131]:
# fancy-pants 
# carries into the final third of the pitch and successful dribbles
tmp_X = fullback_features_scaled[['Car3rd','DriSucc%']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

In [132]:
fig = px.scatter(data_frame=fullback_all,x='Car3rd',y='DriSucc%',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'Car3rd':'Carries into the final third (per 90 min)',
                    'DriSucc%':'Successful dribbles (%)'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

In this category, the best performing players are Alphonso Davies at Bayern Munich in the Bundesliga and João Cancelo at Manchester City in the Premier League. Surprisingly (or not surprisingly if you've watched Liverpool play in recent years), Jo&euml;l Matip at Liverpool has a very high successful dribble rate and plenty of carries into the final third while playing as a centre-back.

In [133]:
# let's see if we can find out the best-of-the-best attack/progressive-minded defenders 

# first we need to negate the negative stats so they don't count
# as positive features
fullback_score = fullback_features.copy()
fullback_score['TklDriPast'] = -fullback_score['TklDriPast']
fullback_score['Err'] = -fullback_score['Err']
fullback_score.head()

Unnamed: 0,PasTotCmp%,TklDriPast,Press%,Int,Err,CarProg,PasProg,Pas3rd,Car3rd,PasAss,SCA,DriSucc%
0,75.5,-0.66,26.0,1.75,-0.0,5.44,2.94,1.56,1.66,0.59,1.19,42.3
1,82.4,-0.42,35.9,3.11,-0.0,2.75,2.72,2.45,0.73,0.24,0.63,72.7
2,90.3,-0.39,29.1,1.7,-0.04,3.48,3.12,4.33,0.71,0.21,0.64,75.0
3,79.4,-0.56,34.0,1.24,-0.0,6.37,4.9,4.1,2.03,1.63,2.63,69.2
4,71.4,-1.17,25.1,1.39,-0.0,2.61,3.06,2.78,1.0,0.78,1.44,73.7


In [134]:
# make a new scaler object and scale our data
scaler = StandardScaler()
fullback_score_scaled = scaler.fit_transform(fullback_score)
fullback_score_scaled = pd.DataFrame(fullback_score_scaled,columns=fullback_score.columns)

In [135]:
fullback_score_scaled['neg_stats'] = fullback_score_scaled[['TklDriPast','Err']].sum(axis=1)
fullback_score_scaled['pos_stats'] = fullback_score_scaled[['PasTotCmp%','Press%',
                                                           'Int','CarProg','PasProg',
                                                           'Pas3rd','Car3rd','PasAss',
                                                           'SCA','DriSucc%']].sum(axis=1)

In [136]:
tmp_X = fullback_score_scaled[['pos_stats','neg_stats']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

In [137]:
fullback_score_scaled['label'] = labels

fullback_score_info = pd.concat([defender_info,fullback_score_scaled[['pos_stats',
                                                                      'neg_stats',
                                                                      'label']]],axis=1)

In [138]:
fig = px.scatter(data_frame=fullback_score_info,x='pos_stats',y='neg_stats',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'pos_stats':'Aggregated positive stats',
                    'neg_stats':'Aggregated negative stats'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

Bayern Munich, Liverpool, and Manchester City are the clear winners in this category. Their footballing philosophy clearly involves a lot of progressive actions from their defenders and fullbacks. From these teams the following players are in the top performing cluster

- Bayern Munich
    - Alphonso Davies
    - Benjamin Pavard
    - Dayot Upamecano
    - Lucas Hernandez
    - Niklas S&uuml;le
- Liverpool
    - Trent Alexander-Arnold
    - Jo&euml;l Matip
    - Andrew Robertson
- Manchester City
    - João Cancelo
    - Aymeric Laporte
    - Kyle Walker
    - Rúben Dias
    
Another notable player from the top performing cluster is Marc Cucurella at Brighton in the Premier League. Chelsea clearly noticed his quality and bought him on August 5 2022 for 65 million euros.

### These are some of the stats we'd be interested in for midfielders


SoT% : Shots on target percentage (Does not include penalty kicks)

G/Sh : Goals per shot

ShoDist : Average distance, in yards, from goal of all shots taken (Does not include penalty kicks)

PasTotCmp% : Pass completion percentage

PasLonCmp% : Pass completion percentage (Passes longer than 30 yards)

PasAss : Passes that directly lead to a shot (assisted shots)

Pas3rd : Completed passes that enter the 1/3 of the pitch closest to the goal

PPA : Completed passes into the 18-yard box

PasProg : Completed passes that move the ball towards the opponent's goal at least 10 yards from its furthest point in the last six passes, or any completed pass into the penalty area

SCA : Shot-creating actions

TklWon : Tackles in which the tackler's team won possession of the ball

TklDriPast : Number of times dribbled past by an opposing player

Press% : Percentage of time the squad gained possession withing five seconds of applying pressure

Int : Interceptions

Err : Mistakes leading to an opponent's shot

DriSucc% : Percentage of dribbles completed successfully

CarProg : Carries that move the ball towards the opponent's goal at least 5 yards, or any carry into the penalty area

Car3rd : Carries that enter the 1/3 of the pitch closest to the goal

CPA : Carries into the 18-yard box

CarMis : Number of times a player failed when attempting to gain control of a ball

CarDis : Number of times a player loses control of the ball after being tackled by an opposing player

Rec% : Percentage of time a player successfully received a pass


In [139]:
# all midfielders
midfielders = df.loc[df['Pos'].isin(['MF','MFFW','MFDF'])]

# let's try to split the midfielders into three categories
# defensive midfielders: Deeper lying midfielders that tackle
# and play progressive passes
# box2box midfielders: Midfielders play assist passes and show
# up to be a threat around the opponent's penaly area
# attacking midfielders: Midfielders that shoot, dribble, and
# present as a danger in the opponent's penalty area

# defensive midfielders
mid_def_features = midfielders[['PasTotCmp%','PasProg','TklWon',
                             'TklDriPast','Press%','Int',
                             'Err','CarMis','CarDis']]

# box2box midfielders
mid_b2b_features = midfielders[['PasTotCmp%','PasLonCmp%','PasAss',
                             'Pas3rd','PPA','PasProg','SCA','TklDriPast',
                             'Err','CarProg','CarMis','CarDis']]

# attacking midfielders
mid_att_features = midfielders[['SoT%','G/Sh','PasTotCmp%','PasAss',
                             'Pas3rd','PPA','SCA','Err','DriSucc%',
                             'CarProg','Car3rd','CPA','CarMis','CarDis','Rec%']]

# player info
mid_info = midfielders[['Player','Nation','Pos','Squad','Comp']]

In [140]:
# create the relevant dataframes

mid_info.reset_index(drop=True,inplace=True)
mid_def_features.reset_index(drop=True,inplace=True)
mid_b2b_features.reset_index(drop=True,inplace=True)
mid_att_features.reset_index(drop=True,inplace=True)

mid_def_all = pd.concat([mid_info,mid_def_features],axis=1)
mid_b2b_all = pd.concat([mid_info,mid_b2b_features],axis=1)
mid_att_all = pd.concat([mid_info,mid_att_features],axis=1)

# scale 

scaler = StandardScaler()
mid_def_features_scaled = scaler.fit_transform(mid_def_features)
mid_def_features_scaled = pd.DataFrame(mid_def_features_scaled,
                                      columns=mid_def_features.columns)

scaler = StandardScaler()
mid_b2b_features_scaled = scaler.fit_transform(mid_b2b_features)
mid_b2b_features_scaled = pd.DataFrame(mid_b2b_features_scaled,
                                      columns=mid_b2b_features.columns)

scaler = StandardScaler()
mid_att_features_scaled = scaler.fit_transform(mid_att_features)
mid_att_features_scaled = pd.DataFrame(mid_att_features_scaled,
                                      columns=mid_att_features.columns)


In [141]:
# Defensively sound 
# tackles won and interceptions
tmp_X = mid_def_features_scaled[['TklWon','Int']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

In [142]:
fig = px.scatter(data_frame=mid_def_all,x='TklWon',y='Int',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'TklWon':'Tackles won (per 90 min)',
                    'Int':'Interceptions (per 90 min)'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

Some of the best performing performing players in this category
- Jean Onana at Bordeaux in Ligue 1
- Wilfred Ndidi at Leicester in the Premier League
- Aurélien Tchouaméni at Monaco in Ligue 1

In [143]:
# passers
# progressive passes and accurate passing
tmp_X = mid_def_features_scaled[['PasProg','PasTotCmp%']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

fig = px.scatter(data_frame=mid_def_all,x='PasProg',y='PasTotCmp%',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'PasProg':'Progressive passes (per 90 min)',
                    'PasTotCmp%':'Passing accuracy (%)'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

In this category there are three players that clearly stand out from the rest
- Thiago Alcântara at Liverpool in the Premier League
- Joshua Kimmich at Bayern Munich in the Bundesliga
- Toni Kroos at Real Madrid in La Liga

In [144]:
# Penalty box passers
# passes into opponent's penalty area and passing accuracy
tmp_X = mid_b2b_features_scaled[['PPA','PasTotCmp%']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

fig = px.scatter(data_frame=mid_b2b_all,x='PPA',y='PasTotCmp%',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'PPA':'Passes into penalty area (per 90 min)',
                    'PasTotCmp%':'Passing accuracy (%)'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

Best performing players in this category
- Kevin De Bruyne at Manchester City in the Premier League
- Nicolo Barella at Inter in Serie A
- David Silva at Real Sociedad in La Liga
- Marco Verratti at PSG in Ligue 1

To no one's surprise, Kevin De Bruyne is by quite a margin leads in passing into the opponent's penalty area.

In [145]:
# Progressive carriers
# passes into opponent's third and progressive carries
tmp_X = mid_b2b_features_scaled[['Pas3rd','CarProg']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

fig = px.scatter(data_frame=mid_b2b_all,x='Pas3rd',y='CarProg',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'Pas3rd':'Passing into final third (per 90 min)',
                    'CarProg':'Progressive carries (per 90 min)'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

In this category there are two players in a league of their own
- Thiago Alcântara at Liverpool in the Premier League
- Toni Kroos at Real Madrid in La Liga

The player with the most progressive carries is Bernardo Silva at Manchester City in the Premier League

In [146]:
# Carriers and creators
# shot-creating-actions and carries into opponent's penalty area
tmp_X = mid_att_features_scaled[['SCA','CPA']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

fig = px.scatter(data_frame=mid_att_all,x='SCA',y='CPA',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'SCA':'Shot-creating actions (per 90 min)',
                    'CPA':'Carries into the penalty area (per 90 min)'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

Two players are well above the rest in this category
- Nabil Fekir at Real Betis in La Liga
- Franck Ribéry at Salerntiana in Serie A

Kevin De Bruyne leads in shot-creating actions 

In [147]:
# reliable dribblers
# successfully receive a pass and successful dribbles
tmp_X = mid_att_features_scaled[['Rec%','DriSucc%']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

fig = px.scatter(data_frame=mid_att_all,x='Rec%',y='DriSucc%',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'Rec%':'Passes received successfully (%)',
                    'DriSucc%':'Successful dribbles (%)'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

Top performing players in this category
- Thiago Mendes at Lyon in Ligue 1
- Sergio Busquets at Barcelona in Li Liga
- Stanislav Lobotka at Napoli in Serie A
- Salva Sevilla at Mallorca in La Liga
- Azor Matusiwa at Reims in Ligue 1

In [66]:
mid_def_features.columns

Index(['PasTotCmp%', 'PasProg', 'TklWon', 'TklDriPast', 'Press%', 'Int', 'Err',
       'CarMis', 'CarDis'],
      dtype='object')

In [148]:
# let's see if we can find out the best-of-the-best defense-minded midfielders 

# first we need to negate the negative stats so they don't count
# as positive features
mid_def_score = mid_def_features.copy()
mid_def_score['TklDriPast'] = -mid_def_score['TklDriPast']
mid_def_score['Err'] = -mid_def_score['Err']
mid_def_score['CarMis'] = -mid_def_score['CarMis']
mid_def_score['CarDis'] = -mid_def_score['CarDis']
mid_def_score.head()

Unnamed: 0,PasTotCmp%,PasProg,TklWon,TklDriPast,Press%,Int,Err,CarMis,CarDis
0,91.7,2.96,1.24,-1.46,27.9,1.86,-0.0,-0.84,-1.46
1,81.6,4.18,2.23,-3.17,28.2,2.56,-0.0,-0.85,-1.46
2,73.6,5.9,1.35,-2.07,29.1,2.15,-0.04,-1.83,-2.59
3,84.4,3.75,2.02,-0.63,31.8,2.69,-0.0,-0.43,-1.01
4,71.4,2.47,0.95,-0.76,32.7,1.71,-0.0,-0.63,-0.82


In [149]:
# make a new scaler object and scale our data
scaler = StandardScaler()
mid_def_score_scaled = scaler.fit_transform(mid_def_score)
mid_def_score_scaled = pd.DataFrame(mid_def_score_scaled,columns=mid_def_score.columns)

In [150]:
mid_def_score_scaled['neg_stats'] = mid_def_score_scaled[['TklDriPast','Err',
                                                           'CarMis','CarDis']].sum(axis=1)
mid_def_score_scaled['pos_stats'] = mid_def_score_scaled[['PasTotCmp%','Press%',
                                                           'Int','PasProg',
                                                           'TklWon']].sum(axis=1)

In [151]:
tmp_X = mid_def_score_scaled[['pos_stats','neg_stats']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

mid_def_score_scaled['label'] = labels

mid_def_score_info = pd.concat([mid_info,mid_def_score_scaled[['pos_stats',
                                                                      'neg_stats',
                                                                      'label']]],axis=1)

fig = px.scatter(data_frame=mid_def_score_info,x='pos_stats',y='neg_stats',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'pos_stats':'Aggregated positive stats',
                    'neg_stats':'Aggregated negative stats'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

Best deep-lying/defensive midfielders based on aggregate scoring
- Thiago Alcântara at Liverpool in the Premier League
- Aurélien Tchouaméni at Monaco in Ligue 1
- Cheick Doucouré at Lens in Ligue 1

In [72]:
mid_b2b_features.columns

Index(['PasTotCmp%', 'PasLonCmp%', 'PasAss', 'Pas3rd', 'PPA', 'PasProg', 'SCA',
       'TklDriPast', 'Err', 'CarProg', 'CarMis', 'CarDis'],
      dtype='object')

In [152]:
# let's see if we can find out the best-of-the-best box-to-box midfielders 

# first we need to negate the negative stats so they don't count
# as positive features
mid_b2b_score = mid_b2b_features.copy()
mid_b2b_score['TklDriPast'] = -mid_b2b_score['TklDriPast']
mid_b2b_score['Err'] = -mid_b2b_score['Err']
mid_b2b_score['CarMis'] = -mid_b2b_score['CarMis']
mid_b2b_score['CarDis'] = -mid_b2b_score['CarDis']
#mid_b2b_score.head()

# make a new scaler object and scale our data
scaler = StandardScaler()
mid_b2b_score_scaled = scaler.fit_transform(mid_b2b_score)
mid_b2b_score_scaled = pd.DataFrame(mid_b2b_score_scaled,columns=mid_b2b_score.columns)

mid_b2b_score_scaled['neg_stats'] = mid_b2b_score_scaled[['TklDriPast','Err',
                                                           'CarMis','CarDis']].sum(axis=1)
mid_b2b_score_scaled['pos_stats'] = mid_b2b_score_scaled[['PasTotCmp%','PasLonCmp%',
                                                           'PasAss','Pas3rd',
                                                           'PPA','PasProg','SCA',
                                                           'CarProg']].sum(axis=1)

tmp_X = mid_b2b_score_scaled[['pos_stats','neg_stats']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

mid_b2b_score_scaled['label'] = labels

mid_b2b_score_info = pd.concat([mid_info,mid_b2b_score_scaled[['pos_stats',
                                                                      'neg_stats',
                                                                      'label']]],axis=1)

fig = px.scatter(data_frame=mid_b2b_score_info,x='pos_stats',y='neg_stats',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'pos_stats':'Aggregated positive stats',
                    'neg_stats':'Aggregated negative stats'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

Best players in this category based on aggregate scoring
- Toni Kroos at Real Madrid in La Liga
- Thiago Alcântara at Liverpool in the Premier League
- Joshua Kimmich at Bayern Munich in the Bundesliga
- Marco Verratti at PSG in Ligue 1
- Kevin De Bruyne at Manchester City in the Premier League

In [74]:
mid_att_features.columns

Index(['SoT%', 'G/Sh', 'PasTotCmp%', 'PasAss', 'Pas3rd', 'PPA', 'SCA', 'Err',
       'DriSucc%', 'CarProg', 'Car3rd', 'CPA', 'CarMis', 'CarDis', 'Rec%'],
      dtype='object')

In [153]:
# let's see if we can find out the best-of-the-best attacking midfielders 

# first we need to negate the negative stats so they don't count
# as positive features
mid_att_score = mid_att_features.copy()
mid_att_score['Err'] = -mid_att_score['Err']
mid_att_score['CarMis'] = -mid_att_score['CarMis']
mid_att_score['CarDis'] = -mid_att_score['CarDis']

# make a new scaler object and scale our data
scaler = StandardScaler()
mid_att_score_scaled = scaler.fit_transform(mid_att_score)
mid_att_score_scaled = pd.DataFrame(mid_att_score_scaled,columns=mid_att_score.columns)

mid_att_score_scaled['neg_stats'] = mid_att_score_scaled[['Err',
                                                           'CarMis','CarDis']].sum(axis=1)
mid_att_score_scaled['pos_stats'] = mid_att_score_scaled[['SoT%','G/Sh',
                                                         'PasTotCmp%','PasAss',
                                                         'Pas3rd','PPA','SCA',
                                                         'DriSucc%','CarProg',
                                                         'Car3rd','CPA','Rec%']].sum(axis=1)

tmp_X = mid_att_score_scaled[['pos_stats','neg_stats']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

mid_att_score_scaled['label'] = labels

mid_att_score_info = pd.concat([mid_info,mid_att_score_scaled[['pos_stats',
                                                                      'neg_stats',
                                                                      'label']]],axis=1)

fig = px.scatter(data_frame=mid_att_score_info,x='pos_stats',y='neg_stats',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'pos_stats':'Aggregated positive stats',
                    'neg_stats':'Aggregated negative stats'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

Best performing players based on aggregate score
- Kevin De Bruyne at Manchester City in the Premier League
- Marco Verratti at PSG in Ligue 1
- Bernardo Silva at Manchester City in the Premier League
- Luis Alberto at Lazio in Serie A

### These are some of the stats we'd be interested in for forwards

SoT% : Shots on target percentage (Does not include penalty kicks)

G/Sh : Goals per shot

PasTotCmp% : Pass completion percentage

PasAss : Passes that directly lead to a shot (assisted shots)

PPA : Completed passes into the 18-yard box

SCA : Shot-creating actions

TklAtt3rd : Tackles in attacking 1/3

PresAtt3rd : Number of times applying pressure to opposing player who is receiving, carrying or releasing the ball, in the attacking 1/3

Int : Interceptions

Err : Mistakes leading to an opponent's shot

DriSucc% : Percentage of dribbles completed successfully

CPA : Carries into the 18-yard box

CarMis : Number of times a player failed when attempting to gain control of a ball

CarDis : Number of times a player loses control of the ball after being tackled by an opposing player

Rec% : Percentage of time a player successfully received a pass

Off : Offsides

AerWon% : Percentage of aerials won

TouAttPen : Touches in attacking penalty area

RecProg : Completed passes that move the ball towards the opponent's goal at least 10 yards from its furthest point in the last six


In [154]:
# all forwards
forwards = df.loc[df['Pos'].isin(['FW','FWDF','FWMF'])]

forward_features = forwards[['SoT%','G/Sh','PasTotCmp%','PasAss',
                             'PPA','SCA','TklAtt3rd','PresAtt3rd','Int',
                             'Err','DriSucc%','CPA','CarMis','CarDis',
                             'Rec%','Off','AerWon%','TouAttPen','RecProg']]

# player info
forward_info = forwards[['Player','Nation','Pos','Squad','Comp']]

In [155]:
forward_info.reset_index(drop=True,inplace=True)
forward_features.reset_index(drop=True,inplace=True)

forward_all = pd.concat([forward_info,forward_features],axis=1)

# scale 
scaler = StandardScaler()
forward_features_scaled = scaler.fit_transform(forward_features)
forward_features_scaled = pd.DataFrame(forward_features_scaled,
                                      columns=forward_features.columns)

In [156]:
# penalty area threats
# carries and passes into the opponent's penalty area
tmp_X = forward_features_scaled[['CPA','PPA']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

fig = px.scatter(data_frame=forward_all,x='CPA',y='PPA',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'CPA':'Carries into penalty area (per 90 min)',
                    'PPA':'Passes into penalty area (per 90 min)'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

There are two players at the far extremes of each statistic. Jack Grealish at Manchester City in the Premier League leads in carries into the penalty area, and Lionel Messi at PSG in Ligue 1 lead in passes into the penalty area. 

Other notably good performers are
- Vinicius Junior at Real Madrid in La Liga
- Ousmane Dembélé at Barcelona in La Liga
- Gerard Deulofeu at Udinese in Serie A

In [157]:
# dribbling chance creators
# shot-creating-actions and successful dribbles
tmp_X = forward_features_scaled[['SCA','DriSucc%']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

fig = px.scatter(data_frame=forward_all,x='SCA',y='DriSucc%',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'SCA':'Shot-creating actions (per 90 min)',
                    'DriSucc%':'Successful dribbles (%)'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

These five players lead the pack in this category
- Dimitri Payet at Marseille in Ligue 1
- Lionel Messi at PSG in Ligue 1
- Luis Muriel at Atalanta in Serie A
- Neymar at PSG in Ligue 1
- Vinicius Junior at Real Madrid in La Liga

In [158]:
# accurate finishers
# goals per shot and shot on target percentage
tmp_X = forward_features_scaled[['G/Sh','SoT%']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

fig = px.scatter(data_frame=forward_all,x='G/Sh',y='SoT%',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'G/Sh':'Goals per shot',
                    'SoT%':'Shots on target (%)'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

Top performing players in this category
- Alberth Elis at Bordeaux in Ligue 1
- Juanmi at Real Betis in La Liga
- Son Heung-min at Tottenham in the Premier League
- Pere Milla at Elche in La Liga
- Jamie Vardy at Leicester in the Premier League

In [159]:
# pressing forwards
# pressing and tackles in the final third
tmp_X = forward_features_scaled[['PresAtt3rd','TklAtt3rd']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

fig = px.scatter(data_frame=forward_all,x='PresAtt3rd',y='TklAtt3rd',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'PresAtt3rd':'Pressing in final third (per 90 min)',
                    'TklAtt3rd':'Tackles in final third (per 90 min)'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

Top performing players
- Rafael Borré at Frankfurt in the Bundesliga
- Georginio Rutter at Hoffenheim in the Bundesliga
- Wout Weghorst at Burnley in the Premier League
- Karim Onisiwo at Mainz in the Bundesliga

In [160]:
# target forwards
# successfully received passes and aerial duels won
tmp_X = forward_features_scaled[['Rec%','AerWon%']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

fig = px.scatter(data_frame=forward_all,x='Rec%',y='AerWon%',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'Rec%':'Passes received successfully (%)',
                    'AerWon%':'Aerial duels won (%)'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

Now here's a surprising result. Lionel Messi is at the tippy-top of this category. Unsurprisingly, his passes received percentage is high. The surprising part is that he has the highest percentage of aerials won. 

Other notable performers in this category
- Angel Di Maria at PSG in Ligue 1
- Florian Sotoca at Lens in Ligue 1
- Lucas Moura at Tottenham in the Premier League

In [100]:
forward_features.columns

Index(['SoT%', 'G/Sh', 'PasTotCmp%', 'PasAss', 'PPA', 'SCA', 'TklAtt3rd',
       'PresAtt3rd', 'Int', 'Err', 'DriSucc%', 'CPA', 'CarMis', 'CarDis',
       'Rec%', 'Off', 'AerWon%', 'TouAttPen', 'RecProg'],
      dtype='object')

In [101]:
# let's see if we can find out the best-of-the-best forwards 

# first we need to negate the negative stats so they don't count
# as positive features
forward_score = forward_features.copy()
forward_score['Err'] = -forward_score['Err']
forward_score['CarMis'] = -forward_score['CarMis']
forward_score['CarDis'] = -forward_score['CarDis']
forward_score['Off'] = -forward_score['Off']

# make a new scaler object and scale our data
scaler = StandardScaler()
forward_score_scaled = scaler.fit_transform(forward_score)
forward_score_scaled = pd.DataFrame(forward_score_scaled,columns=forward_score.columns)

forward_score_scaled['neg_stats'] = forward_score_scaled[['Err',
                                                           'CarMis','CarDis',
                                                               'Off']].sum(axis=1)
forward_score_scaled['pos_stats'] = forward_score_scaled[['SoT%','G/Sh',
                                                         'PasTotCmp%','PasAss',
                                                         'PPA','SCA','TklAtt3rd',
                                                         'PresAtt3rd','Int',
                                                         'DriSucc%','CPA',
                                                         'Rec%','AerWon%',
                                                         'TouAttPen','RecProg']].sum(axis=1)

tmp_X = forward_score_scaled[['pos_stats','neg_stats']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

forward_score_scaled['label'] = labels

forward_score_info = pd.concat([forward_info,forward_score_scaled[['pos_stats',
                                                                      'neg_stats',
                                                                      'label']]],axis=1)

fig = px.scatter(data_frame=forward_score_info,x='pos_stats',y='neg_stats',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'pos_stats':'Aggregated positive stats',
                    'neg_stats':'Aggregated negative stats'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

Players with the best aggregate score
- Jack Grealish at Manchester City in the Premier League
- Angel Di Maria at PSG in Ligue 1
- Riyad Mahrez at Manchester City in the Premier League
- Lionel Messi at PSG in Ligue 1
- Serge Gnabry at Bayern Munich in the Bundesliga

### These are some of the stats we'd be interested in for goalkeepers


#### there aren't many goalkeeper-related stats in the dataset, so we'll make use of what we have 


PasTotCmp% : Pass completion percentage
    
PasProg : Completed passes that move the ball towards the opponent's goal at least 10 yards from its furthest point in the last six passes, or any completed pass into the penalty area

PasInt : Intercepted

PasBlocks : Blocked by the opponent who was standing it the path

Int : Interceptions

Clr : Clearances

Err : Mistakes leading to an opponent's shot

Recov : Number of loose balls recovered

Fls : Fouls committed

In [102]:
# goalies
goalies = df.loc[df['Pos'].isin(['GK'])]

# not much in terms of GK stats, let's use what's available
goalies_features = goalies[['PasTotCmp%','PasProg','PasInt',
                           'PasBlocks','Int','Clr','Err',
                           'Recov','Fls']]

goalie_info = goalies[['Player','Nation','Pos','Squad','Comp']]

In [103]:
goalie_info.reset_index(drop=True,inplace=True)
goalies_features.reset_index(drop=True,inplace=True)

goalies_all = pd.concat([goalie_info,goalies_features],axis=1)

# scale 

scaler = StandardScaler()
goalies_features_scaled = scaler.fit_transform(goalies_features)
goalies_features_scaled = pd.DataFrame(goalies_features_scaled,
                                      columns=goalies_features.columns)

In [104]:
# passing vs errors leading to a shot
tmp_X = goalies_features_scaled[['Err','PasTotCmp%']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

fig = px.scatter(data_frame=goalies_all,x='Err',y='PasTotCmp%',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'Err':'Errors leading to shots (per 90 min)',
                    'PasTotCmp%':'Passing accuracy (%)'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

Gianluigi Donnarumma, Niki M&auml;enp&auml;&auml;, and Benoît Costil are the unfortunate leaders of errors leading to shots. Donnarumma however redeems himself to some extent by being fantastically accurate with his passing.

In [105]:
goalies_features.columns

Index(['PasTotCmp%', 'PasProg', 'PasInt', 'PasBlocks', 'Int', 'Clr', 'Err',
       'Recov', 'Fls'],
      dtype='object')

In [106]:
# let's see if we can find out the best-of-the-best goalkeepers

# first we need to negate the negative stats so they don't count
# as positive features
goalies_score = goalies_features.copy()
goalies_score['Err'] = -goalies_score['Err']
goalies_score['PasInt'] = -goalies_score['PasInt']
goalies_score['PasBlocks'] = -goalies_score['PasBlocks']
goalies_score['Fls'] = -goalies_score['Fls']

# make a new scaler object and scale our data
scaler = StandardScaler()
goalies_score_scaled = scaler.fit_transform(goalies_score)
goalies_score_scaled = pd.DataFrame(goalies_score_scaled,columns=goalies_score.columns)

goalies_score_scaled['neg_stats'] = goalies_score_scaled[['Err','Fls',
                                                           'PasInt','PasBlocks']].sum(axis=1)
goalies_score_scaled['pos_stats'] = goalies_score_scaled[['PasTotCmp%','PasProg',
                                                         'Int','Clr','Recov']].sum(axis=1)

tmp_X = goalies_score_scaled[['pos_stats','neg_stats']]
model = KMeans(4)
labels = model.fit_predict(tmp_X)

goalies_score_scaled['label'] = labels

goalies_score_info = pd.concat([goalie_info,goalies_score_scaled[['pos_stats',
                                                                      'neg_stats',
                                                                      'label']]],axis=1)

fig = px.scatter(data_frame=goalies_score_info,x='pos_stats',y='neg_stats',
                color=labels,hover_name='Player',
                hover_data=['Squad','Comp','Nation'],
                color_continuous_scale=px.colors.qualitative.Plotly,
                labels={
                    'pos_stats':'Aggregated positive stats',
                    'neg_stats':'Aggregated negative stats'
                })

fig.update_traces(marker={'size':5})
fig.update(layout_coloraxis_showscale=False)

fig.show()

### notable players

With the statistics at hand, Manuel Neuer and Jasper Cillessen have the best aggregate score. The most polarizing goalkeeper is Manuel Riemann, with the best aggregate positive stats and one of the worst aggregate negative stats.