In [1]:
pip install plotly

Note: you may need to restart the kernel to use updated packages.


In [2]:
from itertools import repeat 
import numpy as np
import  matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

from IPython.display import Image

In [3]:
data = pd.read_csv('premier_league_players.csv')

In [4]:
data=data.copy()

In [5]:
data.head()

Unnamed: 0,Name,Jersey Number,Club,Position,Nationality,Age,Appearances,Wins,Losses,Goals,...,Punches,High Claims,Catches,Sweeper clearances,Throw outs,Goal Kicks,Yellow cards,Red cards,Fouls,Offsides
0,Bernd Leno,1.0,Arsenal,Goalkeeper,Germany,28.0,64,28,16,0,...,34.0,26.0,17.0,28.0,375.0,489.0,2,0,0,
1,Matt Macey,33.0,Arsenal,Goalkeeper,England,26.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,
2,Rúnar Alex Rúnarsson,13.0,Arsenal,Goalkeeper,Iceland,25.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,
3,Héctor Bellerín,2.0,Arsenal,Defender,Spain,25.0,160,90,37,7,...,,,,,,,23,0,125,8.0
4,Kieran Tierney,3.0,Arsenal,Defender,Scotland,23.0,16,7,5,1,...,,,,,,,2,0,9,0.0


In [6]:
data['Appearances']

0       64
1        0
2        0
3      160
4       16
      ... 
566     31
567     78
568    105
569      1
570      0
Name: Appearances, Length: 571, dtype: int64

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 571 entries, 0 to 570
Data columns (total 59 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Name                    571 non-null    object 
 1   Jersey Number           563 non-null    float64
 2   Club                    571 non-null    object 
 3   Position                571 non-null    object 
 4   Nationality             570 non-null    object 
 5   Age                     570 non-null    float64
 6   Appearances             571 non-null    int64  
 7   Wins                    571 non-null    int64  
 8   Losses                  571 non-null    int64  
 9   Goals                   571 non-null    int64  
 10  Goals per match         309 non-null    float64
 11  Headed goals            502 non-null    float64
 12  Goals with right foot   502 non-null    float64
 13  Goals with left foot    502 non-null    float64
 14  Penalties scored        309 non-null    fl

# data preprocessing

In [8]:
# remove entries which do not have age, jersey number and nationality
data = data[data['Nationality'].notna()]
data = data[data['Age'].notna()]
data= data[data['Jersey Number'].notna()]

In [9]:

data['Cross accuracy %'] = data['Cross accuracy %'].astype(str).str.replace(r'%', '').astype(float)
data['Shooting accuracy %'] = data['Shooting accuracy %'].astype(str).str.replace(r'%', '').astype(float)
data['Tackle success %']=data['Tackle success %'].astype(str).str.replace(r'%', '').astype(float)

In [10]:
pd.set_option('mode.chained_assignment', None)
features = data.columns

datta = data[features]
datta.head()

Unnamed: 0,Name,Jersey Number,Club,Position,Nationality,Age,Appearances,Wins,Losses,Goals,...,Punches,High Claims,Catches,Sweeper clearances,Throw outs,Goal Kicks,Yellow cards,Red cards,Fouls,Offsides
0,Bernd Leno,1.0,Arsenal,Goalkeeper,Germany,28.0,64,28,16,0,...,34.0,26.0,17.0,28.0,375.0,489.0,2,0,0,
1,Matt Macey,33.0,Arsenal,Goalkeeper,England,26.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,
2,Rúnar Alex Rúnarsson,13.0,Arsenal,Goalkeeper,Iceland,25.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,
3,Héctor Bellerín,2.0,Arsenal,Defender,Spain,25.0,160,90,37,7,...,,,,,,,23,0,125,8.0
4,Kieran Tierney,3.0,Arsenal,Defender,Scotland,23.0,16,7,5,1,...,,,,,,,2,0,9,0.0


In [11]:
datta_appNonZero = datta[datta['Appearances']>0] # prevent division by zero for players who are yet to make apperances
#take care of the data type for division
#seperate cols with dtype objects and cols that may not be divided by appearances
cols=features.drop(['Age', 'Name', 'Appearances', 'Club', 'Nationality', 'Jersey Number', 'Cross accuracy %', 'Position',
                   'Goals per match', 'Passes per match', 'Tackle success %', 'Shooting accuracy %'])
datta_appNonZero.loc[:, cols] = datta_appNonZero.loc[:, cols].div(datta_appNonZero['Appearances'], axis=0)

In [12]:
# positional classification on the data as is
goalies = data[data['Position'] == 'Goalkeeper']
defenders = data[data['Position']=='Defender']
midfielders=data[data['Position']== 'Midfielder']
forwards = data[data['Position']=='Forward']

In [13]:
#players who have made at least 38 games(a season worth games)
#dara normalized
data_38app = data[data['Appearances'] >=38]
all_players = datta_appNonZero[datta_appNonZero['Appearances'] >=38]
goalies_ = datta_appNonZero[(datta_appNonZero['Position'] == 'Goalkeeper') & (datta_appNonZero['Appearances'] >= 38)]
defenders_ = datta_appNonZero[(datta_appNonZero['Position'] == 'Defender') & (datta_appNonZero['Appearances'] >= 38)]
midfielders_ = datta_appNonZero[(datta_appNonZero['Position'] == 'Midfielder') & (datta_appNonZero['Appearances'] >= 38)]
forwards_ = datta_appNonZero[(datta_appNonZero['Position'] == 'Forward') & (datta_appNonZero['Appearances'] >= 38)]

# Visulization general

In [14]:
fig = px.pie(data, values = 'Appearances', names ='Nationality', 
             title="countries represented in the EPL by number of appearances")
fig.update_traces(textposition='inside', textinfo='percent+label')
# fig.update_layout(title_text = 'countries represented in the EPL by number of appearances, title_x=0.5, font=dict(color='black',
#, size=20, family='Courier New, monospace')')
iplot(fig)

In [15]:
fig=px.sunburst(data, path=['Position', 'Nationality'], values='Appearances', title='Countries represented in the EPL by number of appearances and by players position')
iplot(fig)

# Players by Appearances

In [16]:
fig=px.bar(data, x='Position', y='Appearances', color='Club', hover_data=['Name'],
          height=600, title='Players appearances by position')
iplot(fig)

In [17]:
fig = px.bar(data, x='Club', y='Appearances', color='Position', hover_data=['Name'], height=600,
            title='Apearance by club')
iplot(fig)

# Players Age

In [18]:
fig = px.violin(data, y='Age', x='Position', box=True, title='Players Age distribution by position')
iplot(fig)

fig=px.box(data, x='Club', y='Age', title='Players Age distribution by club')
iplot(fig)

In [19]:
from plotly.subplots import make_subplots 

In [20]:
goalies_38app= goalies[goalies['Appearances']>=38]

In [21]:
head=5
df1=goalies_38app.sort_values(by='Clean sheets', ascending=False).head(head)
df2=goalies_38app.sort_values(by='Saves', ascending=False).head(head)
                                          
df3=goalies_38app.sort_values(by='High Claims', ascending=False).head(head)
df4=goalies_38app.sort_values(by='Catches', ascending=False).head(head)

df11=goalies_38app.sort_values(by='Clean sheets', ascending=False).head(head)
df12=goalies_38app.sort_values(by='Saves', ascending=False).head(head)
df13=goalies_38app.sort_values(by='High Claims', ascending=False).head(head)
df14=goalies_38app.sort_values(by='Catches', ascending=False).head(head)

fig = make_subplots(rows=4, cols=2, subplot_titles=('Clean sheets(overall)', 'Clean sheets (pre-game)', 'Save (overall)',
                                                   'Saves (pre-game)',
                                                  'High Claims (overall)', 'High claims(pre-game)', 'Catches(overall)',
                                                  'Catches (pre-game)'))
fig.add_trace(go.Bar
              (y=df1['Name'], 
               x=df1["Clean sheets"], 
               hovertext=df1['Club'], 
               orientation='h'), row=1, col=1)
fig.add_trace(go.Bar(
                y=df2["Name"], 
                x=df2['Saves'],
                hovertext=df2['Club'],
                orientation='h'),
                row=2, col=1)

fig.add_trace(go.Bar(
                y=df3["Name"], 
                x=df3['High Claims'],
                hovertext=df3['Club'],
                orientation='h'),
                row=3, col=1)

fig.add_trace(go.Bar(
                y=df4["Name"], 
                x=df4['Catches'],
                hovertext=df4['Club'],
                orientation='h'),
                row=4, col=1)


fig.add_trace(go.Bar(
                y=df11["Name"], 
                x=df11['Clean sheets'],
                hovertext=df11['Club'],
                orientation='h'),
                row=1, col=2)

fig.add_trace(go.Bar(
                y=df12["Name"], 
                x=df12['Saves'],
                hovertext=df12['Club'],
                orientation='h'),
                row=2, col=2)

fig.add_trace(go.Bar(
                y=df13["Name"], 
                x=df13['High Claims'],
                hovertext=df13['Club'],
                orientation='h'),
                row=3, col=2)

fig.add_trace(go.Bar(
                y=df14["Name"], 
                x=df14['Catches'],
                hovertext=df14['Club'],
                orientation='h'),
                row=4, col=2)

fig.update_layout(title_text='Top stats for goalkeepers', title_x=0.5)
fig.update_traces(marker_color=['rgb(110, 102,250)', 'rgb(210, 202, 82)', 'rgb(210, 202,82)','rgb(210,202,82)',
                               'rgb(210,202,82)'], marker_line_color='rgb(8,48,107)',
                 marker_line_width=2.5, opacity=0.6)

fig.update_layout(showlegend=False)
fig.update_layout(autosize=False, width=1200, height=1600)
fig.show()

In [24]:
defenders_attr = ['Blocked shots', 'Interception','Clearances', 'Headed Clearnace','Clearances of line',
                 'Dual win', 'Successful 50/50s', "aerial battle win"]
top =5
fig=make_subplots(rows=5, cols=2, horizontal_spacing=0.05, vertical_spacing = 0.05,
                 subplot_titles=('blocked shots (overall)','Blocked shots (per-game)','Interceptions (overall)', 'Interceptions (per-game)','Clearances (overall)',
                    'Clearances (per-game)','Headed Clearance (overall)', 'Headed Clearance (per-game)','Clearances off line (overall)', 'Clearances off line (per-game)')
                 )
defenders_38app= defenders[defenders['Appearances']>=38]
df = defenders_38app.sort_values(by='Blocked shots', ascending=False).head(top)

fig.add_trace(go.Bar(x=df['Name'],
                    y=df['Blocked shots'],
                    orientation='v'),
             row=1, col=1)
              
df = defenders_38app.sort_values(by='Interceptions', ascending=False).head(top)
fig.add_trace(go.Bar(x=df["Name"], 
             y=df['Interceptions'],
             #color='Club',
             #hover_name=None,
             orientation='v'),
             row=2, col=1)  

df = defenders_38app.sort_values(by='Clearances', ascending=False).head(top)
fig.add_trace(go.Bar(x=df["Name"], 
             y=df['Clearances'],
             #color='Club',
            #hover_name=None,
             orientation='v'),
             row=3, col=1)


df = defenders_38app.sort_values(by='Headed Clearance', ascending=False).head(top)
fig.add_trace(go.Bar(x=df["Name"], 
             y=df['Headed Clearance'],
             #color='Club',
             #hover_name=None,
             orientation='v'),
             row=4,col=1)

df = defenders_38app.sort_values(by='Clearances off line', ascending=False).head(top)
fig.add_trace(go.Bar( x=df["Name"], 
             y=df['Clearances off line'],
             #color='Club',
             #hover_name=None,
             orientation='v'),
             row=5, col=1)


df = defenders_.sort_values(by='Blocked shots', ascending=False).head(top)
fig.add_trace(go.Bar( x=df["Name"], 
             y=df['Blocked shots'],
             #color='Club',
             #hover_name=None,
             orientation='v'),
             row=1, col=2)

df = defenders_.sort_values(by='Interceptions', ascending=False).head(top)
fig.add_trace(go.Bar( x=df["Name"], 
             y=df['Interceptions'],
             #color='Club',
             #hover_name=None,
             orientation='v'),
             row=2, col=2)
  

df = defenders_.sort_values(by='Clearances', ascending=False).head(top)
fig.add_trace(go.Bar(x=df["Name"], 
             y=df['Clearances'],
             #color='Club',
             #hover_name=None,
             orientation='v'),
             row=3, col=2)


df = defenders_.sort_values(by='Headed Clearance', ascending=False).head(top)
fig.add_trace(go.Bar( x=df["Name"], 
             y=df['Headed Clearance'],
             #color='Club',
             #hover_name=None,
             orientation='v'),
             row=4,col=2)

df = defenders_.sort_values(by='Clearances off line', ascending=False).head(top)
fig.add_trace(go.Bar( x=df["Name"], 
             y=df['Clearances off line'],
             #color='Club',
             #hover_name=None,
             orientation='v'),
             row=5, col=2)

fig.update_layout(title_text='Top Defender Qualities', title_x=0.5)
fig.update_traces(marker_color= ['rgb(96, 96, 96)','rgb(210,202,82)','rgb(210,202,82)','rgb(210,202,82)',
                                 'rgb(210,202,82)', 'rgb(210,202,82)'], marker_line_color='rgb(8,48,107)',
                  marker_line_width=2.5, opacity=0.6)

fig.update_layout(showlegend=False)
fig.update_layout(autosize=False, width=1400, height=1600)

fig.show()