# Soccer Player Archetype Detection and Clustering

### Importing Packages and Reading Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import *
from sklearn import metrics 
from scipy.spatial.distance import cdist 

In [2]:
def data_reader(original_path):
    data_original = pd.read_csv(original_path)
    return data_original

In [3]:
data_path = './data/data.csv' 
data_pd = data_reader(data_path)

### Defining Some Functions to Complete the Pre-processing Process

- data_separator() takes the original pandas dataframe, drops unnecessary attributes and any instances with null value. Then, it returns players and goalkeepers seperately. I seperated goalkeepers from other players since goalkeepers have different characteristics in terms of attributes. Goalkeeper archetype clustering might be a potential future work.

- organize_attributes() functions take the pandas dataframe of players, and then it creates a list of special features: pace, shooting, passing, dribbling, defending and pythsical. (goalkeeping skills are not used in this study.) These features are adapted from the stats of original FIFA player cards (https://www.futbin.com/stats-calculator).

- organize_attributes_takeAll() directly returns the quantitative statistics given in the dataset (without calculating any extra feature). 

- weight_formatter() and height_formatter() takes the dataframe of players and returns their numeric weight and height data in terms of lbs and centimeters.

In [26]:

def data_separator(data_original):
    """
    - seperates goalkeepers from the players, returns as a new pandas df, 'goalkeepers'.
    - cleans the unnecessary attributes of the data instances and returns the players as a pandas df, 'players'.
    """
    
    goalkeepers = data_original[data_original['Position'] == 'GK']
    goalkeepers = goalkeepers.drop(['Unnamed: 0','ID','Photo','Flag','Potential','Club',\
    'Club Logo','Value','Wage','Special','Preferred Foot','International Reputation','Weak Foot','Skill Moves',\
    'Work Rate','Body Type','Real Face','Position','Jersey Number','Joined','Loaned From','Contract Valid Until',\
    'Release Clause', 'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM',\
    'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB'], axis=1)
    
    players = data_original[data_original.Position != 'GK']

    players = data_original.drop(['Unnamed: 0','ID','Photo','Flag','Potential','Club',\
    'Club Logo','Value','Wage','Special','Preferred Foot','International Reputation','Weak Foot','Skill Moves',\
    'Work Rate','Body Type','Real Face','Position','Jersey Number','Joined','Loaned From','Contract Valid Until',\
    'Release Clause'], axis=1)
    
    # if still, there are instances with NaN values, delete them all.
    players = players.dropna(how='any',axis=0) 

    """
    print(players.columns)
    ==> 
    Index(['Name', 'Age', 'Overall', 'Potential', 'Height', 'Weight', 'LS', 'ST',
       'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM',
       'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB',
       'RCB', 'RB', 'Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing',
       'Volleys', 'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing',
       'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Reactions',
       'Balance', 'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots',
       'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
       'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving',
       'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes'],
      dtype='object')
    """
    players = players.reset_index(drop=True)
    goalkeepers = goalkeepers.reset_index(drop=True)


    return players, goalkeepers

In [27]:
def organize_attributes_takeAll(players):
    
    return players[['Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing',
       'Volleys', 'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing',
       'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Reactions',
       'Balance', 'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots',
       'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
       'Composure', 'Marking', 'StandingTackle', 'SlidingTackle']]


def organize_attributes(players):    
    attribute_groups = dict()
    attribute_groups['pace'] = ['Acceleration','SprintSpeed']
    attribute_groups['shooting'] = ['Finishing','LongShots','Penalties','Positioning','ShotPower','Volleys']
    attribute_groups['passing'] = ['Crossing','Curve','FKAccuracy','LongPassing','ShortPassing','Vision']
    attribute_groups['dribbling'] = ['Agility','Balance','BallControl','Composure','Dribbling','Reactions']
    attribute_groups['defending'] = ['HeadingAccuracy','Interceptions','Marking','StandingTackle','SlidingTackle']
    attribute_groups['physical'] = ['Aggression','Jumping','Stamina','Strength']
    attribute_groups['gk'] = ['GKDiving', 'GKHandling','GKKicking', 'GKPositioning', 'GKReflexes']
    
    
    attr_list = []

    for value in attribute_groups.values():
        attr_list += value



    pace_index = (players[attribute_groups['pace']].mean(axis=1)) 
    shooting_index = (players[attribute_groups['shooting']].mean(axis=1)) 
    passing_index = (players[attribute_groups['passing']].mean(axis=1)) 
    dribbling_index = (players[attribute_groups['dribbling']].mean(axis=1))
    defending_index = (players[attribute_groups['defending']].mean(axis=1))
    physical_index = (players[attribute_groups['physical']].mean(axis=1)) 
    # gk_index = (players[attribute_groups['gk']].mean(axis=1)) 

    # pos_list = ['LS','ST','RS','LW','LF','CF','RF','RW','LAM','CAM','RAM',\
    # 'LM','LCM','CM','RCM','RM','LWB','LDM','CDM','RDM','RWB','LB','LCB','CB','RCB','RB']
    # position_data = players[pos_list]
    #position_data = position_data.div(players['Overall'].values, axis = 0)


    all_index= pd.concat([pace_index,shooting_index,passing_index,dribbling_index,defending_index,physical_index], axis=1 , ignore_index=True)


    all_index.columns = ['pace_index','shooting_index','passing_index','dribbling_index','defending_index','physical_index']
    #all_index = all_index / all_index.mean(axis=1)
    all_index = all_index.div(all_index.mean(axis=1).to_frame()[0], axis=0)
    
    return all_index.round(2)

In [28]:
def weight_formatter(players):
    """
    all weights are used in lbs.
    """
    
    for index, row in players.iterrows():
        players.at[index, 'Weight'] = float(row['Weight'].split('lbs')[0])
    
    return players['Weight']


def height_formatter(players):
    """
    converted all heights to centimeters.
    """
    for index, row in players.iterrows():
        players.at[index, 'Height'] = round(float(row['Height'].split("'")[0]) * 30.48 + float(row['Height'].split("'")[1]) * 2.54)
    
    return players['Height']

In [30]:
# Dropping the goalkeeper instances and removing unnecessary attributes.
data_pd_cleaned, goalkeepers = data_separator(data_pd)

# Creating some new features by using the columns in the input dataframe.
data_fielded = organize_attributes(data_pd_cleaned)

# Creating height and weight attributes for each player and adding to the current dataframe data_fielded.
data_fielded['height'] = height_formatter(data_pd_cleaned)
data_fielded['weight'] = weight_formatter(data_pd_cleaned)

### Rescaling the Input Data

In [10]:
def rescale_df(df, option = 0):
    """
    set option to 0 for normalization
    set option to 1 for standardization
    """
    if option == 0:
        temp_df = (df - df.min())/(df.max() - df.min()) 
    elif option == 1:
        temp_df = (df - df.mean()) / df.std()
    return temp_df
    



In [None]:
rescaled_data = rescale_df(data_fielded, 0)

In [11]:
rescaled_data

Unnamed: 0,pace_index,shooting_index,passing_index,dribbling_index,defending_index,physical_index,height,weight
0,0.511628,0.743590,0.773196,0.716418,0.123077,0.160377,0.3125,0.368421
1,0.503876,0.752137,0.608247,0.537313,0.153846,0.367925,0.6875,0.548872
2,0.558140,0.717949,0.711340,0.716418,0.130769,0.169811,0.416667,0.300752
3,0.372093,0.683761,0.731959,0.477612,0.338462,0.311321,0.520833,0.330827
4,0.550388,0.700855,0.701031,0.701493,0.138462,0.207547,0.375,0.398496
5,0.364341,0.615385,0.701031,0.597015,0.438462,0.226415,0.375,0.270677
6,0.410853,0.726496,0.608247,0.522388,0.276923,0.386792,0.583333,0.601504
7,0.364341,0.487179,0.505155,0.328358,0.653846,0.452830,0.583333,0.533835
8,0.410853,0.769231,0.587629,0.537313,0.207692,0.424528,0.583333,0.496241
9,0.271318,0.700855,0.783505,0.492537,0.500000,0.150943,0.583333,0.43609


## Clustering 

### Finding the Ideal Number of Clusters

In [13]:
"""

distortions = [] 
inertias = [] 
mapping1 = {} 
mapping2 = {} 
K = range(1,10) 
  
for k in K: 
    #Building and fitting the model 
    kmeanModel = KMeans(n_clusters=k) 
    kmeanModel.fit(rescaled_data)     
      
    distortions.append(sum(np.min(cdist(rescaled_data, kmeanModel.cluster_centers_, 
                      'euclidean'),axis=1)) / rescaled_data.shape[0]) 
    inertias.append(kmeanModel.inertia_) 
  
    mapping1[k] = sum(np.min(cdist(rescaled_data, kmeanModel.cluster_centers_, 
                 'euclidean'),axis=1)) / rescaled_data.shape[0] 
    mapping2[k] = kmeanModel.inertia_ 

#kmeans = KMeans(n_clusters = 9)
#kmeans.fit(rescaled_data)"""

"\n\ndistortions = [] \ninertias = [] \nmapping1 = {} \nmapping2 = {} \nK = range(1,10) \n  \nfor k in K: \n    #Building and fitting the model \n    kmeanModel = KMeans(n_clusters=k) \n    kmeanModel.fit(rescaled_data)     \n      \n    distortions.append(sum(np.min(cdist(rescaled_data, kmeanModel.cluster_centers_, \n                      'euclidean'),axis=1)) / rescaled_data.shape[0]) \n    inertias.append(kmeanModel.inertia_) \n  \n    mapping1[k] = sum(np.min(cdist(rescaled_data, kmeanModel.cluster_centers_, \n                 'euclidean'),axis=1)) / rescaled_data.shape[0] \n    mapping2[k] = kmeanModel.inertia_ \n\n#kmeans = KMeans(n_clusters = 9)\n#kmeans.fit(rescaled_data)"

In [14]:
"""plt.plot(K, distortions, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Distortion') 
plt.title('The Elbow Method using Distortion') 
plt.show() """

"plt.plot(K, distortions, 'bx-') \nplt.xlabel('Values of K') \nplt.ylabel('Distortion') \nplt.title('The Elbow Method using Distortion') \nplt.show() "

In [15]:
number_of_clusters = 10

kmeanModel = KMeans(n_clusters = number_of_clusters ) 
kmeanModel.fit(rescaled_data) 

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [16]:
rescaled_data['Name'] = data_pd_cleaned['Name']
rescaled_data['Age'] = data_pd_cleaned['Age']

In [17]:
rescaled_data['Cluster'] = kmeanModel.labels_

In [18]:
#rescaled_data

In [22]:
for each in range(number_of_clusters):
    print(rescaled_data[rescaled_data['Cluster']==each][:3][['Name', 'Cluster']])   


# rescaled_data[rescaled_data['Cluster']==8]

            Name  Cluster
36  K. Koulibaly        0
52     R. Varane        0
63    M. Benatia        0
        Name  Cluster
0   L. Messi        1
2  Neymar Jr        1
4  E. Hazard        1
            Name  Cluster
3   K. De Bruyne        2
5      L. Modrić        2
11   David Silva        2
               Name  Cluster
9          T. Kroos        3
16  Sergio Busquets        3
22         Casemiro        3
          Name  Cluster
35   M. Icardi        4
148  T. Werner        4
215   J. Vardy        4
           Name  Cluster
12     N. Kanté        5
40   Jordi Alba        5
53  M. Verratti        5
            Name  Cluster
10      D. Godín        6
19  G. Chiellini        6
29    M. Hummels        6
                Name  Cluster
1  Cristiano Ronaldo        7
6          L. Suárez        7
8     R. Lewandowski        7
            Name  Cluster
7   Sergio Ramos        8
33  Thiago Silva        8
34     S. Umtiti        8
                Name  Cluster
111         Carvajal        9
167 