# Soccer Player Archetype Detection and Clustering

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [16]:
def data_reader(original_path):
    data_original = pd.read_csv(original_path)
    return data_original

In [17]:
data_path = './data/data.csv' 

data_pd = data_reader(data_path)

In [18]:
data_pd[:]

Unnamed: 0.1,Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,€228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,...,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,€138.6M
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,...,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,€196.4M
5,5,183277,E. Hazard,27,https://cdn.sofifa.org/players/4/19/183277.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,91,Chelsea,...,91.0,34.0,27.0,22.0,11.0,12.0,6.0,8.0,8.0,€172.1M
6,6,177003,L. Modrić,32,https://cdn.sofifa.org/players/4/19/177003.png,Croatia,https://cdn.sofifa.org/flags/10.png,91,91,Real Madrid,...,84.0,60.0,76.0,73.0,13.0,9.0,7.0,14.0,9.0,€137.4M
7,7,176580,L. Suárez,31,https://cdn.sofifa.org/players/4/19/176580.png,Uruguay,https://cdn.sofifa.org/flags/60.png,91,91,FC Barcelona,...,85.0,62.0,45.0,38.0,27.0,25.0,31.0,33.0,37.0,€164M
8,8,155862,Sergio Ramos,32,https://cdn.sofifa.org/players/4/19/155862.png,Spain,https://cdn.sofifa.org/flags/45.png,91,91,Real Madrid,...,82.0,87.0,92.0,91.0,11.0,8.0,9.0,7.0,11.0,€104.6M
9,9,200389,J. Oblak,25,https://cdn.sofifa.org/players/4/19/200389.png,Slovenia,https://cdn.sofifa.org/flags/44.png,90,93,Atlético Madrid,...,70.0,27.0,12.0,18.0,86.0,92.0,78.0,88.0,89.0,€144.5M


In [19]:

def data_separator(data_original):
    """
    seperates goalkeepers from the players, returns as a new pandas df, 'goalkeepers'.
    cleans the unnecessary attributes of the data instances and returns the players as a pandas df, 'players'.
    """
    goalkeepers = data_original[data_original['Position'] == 'GK']

    players = data_original[data_original.Position != 'GK']

    players = data_original.drop(['Unnamed: 0','ID','Photo','Flag','Potential','Club',\
    'Club Logo','Value','Wage','Special','Preferred Foot','International Reputation','Weak Foot','Skill Moves',\
    'Work Rate','Body Type','Real Face','Position','Jersey Number','Joined','Loaned From','Contract Valid Until',\
    'Release Clause'], axis=1)
    
    # if still, there are instances with NaN values, delete them all.
    players = players.dropna(how='any',axis=0) 

    """
    print(players.columns)
    ==> 
    Index(['Name', 'Age', 'Overall', 'Potential', 'Height', 'Weight', 'LS', 'ST',
       'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM',
       'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB',
       'RCB', 'RB', 'Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing',
       'Volleys', 'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing',
       'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Reactions',
       'Balance', 'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots',
       'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
       'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving',
       'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes'],
      dtype='object')
    """
    players = players.reset_index(drop=True)

    return players#, goalkeepers

In [20]:
def organize_attributes(players):

    attribute_groups = dict()
    attribute_groups['pace'] = ['Acceleration','SprintSpeed']
    attribute_groups['shooting'] = ['Finishing','LongShots','Penalties','Positioning','ShotPower','Volleys']
    attribute_groups['passing'] = ['Crossing','Curve','FKAccuracy','LongPassing','ShortPassing','Vision']
    attribute_groups['dribbling'] = ['Agility','Balance','BallControl','Composure','Dribbling','Reactions']
    attribute_groups['defending'] = ['HeadingAccuracy','Interceptions','Marking','StandingTackle','SlidingTackle']
    attribute_groups['physical'] = ['Aggression','Jumping','Stamina','Strength']
    attribute_groups['gk'] = ['GKDiving', 'GKHandling','GKKicking', 'GKPositioning', 'GKReflexes']

    attr_list = []

    for value in attribute_groups.values():
        attr_list += value



    pace_index = (players[attribute_groups['pace']].mean(axis=1)) 
    shooting_index = (players[attribute_groups['shooting']].mean(axis=1)) 
    passing_index = (players[attribute_groups['passing']].mean(axis=1)) 
    dribbling_index = (players[attribute_groups['dribbling']].mean(axis=1))
    defending_index = (players[attribute_groups['defending']].mean(axis=1))
    physical_index = (players[attribute_groups['physical']].mean(axis=1)) 
    # gk_index = (players[attribute_groups['gk']].mean(axis=1)) 

    # pos_list = ['LS','ST','RS','LW','LF','CF','RF','RW','LAM','CAM','RAM',\
    # 'LM','LCM','CM','RCM','RM','LWB','LDM','CDM','RDM','RWB','LB','LCB','CB','RCB','RB']
    # position_data = players[pos_list]

    # position_data = position_data.div(players['Overall'].values, axis = 0)

    # height = height_formatter(players)
    # weight = weight_formatter(players)


    all_index= pd.concat([pace_index,shooting_index,passing_index,dribbling_index,defending_index,physical_index], axis=1 , ignore_index=True)
    # all_index = all_index / 100

    # all_index= pd.concat([height,weight], axis=1 , ignore_index=True)


    all_index.columns = ['pace_index','shooting_index','passing_index','dribbling_index','defending_index','physical_index']
    #all_index = all_index / all_index.mean(axis=1)
    all_index = all_index.div(all_index.mean(axis=1).to_frame()[0], axis=0)
    
    return all_index.round(2)

In [21]:
def weight_formatter(players):
    """
    all weights are used in lbs.
    """
    
    for index, row in players.iterrows():
        players.at[index, 'Weight'] = float(row['Weight'].split('lbs')[0])
    
    return players['Weight']


def height_formatter(players):
    """
    converted all heights to centimeters.
    """
    for index, row in players.iterrows():
        players.at[index, 'Height'] = round(float(row['Height'].split("'")[0]) * 30.48 + float(row['Height'].split("'")[1]) * 2.54)
    
    return players['Height']


In [22]:
# Dropping the goalkeeper instances and removing unnecessary attributes.
data_pd_cleaned = data_separator(data_pd)

data_fielded = organize_attributes(data_pd_cleaned)

data_fielded["weight"] = weight_formatter(data_pd_cleaned)
data_fielded["height"] = height_formatter(data_pd_cleaned)



In [23]:
#normalized_df=(data_fielded-data_fielded.mean())/data_fielded.std()
normalized_df=(data_fielded-data_fielded.min())/(data_fielded.max()-data_fielded.min())


data_fielded = normalized_df

In [24]:
from sklearn.cluster import *



kmeans = AgglomerativeClustering(n_clusters = 11)
kmeans.fit(data_fielded)

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
                        connectivity=None, distance_threshold=None,
                        linkage='ward', memory=None, n_clusters=11,
                        pooling_func='deprecated')

In [25]:
data_fielded['Name'] = data_pd_cleaned['Name']
data_fielded['Age'] = data_pd_cleaned['Age']

In [26]:
data_fielded['Cluster'] = kmeans.labels_

In [44]:
data_fielded[data_fielded['Cluster']==9]

Unnamed: 0,pace_index,shooting_index,passing_index,dribbling_index,defending_index,physical_index,weight,height,Name,Age,Cluster
1,0.503876,0.752137,0.608247,0.537313,0.153846,0.367925,0.548872,0.6875,Cristiano Ronaldo,33,9
6,0.410853,0.726496,0.608247,0.522388,0.276923,0.386792,0.601504,0.583333,L. Suárez,31,9
28,0.596899,0.726496,0.597938,0.477612,0.207692,0.273585,0.496241,0.6875,P. Aubameyang,29,9
32,0.434109,0.811966,0.587629,0.611940,0.153846,0.367925,0.646617,0.625,G. Higuaín,30,9
39,0.496124,0.786325,0.432990,0.522388,0.238462,0.424528,0.578947,0.625,C. Immobile,28,9
46,0.596899,0.692308,0.618557,0.537313,0.192308,0.264151,0.413534,0.583333,L. Sané,22,9
93,0.449612,0.752137,0.680412,0.537313,0.146154,0.377358,0.518797,0.625,K. Benzema,30,9
104,0.550388,0.709402,0.556701,0.567164,0.230769,0.301887,0.43609,0.583333,A. Martial,22,9
112,0.527132,0.683761,0.701031,0.522388,0.123077,0.386792,0.466165,0.416667,M. Depay,24,9
140,0.542636,0.803419,0.474227,0.417910,0.200000,0.415094,0.548872,0.6875,Louri Beretta,26,9
