In [70]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
pd.options.display.max_columns = 99

In [77]:
full_df = pd.read_csv("../data/lineup_stats.csv")

In [78]:
# we only need the raw numbers for clustering
columns_to_drop = ["player1","player2","player3","player4","player5","TEAM","seconds"]
df = full_df.drop(columns_to_drop, axis=1)
df.fillna(0, inplace = True)

In [79]:
# relative numbers are what is important
# not taking this into account would split the groups in high playing and low playing time
for i,r in df.iterrows():
    s = np.sum(df.iloc[i,:])
    counter = 0
    for col in r:
        df.iloc[i,counter] = col/s
        counter += 1

In [80]:
lineups_indexed = {}
for i,r in full_df.iterrows():
    lineups_indexed[i] = {}
    lineups_indexed[i]["player1"] = r["player1"]
    lineups_indexed[i]["player2"] = r["player2"]
    lineups_indexed[i]["player3"] = r["player3"]
    lineups_indexed[i]["player4"] = r["player4"]
    lineups_indexed[i]["player5"] = r["player5"]
    lineups_indexed[i]["TEAM"] = r["TEAM"]
    lineups_indexed[i]["seconds"] = r["seconds"]

In [81]:
df.head(3)

Unnamed: 0,2FGA,2FGM,3FGA,3FGM,AG,AS,CM,CMU,D,FTA,FTM,FV,O,OF,RV,ST,TO
0,0.180723,0.072289,0.060241,0.036145,0.012048,0.108434,0.048193,0.0,0.096386,0.108434,0.072289,0.024096,0.036145,0.0,0.072289,0.036145,0.036145
1,0.138614,0.059406,0.138614,0.09901,0.029703,0.128713,0.059406,0.0,0.108911,0.039604,0.029703,0.0,0.019802,0.009901,0.069307,0.029703,0.039604
2,0.154639,0.082474,0.134021,0.051546,0.020619,0.082474,0.051546,0.0,0.092784,0.061856,0.061856,0.010309,0.051546,0.0,0.113402,0.020619,0.010309


In [82]:
def generate_clusters(data, n_clusters, lineups, df):
    est = KMeans(n_clusters=n_clusters)
    est.fit(data)
    for count,label in zip(range(len(list(est.labels_))),list(est.labels_)):
        lineups[count]["cluster"] = label
    df["clusters"] = est.labels_
    return lineups, df

def group_by_clusters(df):
    aggs = {'2FGA': "mean", '2FGM': "mean",
       '3FGA': "mean", '3FGM': "mean", 'AG': "mean", 'AS': "mean", 'CM': "mean", 'CMU': "mean", 'D': "mean", 'FTA': "mean", 'FTM': "mean", 'FV': "mean", 'O': "mean",
       'OF': "mean", 'RV': "mean", 'ST': "mean", 'TO': "mean"}
    g_df = df.groupby(["clusters"]).agg(aggs)
    return g_df
    

In [87]:
lineups_indexed, full_df = generate_clusters(df,2,lineups_indexed, full_df)
grouped_full_df = group_by_clusters(full_df)

In [88]:
grouped_full_df

Unnamed: 0_level_0,2FGA,2FGM,3FGA,3FGM,AG,AS,CM,CMU,D,FTA,FTM,FV,O,OF,RV,ST,TO
clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,24.478431,13.411765,13.054902,5.105882,1.580392,11.313725,10.266667,0.200787,13.870588,7.054902,5.372549,1.698039,6.27451,0.85098,10.898039,3.803922,6.682353
1,16.0,8.510417,10.901042,4.34375,1.026042,8.338542,8.145833,0.177083,10.666667,12.072917,9.3125,1.119792,4.71875,0.651042,11.145833,2.984375,5.234375


In [76]:
full_df

Unnamed: 0,player1,player2,player3,player4,player5,2FGA,2FGM,3FGA,3FGM,AG,AS,CM,CMU,D,FTA,FTM,FV,O,OF,RV,ST,TEAM,TO,seconds,clusters
0,"ABALDE, ALBERTO","DOORNEKAMP, AARON","GREEN, ERICK","PLEISS, TIBOR","THOMAS, WILL",9,6,2,3,1,9,4,0.0,8,3,6,2,3,0,6,3,PAM,3,448.0,6
1,"ABALDE, ALBERTO","DUBLJEVIC, BOJAN","MARTINEZ, RAFA","SAN EMETERIO, FERNANDO","THOMAS, WILL",8,6,4,10,3,13,6,0.0,11,1,3,0,2,1,7,3,PAM,4,516.0,0
2,"ABALDE, ALBERTO","DUBLJEVIC, BOJAN","MARTINEZ, RAFA","SASTRE, JOAN","THOMAS, WILL",7,8,8,5,2,8,5,0.0,9,0,6,1,5,0,11,2,PAM,1,530.0,3
3,"ABALDE, ALBERTO","GREEN, ERICK","PLEISS, TIBOR","SAN EMETERIO, FERNANDO","THOMAS, WILL",6,5,3,3,0,4,1,0.0,5,0,2,0,3,0,6,1,PAM,4,343.0,6
4,"ABALDE, ALBERTO","GREEN, ERICK","PLEISS, TIBOR","THOMAS, WILL","VAN ROSSOM, SAM",4,5,5,3,1,3,2,0.0,8,1,4,2,3,0,5,1,PAM,2,330.0,6
5,"ABALDE, ALBERTO","GREEN, ERICK","PLEISS, TIBOR","THOMAS, WILL","VIVES, GUILLEM",3,2,6,4,0,5,6,1.0,4,2,3,0,3,1,4,3,PAM,6,341.0,2
6,"ABALDE, ALBERTO","MARTINEZ, RAFA","PLEISS, TIBOR","SASTRE, JOAN","THOMAS, WILL",5,6,4,10,1,11,10,0.0,14,0,5,2,2,1,11,1,PAM,7,485.0,7
7,"ABALDE, ALBERTO","MARTINEZ, RAFA","PLEISS, TIBOR","THOMAS, WILL","VAN ROSSOM, SAM",4,7,4,7,0,12,5,0.0,8,0,5,1,5,0,3,2,PAM,4,454.0,0
8,"ABASS, AWUDU","BERTANS, DAIRIS","GUDAITIS, ARTURAS","JERRELLS, CURTIS","M'BAYE, AMATH",5,4,3,2,1,3,3,0.0,8,3,3,1,4,0,5,1,MIL,3,307.0,6
9,"ADAMS, JOSH","BALBAY, DOGUS","DUNSTON, BRYANT","SIMON, KRUNOSLAV","STIMAC, VLADIMIR",5,4,1,4,0,3,2,0.0,8,1,1,2,1,0,6,0,IST,2,377.0,6
