## Part 2: Unsupervised clustering

In [34]:
import os
import pandas as pd
import numpy as np
import sklearn

%matplotlib inline
import matplotlib.pyplot as plt                         # Matplotlib's pyplot: MATLAB-like syntax
import seaborn as sns                                   # For pretty plots

import warnings
warnings.filterwarnings('ignore')

This part is exactly the same as in part 1, see its notebook for more info.

In [35]:
df = pd.read_csv(os.path.join('data', 'CrowdstormingDataJuly1st.csv'))
dfd = pd.read_csv(os.path.join('data', 'crowdstorm_disaggregated.csv'))
df_cols = set(df.columns)
dfd_cols = set(dfd.columns)
df['skintone'] = 0.5 * (df['rater1']+df['rater2'])
df_skinton = df[["playerShort", "skintone"]].reset_index().groupby('playerShort', axis=0).mean()
mean_skin = set(df_skinton['skintone'].dropna().unique().tolist())
unique_skin = set(df['skintone'].dropna().unique().tolist())
LABEL_FIELDS = ['club', 'leagueCountry', 'position', 'Alpha_3']
UNKNOWN_LABEL = 'MISSING'
label_encoders = {}

from sklearn.preprocessing import LabelEncoder

df_categorized = df.copy()

for label in LABEL_FIELDS:
    encoder = LabelEncoder()
    values = df_categorized[label].fillna(UNKNOWN_LABEL, inplace=False)
    encoder.fit(values)
    transformed = encoder.transform(values)
    df_categorized[label] = transformed
    
    label_encoders[label] = encoder
    
# Special case for the skintone where we need to keep NaN values
encoder = LabelEncoder()
skintone = df_categorized['skintone']
skintone = skintone[skintone.notnull()]
encoder.fit(skintone)
df_categorized.loc[skintone.index, 'skintone'] = encoder.transform(skintone)
label_encoders['skintone'] = encoder


df_subset = df_categorized[['playerShort', 'club', 'leagueCountry', 'position','games', 'yellowCards', 'redCards', 'meanIAT', 'skintone', 'birthday', 'height',
       'weight', 'victories', 'ties', 'defeats', 'goals']].copy()

for f in ['meanIAT', 'yellowCards', 'redCards']:
    df_subset[f].fillna(0, inplace=True)
df_subset["skintone"] = df["skintone"]
df_subset["cost"] = df_subset['meanIAT']*(df_subset['yellowCards'] + 3*df_subset['redCards'])*df_subset["games"]

# Unsupervised clustering

We keep the dataframe `df_subset` because we want to keep the `cost` column from part 1 as a likely useful aggregate of racial bias that a player received.



In [36]:
df_aggregated = df_subset.drop(["meanIAT", "club", "leagueCountry"], 1).set_index(["playerShort"]).groupby(level=0).agg(
    {
        "skintone": "mean",
        "height": "mean",
        "weight": "mean",
        "position": "first",
        "games": "sum",
        "yellowCards": "sum",
        "redCards": "sum",
        "cost": "sum",
        "victories": "max",
        "ties": "max",
        "defeats": "max",
        "goals": "max",
     }
).reset_index().drop(["playerShort"], 1).dropna()
df_aggregated["yellowCards"] = df_aggregated["yellowCards"] / df_aggregated["games"]
df_aggregated["redCards"] = df_aggregated["redCards"] / df_aggregated["games"]
df_aggregated["cost"] = df_aggregated["cost"] / df_aggregated["games"]

orig_x = df_aggregated.drop(["games", "skintone"], axis=1)

x = orig_x.copy()
data = sklearn.preprocessing.scale(x.as_matrix())
features = list(x.columns)

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

kmeans = KMeans(n_clusters=2, random_state=0).fit(data)

We arbitrarily split the skintone feature in two parts, the ones strictly below 0.5 and the ones equal or above 0.5.

In [37]:
def accuracy(skintone, labels):
    skintone = skintone < 0.5
    score = ((skintone - labels).apply(np.abs)).mean()
    if score < 0.5:
        return 1 - score
    return score

cur_score = silhouette_score(data, kmeans.labels_)
cur_acc = accuracy(df_aggregated["skintone"], kmeans.labels_)

print("Original silhouette score: {} \t accuracy: {}".format(cur_score, cur_acc))

print(features)
for i in range(0, len(features)-1):
    score_changed = False
    for f in features:
        data = sklearn.preprocessing.scale(x.drop(f, 1).as_matrix())
        kmeans = KMeans(n_clusters=2, random_state=0).fit(data)
        score = silhouette_score(data, kmeans.labels_)
        acc = accuracy(df_aggregated["skintone"], kmeans.labels_)
        if acc > cur_acc:
            print("Dropping {} gives Silhouette score {} \t accuracy: {}".format(f, score, acc))
            #print(pd.concat([df_aggregated.reset_index()["skintone"], pd.Series(kmeans.labels_)], axis=1))
            del(features[features.index(f)])
            cur_score = score
            cur_acc = acc
            x = x.drop(f, 1)
            break


Original silhouette score: 0.2041511586863509 	 accuracy: 0.5460358056265985
['defeats', 'ties', 'position', 'height', 'victories', 'goals', 'weight', 'redCards', 'cost', 'yellowCards']
Dropping cost gives Silhouette score 0.1961789358486745 	 accuracy: 0.5607416879795397
Dropping position gives Silhouette score 0.22167562819333234 	 accuracy: 0.5652173913043479
Dropping defeats gives Silhouette score 0.20314673724726415 	 accuracy: 0.571611253196931
Dropping height gives Silhouette score 0.2344718699049821 	 accuracy: 0.579923273657289
Dropping ties gives Silhouette score 0.23748648860185428 	 accuracy: 0.6368286445012787


We see that whatever feature we drop we don't get anything better than 63% accuracy which is basically 13% better than a random classifier. Let's try another classifier.

In [38]:
from sklearn.cluster import SpectralClustering

x = orig_x.copy()
features = list(x.columns)
data = sklearn.preprocessing.scale(x.as_matrix())

classifier = SpectralClustering(n_clusters=2,
                              eigen_solver='amg',
                              affinity="rbf")
spectral = classifier.fit(data)

cur_score = silhouette_score(data, spectral.labels_)
cur_acc = accuracy(df_aggregated["skintone"], spectral.labels_)

print("Original silhouette score: {} \t accuracy: {}".format(cur_score, cur_acc))


Original silhouette score: 0.34029044932690966 	 accuracy: 0.7576726342710998


75% accuracy is quite better than with KMeans, let's try to remove features now.

In [39]:
for i in range(0, len(features)-1):
    score_changed = False
    for f in features:
        data = sklearn.preprocessing.scale(x.drop(f, 1).as_matrix())
        results = classifier.fit(data)
        score = silhouette_score(data, results.labels_)
        acc = accuracy(df_aggregated["skintone"], results.labels_)
        if score > cur_score:
            print("Dropping {} gives Silhouette score {} \t accuracy: {}".format(f, score, acc))
            #print(pd.concat([df_aggregated.reset_index()["skintone"], pd.Series(kmeans.labels_)], axis=1))
            del(features[features.index(f)])
            cur_score = score
            cur_acc = acc
            x = x.drop(f, 1)
            break

Dropping defeats gives Silhouette score 0.3592034629889609 	 accuracy: 0.7576726342710998
Dropping ties gives Silhouette score 0.4292783662765878 	 accuracy: 0.7589514066496164
Dropping goals gives Silhouette score 0.4856503015170032 	 accuracy: 0.7576726342710998
Dropping position gives Silhouette score 0.5256528436120402 	 accuracy: 0.7576726342710998
Dropping yellowCards gives Silhouette score 0.5624662958621395 	 accuracy: 0.7576726342710998
Dropping weight gives Silhouette score 0.5821891941509146 	 accuracy: 0.7589514066496164
Dropping height gives Silhouette score 0.6086258378129109 	 accuracy: 0.7576726342710998
Dropping victories gives Silhouette score 0.7170147246501964 	 accuracy: 0.7576726342710998
Dropping redCards gives Silhouette score 0.7566283280168588 	 accuracy: 0.7576726342710998


As we saw above