## Description
The first part is focused on inferring the missing value about skin color rating using machine learning algorithm. The second part try to infer wether referees' behavior change depending of the skin-color of the player.

Let start by doing some imports.

In [88]:
import os
import pandas as pd
import numpy as np
import sklearn

%matplotlib inline
import matplotlib.pyplot as plt                         # Matplotlib's pyplot: MATLAB-like syntax
import seaborn as sns                                   # For pretty plots

In [111]:
df = pd.read_csv(os.path.join('data', 'CrowdstormingDataJuly1st.csv'))
dfd = pd.read_csv(os.path.join('data', 'crowdstorm_disaggregated.csv'))
df_cols = set(df.columns)
dfd_cols = set(dfd.columns)
df['skintone'] = 0.5 * (df['rater1']+df['rater2'])
df_skinton = df[["playerShort", "skintone"]].reset_index().groupby('playerShort', axis=0).mean()
mean_skin = set(df_skinton['skintone'].dropna().unique().tolist())
unique_skin = set(df['skintone'].dropna().unique().tolist())
LABEL_FIELDS = ['club', 'leagueCountry', 'position', 'Alpha_3']
UNKNOWN_LABEL = 'MISSING'
label_encoders = {}

from sklearn.preprocessing import LabelEncoder

df_categorized = df.copy()

for label in LABEL_FIELDS:
    encoder = LabelEncoder()
    values = df_categorized[label].fillna(UNKNOWN_LABEL, inplace=False)
    encoder.fit(values)
    transformed = encoder.transform(values)
    df_categorized[label] = transformed
    
    label_encoders[label] = encoder
    
# Special case for the skintone where we need to keep NaN values
encoder = LabelEncoder()
skintone = df_categorized['skintone']
skintone = skintone[skintone.notnull()]
encoder.fit(skintone)
df_categorized.loc[skintone.index, 'skintone'] = encoder.transform(skintone)
label_encoders['skintone'] = encoder


df_subset = df_categorized[['playerShort', 'club', 'leagueCountry', 'position','games', 'yellowCards', 'redCards', 'meanIAT', 'skintone', 'birthday', 'height',
       'weight', 'victories', 'ties', 'defeats', 'goals']].copy()

for f in ['meanIAT', 'yellowCards', 'redCards']:
    df_subset[f].fillna(0, inplace=True)
df_subset["skintone"] = df["skintone"]
df_subset["cost"] = df_subset['meanIAT']*(df_subset['yellowCards'] + 3*df_subset['redCards'])

# Unsupervised clustering

We keep the dataframe `df_subset` because we want to keep the `cost` column from part 1 as a likely useful aggregate of racial bias that a player received.



In [147]:
df_aggregated = df_subset.drop(["meanIAT", "club", "leagueCountry"], 1).set_index(["playerShort"]).groupby(level=0).agg(
    {
        "skintone": "mean",
        "height": "mean",
        "weight": "mean",
        "position": "first",
        "games": "max",
        "yellowCards": "sum",
        "redCards": "sum",
        "cost": "sum",
        "victories": "max",
        "ties": "max",
        "defeats": "max",
        "goals": "max",
     }
).reset_index().drop(["playerShort"], 1).dropna()
df_aggregated["yellowCards"] = df_aggregated["yellowCards"] / df_aggregated["games"]
df_aggregated["redCards"] = df_aggregated["redCards"] / df_aggregated["games"]
df_aggregated["cost"] = df_aggregated["cost"] / df_aggregated["games"]

x = df_aggregated.drop(["games", "skintone"], axis=1)

data = sklearn.preprocessing.scale(x.as_matrix())
features = list(x.columns)

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

kmeans = KMeans(n_clusters=2, random_state=0).fit(data)
sklearn.preprocessing.scale(x.as_matrix())

array([[ 1.50519227, -0.27763032,  2.69206622, ..., -0.69747321,
        -1.08764045, -0.95884737],
       [ 0.5503674 ,  1.23795022,  1.25605272, ..., -0.26859413,
        -1.36742164, -0.60411455],
       [ 2.22131092,  0.48015995,  1.6150561 , ..., -0.69747321,
         1.7101715 , -1.12672296],
       ..., 
       [ 1.26648605, -0.65652546,  1.6150561 , ...,  1.33970242,
        -1.08764045, -0.39334683],
       [-1.12057612, -1.0354206 , -1.25697089, ...,  2.56200779,
         0.03148433, -0.09501521],
       [-0.40445746,  0.10126481,  0.1790426 , ..., -0.69747321,
        -0.24829687, -0.61036593]])

We arbitrarily split the skintone feature in two parts, the ones strictly below 0.5 and the ones equal or above 0.5.

In [144]:
def accuracy(skintone, labels):
    skintone = skintone < 0.5
    score = ((skintone - labels).apply(np.abs)).mean()
    if score < 0.5:
        return 1 - score
    return score

x["label"] = kmeans.labels_
cur_score = silhouette_score(data, kmeans.labels_)
cur_acc = accuracy(df_aggregated["skintone"], kmeans.labels_)

print("Original silhouette score: {} \t accuracy: {}".format(cur_score, acc))

print(features)
for i in range(0, len(features)):
    score_changed = False
    for f in features:
        data = sklearn.preprocessing.scale(x.drop(f, 1).as_matrix())
        kmeans = KMeans(n_clusters=2, random_state=0).fit(data)
        score = silhouette_score(data, kmeans.labels_)
        acc = accuracy(df_aggregated["skintone"], kmeans.labels_)
        if score > cur_score:
            print("Dropping {} gives Silhouette score {} \t accuracy: {}".format(f, score, acc))
            print(np.mean(kmeans.labels_))
            #print(pd.concat([df_aggregated.reset_index()["skintone"], pd.Series(kmeans.labels_)], axis=1))
            del(features[features.index(f)])
            cur_score = score
            cur_acc = acc
            x = x.drop(f, 1)
            break


Original silhouette score: 0.17034142025361285 	 accuracy: 0.5287723785166241
['victories', 'goals', 'defeats', 'weight', 'yellowCards', 'ties', 'height', 'redCards', 'position', 'cost']
Dropping victories gives Silhouette score 0.23899959801234397 	 accuracy: 0.5287723785166241
0.608056265985
Dropping goals gives Silhouette score 0.2422054038445399 	 accuracy: 0.5300511508951407
0.610613810742
Dropping weight gives Silhouette score 0.2698625506279337 	 accuracy: 0.5294117647058824
0.609974424552
Dropping yellowCards gives Silhouette score 0.300717168567046 	 accuracy: 0.5294117647058824
0.608695652174
Dropping height gives Silhouette score 0.3506022529584603 	 accuracy: 0.5287723785166241
0.608056265985
Dropping redCards gives Silhouette score 0.40306090631917657 	 accuracy: 0.5287723785166241
0.608056265985
Dropping defeats gives Silhouette score 0.4120488101318473 	 accuracy: 0.5287723785166241
0.608056265985
Dropping ties gives Silhouette score 0.41917064228427436 	 accuracy: 0.528



Dropping cost gives Silhouette score 1.0 	 accuracy: 0.5287723785166241
0.608056265985
