In [90]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score

# Importing and simple cleaning of data

First of all, some simple cleaning. We remove features that have nothing to do with the skin color. We remove playerShort (we will use 'player' later for aggregating), birthday, Alpha_3 (since it is the same as refCountry) and photoID.

There are missing values for height, weight and position. And also 163 dyads miss the information for the implicit association test and the explicit bias scores.

First we will remove the refrees that got in the data by mistake (as mentioned in the preprocessing article) and see if the problem is solved then. After removing these refrees, there were still 110 samples with missing data which we just removed.

The missing values of height and weight are replaced by the respective means and the missing values in position are replaced by the most frequently occuring position.

In [254]:
df = pd.read_csv ('CrowdstormingDataJuly1st.csv')
df.head ()

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,...,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,...,0.5,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,...,0.75,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504
2,abdon-prats,Abdón Prats,RCD Mallorca,Spain,17.12.1992,181.0,79.0,,1,0,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
3,pablo-mari,Pablo Marí,RCD Mallorca,Spain,31.08.1993,191.0,87.0,Center Back,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
4,ruben-pena,Rubén Peña,Real Valladolid,Spain,18.07.1991,172.0,70.0,Right Midfielder,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002


In [255]:
# remove values without skin color rating
df_train_raw = df [pd.notnull (df ['rater1'])]

In [256]:
# drop unimportant features
df_train_raw = df_train_raw.drop(['birthday', 'player', 'Alpha_3', 'photoID'], axis=1)

In [257]:
# remove referees that are not supposed to be in here
refrees_to_remove = df_train_raw.groupby('refNum').sum()[df_train_raw.groupby('refNum').sum()['games'] < 22].index.tolist()
df_train_raw = df_train_raw[~df_train_raw.refNum.isin(refrees_to_remove)]

In [258]:
# remove samples that don't have IAT or Exp score information
df_train_raw = df_train_raw[df_train_raw.meanIAT.notnull()]

In [259]:
# set missing height, weight and position values to mean
mean_height = df_train_raw.height.mean()
mean_weight = df_train_raw.weight.mean()
most_frequent_position = df_train_raw['position'].value_counts().index[0]

df_train_raw.loc[df_train_raw.height.isnull(),'height'] = mean_height
df_train_raw.loc[df_train_raw.weight.isnull(),'weight'] = mean_weight
df_train_raw['position'] = df_train_raw['position'].fillna(most_frequent_position)

# Advanced pre-processing

We create a new feature called "skinColor" which is the average of both ratings, then mapped to either "white" or "black" to simplify our classification.

Position, club and leagueCountry are not numerical, we use dummy variables to make them numerical. For now we left out the club feature because it would cause a lot of dummy variables and we assume it is not a significant feature. We will check in the end if we get significantly better result if we add this feature.

We drop height and weight, as there are not features that we are looking for to influence the model.

**TODO: Aggregrate per player** (Done)

**TODO: feature engineering**

In [260]:
df_train = df_train_raw

# create feature "skinColor"
def attribute_skin_label(val):
    if val > 0.5:
        return 1 #  "black"
    else:
        return 0  # "white"
#Replace rating with either white or black
df_train['skinColor'] = df_train[['rater1','rater2']].mean(axis=1).apply(attribute_skin_label)
df_train = df_train.drop(['rater1', 'rater2'], 1)

# raters = pd.DataFrame (df_train_raw [['rater1', 'rater2']].mean (axis=1))
# raters.columns = ['skinColor']
# df_train = pd.merge (raters, df_train_raw.drop (['rater1', 'rater2'], 1), right_index = True, left_index = True)

# add dummy variables for position, club and leagueCountry
n_positions = len(df_train.position.unique())
#n_club = len(df_train.club.unique())
n_leagueCountry = len(df_train.leagueCountry.unique())

d_positions = pd.get_dummies(df_train['position'])
#d_club = pd.get_dummies(df_train['club'])
d_leagueCountry = pd.get_dummies(df_train['leagueCountry'])

df_train = pd.concat([df_train, d_positions, d_leagueCountry], axis=1)
df_train = df_train.drop(['position', 'club', 'leagueCountry','height','weight'], 1)

In [261]:
# Checking how many "white" and "black" players we have
df_train['skinColor'].value_counts()

0    98033
1    18489
Name: skinColor, dtype: int64

## Player aggregation

We aggregate per player because we want to predict the skin color of a given player, not of a given dyad.

In [262]:
df_agg = df_train.groupby('playerShort').mean()

# Making the model


We decide to keep meanIAT and meanExp. As we have aggregated by player, the represent the average amount of bias the players receive over all the referees they meet. 
We decide to drop refNum, refCountry, nIAT, seIAT, nExp, seExp for simplicity.


In [263]:
clf = RandomForestClassifier(n_estimators=10)
Y = np.asarray(df_agg['skinColor'], dtype='str')
X = df_agg.drop(['skinColor','refNum','refCountry','nIAT','seIAT','nExp','seExp'], 1)

In [266]:
X.columns  # Just checking the columns we are using

Index(['games', 'victories', 'ties', 'defeats', 'goals', 'yellowCards',
       'yellowReds', 'redCards', 'meanIAT', 'meanExp', 'Attacking Midfielder',
       'Center Back', 'Center Forward', 'Center Midfielder',
       'Defensive Midfielder', 'Goalkeeper', 'Left Fullback',
       'Left Midfielder', 'Left Winger', 'Right Fullback', 'Right Midfielder',
       'Right Winger', 'England', 'France', 'Germany', 'Spain'],
      dtype='object')

# Cross validation

We check the accuracy of our model through cross validation.

In [267]:
cross_val_score(clf, X, Y, cv=5, scoring='accuracy')

array([ 0.82018927,  0.81388013,  0.81072555,  0.85488959,  0.83860759])

In [268]:
clf.fit(X,Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

## Feature importance

We check what the most important features are.
TODO: graph of feature importance distribution

We see that the "average bias" the players encounter from referees is the most important factor in determining skin color.

Also the position they play in has very little relevance.

The number of cards they receive is somewhat relevant


In [269]:
feature_importance = {}
for x,y in zip(X.columns,clf.feature_importances_):
    feature_importance[x]=y
import operator
sorted_feature_importance = sorted(feature_importance.items(), key=operator.itemgetter(1),reverse=True)
for i in sorted_feature_importance:
    print (i)

('meanExp', 0.13039977114870807)
('meanIAT', 0.10516268203297434)
('victories', 0.091983221676714064)
('goals', 0.089677250668306605)
('defeats', 0.079975712345983191)
('yellowCards', 0.077810031676821073)
('games', 0.073926672438518679)
('ties', 0.071810129118304714)
('redCards', 0.056151387326115618)
('yellowReds', 0.045225446622801882)
('France', 0.041166973539558366)
('Germany', 0.023990159843422475)
('Spain', 0.014999508623595975)
('Center Forward', 0.014628896137841402)
('England', 0.012740605046678882)
('Left Winger', 0.011132061375131043)
('Right Winger', 0.010940911875398054)
('Center Back', 0.010241343730599145)
('Goalkeeper', 0.0090814391660954761)
('Right Fullback', 0.0058170225456167877)
('Defensive Midfielder', 0.0056407093078226726)
('Left Midfielder', 0.0047022109511610805)
('Left Fullback', 0.0046701608118241214)
('Attacking Midfielder', 0.0041043895620666822)
('Center Midfielder', 0.0026854338405417713)
('Right Midfielder', 0.0013358685873979051)
