In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.learning_curve import learning_curve

# Importing and simple cleaning of data

First of all, some simple cleaning. We remove features that have nothing to do with the skin color. We remove playerShort (we will use 'player' later for aggregating), birthday, Alpha_3 (since it is the same as refCountry) and photoID.

There are missing values for height, weight and position. And also 163 dyads miss the information for the implicit association test and the explicit bias scores.

First we will remove the refrees that got in the data by mistake (as mentioned in the preprocessing article) and see if the problem is solved then. After removing these refrees, there were still 110 samples with missing data which we just removed.

The missing values of height and weight are replaced by the respective means and the missing values in position are replaced by the most frequently occuring position.

In [None]:
df = pd.read_csv ('CrowdstormingDataJuly1st.csv')
df.head ()

In [None]:
# remove values without skin color rating (we know all samples have either two raters or none)
df_train_raw = df [pd.notnull (df ['rater1'])]

In [None]:
# drop unimportant features
df_train_raw = df_train_raw.drop(['birthday', 'player', 'Alpha_3', 'photoID'], axis=1)

In [None]:
# remove referees that are not supposed to be in here
refrees_to_remove = df_train_raw.groupby('refNum').sum()[df_train_raw.groupby('refNum').sum()['games'] < 22].index.tolist()
df_train_raw = df_train_raw[~df_train_raw.refNum.isin(refrees_to_remove)]

In [None]:
# remove samples that don't have IAT or Exp score information
df_train_raw = df_train_raw[df_train_raw.meanIAT.notnull()]

In [None]:
# set missing height, weight and position values to mean
mean_height = df_train_raw.height.mean()
mean_weight = df_train_raw.weight.mean()
most_frequent_position = df_train_raw['position'].value_counts().index[0]

df_train_raw.loc[df_train_raw.height.isnull(),'height'] = mean_height
df_train_raw.loc[df_train_raw.weight.isnull(),'weight'] = mean_weight
df_train_raw['position'] = df_train_raw['position'].fillna(most_frequent_position)

# Advanced pre-processing

We create a new feature called "skinColor" which is the average of both ratings, then mapped to either "white" or "black" to simplify our classification.

Position, club and leagueCountry are not numerical, we use dummy variables to make them numerical. For now we left out the club feature because it would cause a lot of dummy variables and we assume it is not a significant feature. We will check in the end if we get significantly better result if we add this feature.

We drop height and weight, as there are not features that we are looking for to influence the model.

**TODO: Aggregrate per player** (Done)

**TODO: feature engineering**

In [None]:
df_train = df_train_raw

# create feature "skinColor"
def attribute_skin_label(val):
    if val > 0.5:
        return 1 #  "black"
    else:
        return 0  # "white"
#Replace rating with either white or black
df_train['skinColor'] = df_train[['rater1','rater2']].mean(axis=1).apply(attribute_skin_label)
df_train = df_train.drop(['rater1', 'rater2'], 1)

# raters = pd.DataFrame (df_train_raw [['rater1', 'rater2']].mean (axis=1))
# raters.columns = ['skinColor']
# df_train = pd.merge (raters, df_train_raw.drop (['rater1', 'rater2'], 1), right_index = True, left_index = True)

# add dummy variables for position, club and leagueCountry
n_positions = len(df_train.position.unique())
#n_club = len(df_train.club.unique())
n_leagueCountry = len(df_train.leagueCountry.unique())

d_positions = pd.get_dummies(df_train['position'])
#d_club = pd.get_dummies(df_train['club'])
d_leagueCountry = pd.get_dummies(df_train['leagueCountry'])

df_train = pd.concat([df_train, d_positions, d_leagueCountry], axis=1)
df_train = df_train.drop(['position', 'club', 'leagueCountry','height','weight'], 1)

In [None]:
# Checking how many "white" and "black" players we have
df_train['skinColor'].value_counts()

## Player aggregation

We aggregate per player because we want to predict the skin color of a given player, not of a given dyad.

In [None]:
df_train['IATRedCards'] = df_train['redCards']*df_train['meanIAT']
df_train['ExpRedCards'] = df_train['redCards']*df_train['meanExp']
df_train['IATYellowRedCards'] = df_train['yellowReds']*df_train['meanIAT']
df_train['ExpYellowRedCards'] = df_train['yellowReds']*df_train['meanExp']
df_train['IATYellowCards'] = df_train['yellowCards']*df_train['meanIAT']
df_train['ExpYellowCards'] = df_train['yellowCards']*df_train['meanExp']

In [None]:
#df_agg = df_train.groupby('playerShort').sum().columns
summed = df_train[['playerShort','games', 'victories', 'ties', 'defeats', 'goals', 'yellowCards', 'yellowReds', 'redCards']].groupby('playerShort').sum()
meaned = df_train[['playerShort', 'skinColor','Attacking Midfielder', 'Center Back', 'Center Forward',
       'Center Midfielder', 'Defensive Midfielder', 'Goalkeeper',
       'Left Fullback', 'Left Midfielder', 'Left Winger', 'Right Fullback',
       'Right Midfielder', 'Right Winger', 'England', 'France', 'Germany',
       'Spain', 'meanIAT', 'meanExp', 'IATRedCards', 'ExpRedCards', 'IATYellowRedCards', 'ExpYellowRedCards', 'IATYellowCards', 'ExpYellowCards' ]].groupby('playerShort').mean()

In [None]:
df_agg = pd.concat([summed, meaned], axis=1)

In [None]:
df_agg.head()

In [None]:
df_agg.columns

# Making the model


We decide to keep meanIAT and meanExp. As we have aggregated by player, the represent the average amount of bias the players receive over all the referees they meet. 
We decide to drop refNum, refCountry, nIAT, seIAT, nExp, seExp for simplicity.


In [None]:
clf = RandomForestClassifier(n_estimators=10)
Y = np.asarray(df_agg['skinColor'], dtype='str')
X = df_agg.drop(['skinColor','refNum','refCountry','nIAT','seIAT','nExp','seExp'], 1)

In [None]:
X.columns  # Just checking the columns we are using

# Cross validation

We check the accuracy of our model through cross validation.

In [None]:
cross_val_score(clf, X, Y, cv=5, scoring='accuracy')

In [None]:
clf.fit(X,Y)

## Feature importance

We check what the most important features are.
TODO: graph of feature importance distribution

We see that the "average bias" the players encounter from referees is the most important factor in determining skin color.

Also the position they play in has very little relevance.

The number of cards they receive is somewhat relevant


In [None]:
feature_importance = {}
for x,y in zip(X.columns,clf.feature_importances_):
    feature_importance[x]=y
import operator
sorted_feature_importance = sorted(feature_importance.items(), key=operator.itemgetter(1),reverse=True)
for i in sorted_feature_importance:
    print (i)

# Learning curves

In [None]:
def plot_learning_curve(title, estimator, X, y, cv=20):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    title : string
        Title for the chart.
        
    estimator: clf

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
    """
    plt.figure()
    plt.title(title)

    plt.xlabel("Training examples")
    plt.ylabel("Score")
    
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


title = "Learning Curves (RandomForestClassifier)"

estimator = RandomForestClassifier(n_estimators=10)
Y = np.asarray(df_agg['skinColor'], dtype='str')
X = df_agg.drop(['skinColor','refNum','refCountry','nIAT','seIAT','nExp','seExp'], 1)

plot_learning_curve(title, estimator, X, Y)

plt.show()

The cross-validation score is significanty worse than the training score and does not improve when adding more data. This means our model has a large bias.

In [None]:
title = "Learning Curves (RandomForestClassifier) - combined features"

estimator = RandomForestClassifier(n_estimators=10)
Y = np.asarray(df_agg['skinColor'], dtype='str')
X = df_agg[['yellowCards','yellowReds', 'redCards', 'Attacking Midfielder','Center Back', 'Center Forward', 'Center Midfielder','Defensive Midfielder', 'Goalkeeper', 'Left Fullback','Left Midfielder', 'Left Winger', 'Right Fullback', 'Right Midfielder', 'Right Winger', 'England', 'France', 'Germany', 'Spain', 'meanIAT', 'meanExp', 'IATRedCards', 'ExpRedCards', 'IATYellowRedCards', 'ExpYellowRedCards', 'IATYellowCards', 'ExpYellowCards']]

plot_learning_curve(title, estimator, X, Y)

plt.show()

Tried to combine the IAT/Exp information with the red cards. Same result.

In [None]:
title = "Learning Curves (RandomForestClassifier) - Only using position"

estimator = RandomForestClassifier(n_estimators=10)
Y = np.asarray(df_agg['skinColor'], dtype='str')
X = df_agg[['Attacking Midfielder','Center Back', 'Center Forward', 'Center Midfielder','Defensive Midfielder', 'Goalkeeper', 'Left Fullback','Left Midfielder', 'Left Winger', 'Right Fullback', 'Right Midfielder', 'Right Winger']]

plot_learning_curve(title, estimator, X, Y)

plt.show()

Here I was trying to get a bad score by only using the position dummy variables. I'm not sure what to think of these results. The first two graphs look like high bias but might also be overfitting because the difference between test and train is significant while both scores seperately are alright. This last graph might comfirm this because it has low complexity (so definitaly not overfitting) and here the curves are close together around the 80% (the same as the cross-validation score of the complexer model).