# Homework 4: machine learning

## Data processing

In [None]:
%load_ext autoreload
%autoreload 2 
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, KFold

Read the file and remove all the players that have no rating

In [None]:
dyads = pd.read_csv("CrowdstormingDataJuly1st.csv", index_col=0)
dyads = dyads[dyads['rater1'].isnull() == False]
print(dyads.shape)
dyads.head()

## Feature selection

Aggregate the data for each player we consider the following variables:
- the height and weight of the player
- The total amount of games played
- The total amount of victories, ties and defeats
- the total number of goals made
- The total number of red cards, yellow reds and yellow cards received

In [None]:
group_players = dyads.groupby(level=0)

players = group_players.agg({'height':'first', 'weight':'first', 'games':'sum', 
                             'victories':'sum','defeats':'sum', 'ties': 'sum', 'goals':'sum', 
                             'redCards':'sum', 'yellowReds': 'sum', 'yellowCards':'sum'})
print(players.shape)
players.head()

If the weight or the height is NaN we replace it by the average height and weight of all the players

In [None]:
av_height = players['height'].mean()
av_weight = players['weight'].mean()
players['height'] = players['height'].fillna(value=av_height)
players['weight'] = players['weight'].fillna(value=av_weight)

We create extra features by normalizing the data:
- The percentage of victories, ties and defeats
- The number of red cards, yellow reds and yellow cards divided by the number of games played

In [None]:
categorical_values = ['victories', 'ties', 'defeats', 'redCards', 'yellowReds', 'yellowCards']
for name in categorical_values:
    players['percentage_'+name] = players[name]/players['games']
players.head()

Compute extra feature based on correlation between mean IAT, cards given and mean Exp

In [None]:
c = group_players.corr()

In [None]:
for racism in ['meanIAT', 'meanExp']:
    for card in ['redCards', 'yellowCards', 'yellowReds']:
        a = c.loc[c.index.get_level_values(1)==racism, card].reset_index(level=1).fillna(value=0)
        players['cor_'+racism+card] = a[card]
players.head()

Transform categorical data into numerical values (example spain = 3) so that it can be used in random forest. We use:
- club
- country of the league
- position

In [None]:
le = preprocessing.LabelEncoder()

categorical_values = ['club', 'leagueCountry', 'position']
for name in categorical_values:
    categorie = group_players.agg({name:'first'})
    le.fit(categorie.as_matrix().flatten().tolist())
    players[name] = le.transform(categorie.as_matrix().flatten().tolist())

players.head()

In [None]:
skin_color = group_players.agg({'rater1' : 'first'})
skin_color.head()

## Assignment 1: predict player's skin color

We convert the pandas data frame to lists in order to match the expected data format for scikit learn. We also map the player's skin color to an integer instead of a float.

In [None]:
X = players.as_matrix()
Y = skin_color.as_matrix().flatten()
# map 0.25 to 1 etc
Y = np.array(list(map((lambda x: x*4), Y)))

Train the random forest using cross validation

In [None]:
kf = KFold(n_splits=4)
clf = RandomForestClassifier(n_estimators=10, max_depth=5, max_features=None)

for train_index, test_index in kf.split(X):
    clf = clf.fit(X[train_index], Y[train_index])
    # test model
    Y_predict = clf.predict(X[test_index])
    Y_predict2 = clf.predict(X[train_index])
    print("accurancy predictions test data: ",(Y[test_index] - Y_predict).tolist().count(0) / len(Y_predict))
    print("accurancy predictions training data: ",(Y[train_index] - Y_predict2).tolist().count(0) / len(Y_predict2))

In [None]:
cross_val_score(clf, X, Y, scoring='accuracy', cv=4)

### Feature importance

In [None]:
importances = clf.feature_importances_
std = np.std([clf.feature_importances_ for tree in clf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %s (%f)" % (f + 1,  players.columns[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()