# Homework 4: machine learning

## Data processing

In [163]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.cross_validation import KFold

Read the file and remove all the players that have no rating

In [164]:
dyads = pd.read_csv("CrowdstormingDataJuly1st.csv", index_col=0)
dyads = dyads[dyads['rater1'].isnull() == False]
print(dyads.shape)
dyads.head()

(124621, 27)


Unnamed: 0_level_0,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,...,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,0,...,0.5,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696
john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,0,...,0.75,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504
aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182.0,71.0,Center Back,1,0,0,...,0.0,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752
aleksandar-kolarov,Aleksandar Kolarov,Manchester City,England,10.11.1985,187.0,80.0,Left Fullback,1,1,0,...,0.25,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752
alexander-tettey,Alexander Tettey,Norwich City,England,04.04.1986,180.0,68.0,Defensive Midfielder,1,0,0,...,1.0,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752


## Feature selection

Aggregate the data for each player we consider the following variables:
- the height and weight of the player
- The total amount of games played
- The total amount of victories, ties and defeats
- the total number of goals made
- The total number of red cards, yellow reds and yellow cards received

In [165]:
group_players = dyads.groupby(level=0)

players = group_players.agg({'height':'first', 'weight':'first', 'games':'sum', 
                             'victories':'sum','defeats':'sum', 'ties': 'sum', 'goals':'sum', 
                             'redCards':'sum', 'yellowReds': 'sum', 'yellowCards':'sum'})
print(players.shape)
players.head()

(1585, 10)


Unnamed: 0_level_0,goals,height,yellowReds,weight,defeats,games,yellowCards,victories,redCards,ties
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
aaron-hughes,9,182.0,0,71.0,228,654,19,247,0,179
aaron-hunt,62,183.0,0,73.0,122,336,42,141,1,73
aaron-lennon,31,165.0,0,63.0,115,412,11,200,0,97
aaron-ramsey,39,178.0,0,76.0,68,260,31,150,1,42
abdelhamid-el-kaoutari,1,180.0,4,73.0,43,124,8,41,2,40


If the weight or the height is NaN we replace it by the average height and weight of all the players

In [166]:
av_height = players['height'].mean()
av_weight = players['weight'].mean()
players['height'] = players['height'].fillna(value=av_height)
players['weight'] = players['weight'].fillna(value=av_weight)

We create extra features by normalizing the data:
- The percentage of victories, ties and defeats
- The number of red cards, yellow reds and yellow cards divided by the number of games played

In [167]:
categorical_values = ['victories', 'ties', 'defeats', 'redCards', 'yellowReds', 'yellowCards']
for name in categorical_values:
    players['percentage_'+name] = players[name]/players['games']
players.head()

Unnamed: 0_level_0,goals,height,yellowReds,weight,defeats,games,yellowCards,victories,redCards,ties,percentage_victories,percentage_ties,percentage_defeats,percentage_redCards,percentage_yellowReds,percentage_yellowCards
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
aaron-hughes,9,182.0,0,71.0,228,654,19,247,0,179,0.377676,0.2737,0.348624,0.0,0.0,0.029052
aaron-hunt,62,183.0,0,73.0,122,336,42,141,1,73,0.419643,0.217262,0.363095,0.002976,0.0,0.125
aaron-lennon,31,165.0,0,63.0,115,412,11,200,0,97,0.485437,0.235437,0.279126,0.0,0.0,0.026699
aaron-ramsey,39,178.0,0,76.0,68,260,31,150,1,42,0.576923,0.161538,0.261538,0.003846,0.0,0.119231
abdelhamid-el-kaoutari,1,180.0,4,73.0,43,124,8,41,2,40,0.330645,0.322581,0.346774,0.016129,0.032258,0.064516


Transform categorical data into numerical values (example spain = 3) so that it can be used in random forest. We use:
- club
- country of the league
- position

In [168]:
le = preprocessing.LabelEncoder()

categorical_values = ['club', 'leagueCountry', 'position']
for name in categorical_values:
    categorie = group_players.agg({name:'first'})
    le.fit(categorie.as_matrix().flatten().tolist())
    players[name] = le.transform(categorie.as_matrix().flatten().tolist())

players.head()

Unnamed: 0_level_0,goals,height,yellowReds,weight,defeats,games,yellowCards,victories,redCards,ties,percentage_victories,percentage_ties,percentage_defeats,percentage_redCards,percentage_yellowReds,percentage_yellowCards,club,leagueCountry,position
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
aaron-hughes,9,182.0,0,71.0,228,654,19,247,0,179,0.377676,0.2737,0.348624,0.0,0.0,0.029052,34,0,1
aaron-hunt,62,183.0,0,73.0,122,336,42,141,1,73,0.419643,0.217262,0.363095,0.002976,0.0,0.125,91,2,0
aaron-lennon,31,165.0,0,63.0,115,412,11,200,0,97,0.485437,0.235437,0.279126,0.0,0.0,0.026699,83,0,10
aaron-ramsey,39,178.0,0,76.0,68,260,31,150,1,42,0.576923,0.161538,0.261538,0.003846,0.0,0.119231,6,0,3
abdelhamid-el-kaoutari,1,180.0,4,73.0,43,124,8,41,2,40,0.330645,0.322581,0.346774,0.016129,0.032258,0.064516,51,1,1


In [169]:
skin_color = group_players.agg({'rater1' : 'first'})
skin_color.head()

Unnamed: 0_level_0,rater1
playerShort,Unnamed: 1_level_1
aaron-hughes,0.25
aaron-hunt,0.0
aaron-lennon,0.25
aaron-ramsey,0.0
abdelhamid-el-kaoutari,0.25


## Assignment 1: predict player's skin color

We convert the pandas data frame to lists in order to match the expected data format for scikit learn. We also map the player's skin color to an integer instead of a float.

In [170]:
X = players.as_matrix()
Y = skin_color.as_matrix().flatten()
# map 0.25 to 1 etc
Y = np.array(list(map((lambda x: x*4), Y)))

In [176]:
kf = KFold(len(X), n_folds=4)
clf = RandomForestClassifier(n_estimators=500)

for train_index, test_index in kf:
    clf = clf.fit(X[train_index], Y[train_index])
    # test model
    Y_predict = clf.predict(X[test_index])
    print("accurancy predictions: ",(Y[test_index] - Y_predict).tolist().count(0) / len(Y_predict))

accurancy predictions:  0.3853904282115869
accurancy predictions:  0.4116161616161616
accurancy predictions:  0.4494949494949495
accurancy predictions:  0.42676767676767674
