# Sebastian Garcia

# Running Models

- Use Random Forest on the FIFA dataset to predict a diagnosis:
- Clean and preprocess data.
- Run the model on the data.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, chi2


In [2]:
fifa = pd.read_csv('FIFA_2018_Statistics.csv')


In [3]:
fifa.head()


Unnamed: 0,Date,Team,Opponent,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,...,Yellow Card,Yellow & Red,Red,Man of the Match,1st Goal,Round,PSO,Goals in PSO,Own goals,Own goal Time
0,14-06-2018,Russia,Saudi Arabia,5,40,13,7,3,3,6,...,0,0,0,Yes,12.0,Group Stage,No,0,,
1,14-06-2018,Saudi Arabia,Russia,0,60,6,0,3,3,2,...,0,0,0,No,,Group Stage,No,0,,
2,15-06-2018,Egypt,Uruguay,0,43,8,3,3,2,0,...,2,0,0,No,,Group Stage,No,0,,
3,15-06-2018,Uruguay,Egypt,1,57,14,4,6,4,5,...,0,0,0,Yes,89.0,Group Stage,No,0,,
4,15-06-2018,Morocco,Iran,0,64,13,3,6,4,5,...,1,0,0,No,,Group Stage,No,0,1.0,90.0


In [4]:
fifa.describe()


Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,...,Passes,Distance Covered (Kms),Fouls Committed,Yellow Card,Yellow & Red,Red,1st Goal,Goals in PSO,Own goals,Own goal Time
count,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0,...,128.0,128.0,128.0,128.0,128.0,128.0,94.0,128.0,12.0,12.0
mean,1.320312,49.992188,12.59375,3.914062,5.273438,3.359375,4.71875,1.34375,14.890625,2.726562,...,462.648438,106.664062,13.546875,1.695312,0.015625,0.015625,39.457447,0.203125,1.0,45.833333
std,1.156519,10.444074,5.245827,2.234403,2.409675,2.403195,2.446072,1.193404,4.724262,2.049447,...,151.186311,11.749537,4.619131,1.325454,0.124507,0.124507,24.496506,0.807049,0.0,29.978275
min,0.0,25.0,3.0,0.0,1.0,0.0,0.0,0.0,5.0,0.0,...,189.0,80.0,5.0,0.0,0.0,0.0,1.0,0.0,1.0,12.0
25%,0.0,42.0,9.0,2.0,4.0,1.75,3.0,0.0,11.0,1.0,...,351.0,101.0,10.0,1.0,0.0,0.0,18.25,0.0,1.0,21.75
50%,1.0,50.0,12.0,3.5,5.0,3.0,5.0,1.0,15.0,2.0,...,462.0,104.5,13.0,2.0,0.0,0.0,39.0,0.0,1.0,35.0
75%,2.0,58.0,15.0,5.0,7.0,4.0,6.0,2.0,18.0,4.0,...,555.25,109.0,16.0,2.0,0.0,0.0,54.75,0.0,1.0,75.75
max,6.0,75.0,26.0,12.0,11.0,10.0,11.0,5.0,26.0,9.0,...,1137.0,148.0,25.0,6.0,1.0,1.0,90.0,4.0,1.0,90.0


In [5]:
fifa.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 27 columns):
Date                      128 non-null object
Team                      128 non-null object
Opponent                  128 non-null object
Goal Scored               128 non-null int64
Ball Possession %         128 non-null int64
Attempts                  128 non-null int64
On-Target                 128 non-null int64
Off-Target                128 non-null int64
Blocked                   128 non-null int64
Corners                   128 non-null int64
Offsides                  128 non-null int64
Free Kicks                128 non-null int64
Saves                     128 non-null int64
Pass Accuracy %           128 non-null int64
Passes                    128 non-null int64
Distance Covered (Kms)    128 non-null int64
Fouls Committed           128 non-null int64
Yellow Card               128 non-null int64
Yellow & Red              128 non-null int64
Red                       128 non-nul

In [6]:
# See if we have NaNs
fifa.isna().sum()


Date                        0
Team                        0
Opponent                    0
Goal Scored                 0
Ball Possession %           0
Attempts                    0
On-Target                   0
Off-Target                  0
Blocked                     0
Corners                     0
Offsides                    0
Free Kicks                  0
Saves                       0
Pass Accuracy %             0
Passes                      0
Distance Covered (Kms)      0
Fouls Committed             0
Yellow Card                 0
Yellow & Red                0
Red                         0
Man of the Match            0
1st Goal                   34
Round                       0
PSO                         0
Goals in PSO                0
Own goals                 116
Own goal Time             116
dtype: int64

In [7]:
# Replace the NaNs with '0'
fifa['1st Goal'] = fifa['1st Goal'].fillna(0)
fifa['Own goals'] = fifa['Own goals'].fillna(0)
fifa.isna().sum()


Date                        0
Team                        0
Opponent                    0
Goal Scored                 0
Ball Possession %           0
Attempts                    0
On-Target                   0
Off-Target                  0
Blocked                     0
Corners                     0
Offsides                    0
Free Kicks                  0
Saves                       0
Pass Accuracy %             0
Passes                      0
Distance Covered (Kms)      0
Fouls Committed             0
Yellow Card                 0
Yellow & Red                0
Red                         0
Man of the Match            0
1st Goal                    0
Round                       0
PSO                         0
Goals in PSO                0
Own goals                   0
Own goal Time             116
dtype: int64

In [8]:
# Assign independent and dependent variables
X = fifa.drop(['Man of the Match', 'Opponent', '1st Goal',
                  'Own goals', 'Own goal Time', 'Date'], axis=1)
y = fifa['Man of the Match']


In [9]:
# Enconde the categorical variables in order to turn it to numerical
def one_hot(cat):
    dummies = pd.get_dummies(X[cat], prefix=cat)
    res = pd.concat([X, dummies], axis=1)
    return(res)


In [10]:
# Select categorical and numerical columns
cat_features = X.select_dtypes(include=['object'])
num_features = X.select_dtypes(include=['int'])


In [11]:
# Apply encoding to the categorical columns
for cat_var in cat_features:
    X = one_hot(cat_var)
    

In [12]:
# Drop the old ones from the dataset
X.drop(cat_features, axis=1, inplace = True)


In [13]:
# Split data train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [14]:
X


Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,...,Team_Tunisia,Team_Uruguay,Round_3rd Place,Round_Final,Round_Group Stage,Round_Quarter Finals,Round_Round of 16,Round_Semi- Finals,PSO_No,PSO_Yes
0,5,40,13,7,3,3,6,3,11,0,...,0,0,0,0,1,0,0,0,1,0
1,0,60,6,0,3,3,2,1,25,2,...,0,0,0,0,1,0,0,0,1,0
2,0,43,8,3,3,2,0,1,7,3,...,0,0,0,0,1,0,0,0,1,0
3,1,57,14,4,6,4,5,1,13,3,...,0,1,0,0,1,0,0,0,1,0
4,0,64,13,3,6,4,5,0,14,2,...,0,0,0,0,1,0,0,0,1,0
5,1,36,8,2,5,1,2,0,22,2,...,0,0,0,0,1,0,0,0,1,0
6,3,39,8,3,2,3,4,1,13,2,...,0,0,0,0,1,0,0,0,1,0
7,3,61,12,5,5,2,5,3,13,0,...,0,0,0,0,1,0,0,0,1,0
8,2,51,12,5,4,3,5,0,19,1,...,0,0,0,0,1,0,0,0,1,0
9,1,49,4,1,2,1,1,0,16,4,...,0,0,0,0,1,0,0,0,1,0


# Random Forest

In [15]:
# Checking distribution
y.value_counts()


No     64
Yes    64
Name: Man of the Match, dtype: int64

In [16]:
# Build Random Forest model
rfc = RandomForestClassifier(n_estimators=100, max_depth=3)
rfc.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
scores_f = cross_val_score(rfc, X_train, y_train, cv=3, scoring='roc_auc')
scores_f.mean()


0.7264705882352942

# Feature Selection

In [18]:
f_imp = rfc.feature_importances_
f_names = list(X_train)


In [19]:
f_imp_df = pd.DataFrame(f_imp, f_names)
f_imp_df = f_imp_df.rename(columns={0: 'Feature Importance'})
f_imp_df = f_imp_df.sort_values('Feature Importance', ascending=False)
f_imp_df.head()


Unnamed: 0,Feature Importance
Goal Scored,0.178292
Corners,0.084378
Attempts,0.063702
Blocked,0.059755
Passes,0.0579


In [20]:
corr = X.corr()
corr.style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)


Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,Pass Accuracy %,Passes,Distance Covered (Kms),Fouls Committed,Yellow Card,Yellow & Red,Red,Goals in PSO,Team_Argentina,Team_Australia,Team_Belgium,Team_Brazil,Team_Colombia,Team_Costa Rica,Team_Croatia,Team_Denmark,Team_Egypt,Team_England,Team_France,Team_Germany,Team_Iceland,Team_Iran,Team_Japan,Team_Korea Republic,Team_Mexico,Team_Morocco,Team_Nigeria,Team_Panama,Team_Peru,Team_Poland,Team_Portugal,Team_Russia,Team_Saudi Arabia,Team_Senegal,Team_Serbia,Team_Spain,Team_Sweden,Team_Switzerland,Team_Tunisia,Team_Uruguay,Round_3rd Place,Round_Final,Round_Group Stage,Round_Quarter Finals,Round_Round of 16,Round_Semi- Finals,PSO_No,PSO_Yes
Goal Scored,1.0,0.035,0.14,0.46,-0.02,-0.087,0.04,0.045,0.047,-0.12,0.14,0.044,0.014,0.03,-0.049,-0.035,-0.09,-0.011,0.028,-0.088,0.2,0.049,0.028,-0.088,0.14,-0.089,-0.088,0.082,0.14,-0.088,-0.088,-0.088,0.028,-0.043,-0.089,-0.088,-0.043,-0.088,-0.088,-0.088,0.028,0.15,-0.088,0.0018,-0.088,0.067,-0.021,-0.011,0.047,0.014,-0.035,0.18,-0.074,0.012,0.059,-0.05,0.016,-0.016
Ball Possession %,0.035,1.0,0.54,0.3,0.36,0.52,0.54,0.058,0.27,-0.29,0.71,0.88,-0.059,-0.3,-0.21,0.091,0.024,-0.01,0.24,0.015,0.056,0.11,0.0045,-0.13,0.12,-0.082,-0.074,0.086,-0.049,0.25,-0.19,-0.26,0.048,-0.15,0.074,-0.0048,-0.059,-0.16,0.025,-0.0098,0.065,-0.22,0.12,-0.089,-0.069,0.32,-0.21,0.13,0.015,-0.035,9.5e-05,9.5e-05,0.0013,0.00019,-0.002,0.00013,0.0029,-0.0029
Attempts,0.14,0.54,1.0,0.73,0.72,0.75,0.69,-0.017,0.14,-0.27,0.4,0.58,0.17,-0.25,-0.19,-0.075,0.0098,0.15,0.023,-0.057,0.12,0.31,-0.098,-0.097,0.18,-0.089,-0.087,0.052,-0.04,0.29,-0.018,-0.17,-0.072,-0.047,0.057,-0.027,0.012,-0.15,-0.057,-0.077,0.014,-0.13,-0.018,-0.077,-0.057,0.17,-0.061,0.048,-0.018,-0.015,0.022,-0.026,-0.19,0.026,0.18,0.091,-0.17,0.17
On-Target,0.46,0.3,0.73,1.0,0.32,0.33,0.41,0.073,0.093,-0.32,0.29,0.35,0.065,-0.19,-0.12,-0.052,-0.023,0.04,0.047,-0.087,0.19,0.33,-0.074,-0.064,0.025,-0.074,-0.16,-0.037,0.04,0.19,0.006,-0.18,0.0069,-0.017,-0.054,-0.04,-0.064,-0.087,-0.017,-0.087,-0.033,-0.01,-0.04,0.006,-0.11,0.11,0.026,-0.033,0.029,0.044,0.033,0.033,-0.11,0.024,0.1,0.0069,-0.068,0.068
Off-Target,-0.02,0.36,0.72,0.32,1.0,0.3,0.44,-0.096,0.14,-0.13,0.19,0.4,0.23,-0.19,-0.12,-0.15,-0.12,0.19,-0.095,-0.039,0.044,0.15,-0.13,-0.061,0.3,0.017,0.025,0.087,-0.013,0.18,0.068,-0.018,-0.11,-0.15,0.054,0.047,0.0039,-0.061,-0.039,-0.061,-0.13,-0.17,0.0039,-0.061,0.025,0.13,-0.073,0.017,-0.061,-0.0062,-0.014,-0.041,-0.085,0.024,0.026,0.17,-0.2,0.2
Blocked,-0.087,0.52,0.75,0.33,0.3,1.0,0.64,-0.0022,0.086,-0.14,0.4,0.53,0.082,-0.17,-0.16,0.034,0.17,0.11,0.1,-0.0017,0.036,0.22,-0.0082,-0.088,0.064,-0.14,-0.066,0.064,-0.11,0.28,-0.11,-0.17,-0.046,0.063,0.12,-0.066,0.085,-0.17,-0.066,-0.023,0.086,-0.098,-0.0017,-0.11,-0.045,0.14,-0.081,0.12,-0.0017,-0.064,0.034,-0.045,-0.18,0.015,0.22,0.029,-0.11,0.11
Corners,0.04,0.54,0.69,0.41,0.44,0.64,1.0,-0.034,0.085,-0.23,0.33,0.52,0.1,-0.17,-0.17,0.015,0.12,0.089,0.094,-0.0033,0.084,0.3,-0.11,-0.067,0.098,-0.016,-0.17,0.084,-0.17,0.23,0.06,-0.19,0.0023,0.018,0.021,-0.025,-0.0033,-0.19,-0.11,0.06,0.094,0.04,-0.025,-0.11,-0.046,0.094,-0.13,0.13,-0.088,-0.093,-0.011,-0.037,-0.13,0.03,0.14,0.039,-0.07,0.07
Offsides,0.045,0.058,-0.017,0.073,-0.096,-0.0022,-0.034,1.0,0.089,0.0065,0.13,0.035,0.031,-0.04,-0.048,0.23,-0.036,-0.065,0.024,-0.045,-0.041,0.077,0.061,0.042,-0.012,-0.014,-0.0014,0.16,-0.19,0.13,-0.13,-0.045,-0.014,-0.18,0.024,-0.088,-0.088,0.13,0.13,-0.0014,-0.052,0.0095,-0.0014,0.26,0.042,0.024,-0.024,-0.052,0.086,-0.16,-0.089,-0.036,0.23,-0.16,-0.15,0.024,0.075,-0.075
Free Kicks,0.047,0.27,0.14,0.093,0.14,0.086,0.085,0.089,1.0,-0.23,0.13,0.18,0.077,0.08,-0.027,0.03,-0.1,0.14,0.21,-0.084,-0.12,0.082,-0.0054,-0.029,0.14,-0.21,-0.2,0.086,0.042,0.037,-0.14,0.047,-0.024,-0.095,-0.072,-0.051,0.047,0.11,0.037,-0.04,0.071,-0.11,0.11,-0.062,0.0036,0.071,0.022,-0.024,0.037,0.022,-0.17,-0.01,0.033,0.054,-0.041,0.052,-0.11,0.11
Saves,-0.12,-0.29,-0.27,-0.32,-0.13,-0.14,-0.23,0.0065,-0.23,1.0,-0.19,-0.26,0.13,0.075,0.0097,0.14,0.017,0.11,-0.11,0.046,0.13,-0.17,-0.02,0.046,0.015,0.22,0.021,-0.035,-0.1,0.071,0.021,-0.03,0.068,0.12,0.31,-0.11,-0.055,-0.0045,-0.03,-0.11,-0.064,0.027,-0.08,-0.08,0.046,-0.22,-0.052,0.024,0.17,-0.052,0.048,-0.045,-0.095,0.035,0.085,0.024,-0.098,0.098


In [22]:
# Drop columns
X_new = X.drop(['Off-Target', 'Blocked', 'PSO_No', 'PSO_Yes', 'Goals in PSO'], axis=1)


In [23]:
# Split data train and test
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y, test_size=0.25, random_state=42)


In [24]:
# Build Random Forest model
rfc_new = RandomForestClassifier(n_estimators=100, max_depth=3)
rfc_new.fit(X_train_new, y_train_new)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [25]:
# Evaluate model
scores_f_new = cross_val_score(rfc, X_train_new, y_train_new, cv=3, scoring='roc_auc')
scores_f_new.mean()


0.7040032679738563

# Hyperparameter Tuning

In [26]:
param_grid_random = {
    'n_estimators': [400, 600, 800],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [5, 6, 7],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2, 4, 6]
}


In [27]:
# Build Randomized Search model
rfc_rs = RandomizedSearchCV(rfc, param_grid_random, n_iter=100, cv=3, scoring='roc_auc', n_jobs=-1)
rfc_rs.fit(X_train, y_train)




RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [400, 600, 800], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [5, 6, 7], 'min_samples_leaf': [1, 2, 3], 'min_samples_split': [2, 4, 6]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=0)

In [28]:
# Checking for best parameters
rfc_rs.best_params_


{'n_estimators': 800,
 'min_samples_split': 4,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 7}

In [29]:
# Best Model Random Search
# Applying best model
rfc_rs.best_estimator_


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [30]:
# Evaluate RandomSearch model
scores_rs = cross_val_score(rfc_rs.best_estimator_, X_train, y_train, cv=3, scoring='roc_auc')
scores_rs.mean()


0.746813725490196

In [31]:
param_grid = {
    'n_estimators': [100, 250, 400],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [2, 3, 4, 5],
    'min_samples_leaf': [1, 2],
    'min_samples_split': [2, 3]
}


In [32]:
# Build Grid Search model
rfc_gs = GridSearchCV(rfc, param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
rfc_gs.fit(X_train, y_train)




GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [100, 250, 400], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [2, 3, 4, 5], 'min_samples_leaf': [1, 2], 'min_samples_split': [2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [33]:
# Checking for best parameters
rfc_gs.best_params_


{'max_depth': 5,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 250}

In [34]:
# Best Model Grid Search
# Applying best Grid Search model
rfc_gs.best_estimator_


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [35]:
# Evaluate best GridSearch model
scores_gs = cross_val_score(rfc_gs.best_estimator_, X_train, y_train, cv=3, scoring='roc_auc')
scores_gs.mean()


0.7544934640522875