# Modeling

Begin writing a function that creates and attempts to optimize a Random Forest Classifier model. It will utilize cross-validation and grid search. Once this is complete and functional, we can begin adding other algorithms.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier

# Get Some Data

In [2]:
df = pd.read_csv('../bryant/bryant_games.csv', index_col = [0])
df

Unnamed: 0,assistsplayer_1,assistsplayer_10,assistsplayer_2,assistsplayer_3,assistsplayer_5,assistsplayer_6,assistsplayer_7,assistsplayer_8,assistsplayer_9,currentGold_1,...,team_totalGold_100,team_totalGold_200,team_trueDamageDoneToChampions_100,team_trueDamageDoneToChampions_200,team_ward_player_100,team_ward_player_200,team_assistsplayer_100,team_assistsplayer_200,team_xp_100,team_xp_200
0,1.0,7.0,4.0,1.0,4.0,7.0,3.0,7.0,5.0,28.0,...,28867,38700,3545,1502,85,48,10,29,34091,38008
1,2.0,7.0,7.0,6.0,16.0,0.0,5.0,5.0,6.0,343.0,...,43786,36709,3880,2386,83,53,39,23,45729,42371
2,1.0,11.0,3.0,3.0,4.0,3.0,9.0,2.0,5.0,3176.0,...,40967,37160,1277,2605,65,62,14,30,42047,43392
3,4.0,10.0,3.0,3.0,6.0,4.0,12.0,6.0,11.0,709.0,...,35334,40147,5504,4052,97,120,18,43,39955,43949
4,1.0,3.0,3.0,5.0,7.0,2.0,0.0,2.0,1.0,532.0,...,35376,31749,274,1431,117,86,20,8,40045,37315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718,3.0,12.0,3.0,2.0,5.0,1.0,6.0,5.0,3.0,347.0,...,36696,39327,5203,6964,75,53,16,27,41820,45552
719,3.0,5.0,5.0,3.0,11.0,1.0,2.0,5.0,2.0,1075.0,...,36296,34454,1334,2512,89,77,28,15,40534,39114
720,1.0,8.0,3.0,4.0,2.0,3.0,7.0,8.0,2.0,380.0,...,38400,42115,1885,2757,268,54,14,28,39394,44462
721,4.0,15.0,4.0,6.0,2.0,4.0,3.0,15.0,2.0,421.0,...,39021,46473,7779,10837,52,138,19,39,38692,49258


This data has already been prepared, so all I need to do is split it up into X and y groups and then into train and test sets. Please keep in mind this data set is only a fraction of our expected data set, and is only being used to check the funcionality of my model.

__Split into X and y Groups__

In [3]:
X, y = df.drop(columns = ['winningTeam']), df.winningTeam

__Create Train and Test Sets__

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

In [5]:
X_train.shape, y_train.shape

((578, 207), (578,))

__Create Dummy Variables__

In [6]:
X_train = pd.get_dummies(X_train, drop_first = True)
X_train

Unnamed: 0,assistsplayer_1,assistsplayer_10,assistsplayer_2,assistsplayer_3,assistsplayer_5,assistsplayer_6,assistsplayer_7,assistsplayer_8,assistsplayer_9,currentGold_1,...,gameVersion_11.12.379.4946,gameVersion_11.14.384.6677,gameVersion_11.16.390.1945,gameVersion_11.17.393.607,gameVersion_11.17.394.4489,gameVersion_11.18.395.7538,gameVersion_11.19.398.9466,gameVersion_11.20.400.7328,gameVersion_11.21.403.3002,gameVersion_11.22.406.3587
172,5.0,4.0,4.0,4.0,3.0,3.0,2.0,0.0,1.0,908.0,...,0,0,0,0,0,0,0,0,0,1
229,5.0,11.0,2.0,1.0,6.0,6.0,3.0,8.0,7.0,667.0,...,0,0,0,0,0,0,1,0,0,0
601,3.0,10.0,3.0,2.0,4.0,1.0,7.0,2.0,3.0,530.0,...,0,0,0,0,0,0,0,0,0,1
457,5.0,9.0,14.0,5.0,9.0,6.0,8.0,4.0,6.0,813.0,...,0,0,0,0,0,0,0,1,0,0
164,3.0,8.0,3.0,3.0,4.0,3.0,4.0,1.0,3.0,517.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,2.0,1.0,6.0,2.0,7.0,2.0,6.0,2.0,4.0,1182.0,...,0,0,0,0,0,0,0,0,0,1
322,0.0,8.0,2.0,1.0,3.0,6.0,7.0,0.0,9.0,575.0,...,0,0,0,0,0,0,0,0,0,1
382,1.0,10.0,6.0,3.0,4.0,0.0,5.0,6.0,8.0,1028.0,...,0,0,0,0,0,0,0,0,0,1
365,2.0,1.0,7.0,3.0,10.0,0.0,5.0,0.0,0.0,837.0,...,0,0,0,0,0,0,1,0,0,0


__Create a Baseline__

Since this is a classification problem, I will set the baseline to whichever team has the most wins.

In [7]:
#Set team 100.0 to be blue_team and team 200.0 to be red_team
def get_team_color(value):
    if value == 100.0:
        return 'blue_team'
    else:
        return 'red_team'

In [8]:
y_train = y_train.apply(get_team_color)

In [9]:
y_train.value_counts()

red_team     295
blue_team    283
Name: winningTeam, dtype: int64

In [10]:
#Use the dummy classifier to set the baseline
#red_team has the most wins
from sklearn.dummy import DummyClassifier

baseline = DummyClassifier(strategy = 'constant', constant = 'red_team')
baseline.fit(X_train, y_train)

#Now get the baseline accuracy
baseline.score(X_train, y_train)

0.5103806228373703

__Train a Single Model__

Train a single model to find out about how long it will take with so many features. From there, I will be able to estimate how long the grid search might take to complete.

In [11]:
#Create the model (just use default hyperparameters for now, except random_state)
model = RandomForestClassifier(random_state = 123)

#Fit the model
model.fit(X_train, y_train)

#Score the model
model.score(X_train, y_train)

1.0

The above model finished training extremely quickly, so I don't think there is anything to worry about. Just be mindful of how many models will actually be produced with the given ranges for the hyperparameters.

__Implement GridSearchCV__

In [12]:
clf = RandomForestClassifier(random_state = 123)

grid = GridSearchCV(clf, {'max_depth': range(5, 11), 'min_samples_leaf': range(5, 11)}, cv = 5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=123),
             param_grid={'max_depth': range(5, 11),
                         'min_samples_leaf': range(5, 11)})

In [13]:
#What was the best score and best parameters
grid.best_score_, grid.best_params_

(0.9532983508245877, {'max_depth': 6, 'min_samples_leaf': 5})

__Write RandomForestClassifier Function__

In [14]:
rf_dict = {
    'max_depth': range(5, 11),
    'min_samples_leaf': range(5, 11)
}

In [15]:
def get_random_forest_models(X_train, y_train, param_dict, cv = 5):
    """
    This function creates and returns an optimized random forest classification model. It also
    prints out the best model's accuracy score and parameters.
    
    This function takes in the X and y training sets to fit the models.
    
    This function takes in a dictionary that contains the parameters to be iterated through.
    
    This function also takes in a value for the number of cross validation folds to do.
    The cv value defaults to 5.
    """
    #Create the classifier model
    clf = RandomForestClassifier(random_state = 123)
    
    #Create the GridSearchCV object
    grid = GridSearchCV(clf, param_dict, cv = 5)
    
    #Fit the GridSearchCV object
    grid.fit(X_train, y_train)
    
    #Print the best model's score and parameters
    print('Mean Cross-Validated Accuracy: ', round(grid.best_score_, 4))
    print('Max Depth: ', grid.best_params_['max_depth'])
    print('Min Samples Per Leaf: ', grid.best_params_['min_samples_leaf'])
    
    #Return the best model
    return grid.best_estimator_

In [16]:
best_model = get_random_forest_models(X_train, y_train, rf_dict)

Mean Cross-Validated Accuracy:  0.9533
Max Depth:  6
Min Samples Per Leaf:  5


In [17]:
#Check to see if the function returned the model correctly
#Scoring it on the train data should yield a similar result to the average score
best_model.score(X_train, y_train)

0.9982698961937716

### AdaBoostClassifier

I will use the AdaBoostClassifier with a RandomForestClassifier as the base_estimator.

In [18]:
from sklearn.ensemble import AdaBoostClassifier

In [19]:
#Create the RandomForestClassifier object
rf = RandomForestClassifier(random_state = 123)

#Create the AdaBoostClassifier object
adaBoost = AdaBoostClassifier(rf, random_state = 123)

#Create GridSearchCV object
grid = GridSearchCV(adaBoost, {'n_estimators': range(50, 101, 10)}, cv = 5)

#Fit the grid object
grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=AdaBoostClassifier(base_estimator=RandomForestClassifier(random_state=123),
                                          random_state=123),
             param_grid={'n_estimators': range(50, 101, 10)})

In [20]:
#What was the best score and best parameters
grid.best_score_, grid.best_params_

(0.9567316341829086, {'n_estimators': 50})

__Let's see if it can improve performance of our best RandomForest model from earlier__

In [21]:
#Create the AdaBoostClassifier object
adaBoost = AdaBoostClassifier(best_model, random_state = 123)

#Create GridSearchCV object
grid = GridSearchCV(adaBoost, {'n_estimators': range(50, 101, 5)}, cv = 5)

#Fit the grid object
grid.fit(X_train, y_train)

#What was the best score and best parameters
grid.best_score_, grid.best_params_

(0.9584557721139431, {'n_estimators': 50})

It is actually slightly better than before

In [22]:
#Create a function for AdaBoost
def get_adaBoosted_model(X_train, y_train, model_to_boost, param_dict, cv = 5):
    """
    This function creates and returns an optimized AdaBoosted random forest classification model. It also
    prints out the best model's accuracy score and parameters.
    
    This function takes in the X and y training sets to fit the models.
    
    This function takes in a dictionary that contains the parameters to be iterated through.
    
    This function also takes in a value for the number of cross validation folds to do.
    The cv value defaults to 5.
    """
    #Create the AdaBoost Classifier
    adaBoost_clf = AdaBoostClassifier(model_to_boost, random_state = 123)
    
    #Create the GridSearchCV object
    grid = GridSearchCV(adaBoost_clf, param_dict, cv = 5)
    
    #Fit the GridSearchCV object
    grid.fit(X_train, y_train)
    
    #Print the best model's score and parameters
    print('Mean Cross-Validated Accuracy: ', round(grid.best_score_, 4))
    print('Num Estimators: ', grid.best_params_['n_estimators'])
    print('Learning Rate: ', grid.best_params_['learning_rate'])
    
    #Return the best model
    return grid.best_estimator_

In [23]:
adaBoost_params = {
    'n_estimators': range(50, 61),
    'learning_rate': range(1, 6)
}

In [24]:
#Test the above function
ada_boosted_clf = get_adaBoosted_model(X_train, y_train, best_model, adaBoost_params)

Mean Cross-Validated Accuracy:  0.9602
Num Estimators:  50
Learning Rate:  5


In [25]:
#This performed slightly better than the random forest alone.
#What were the most important features?
best_features = pd.DataFrame(ada_boosted_clf.feature_importances_, X_train.columns)
best_features.sort_values(by = 0, ascending = False).head(10)

Unnamed: 0,0
towers_lost_team200,0.071173
towers_lost_team100,0.05206
inhibs_lost_team200,0.051986
inhibs_lost_team100,0.035715
team_totalGold_100,0.029264
team_xp_100,0.022004
baron_team100,0.018865
team_totalGold_200,0.012477
team_xp_200,0.011803
dragon_team100,0.011389
