# Modeling

Begin writing a function that creates and attempts to optimize a Random Forest Classifier model. It will utilize cross-validation and grid search. Once this is complete and functional, we can begin adding other algorithms.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier
import prepare
import acquire

# Get Some Data

In [2]:
df = pd.read_csv('final_10.csv')
df

Unnamed: 0.1,Unnamed: 0,airdragon_team100,assistsplayer_1,assistsplayer_10,assistsplayer_2,assistsplayer_3,assistsplayer_4,assistsplayer_5,assistsplayer_6,assistsplayer_7,...,xp_4,xp_5,xp_6,xp_7,xp_8,xp_9,chemtechdragon_team200,riftherald_team200,airdragon_team200,waterdragon_team200
0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,3033.0,2297.0,3978.0,3193.0,4131.0,2971.0,,,,
1,1,0.0,1.0,1.0,2.0,2.0,2.0,2.0,0.0,1.0,...,2907.0,2719.0,2691.0,3758.0,4331.0,3662.0,,,,
2,2,0.0,1.0,2.0,1.0,4.0,5.0,7.0,1.0,3.0,...,3192.0,3234.0,4260.0,3189.0,4858.0,2538.0,,,,
3,3,0.0,0.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,...,2921.0,2417.0,4801.0,4457.0,3837.0,2916.0,1.0,,,
4,4,1.0,0.0,2.0,0.0,1.0,0.0,1.0,0.0,3.0,...,3005.0,2671.0,4865.0,3579.0,4496.0,2644.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4858,4858,0.0,1.0,1.0,2.0,2.0,1.0,4.0,0.0,1.0,...,3365.0,2670.0,4031.0,3851.0,2589.0,3621.0,,,,
4859,4859,0.0,3.0,1.0,1.0,1.0,2.0,0.0,2.0,2.0,...,3119.0,4443.0,3667.0,4158.0,2940.0,3527.0,,,,
4860,4860,0.0,2.0,1.0,0.0,1.0,0.0,2.0,0.0,1.0,...,4471.0,2958.0,4988.0,4474.0,2615.0,4185.0,,,,
4861,4861,0.0,0.0,4.0,2.0,1.0,0.0,2.0,0.0,5.0,...,3347.0,1972.0,5326.0,2685.0,3750.0,3402.0,,,,


Although this data has already been prepared, I still need to drop the column called 'killsplayer_0'. It represents how many kills were made by game objects, not players, and contains several null values. Then, all I need to do is split it up into X and y groups and then into train and test sets. Please keep in mind this data set is only a fraction of our expected data set, and is only being used to check the funcionality of my model.

__Drop 'killsplayer_0' Column__

In [3]:
df = prepare.clean(df)

__Split into X and y Groups__

In [4]:
X, y = df.drop(columns = ['winningTeam']), df.winningTeam

__Create Train and Test Sets__

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

In [6]:
X_train.shape, y_train.shape

((3890, 229), (3890,))

__Create Dummy Variables__

In [7]:
X_train = pd.get_dummies(X_train, drop_first = True)
X_train

Unnamed: 0.1,Unnamed: 0,airdragon_team100,assistsplayer_1,assistsplayer_10,assistsplayer_2,assistsplayer_3,assistsplayer_4,assistsplayer_5,assistsplayer_6,assistsplayer_7,...,matchId_NA1_4112907924,matchId_NA1_4112912287,matchId_NA1_4112918754,matchId_NA1_4112919909,matchId_NA1_4112924167,matchId_NA1_4112925405,matchId_NA1_4112932537,matchId_NA1_4112942647,matchId_NA1_4112957466,matchId_NA1_4112995056
3513,3513,0.0,1.0,0.0,1.0,1.0,4.0,2.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1250,1250,0.0,1.0,0.0,0.0,3.0,0.0,5.0,0.0,3.0,...,0,0,0,0,0,0,0,0,0,0
3532,3532,0.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3858,3858,0.0,1.0,6.0,3.0,1.0,0.0,4.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
528,528,0.0,1.0,2.0,2.0,2.0,3.0,2.0,0.0,2.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1593,1593,0.0,3.0,2.0,1.0,0.0,1.0,5.0,0.0,2.0,...,0,0,0,0,0,0,0,0,0,0
4060,4060,0.0,2.0,5.0,1.0,0.0,1.0,1.0,0.0,6.0,...,0,0,0,0,0,0,0,0,0,0
1346,1346,0.0,2.0,2.0,2.0,0.0,2.0,2.0,4.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3454,3454,0.0,0.0,0.0,1.0,0.0,2.0,3.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0


__Create a Baseline__

Since this is a classification problem, I will set the baseline to whichever team has the most wins.

In [8]:
#Set team 100.0 to be blue_team and team 200.0 to be red_team
def get_team_color(value):
    if value == 100.0:
        return 'blue_team'
    else:
        return 'red_team'

In [9]:
y_train = y_train.apply(get_team_color)

In [10]:
y_train.value_counts()

red_team     2017
blue_team    1873
Name: winningTeam, dtype: int64

In [11]:
#Use the dummy classifier to set the baseline
#red_team has the most wins
from sklearn.dummy import DummyClassifier

baseline = DummyClassifier(strategy = 'constant', constant = 'red_team')
baseline.fit(X_train, y_train)

#Now get the baseline accuracy
baseline.score(X_train, y_train)

0.5185089974293059

__Train a Single Model__

Train a single model to find out about how long it will take with so many features. From there, I will be able to estimate how long the grid search might take to complete.

In [12]:
#Create the model (just use default hyperparameters for now, except random_state)
model = RandomForestClassifier(random_state = 123)

#Fit the model
model.fit(X_train, y_train)

#Score the model
model.score(X_train, y_train)

1.0

The above model finished training extremely quickly, so I don't think there is anything to worry about. Just be mindful of how many models will actually be produced with the given ranges for the hyperparameters.

__Implement GridSearchCV__

In [13]:
clf = RandomForestClassifier(random_state = 123)

grid = GridSearchCV(clf, {'max_depth': range(5, 11), 'min_samples_leaf': range(5, 11)}, cv = 5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=123),
             param_grid={'max_depth': range(5, 11),
                         'min_samples_leaf': range(5, 11)})

In [14]:
#What was the best score and best parameters
grid.best_score_, grid.best_params_

(0.6280205655526991, {'max_depth': 8, 'min_samples_leaf': 7})

__Write RandomForestClassifier Function__

In [15]:
rf_dict = {
    'max_depth': range(1, 16),
    'min_samples_leaf': range(1, 16)
}

In [16]:
def get_random_forest_models(X_train, y_train, param_dict, cv = 5):
    """
    This function creates and returns an optimized random forest classification model. It also
    prints out the best model's mean cross-validated accuracy score and parameters.
    
    This function takes in the X and y training sets to fit the models.
    
    This function takes in a dictionary that contains the parameters to be iterated through.
    
    This function also takes in a value for the number of cross validation folds to do.
    The cv value defaults to 5.
    """
    #Create the classifier model
    clf = RandomForestClassifier(random_state = 123)
    
    #Create the GridSearchCV object
    grid = GridSearchCV(clf, param_dict, cv = 5)
    
    #Fit the GridSearchCV object
    grid.fit(X_train, y_train)
    
    #Print the best model's score and parameters
    print('Mean Cross-Validated Accuracy: ', round(grid.best_score_, 4))
    print('Max Depth: ', grid.best_params_['max_depth'])
    print('Min Samples Per Leaf: ', grid.best_params_['min_samples_leaf'])
    
    #Return the best model
    return grid.best_estimator_

In [None]:
best_model = get_random_forest_models(X_train, y_train, rf_dict)

In [None]:
#Check to see if the function returned the model correctly
#Scoring it on the train data should yield a similar result to the average score
best_model.score(X_train, y_train)

__What were the Most Important Features?__

In [None]:
best_features = pd.DataFrame(best_model.feature_importances_, X_train.columns)
best_features.sort_values(by = 0, ascending = False).head(10)

### AdaBoostClassifier

I will use the AdaBoostClassifier with a RandomForestClassifier as the base_estimator.

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
#Create the RandomForestClassifier object
rf = RandomForestClassifier(random_state = 123)

#Create the AdaBoostClassifier object
adaBoost = AdaBoostClassifier(rf, random_state = 123)

#Create GridSearchCV object
grid = GridSearchCV(adaBoost, {'n_estimators': range(50, 101, 10)}, cv = 5)

#Fit the grid object
grid.fit(X_train, y_train)

In [None]:
#What was the best score and best parameters
grid.best_score_, grid.best_params_

__Let's see if it can improve performance of our best RandomForest model from earlier__

In [None]:
#Create the AdaBoostClassifier object
adaBoost = AdaBoostClassifier(best_model, random_state = 123)

#Create GridSearchCV object
grid = GridSearchCV(adaBoost, {'n_estimators': range(50, 101, 5)}, cv = 5)

#Fit the grid object
grid.fit(X_train, y_train)

#What was the best score and best parameters
grid.best_score_, grid.best_params_

It is actually slightly better than before

In [None]:
#Create a function for AdaBoost
def get_adaBoosted_model(X_train, y_train, model_to_boost, param_dict, cv = 5):
    """
    This function creates and returns an optimized AdaBoosted random forest classification model. It also
    prints out the best model's mean cross-validated accuracy score and parameters.
    
    This function takes in the X and y training sets to fit the models.
    
    This function takes in a dictionary that contains the parameters to be iterated through.
    
    This function also takes in a value for the number of cross validation folds to do.
    The cv value defaults to 5.
    """
    #Create the AdaBoost Classifier
    adaBoost_clf = AdaBoostClassifier(model_to_boost, random_state = 123)
    
    #Create the GridSearchCV object
    grid = GridSearchCV(adaBoost_clf, param_dict, cv = 5)
    
    #Fit the GridSearchCV object
    grid.fit(X_train, y_train)
    
    #Print the best model's score and parameters
    print('Mean Cross-Validated Accuracy: ', round(grid.best_score_, 4))
    print('Num Estimators: ', grid.best_params_['n_estimators'])
    print('Learning Rate: ', grid.best_params_['learning_rate'])
    
    #Return the best model
    return grid.best_estimator_

In [None]:
adaBoost_params = {
    'n_estimators': range(50, 61),
    'learning_rate': range(1, 6)
}

In [None]:
#Test the above function
ada_boosted_clf = get_adaBoosted_model(X_train, y_train, best_model, adaBoost_params)

In [None]:
#This performed slightly better than the random forest alone.
#What were the most important features?
best_features = pd.DataFrame(ada_boosted_clf.feature_importances_, X_train.columns)
best_features.sort_values(by = 0, ascending = False).head(10)

# Test Dataset at 15 Minute Mark

To get the data at the 15 minute mark, I'll have to reload all of the match data json files and run them through the prepare function.

In [None]:
#Create a list of timeline files to iterate through
timeline_files = ['timeline_data_start_4000_end_5000.json',
                  'timeline_data_start_5000_end_6000.json',
                  'timeline_data_start_6000_end_7000.json', 
                  'timeline_data_start_7000_end_8000.json',
                  'timeline_data_start_8000_end_9000.json',
                  'timeline_data_start_9000_end_10000.json',
                  'timeline_data_start_10000_end_10657.json']

In [None]:
#Create a list of other game data files to iterate through
other_data_files = ['other_game_data_start_4000_end_5000.json',
                  'other_game_data_start_5000_end_6000.json',
                  'other_game_data_start_6000_end_7000.json', 
                  'other_game_data_start_7000_end_8000.json',
                  'other_game_data_start_8000_end_9000.json',
                  'other_game_data_start_9000_end_10000.json',
                  'other_game_data_start_10000_end_10657.json']

In [None]:
#Create empty list to store the timeline info
#Save the single file so we don't have to do this again in the future
timeline_list = []

#Now loop through the files list, read each file, and extend the timeline_list with each entry
for file in timeline_files:
    #Read the file
    temp_file = pd.read_json(file)
    
    #Turn it into a list of dicts
    temp_file = temp_file.to_dict(orient = 'records')
    
    #Extend the timeline_list with the temp file
    timeline_list.extend(temp_file)

In [None]:
#Convert to df
timeline_df = pd.DataFrame(timeline_list)

#Now save this complete file as a single json
timeline_df.to_json('timeline_data_start_4000_end_10657.json')

In [None]:
#Create empty list to store the other game data
#Save the single file so we don't have to do this again in the future
game_data_list = []

#Now loop through the files list, read each file, and extend the game_data_list with each entry
for file in other_data_files:
    #Read the file
    temp_file = pd.read_json(file)
    
    #Turn it into a list of dicts
    temp_file = temp_file.to_dict(orient = 'records')
    
    #Extend the timeline_list with the temp file
    game_data_list.extend(temp_file)

In [None]:
#Convert to df
game_data_df = pd.DataFrame(game_data_list)

#Now save this complete file as a single json
game_data_df.to_json('other_game_data_start_4000_end_10657.json')

In [None]:
import prepare

#Now the lists are created, run them through Joshua C's prepare and prep functions
match_info_minute_15 = prepare.prepare(timeline_list, game_data_list, 15)

In [None]:
#Save this csv
match_info_minute_15.to_csv('match_data_start_4000_end_10657_minute_15.csv', index = False)

In [None]:
#Now that we have the prepared data for the 15 minute mark, go through the same process you did before
match_info_15 = prepare.prep(match_info_minute_15)

In [None]:
#Killsplayer_0 can be dropped because its not an actual player.
match_info_15.drop(columns = ['killsplayer_0'], inplace = True)

In [None]:
#Split into X and y
X, y = match_info_15.drop(columns = ['winningTeam']), match_info_15.winningTeam

In [None]:
#Create dummy vars
X = pd.get_dummies(X, drop_first = True)

In [None]:
#Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

In [None]:
#Create the dict of params to iterate through
rf_dict = {
    'max_depth': range(1, 16),
    'min_samples_leaf': range(1, 16)
}

In [None]:
best_model = get_random_forest_models(X_train, y_train, rf_dict, cv = 5)

In [None]:
#What were the most important features?
best_features = pd.DataFrame(best_model.feature_importances_, X_train.columns)
best_features.sort_values(by = 0, ascending = False).head(10)

# Build Models For New Data at 15 Minutes

In [None]:
#Load the extracted dataframe for the new data
extracted_df = pd.read_csv('new_extracted_data_smith.csv')

In [None]:
#Prepare the extracted data
import prepare

train, test = prepare.prepare(extracted_df)

In [None]:
train.shape, test.shape

In [None]:
#Drop columns that are categorical. These columns don't offer any value
cols_to_drop = train.select_dtypes('object').columns
cols_to_drop

train.drop(columns = cols_to_drop, inplace = True)
test.drop(columns = cols_to_drop, inplace = True)

In [None]:
train.shape, test.shape

In [None]:
#Now split into X and y groups
X_train, X_test = train.drop(columns = ['winningTeam']), test.drop(columns = ['winningTeam'])
y_train, y_test = train.winningTeam, test.winningTeam

In [None]:
#Create the dict of params to iterate through
rf_dict = {
    'max_depth': range(1, 16),
    'min_samples_leaf': range(1, 16)
}

In [None]:
#Now create models and return the best one
best_model = get_random_forest_models(X_train, y_train, rf_dict, cv = 5)

In [None]:
#What were the top ten features?
def get_best_features(model, X_train, num_features = 10):
    """
    This function gets the best features of the desired model and prints them out.
    You can change how many features are shown with num_features.
    This function returns nothing.
    """
    #What were the most important features?
    best_features = pd.DataFrame(best_model.feature_importances_, X_train.columns)
    print(best_features.sort_values(by = 0, ascending = False).head(num_features))

In [None]:
get_best_features(best_model, X_train)

# Build Models For New Data at 10 Minutes

In [None]:
import acquire
import prepare
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier

__Extract Function Does Not Work For Time = 10__

I need to wait until Joshua can take a look and fix the issue. However, this is not top priority.

In [None]:
#Need to extract data for the 10 minute mark
extracted_data = acquire.build_extracted_df(username = 'smith', path = './new_data/', time = 10)