# Model New Data

In [1]:
import numpy as np
import pandas as pd
import acquire
import prepare
import model
from env import api_key

In [5]:
#Load final_10 csv file
data = pd.read_csv('../master10_csv/final_10.csv', index_col = [0])
data

Unnamed: 0,airdragon_team100,assistsplayer_1,assistsplayer_10,assistsplayer_2,assistsplayer_3,assistsplayer_4,assistsplayer_5,assistsplayer_6,assistsplayer_7,assistsplayer_8,...,xp_4,xp_5,xp_6,xp_7,xp_8,xp_9,chemtechdragon_team200,riftherald_team200,airdragon_team200,waterdragon_team200
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,3033.0,2297.0,3978.0,3193.0,4131.0,2971.0,,,,
1,0.0,1.0,1.0,2.0,2.0,2.0,2.0,0.0,1.0,2.0,...,2907.0,2719.0,2691.0,3758.0,4331.0,3662.0,,,,
2,0.0,1.0,2.0,1.0,4.0,5.0,7.0,1.0,3.0,2.0,...,3192.0,3234.0,4260.0,3189.0,4858.0,2538.0,,,,
3,0.0,0.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,2921.0,2417.0,4801.0,4457.0,3837.0,2916.0,1.0,,,
4,1.0,0.0,2.0,0.0,1.0,0.0,1.0,0.0,3.0,0.0,...,3005.0,2671.0,4865.0,3579.0,4496.0,2644.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4858,0.0,1.0,1.0,2.0,2.0,1.0,4.0,0.0,1.0,2.0,...,3365.0,2670.0,4031.0,3851.0,2589.0,3621.0,,,,
4859,0.0,3.0,1.0,1.0,1.0,2.0,0.0,2.0,2.0,1.0,...,3119.0,4443.0,3667.0,4158.0,2940.0,3527.0,,,,
4860,0.0,2.0,1.0,0.0,1.0,0.0,2.0,0.0,1.0,2.0,...,4471.0,2958.0,4988.0,4474.0,2615.0,4185.0,,,,
4861,0.0,0.0,4.0,2.0,1.0,0.0,2.0,0.0,5.0,1.0,...,3347.0,1972.0,5326.0,2685.0,3750.0,3402.0,,,,


In [87]:
data.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4863 entries, 0 to 4862
Data columns (total 196 columns):
 #    Column                            Dtype  
---   ------                            -----  
 0    airdragon_team100                 float64
 1    assistsplayer_1                   float64
 2    assistsplayer_10                  float64
 3    assistsplayer_2                   float64
 4    assistsplayer_3                   float64
 5    assistsplayer_4                   float64
 6    assistsplayer_5                   float64
 7    assistsplayer_6                   float64
 8    assistsplayer_7                   float64
 9    assistsplayer_8                   float64
 10   assistsplayer_9                   float64
 11   baron_team100                     float64
 12   chemtechdragon_team100            float64
 13   currentGold_1                     float64
 14   currentGold_10                    float64
 15   currentGold_2                     float64
 16   currentGold_3         

In [6]:
#Now prepare it
train, test = prepare.prepare(data)
train.shape, test.shape

((3890, 229), (973, 229))

In [7]:
#Drop all columns that are categorical. They are not useful
cols_to_drop = train.select_dtypes('object').columns
cols_to_drop

Index(['gameMode', 'gameName', 'gameType', 'gameVersion', 'matchId'], dtype='object')

In [8]:
train = train.drop(columns = cols_to_drop)
test = test.drop(columns = cols_to_drop)

In [9]:
train.shape, test.shape

((3890, 224), (973, 224))

In [10]:
#Now separate it into X and y groups
X_train, y_train = train.drop(columns = ['winningTeam']), train.winningTeam
X_test, y_test = test.drop(columns = ['winningTeam']), train.winningTeam

In [12]:
#Create the dict of hyperparameters to optimize across
param_dict = {
    'max_depth': range(1, 16),
    'min_samples_leaf': range(1, 16)
}

In [13]:
#Now build the models
best_model = model.get_random_forest_models(X_train, y_train, param_dict, cv = 5)

Mean Cross-Validated Accuracy:  0.6344
Max Depth:  13
Min Samples Per Leaf:  14


In [14]:
#What were the most important features?
best_features = pd.DataFrame(best_model.feature_importances_, X_train.columns)
best_features.sort_values(by = 0, ascending = False).head(10)

Unnamed: 0,0
RedTeamTotalGoldDifference,0.043678
BlueTeamTotalGoldDifference,0.042757
RedTeamXp,0.022739
RedTeamTotalGold,0.021644
gameDuration,0.019262
BlueTeamXp,0.018272
BlueTeamTotalGold,0.017611
RedTeamTotalDamageDoneToChampions,0.0166
totalGold_9,0.011461
RedTeamKills,0.011025


# Can We Remove Individual Stats and Maintain or Improve Model Accuracy?

In [20]:
#Only keep columns that have the word 'team' in their name
cols_to_keep = X_train.columns[X_train.columns.str.contains('Team')]
cols_to_keep

Index(['BlueTeamDeaths', 'RedTeamDeaths', 'BlueTeamGoldPerSec',
       'RedTeamGoldPerSec', 'BlueTeamJungleMinionsKilled',
       'RedTeamJungleMinionsKilled', 'BlueTeamKills', 'RedTeamKills',
       'BlueTeamLevel', 'RedTeamLevel', 'BlueTeamMagicDamageDoneToChampions',
       'RedTeamMagicDamageDoneToChampions', 'BlueTeamMinionsKilled',
       'RedTeamMinionsKilled', 'BlueTeamPhysicalDamageDoneToChampions',
       'RedTeamPhysicalDamageDoneToChampions',
       'BlueTeamTimeEnemySpentControlled', 'RedTeamTimeEnemySpentControlled',
       'BlueTeamTotalDamageDoneToChampions',
       'RedTeamTotalDamageDoneToChampions', 'BlueTeamTotalGold',
       'RedTeamTotalGold', 'BlueTeamTrueDamageDoneToChampions',
       'RedTeamTrueDamageDoneToChampions', 'BlueTeamWards', 'RedTeamWards',
       'BlueTeamAssists', 'RedTeamAssists', 'BlueTeamXp', 'RedTeamXp',
       'BlueTeamTotalGoldDifference', 'RedTeamTotalGoldDifference',
       'BlueTeamMVPKills', 'RedTeamMVPKills'],
      dtype='object')

In [22]:
X_train_team_data = X_train.loc[:, cols_to_keep]
X_train_team_data

Unnamed: 0,BlueTeamDeaths,RedTeamDeaths,BlueTeamGoldPerSec,RedTeamGoldPerSec,BlueTeamJungleMinionsKilled,RedTeamJungleMinionsKilled,BlueTeamKills,RedTeamKills,BlueTeamLevel,RedTeamLevel,...,BlueTeamWards,RedTeamWards,BlueTeamAssists,RedTeamAssists,BlueTeamXp,RedTeamXp,BlueTeamTotalGoldDifference,RedTeamTotalGoldDifference,BlueTeamMVPKills,RedTeamMVPKills
3513,4,8,30,20,54,44,8,4,36,30,...,35,208,9,2,17886,15180,1046,-1046,2.0,2.0
1250,3,6,20,30,60,52,6,3,35,33,...,14,12,9,5,18575,16532,1327,-1327,3.0,3.0
3532,3,10,30,30,64,52,10,3,37,34,...,19,14,5,3,21106,17694,4519,-4519,4.0,1.0
3858,9,12,20,20,56,64,12,9,35,34,...,29,16,9,11,18593,17114,1433,-1433,5.0,4.0
528,9,11,20,20,40,55,11,9,34,33,...,12,193,10,7,18305,17077,834,-834,4.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1593,6,14,30,20,49,38,14,6,35,32,...,18,10,10,7,18275,15159,4619,-4619,5.0,2.0
4060,11,4,30,20,44,48,4,11,32,35,...,14,13,5,13,15977,18056,-2111,2111,2.0,4.0
1346,8,8,30,30,56,51,8,8,36,35,...,14,10,8,9,18739,17614,250,-250,4.0,6.0
3454,4,4,30,20,64,60,4,4,36,34,...,14,15,6,3,18604,18467,1594,-1594,2.0,3.0


In [24]:
#Build the models
best_model = model.get_random_forest_models(X_train_team_data, y_train, param_dict, cv = 5)

Mean Cross-Validated Accuracy:  0.6334
Max Depth:  8
Min Samples Per Leaf:  14


In [26]:
#What were the most important features?
best_features = pd.DataFrame(best_model.feature_importances_, X_train_team_data.columns)
best_features.sort_values(by = 0, ascending = False).head(10)

Unnamed: 0,0
RedTeamTotalGoldDifference,0.123389
BlueTeamTotalGoldDifference,0.117637
RedTeamXp,0.054142
RedTeamTotalGold,0.0515
BlueTeamXp,0.051006
RedTeamTotalDamageDoneToChampions,0.048396
BlueTeamTotalGold,0.046152
BlueTeamPhysicalDamageDoneToChampions,0.031445
BlueTeamTotalDamageDoneToChampions,0.030607
BlueTeamMagicDamageDoneToChampions,0.030401


# Can We Keep Only One Team's Data and Maintain or Improve Model Accuracy?

In [27]:
cols_to_keep = X_train_team_data.columns[X_train_team_data.columns.str.contains('Blue')]
cols_to_keep

Index(['BlueTeamDeaths', 'BlueTeamGoldPerSec', 'BlueTeamJungleMinionsKilled',
       'BlueTeamKills', 'BlueTeamLevel', 'BlueTeamMagicDamageDoneToChampions',
       'BlueTeamMinionsKilled', 'BlueTeamPhysicalDamageDoneToChampions',
       'BlueTeamTimeEnemySpentControlled',
       'BlueTeamTotalDamageDoneToChampions', 'BlueTeamTotalGold',
       'BlueTeamTrueDamageDoneToChampions', 'BlueTeamWards', 'BlueTeamAssists',
       'BlueTeamXp', 'BlueTeamTotalGoldDifference', 'BlueTeamMVPKills'],
      dtype='object')

In [28]:
X_train_blue_team = X_train_team_data.loc[:, cols_to_keep]
X_train_blue_team

Unnamed: 0,BlueTeamDeaths,BlueTeamGoldPerSec,BlueTeamJungleMinionsKilled,BlueTeamKills,BlueTeamLevel,BlueTeamMagicDamageDoneToChampions,BlueTeamMinionsKilled,BlueTeamPhysicalDamageDoneToChampions,BlueTeamTimeEnemySpentControlled,BlueTeamTotalDamageDoneToChampions,BlueTeamTotalGold,BlueTeamTrueDamageDoneToChampions,BlueTeamWards,BlueTeamAssists,BlueTeamXp,BlueTeamTotalGoldDifference,BlueTeamMVPKills
3513,4,30,54,8,36,4418,192,6593,401302,11500,16113,487,35,9,17886,1046,2.0
1250,3,20,60,6,35,3617,226,9274,355801,13729,16660,835,14,9,18575,1327,3.0
3532,3,30,64,10,37,2008,227,10075,334801,12518,19832,432,19,5,21106,4519,4.0
3858,9,20,56,12,35,6012,213,10534,351805,18813,18884,2262,29,9,18593,1433,5.0
528,9,20,40,11,34,7679,188,7935,231642,16812,17716,1197,12,10,18305,834,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1593,6,30,49,14,35,7577,207,12252,408303,21641,19403,1809,18,10,18275,4619,5.0
4060,11,30,44,4,32,7488,172,5356,352066,14220,15704,1370,14,5,15977,-2111,2.0
1346,8,30,56,8,36,2395,183,9920,199400,12844,17589,526,14,8,18739,250,4.0
3454,4,30,64,4,36,3184,228,8200,398893,11385,16759,0,14,6,18604,1594,2.0


In [29]:
#Build the models
best_model = model.get_random_forest_models(X_train_blue_team, y_train, param_dict, cv = 5)

Mean Cross-Validated Accuracy:  0.6252
Max Depth:  4
Min Samples Per Leaf:  14


In [31]:
y_train.value_counts(normalize = True)

200.0    0.518509
100.0    0.481491
Name: winningTeam, dtype: float64

In [32]:
#What were the most important features?
best_features = pd.DataFrame(best_model.feature_importances_, X_train_blue_team.columns)
best_features.sort_values(by = 0, ascending = False).head(10)

Unnamed: 0,0
BlueTeamTotalGoldDifference,0.348754
BlueTeamTotalGold,0.133557
BlueTeamDeaths,0.114156
BlueTeamXp,0.108818
BlueTeamMinionsKilled,0.045884
BlueTeamKills,0.033393
BlueTeamTotalDamageDoneToChampions,0.031304
BlueTeamMagicDamageDoneToChampions,0.027262
BlueTeamAssists,0.025697
BlueTeamPhysicalDamageDoneToChampions,0.024809


# Can We Only Use Individual Stats and Maintain or Improve Model Accuracy?

In [50]:
cols_to_keep = X_train.columns[(X_train.columns.str.contains('Team') == False)& (X_train.columns.str.contains('dragon') == False)]
cols_to_keep

Index(['assistsplayer_1', 'assistsplayer_10', 'assistsplayer_2',
       'assistsplayer_3', 'assistsplayer_4', 'assistsplayer_5',
       'assistsplayer_6', 'assistsplayer_7', 'assistsplayer_8',
       'assistsplayer_9',
       ...
       'xp_10', 'xp_2', 'xp_3', 'xp_4', 'xp_5', 'xp_6', 'xp_7', 'xp_8', 'xp_9',
       'riftherald_team200'],
      dtype='object', length=175)

In [52]:
X_train_players = X_train.loc[:, cols_to_keep]
X_train_players

Unnamed: 0,assistsplayer_1,assistsplayer_10,assistsplayer_2,assistsplayer_3,assistsplayer_4,assistsplayer_5,assistsplayer_6,assistsplayer_7,assistsplayer_8,assistsplayer_9,...,xp_10,xp_2,xp_3,xp_4,xp_5,xp_6,xp_7,xp_8,xp_9,riftherald_team200
3513,1.0,0.0,1.0,1.0,4.0,2.0,1.0,0.0,0.0,1.0,...,2699.0,3323.0,4350.0,3785.0,3204.0,1617.0,3805.0,5010.0,2049.0,0.0
1250,1.0,0.0,0.0,3.0,0.0,5.0,0.0,3.0,0.0,2.0,...,4197.0,4627.0,2781.0,2777.0,3916.0,4206.0,2758.0,2365.0,3006.0,0.0
3532,1.0,1.0,1.0,0.0,1.0,2.0,1.0,1.0,0.0,0.0,...,3073.0,3965.0,4961.0,3586.0,3179.0,3995.0,3442.0,4516.0,2668.0,0.0
3858,1.0,6.0,3.0,1.0,0.0,4.0,1.0,0.0,0.0,4.0,...,2517.0,3877.0,4989.0,3019.0,2615.0,3654.0,3877.0,3737.0,3329.0,0.0
528,1.0,2.0,2.0,2.0,3.0,2.0,0.0,2.0,1.0,2.0,...,1997.0,3179.0,4250.0,3041.0,3102.0,3822.0,3981.0,3556.0,3721.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1593,3.0,2.0,1.0,0.0,1.0,5.0,0.0,2.0,1.0,2.0,...,2800.0,3413.0,4569.0,3028.0,2421.0,3449.0,2877.0,3477.0,2556.0,0.0
4060,2.0,5.0,1.0,0.0,1.0,1.0,0.0,6.0,2.0,0.0,...,2847.0,2933.0,4149.0,3099.0,1914.0,3995.0,3233.0,4440.0,3541.0,0.0
1346,2.0,2.0,2.0,0.0,2.0,2.0,4.0,0.0,2.0,1.0,...,2802.0,4641.0,5293.0,3224.0,2681.0,4258.0,3652.0,4161.0,2741.0,0.0
3454,0.0,0.0,1.0,0.0,2.0,3.0,0.0,1.0,0.0,2.0,...,2152.0,3437.0,4279.0,3331.0,2885.0,4645.0,3949.0,4759.0,2962.0,0.0


In [53]:
#Build the models
best_model = model.get_random_forest_models(X_train_players, y_train, param_dict, cv = 5)

Mean Cross-Validated Accuracy:  0.6206
Max Depth:  10
Min Samples Per Leaf:  5


In [54]:
#What were the most important features?
best_features = pd.DataFrame(best_model.feature_importances_, X_train_players.columns)
best_features.sort_values(by = 0, ascending = False).head(10)

Unnamed: 0,0
totalGold_9,0.020772
totalGold_2,0.018935
totalGold_8,0.016779
totalGold_4,0.015249
totalGold_10,0.014764
totalGold_1,0.01388
totalDamageDoneToChampions_6,0.013619
xp_2,0.0132
totalGold_7,0.01312
gameDuration,0.012908


# Will a KNN Model Improve Accuracy?

In [64]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [60]:
#First need to scale the data
#Create the scaler
scaler = MinMaxScaler()

#Fit and transform on train data
X_train_scaled = scaler.fit_transform(X_train)

In [72]:
#Build a function for KNN
def get_KNN_models(X_train_scaled, y_train, param_dict, cv = 5):
    """
    This function takes in scaled data and builds an optimized KNN classification model. 
    It will use the parameters specified in the param_dict to optimizie across.
    
    This function utilizes GridSearchCV.
    
    This function returns the best model and prints out its parameters and mean
    cross-validated accuracy.
    """
    #Create the KNN model
    clf = KNeighborsClassifier()
    
    #Create the GridSearchCV object
    grid = GridSearchCV(clf, param_dict, cv)
    
    #Fit the GridSearchCV object
    grid.fit(X_train_scaled, y_train)
    
    #Print the best model's score and parameters
    print('Mean Cross-Validated Accuracy: ', round(grid.best_score_, 4))
    print('Num Neighbors: ', grid.best_params_['n_neighbors'])
    print('Weights: ', grid.best_params_['weights'])
    
    #Return the best model
    return grid.best_estimator_

In [79]:
#Create a param_dict for KNN model
param_dict = {
    'n_neighbors': range(200, 500),
    'weights': ['uniform', 'distance']
}

In [80]:
#Test the function. Run it on the full X_train_scaled data set (Team and Player data)
best_model = get_KNN_models(X_train_scaled, y_train, param_dict, cv = 5)

Mean Cross-Validated Accuracy:  0.6254
Num Neighbors:  216
Weights:  uniform


# Can We Use Only Team Data and Maintain or Improve KNN Model Accuracy?

In [81]:
#Scale the data
X_train_team_data_scaled = scaler.fit_transform(X_train_team_data)

In [82]:
#build the models
best_model = get_KNN_models(X_train_team_data_scaled, y_train, param_dict, cv = 5)

Mean Cross-Validated Accuracy:  0.6301
Num Neighbors:  499
Weights:  distance


# Can We Use Only A Single Team's Data and Maintain or Improve KNN Model Accuracy?

In [83]:
#Scale the data
X_train_blue_team_scaled = scaler.fit_transform(X_train_blue_team)

In [84]:
#Build the models
best_model = get_KNN_models(X_train_blue_team_scaled, y_train, param_dict, cv = 5)

Mean Cross-Validated Accuracy:  0.6252
Num Neighbors:  229
Weights:  uniform


# Can We Only Use Individual Stats and Maintain or Improve KNN Model Accuracy?

In [85]:
#Scale the data
X_train_players_scaled = scaler.fit_transform(X_train_players)

In [86]:
#Build the models
best_model = get_KNN_models(X_train_players_scaled, y_train, param_dict, cv = 5)

Mean Cross-Validated Accuracy:  0.6103
Num Neighbors:  494
Weights:  uniform


# Try Using Josh C's Suggested Features for Both Random Forest and KNN Models

In [88]:
suggested_features = [
    'xp_1',
    'killsplayer_2',
    'killsplayer_4',
    'assistsplayer_5'
]

In [89]:
X_train_suggested = X_train.loc[:, suggested_features]
X_train_suggested

Unnamed: 0,xp_1,killsplayer_2,killsplayer_4,assistsplayer_5
3513,3224.0,2.0,1.0,2.0
1250,4474.0,2.0,3.0,5.0
3532,5415.0,1.0,3.0,2.0
3858,4093.0,3.0,5.0,4.0
528,4733.0,2.0,1.0,2.0
...,...,...,...,...
1593,4844.0,3.0,4.0,5.0
4060,3882.0,2.0,1.0,1.0
1346,2900.0,4.0,0.0,2.0
3454,4672.0,1.0,2.0,3.0


__Random Forest Classifier__

In [90]:
rf_dict = {
    'max_depth': range(1, 16),
    'min_samples_leaf': range(1, 16)
}

In [92]:
#Build the random forest models
best_model = model.get_random_forest_models(X_train_suggested, y_train, rf_dict, cv = 5)

Mean Cross-Validated Accuracy:  0.5545
Max Depth:  5
Min Samples Per Leaf:  13


__KNN Classifier__

In [93]:
#Scale the data
X_train_suggested_scaled = scaler.fit_transform(X_train_suggested)

In [94]:
knn_dict = {
    'n_neighbors': range(200, 500),
    'weights': ['uniform', 'distance']
}

In [None]:
#Build the KNN models
best_model = get_KNN_models(X_train_suggested_scaled, y_train, knn_dict, cv = 5)