# Regression

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import sklearn

# To create test/train splits
from sklearn.model_selection import train_test_split
# To help us go through different parameter configurations for
# each type of model.
from sklearn.model_selection import GridSearchCV
# To help us evaluate the model on each trial or "split"
from sklearn.model_selection import cross_val_score

# Naive Bayes
from sklearn.neighbors import KNeighborsRegressor
# Neural Networks
from sklearn.neural_network import MLPRegressor

# Confusion matrix
from sklearn.metrics import confusion_matrix

from sklearn.preprocessing import Imputer

In [21]:
# Import our regression evaluation metrics
from sklearn.metrics import explained_variance_score, r2_score

In [22]:
# This is to stop the barrage of warning messages we'll get later
import warnings; warnings.simplefilter('ignore')

In [23]:
# Create a seed to use later.
seed = 42

In [24]:
# First import your dataset as usual
dataset = "datasets/SkillCraft1_Dataset.csv"

df = pd.read_csv(dataset, delimiter=",")

df.head()

Unnamed: 0,GameID,LeagueIndex,Age,HoursPerWeek,TotalHours,APM,SelectByHotkeys,AssignToHotkeys,UniqueHotkeys,MinimapAttacks,MinimapRightClicks,NumberOfPACs,GapBetweenPACs,ActionLatency,ActionsInPAC,TotalMapExplored,WorkersMade,UniqueUnitsMade,ComplexUnitsMade,ComplexAbilitiesUsed
0,52,5,27,10,3000,143.718,0.003515,0.00022,7,0.00011,0.000392,0.004849,32.6677,40.8673,4.7508,28,0.001397,6,0.0,0.0
1,55,5,23,10,5000,129.2322,0.003304,0.000259,4,0.000294,0.000432,0.004307,32.9194,42.3454,4.8434,22,0.001194,5,0.0,0.000208
2,56,4,30,10,200,69.9612,0.001101,0.000336,4,0.000294,0.000461,0.002926,44.6475,75.3548,4.043,22,0.000745,6,0.0,0.000189
3,57,3,19,20,400,107.6016,0.001034,0.000213,1,5.3e-05,0.000543,0.003783,29.2203,53.7352,4.9155,19,0.000426,7,0.0,0.000384
4,58,3,32,10,500,122.8908,0.001136,0.000327,2,0.0,0.001329,0.002368,22.6885,62.0813,9.374,15,0.001174,4,0.0,1.9e-05


In [25]:
df.describe()

Unnamed: 0,GameID,LeagueIndex,APM,SelectByHotkeys,AssignToHotkeys,UniqueHotkeys,MinimapAttacks,MinimapRightClicks,NumberOfPACs,GapBetweenPACs,ActionLatency,ActionsInPAC,TotalMapExplored,WorkersMade,UniqueUnitsMade,ComplexUnitsMade,ComplexAbilitiesUsed
count,3395.0,3395.0,3395.0,3395.0,3395.0,3395.0,3395.0,3395.0,3395.0,3395.0,3395.0,3395.0,3395.0,3395.0,3395.0,3395.0,3395.0
mean,4805.012371,4.184094,117.046947,0.004299,0.000374,4.364654,9.8e-05,0.000387,0.003463,40.361562,63.739403,5.272988,22.131664,0.001032,6.534021,5.9e-05,0.000142
std,2719.944851,1.517327,51.945291,0.005284,0.000225,2.360333,0.000166,0.000377,0.000992,17.15357,19.238869,1.494835,7.431719,0.000519,1.857697,0.000111,0.000265
min,52.0,1.0,22.0596,0.0,0.0,0.0,0.0,0.0,0.000679,6.6667,24.0936,2.0389,5.0,7.7e-05,2.0,0.0,0.0
25%,2464.5,3.0,79.9002,0.001258,0.000204,3.0,0.0,0.00014,0.002754,28.95775,50.4466,4.27285,17.0,0.000683,5.0,0.0,0.0
50%,4874.0,4.0,108.0102,0.0025,0.000353,4.0,4e-05,0.000281,0.003395,36.7235,60.9318,5.0955,22.0,0.000905,6.0,0.0,2e-05
75%,7108.5,5.0,142.7904,0.005133,0.000499,6.0,0.000119,0.000514,0.004027,48.2905,73.6813,6.0336,27.0,0.001259,8.0,8.6e-05,0.000181
max,10095.0,8.0,389.8314,0.043088,0.001752,10.0,0.003019,0.004041,0.007971,237.1429,176.3721,18.5581,58.0,0.005149,13.0,0.000902,0.003084


In [26]:
# Let's see how big our dataset is
n_samples, n_columns = df.shape
print("Number of samples: {}".format(n_samples))

Number of samples: 3395


In [27]:
# Step 1: Create visualization

# For this dataset, we can't get it to plot easily, we'll try this later

In [28]:
# Step 2: Separate features and labels


#imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
#imputer = imputer.fit(df[::])
#df = imputer.transform(df[::])
columns_to_encode =["Age","HoursPerWeek","TotalHours"]
df = pd.get_dummies(data=df, columns=columns_to_encode)

df_X = df.drop("LeagueIndex", axis=1)
df_X.head()

Unnamed: 0,GameID,APM,SelectByHotkeys,AssignToHotkeys,UniqueHotkeys,MinimapAttacks,MinimapRightClicks,NumberOfPACs,GapBetweenPACs,ActionLatency,...,TotalHours_936,TotalHours_94,TotalHours_95,TotalHours_950,TotalHours_96,TotalHours_960,TotalHours_980,TotalHours_990,TotalHours_999,TotalHours_?
0,52,143.718,0.003515,0.00022,7,0.00011,0.000392,0.004849,32.6677,40.8673,...,0,0,0,0,0,0,0,0,0,0
1,55,129.2322,0.003304,0.000259,4,0.000294,0.000432,0.004307,32.9194,42.3454,...,0,0,0,0,0,0,0,0,0,0
2,56,69.9612,0.001101,0.000336,4,0.000294,0.000461,0.002926,44.6475,75.3548,...,0,0,0,0,0,0,0,0,0,0
3,57,107.6016,0.001034,0.000213,1,5.3e-05,0.000543,0.003783,29.2203,53.7352,...,0,0,0,0,0,0,0,0,0,0
4,58,122.8908,0.001136,0.000327,2,0.0,0.001329,0.002368,22.6885,62.0813,...,0,0,0,0,0,0,0,0,0,0


In [29]:
df_y = df['LeagueIndex']
df_y.head()



0    5
1    5
2    4
3    3
4    3
Name: LeagueIndex, dtype: int64

In [30]:
# Step 3: Split into test, train
# We want 70% train, 30% test
train_X, test_X, train_y, test_y = train_test_split(df_X, df_y,
                                                    test_size=0.3,
                                                    random_state=seed)

## Step 4: Train Estimators

To demonstrate how to do crossfold-validation (CV), we'll use two algorithms:
- [Neural Network](http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html)
- [KNN](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html)


Here's what we'll be doing:
- Defining a "grid" of possibilities for the models' parameters
- Training on "splits" of our data
- Keeping the best model of each type (KNN, NN) that performed best on our dataset
- Showing the accuracies and confusion matrices of those best-performing models

__JUPYTER PRO-TIP__: You can time the run-time of individual cells in Jupyter by putting `%%time` at the start of any cell.

In [31]:
%%time
# We're going to build nn_params, a list of dictionaries
# Each dictionary has the variable name to try modifying, and
# which values to try for it.
# All combinations of variables in the dictionary below will be tried

nn_params = [
    {
        "hidden_layer_sizes": [
            (20,), (30,), # models with 1 hidden layer
        ],
        "max_iter": [40, 100],
        "activation": ["logistic", "tanh", "relu"],
        "learning_rate_init": [1e-4, 1e-3],
    },
]

# nn_tester will test the permutations of parameters in nn_params
# using 5 trials (splitting train set into 5, training on 4 and testing
# on the 5th).
nn_model = MLPRegressor(random_state=seed)
nn_experimenter = GridSearchCV(nn_model, nn_params, cv=5)
nn_experimenter.fit(train_X, train_y)

print("Best parameter set found: ")
print(nn_experimenter.best_params_)

Best parameter set found: 
{'activation': 'logistic', 'hidden_layer_sizes': (30,), 'learning_rate_init': 0.001, 'max_iter': 100}
CPU times: user 2min 43s, sys: 1min 27s, total: 4min 11s
Wall time: 4min 18s


## Step 5: Evaluation for Regression

Evaluating regression problems are slightly different, we use two different metrics for this:
- [explained_variance_score]() -- best possible score is `1.0`, lower values are worse
- [r2_score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.explained_variance_score.html#sklearn.metrics.explained_variance_score) -- best possible score is `1.0`, lower values are worse

We will use just `r2_score()` below for now but `explained_variance_score()` works similarly.

In [32]:
train_preds = nn_experimenter.predict(train_X)
test_preds = nn_experimenter.predict(test_X)

train_r2 = r2_score(train_preds, train_y)
test_r2 = r2_score(test_preds, test_y)

print("Train R2 score: {:.1f}%".format(train_r2))
print("Test R2 score {:.1f}%".format(test_r2))

Train R2 score: -0.7%
Test R2 score -0.8%


In [33]:
%%time

# Now we can do the same thing with KNN

# Notice here how we decide we only want one dictionary of params
# to try all permutations of.
knn_params = [
    {
        "n_neighbors": [2, 3, 4, 5, 8],
        "p": [1, 2],
        "algorithm": ["ball_tree", "kd_tree"]
    },
]

knn_model = KNeighborsRegressor()
knn_exp = GridSearchCV(knn_model, knn_params, cv=5)

knn_exp.fit(train_X, train_y)

print("Best parameter set found: ")
print(knn_exp.best_params_)

Best parameter set found: 
{'algorithm': 'ball_tree', 'n_neighbors': 8, 'p': 1}
CPU times: user 39.7 s, sys: 1.63 s, total: 41.4 s
Wall time: 46.1 s


In [34]:
train_preds = knn_exp.predict(train_X)
test_preds = knn_exp.predict(test_X)

train_r2 = r2_score(train_preds, train_y)
test_r2 = r2_score(test_preds, test_y)

print("Train R2 score: {:.1f}%".format(train_r2))
print("Test R2 score {:.1f}%".format(test_r2))

Train R2 score: 0.1%
Test R2 score -0.2%


In [35]:
best_knn = KNeighborsRegressor()
best_knn.set_params(**knn_exp.best_params_)

KNeighborsRegressor(algorithm='ball_tree', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=8, p=1,
          weights='uniform')

In [36]:
knn_exp.best_estimator_

KNeighborsRegressor(algorithm='ball_tree', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=8, p=1,
          weights='uniform')

In [37]:
best_knn.fit(train_X, train_y)

KNeighborsRegressor(algorithm='ball_tree', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=8, p=1,
          weights='uniform')

In [38]:
trn_knn = best_knn.predict(train_X)
test_knn =  best_knn.predict(test_X)