In [30]:
#Set up packages and load data frame
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
%matplotlib inline

trainDF = pd.read_csv('../data/raw/train.csv')
testDF = pd.read_csv('../data/raw/test.csv')

#Experiment 1 - make goal values absolute
trainDF['3P Made'] = trainDF['3P Made'].abs()
trainDF['3PA'] = trainDF['3PA'].abs()
trainDF['3P%'] = trainDF['3P%'].abs()


In [10]:
#Remove Id columns as irrelevant


trainDF_cleaned = trainDF.copy()
trainDF_cleaned.drop(['Id_old','Id'], axis=1, inplace=True)
target = trainDF_cleaned.pop('TARGET_5Yrs')

testDF_cleaned = testDF.copy()
testDF_cleaned.drop(['Id_old','Id'], axis=1, inplace=True)

In [None]:
#Explore data
trainDF_cleaned.head
trainDF_cleaned.info()
trainDF_cleaned.describe()

In [44]:
#Experiment 2 - GridSearch with cross-validation

k_range = list(range(1, 31))
param_grid = {
    'n_neighbors': [27],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
    }

grid = GridSearchCV(knn, param_grid, verbose = 1, cv=10, n_jobs = -1)
grid.fit(trainDF, target)

grid.cv_results_
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)


Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    6.8s finished


GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=27, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': [27],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [12]:
#Data pre-processing

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
trainDF_cleaned = scaler.fit_transform(trainDF_cleaned)
testDF_cleaned = scaler.fit_transform(testDF_cleaned)

In [13]:
#Save scaler model
from joblib import dump
dump(scaler, '../models/scaler.joblib')

['../models/scaler.joblib']

In [14]:
#Split data into 20% val and 80% training data
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split (trainDF_cleaned, target, test_size=0.2, random_state=8)

np.save('../data/processed/X_train', X_train)
np.save('../data/processed/X_val',   X_val)

np.save('../data/processed/y_train', y_train)
np.save('../data/processed/y_val',   y_val)



In [15]:
#Baseline model

y_mean = y_train.mean()
y_base = np.full((len(y_train), 1), y_mean)

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

print(mse(y_train, y_base, squared=False))
print(mae(y_train, y_base))

0.3736996987472026
0.27930292968750003


In [47]:
#KNN Classification Model

#Import knearest neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier
#Instantiate KNN Classifier
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=27, p=2,
                     weights='uniform')

#Train the model using the training sets
knn.fit(X_train, y_train)

#Predict on val and train set
y_val_predKNN = knn.predict(X_val)
y_train_predKNN = knn.predict(X_train)

Confusion Matrix 
 [[   5  252]
 [   6 1337]]
Accuracy: 0.83875


In [58]:
#Evaluation - Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

# Print confusion matrix to evaluate classification accuracy
cm_val = confusion_matrix(y_val, y_val_predKNN)
print("Confusion Matrix \n", cm_val)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_val, y_val_predKNN))

#AUC score
knn_auc = roc_auc_score(y_val,y_val_predKNN)
print('AUC: %.2f' % knn_auc)


AUC: 0.51


In [59]:
#Compare against baseline - worse in mse but better in mae
print(mse(y_train, y_train_predKNN, squared=False))
print(mae(y_train, y_train_predKNN))

0.4085033659592048
0.166875


In [48]:
#Predict on test set for submission
y_test_predKNN = knn.predict_proba(testDF_cleaned)[:,1]
print(y_test_predKNN)

[0.59259259 0.77777778 1.         ... 0.77777778 0.85185185 0.88888889]


In [49]:
#Convert predictions to Kaggle submission format
TestPredDF = pd.DataFrame(y_test_predKNN, columns =['Target_5Yrs'])
TestPredDF.index.name = "Id"
TestPredDF.to_csv('../data/interim/KNN_GridsearchAll.csv')