# Modeling

In [None]:
# Core Packages
import seaborn as sns
import matplotlib.pyplot as plt
import os 
%matplotlib inline

# ML Packages
from sklearn.linear_model import SGDRegressor, ElasticNetCV, LogisticRegression
from sklearn.metrics import mean_squared_error, make_scorer, f1_score, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split, learning_curve, RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Core Packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os 
%matplotlib inline

# Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

# Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

### Choosing a Machine Learning Algorithm
From: [A Data Science Framework: To Achieve 99% Accuracy](https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy)

In [None]:
import os
os.chdir("/Users/erikgregorywebb/Documents/Python/nba-prediction/")

In [None]:
train = pd.read_csv("train.csv")

In [None]:
MLA = [
    # Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    # Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),
    
    # GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    # Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    # Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    # SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),
    
    # Trees    
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    
    # Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),

    # Xgboost
    XGBClassifier()    
    ]

In [None]:
# Drop non-numeric features
train = train.drop(['Team', 'Opponent', 'Team-Score', 'Location', 'Opponent-Score', 'Date', 'Time', 'Season'], axis = 1)
test = test.drop(['Team', 'Opponent', 'Team-Score', 'Location', 'Opponent-Score', 'Date', 'Time', 'Season'], axis = 1)

# Create target variables for test and train datasets, then drop "Win" from datasets
target_train = train['Win']
target_test = test['Win']
train = train.drop(['Win'], axis = 1)
test = test.drop(['Win'], axis = 1)

# Scale the data
scaler = StandardScaler()
scaler.fit(train)
scaled_train = scaler.transform(train)
scaled_test = scaler.transform(test)

In [None]:
# Split dataset in cross-validation with this splitter class; run model 10 times with 80/20 split
cv_split = model_selection.ShuffleSplit(n_splits = 2, test_size = .2, train_size = .7, random_state = 0) 

# Create table to compare MLA metrics
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean','MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)

In [None]:
row_index = 0
for alg in MLA:
    print(alg)
    
    # Set name and parameters
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    
    # Score model with cross validation:
    cv_results = model_selection.cross_validate(alg, scaled_train, target_train, cv  = cv_split)

    MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()       
    row_index+=1

In [None]:
# Print and sort the table
MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
MLA_compare.drop(['MLA Parameters'], axis = 1) # Drop for the purpose of presenting

In [None]:
# Plot Mean Test Accuracy by Algorithm
plt.figure(figsize=(10,10))
sns.barplot(x='MLA Test Accuracy Mean', y = 'MLA Name', data = MLA_compare, color = 'b')
plt.title('Mean Test Accuracy by Algorithm \n')
plt.xlabel('Accuracy Score (%)')
plt.ylabel('Algorithm')

In [None]:
# Plot Mean Run Time by Algorithm
MLA_compare.sort_values(by = ['MLA Time'], ascending = False, inplace = True)
plt.figure(figsize=(10,10))
sns.barplot(x='MLA Time', y = 'MLA Name', data = MLA_compare, color = 'b')
plt.title('Mean Run Time by Algorithm \n')
plt.xlabel('Mean Run Time (Seconds)')
plt.ylabel('Algorithm')