In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix


In [11]:
game_data = pd.read_csv('D1_2015_Processed_Stats.csv')
print("Number of records= ", game_data.shape[0])
game_data.info()

Number of records=  5149
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5149 entries, 0 to 5148
Data columns (total 13 columns):
points_game           5149 non-null float64
field_goals_pct       5149 non-null float64
offensive_rebounds    5149 non-null float64
free_throws_att       5149 non-null float64
free_throws_pct       5149 non-null float64
turnovers             5149 non-null float64
win_pct               5149 non-null float64
game_id               5149 non-null object
home_team             5149 non-null bool
market                5149 non-null object
opp_market            5149 non-null object
game_result           5149 non-null object
game_date             5149 non-null object
dtypes: bool(1), float64(7), object(5)
memory usage: 487.8+ KB


In [12]:
numeric_feature_to_scale = ['points_game','field_goals_pct','offensive_rebounds','free_throws_att',
                'free_throws_pct','turnovers','win_pct']

game_data_scale = game_data[numeric_feature_to_scale].copy()
scaler =StandardScaler()
scaled_features = scaler.fit_transform(game_data_scale.values)
scaled_df = pd.DataFrame(scaled_features, index=game_data_scale.index, columns=game_data_scale.columns)

# Add the categorical column 
scaled_df['home_team'] = game_data['home_team']
scaled_df.head()

Unnamed: 0,points_game,field_goals_pct,offensive_rebounds,free_throws_att,free_throws_pct,turnovers,win_pct,home_team
0,0.635309,1.534024,-1.091424,0.756443,-0.102739,0.812673,-0.000316,False
1,-0.400047,-0.814191,-1.81426,-2.478703,1.552383,0.405574,-0.000316,True
2,0.635309,-1.036867,-2.537095,-1.146584,3.847301,-2.037022,3.100964,True
3,0.065863,0.623079,3.064879,0.280686,-1.994307,1.626872,-0.000316,True
4,1.049451,0.238457,-0.368589,1.327351,-0.061013,-6.922215,3.100964,True


In [13]:
X =scaled_df
X = pd.get_dummies(X, columns=['home_team'], drop_first=True)
y= game_data['game_result'].copy()
y = pd.get_dummies(y, columns=['game_result'], drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 10)

In [30]:
mlp = MLPClassifier(hidden_layer_sizes=(8,8,16,8),max_iter=1000)
mlp.fit(X_train,y_train.WIN)


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(8, 8, 16, 8), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [31]:
y_pred = mlp.predict(X_test)
print("Neural Net model accuracy is %2.2f" % metrics.accuracy_score(y_test, y_pred))

Neural Net model accuracy is 0.70


In [32]:
print(classification_report(y_test,y_pred))  

             precision    recall  f1-score   support

          0       0.70      0.76      0.73       540
          1       0.70      0.64      0.67       490

avg / total       0.70      0.70      0.70      1030



{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}


In [32]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2,
                               random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train,  y_train.WIN)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.4min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [33]:
rf_random.best_params_

{'n_estimators': 1800,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_depth': 10}