#### References

https://github.com/eclarson/DataMiningNotebooks/blob/master/04.%20Logits%20and%20SVM.ipynb
https://github.com/jakemdrew/EducationDataNC/blob/master/2017/Models/2017ComparingSegregatedHighSchoolCampuses.ipynb (Logit)



## Create Models

### Data Description


#### Train and Test Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import ShuffleSplit, cross_validate
from sklearn.linear_model import LogisticRegression

team = pd.read_csv('~/7331_MiniLab/data/teams2Plus.csv')

#Convert Y/N playoff flag to 1/0 indicator
team['Playoff'] = team['Playoff'].map({'Y':1, 'N':0})

#Drop records with missing values in the Playoff column
team = team[np.isfinite(team['Playoff'])]
team.Playoff = team.Playoff.astype(int)

#Store all franchise IDs per row for future references
allfranchID = team['franchID']

#Drop Categorial Columns with no predictive ability
team = team.drop(['teamIDBR', 'teamIDlahman45', 'teamIDretro', 'G', 'teamID', 'Ghome', 'name', 'park', 'lgID', 'divID', 'salary'], axis=1)

#Drop Columns which introduce leakage
team = team.drop(['LgWin', 'DivWin', 'WCWin', 'WSWin', 'W', 'L'], axis=1)

#Create Cross Validation Object with 10 folds
## Not necessary for this data set, but will code for practice
cv = ShuffleSplit(n_splits = 10, test_size=0.80, random_state=0)

#Also create Test set for 2017
team2017 = team.loc[team['yearID'] == 2017]
franchid2017 = team2017['franchID']


#Drop last categorial column now that it has been preserved
team = team.drop(['franchID'], axis=1)
team2017 = team2017.drop(['franchID'], axis=1)

#Create X Explanatory and Y response variables for regression
teamY = team['Playoff']
teamX = team.drop('Playoff', axis=1)

print("Team DF")
team.info()
team.tail()
teamX_colNames = list(teamX)

print("Team 2017")
team2017.info()


Team DF
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1296 entries, 0 to 1323
Data columns (total 40 columns):
Unnamed: 0    1296 non-null int64
yearID        1296 non-null int64
Rank          1296 non-null int64
R             1296 non-null int64
AB            1296 non-null int64
H             1296 non-null int64
2B            1296 non-null int64
3B            1296 non-null int64
HR            1296 non-null int64
BB            1296 non-null float64
SO            1296 non-null float64
SB            1296 non-null float64
CS            1296 non-null float64
HBP           1296 non-null float64
SF            1296 non-null float64
RA            1296 non-null int64
ER            1296 non-null int64
ERA           1296 non-null float64
CG            1296 non-null int64
SHO           1296 non-null int64
SV            1296 non-null int64
IPouts        1296 non-null int64
HA            1296 non-null int64
HRA           1296 non-null int64
BBA           1296 non-null int64
SOA           1296 no

In [2]:
#Last check for NA values
team.isnull().sum()

Unnamed: 0    0
yearID        0
Rank          0
R             0
AB            0
H             0
2B            0
3B            0
HR            0
BB            0
SO            0
SB            0
CS            0
HBP           0
SF            0
RA            0
ER            0
ERA           0
CG            0
SHO           0
SV            0
IPouts        0
HA            0
HRA           0
BBA           0
SOA           0
E             0
DP            0
FP            0
attendance    0
BPF           0
PPF           0
Playoff       0
WHIP          0
KBB           0
KAB           0
Bavg          0
Slug          0
OBP           0
OPS           0
dtype: int64

### Logistic Regresssion
------

#### Collinearity

In [20]:
#Credit to:
###https://stats.stackexchange.com/questions/155028/how-to-systematically-remove-collinear-variables-in-python

from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

#vif(teamX)

In [19]:
#Logistic Regression Summary table with full model fit prior to scaling, cross validation or recursive 
#feature elimination.

#import statsmodels.api as sm
#logit_model = sm.Logit(teamY, teamX)
#result = logit_model.fit()
#print(result.summary2())
#print("AIC:", result.aic)
#print("BIC:", result.bic)

#### Classifier Evaluation

In [4]:
#Credit To:  https://github.com/jakemdrew/EducationDataNC/blob/master/2017/Models/2017ComparingSegregatedHighSchoolCampuses.ipynb

from sklearn.model_selection import cross_validate
#from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

def EvaluateClassifierEstimator(classifierEstimator, X, y, cv):
   
    #Perform cross validation 
    scores = cross_validate(classifierEstimator, teamX, teamY, scoring=['accuracy','precision','recall']
                            , cv=cv, return_train_score=True)

    Accavg = scores['test_accuracy'].mean()
    Preavg = scores['test_precision'].mean()
    Recavg = scores['test_recall'].mean()

    print_str = "The average accuracy for all cv folds is: \t\t\t {Accavg:.5}"
    print_str2 = "The average precision for all cv folds is: \t\t\t {Preavg:.5}"
    print_str3 = "The average recall for all cv folds is: \t\t\t {Recavg:.5}"

    print(print_str.format(Accavg=Accavg))
    print(print_str2.format(Preavg=Preavg))
    print(print_str3.format(Recavg=Recavg))
    print('*********************************************************')

    print('Cross Validation Fold Mean Error Scores')
    scoresResults = pd.DataFrame()
    scoresResults['Accuracy'] = scores['test_accuracy']
    scoresResults['Precision'] = scores['test_precision']
    scoresResults['Recall'] = scores['test_recall']
#    scoresResults['ConfMtx'] = 

    return scoresResults

def EvaluateClassifierEstimator2(classifierEstimator, X, y, cv):
    
    #Perform cross validation 
    from sklearn.model_selection import cross_val_predict
    predictions = cross_val_predict(classifierEstimator, teamX, teamY, cv=cv)
    
    #model evaluation 
    from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
    
    #pass true test set values and predictions to classification_report
    classReport = classification_report(Y,predictions)
    confMat = confusion_matrix(Y,predictions)
    acc = accuracy_score(Y,predictions)
    
    print (classReport)
    print (confMat)
    print (acc)

#### Perform Logistic Regression Using Grid Search CV

In [5]:
#Logisitic regression 10-fold cross-validation 
from sklearn.linear_model import LogisticRegression
regEstimator = LogisticRegression()

parameters = { 'penalty':['l2']
              ,'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
              ,'class_weight': ['balanced', 'none']
              ,'random_state': [0]
              ,'solver': ['lbfgs']
              ,'max_iter':[100,500]
             }

#Create a grid search object using the  
from sklearn.model_selection import GridSearchCV
regGridSearch = GridSearchCV(estimator=regEstimator
                   , n_jobs=8 # jobs to run in parallel
                   , verbose=1 # low verbosity
                   , param_grid=parameters
                   , cv=cv # KFolds = 10
                   , scoring='accuracy')

#Perform hyperparameter search to find the best combination of parameters for our data
regGridSearch.fit(teamX, teamY)

Fitting 10 folds for each of 28 candidates, totalling 280 fits


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    6.1s
[Parallel(n_jobs=8)]: Done 280 out of 280 | elapsed:    9.2s finished


In [6]:
#Diplay the top model parameters
regGridSearch.best_estimator_.coef_

array([[-1.30865242e-03,  7.28020490e-04, -2.51441234e-04,
         5.23317942e-03, -4.06654300e-04,  2.66439629e-03,
         6.75772476e-04,  1.34743801e-04,  1.38848967e-03,
         2.90960782e-03, -6.47629475e-04,  5.29937900e-04,
        -1.21187933e-04,  2.62024813e-04,  3.60724383e-04,
        -6.81829298e-03, -6.29209862e-03, -3.91064732e-05,
         2.44344786e-04,  2.18724149e-04,  6.08241122e-04,
         9.68473588e-04, -5.32808568e-03, -1.10511298e-03,
        -4.05971636e-03,  2.80982860e-03, -5.14775999e-04,
        -2.44082200e-04,  4.71420473e-07,  1.04324784e-06,
         7.03806100e-06, -7.89040172e-05, -6.35761731e-06,
         2.24963939e-05, -6.34582296e-08,  6.06000666e-07,
         1.60269440e-06,  9.08975084e-07,  2.51166948e-06]])

In [7]:
#Use the best parameters for our Linear Regression object
classifierEst = regGridSearch.best_estimator_

#Evaluate the regression estimator above using our pre-defined cross validation and scoring metrics. 
EvaluateClassifierEstimator(classifierEst, teamX, teamY, cv)

The average accuracy for all cv folds is: 			 0.86316
The average precision for all cv folds is: 			 0.67994
The average recall for all cv folds is: 			 0.49177
*********************************************************
Cross Validation Fold Mean Error Scores


Unnamed: 0,Accuracy,Precision,Recall
0,0.862102,0.658915,0.461957
1,0.865959,0.664516,0.542105
2,0.863067,0.645161,0.534759
3,0.859209,0.75,0.371134
4,0.863067,0.66242,0.53886
5,0.865959,0.645161,0.621762
6,0.851495,0.675439,0.396907
7,0.858245,0.7,0.403141
8,0.878496,0.735714,0.536458
9,0.864031,0.662069,0.510638


In [8]:
#Predictions using Grid Search CV
print("Plain GridSearch Prediction")
print(regGridSearch.predict(teamX))
print(regGridSearch.predict_proba(teamX))

#Is there a difference between .predict and .best_estimator_.predict?  Nope.
print("Best Estimator GridSearch Prediction")
print(regGridSearch.best_estimator_.predict(teamX))
print(regGridSearch.best_estimator_.predict_proba(teamX))

Plain GridSearch Prediction
[0 1 0 ... 0 0 1]
[[0.95662678 0.04337322]
 [0.31342188 0.68657812]
 [0.8012295  0.1987705 ]
 ...
 [0.98469441 0.01530559]
 [0.94326762 0.05673238]
 [0.44101563 0.55898437]]
Best Estimator GridSearch Prediction
[0 1 0 ... 0 0 1]
[[0.95662678 0.04337322]
 [0.31342188 0.68657812]
 [0.8012295  0.1987705 ]
 ...
 [0.98469441 0.01530559]
 [0.94326762 0.05673238]
 [0.44101563 0.55898437]]


### Testing Scaled vs Unscaled Data

### Recursive Feature Elimination

In [9]:
#Credit to:  Jake Drew NC Education Data Set Analysis

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit

#### Added by GS ####
print("Logistic Regression 1st Pass")
regEstimator = LogisticRegression()

parameters = { 'penalty':['l2']
              ,'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
              ,'class_weight': ['balanced', 'none']
              ,'random_state': [0]
              ,'solver': ['lbfgs']
              ,'max_iter':[100,500]
             }

#Create a grid search object using the  
from sklearn.model_selection import GridSearchCV
regGridSearch = GridSearchCV(estimator=regEstimator
                   , n_jobs=8 # jobs to run in parallel
                   , verbose=1 # low verbosity
                   , param_grid=parameters
                   , cv=cv # KFolds = 10
                   , scoring='accuracy')

#Perform hyperparameter search to find the best combination of parameters for our data
regGridSearch.fit(teamX, teamY)

#Use the best parameters for our Linear Regression object
classifierEst = regGridSearch.best_estimator_
#### End Added by GS ####

print("Logistic Regression Second Pass")
#Recursive Feature Elimination
rfecv = RFECV(estimator=classifierEst, step=1, cv=cv, scoring='accuracy', verbose=1)
X_BestFeatures = rfecv.fit_transform(teamX, teamY)

print("Ranking", rfecv.ranking_)
print("Support", rfecv.support_)
print("Number of Features:", rfecv.n_features_)

#create a pipeline to scale all of the data and perform logistic regression during each grid search step.
pipe = make_pipeline(StandardScaler(), LogisticRegression())

#Define a range of hyper parameters for grid search
parameters = { 'logisticregression__penalty':['l2']
              ,'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
              ,'logisticregression__class_weight': ['balanced','none']
              ,'logisticregression__random_state': [0]
              ,'logisticregression__solver': ['lbfgs']
              ,'logisticregression__max_iter':[100,500]
             }

#Perform the grid search using accuracy as a metric during cross validation.
grid = GridSearchCV(pipe, parameters, cv=cv, scoring='accuracy')

#Use the best features from recursive feature elimination during the grid search
grid.fit(teamX, teamY)

Logistic Regression 1st Pass
Fitting 10 folds for each of 28 candidates, totalling 280 fits


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    6.2s
[Parallel(n_jobs=8)]: Done 280 out of 280 | elapsed:    9.3s finished


Logistic Regression Second Pass
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 featur

Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 feat

GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=0, test_size=0.8, train_size=None),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'logisticregression__class_weight': ['balanced', 'none'], 'logisticregression__max_iter': [100, 500], 'logisticregression__solver': ['lbfgs'], 'logisticregression__penalty': ['l2'], 'logisticregression__random_state': [0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [13]:
#Use the best parameters for our Linear Regression object
classifierEst = grid.best_estimator_

#Evaluate the regression estimator above using our pre-defined cross validation and scoring metrics. 
EvaluateClassifierEstimator(classifierEst, teamX, teamY, cv)
#EvaluateClassifierEstimator2(classifierEst, teamX, teamY, cv)

The average accuracy for all cv folds is: 			 0.98351
The average precision for all cv folds is: 			 0.95857
The average recall for all cv folds is: 			 0.95162
*********************************************************
Cross Validation Fold Mean Error Scores


Unnamed: 0,Accuracy,Precision,Recall
0,0.982642,0.956044,0.945652
1,0.991321,1.0,0.952632
2,0.99325,0.983871,0.97861
3,0.992285,0.984375,0.974227
4,0.992285,0.989418,0.968912
5,0.988428,0.945813,0.994819
6,0.978785,0.952632,0.93299
7,0.959499,0.902703,0.874346
8,0.99325,0.984293,0.979167
9,0.963356,0.886598,0.914894


In [11]:
print(grid.best_estimator_.predict(teamX))
print(grid.best_estimator_.predict_proba(teamX))


[0 1 0 ... 0 0 1]
[[1.00000000e+00 5.32747065e-11]
 [2.97600599e-02 9.70239940e-01]
 [9.99831989e-01 1.68011289e-04]
 ...
 [9.99999997e-01 3.26795872e-09]
 [9.99999987e-01 1.32818161e-08]
 [3.81823296e-02 9.61817670e-01]]


#### Top Logistic Regression Model

### SVM

#lets investigate SVMs on the data and play with the parameters and kernels
from sklearn.svm import SVC

#train the model just as before
svm_clf = SVC(C=0.5, kernel='rbf', degree=3, gamma='auto') # get object
svm_clf.fit(X_train_scaled, y_train)  # train object

y_hat = svm_clf.predict(X_test_scaled) # get test set precitions

acc = mt.accuracy_score(y_test,y_hat)
conf = mt.confusion_matrix(y_test,y_hat)
print('accuracy:', acc )
print(conf)

In [14]:
#SVM for consolidated team level baseball data created in Lab 1.
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn import metrics as mt

scaler = StandardScaler()

teamX_scaled = scaler.fit_transform(teamX)

#train the model just as before
svm_clf = SVC(C=0.5, kernel='rbf', degree=3, gamma='auto') # get object
svm_clf.fit(teamX_scaled, teamY)  # train object

y_hat = svm_clf.predict(teamX_scaled)

acc = mt.accuracy_score(teamY,y_hat)
conf = mt.confusion_matrix(teamY,y_hat)
prec = mt.precision_score(teamY, y_hat)
print('accuracy:', acc )
print('precision:', prec)
print(conf)

accuracy: 0.9567901234567902
precision: 0.9333333333333333
[[1044   14]
 [  42  196]]


In [15]:
#look at the support vectors
print(svm_clf.support_vectors_.shape)
print(svm_clf.support_.shape)
print(svm_clf.n_support_ )


(444, 39)
(444,)
[236 208]


In [16]:
# SVM based Prediction
print(y_hat)

[0 1 0 ... 0 0 1]


#### SGD  (Necessary Given Data Size?)

#use some compact notation for creating a linear SVM classifier with stichastic descent
from sklearn.linear_model import SGDClassifier

regularize_const = 0.1
iterations = 5
svm_sgd = SGDClassifier(alpha=regularize_const,
        fit_intercept=True, l1_ratio=0.0, learning_rate='optimal',
        loss='hinge', n_iter=iterations, n_jobs=-1, penalty='l2')

scl = StandardScaler()
for train_idx, test_idx in cv.split(X,y):
    svm_sgd.fit(scl.fit_transform(X[train_idx]),y[train_idx])
    yhat = svm_sgd.predict(scl.transform(X[test_idx]))
    
    conf = mt.confusion_matrix(y[test_idx],yhat)
    acc = mt.accuracy_score(y[test_idx],yhat)

print('SVM:', acc)