#### References

https://github.com/eclarson/DataMiningNotebooks/blob/master/04.%20Logits%20and%20SVM.ipynb
https://github.com/jakemdrew/EducationDataNC/blob/master/2017/Models/2017ComparingSegregatedHighSchoolCampuses.ipynb (Logit)



## Create Models

### Data Description


#### Train and Test Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import ShuffleSplit, cross_validate
from sklearn.linear_model import LogisticRegression

team = pd.read_csv('teams2Plus.csv')

#Convert Y/N playoff flag to 1/0 indicator
team['Playoff'] = team['Playoff'].map({'Y':1, 'N':0})

#Drop records with missing values in the Playoff column
team = team[np.isfinite(team['Playoff'])]
team.Playoff = team.Playoff.astype(int)

#Store all franchise IDs per row for future references
allfranchID = team['franchID']

#Drop Categorial Columns with no predictive ability
team = team.drop(['teamIDBR', 'teamIDlahman45', 'teamIDretro', 'G', 'teamID', 'Ghome', 'name', 'park', 'lgID', 'divID', 'salary'], axis=1)

#Drop Columns which introduce leakage
team = team.drop(['LgWin', 'DivWin', 'WCWin', 'WSWin'], axis=1)

#Create Cross Validation Object with 10 folds
## Not necessary for this data set, but will code for practice
cv = ShuffleSplit(n_splits = 10, test_size=0.80, random_state=0)

#Also create Test set for 2017
team2017 = team.loc[team['yearID'] == 2017]
franchid2017 = team2017['franchID']


#Drop last categorial column now that it has been preserved
team = team.drop(['franchID'], axis=1)
team2017 = team2017.drop(['franchID'], axis=1)

#Create X Explanatory and Y response variables for regression
teamY = team['Playoff']
teamX = team.drop('Playoff', axis=1)

print("Team DF")
team.info()
team.tail()

print("Team 2017")
team2017.info()

Team DF
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1296 entries, 0 to 1323
Data columns (total 42 columns):
Unnamed: 0    1296 non-null int64
yearID        1296 non-null int64
Rank          1296 non-null int64
W             1296 non-null int64
L             1296 non-null int64
R             1296 non-null int64
AB            1296 non-null int64
H             1296 non-null int64
2B            1296 non-null int64
3B            1296 non-null int64
HR            1296 non-null int64
BB            1296 non-null float64
SO            1296 non-null float64
SB            1296 non-null float64
CS            1296 non-null float64
HBP           1296 non-null float64
SF            1296 non-null float64
RA            1296 non-null int64
ER            1296 non-null int64
ERA           1296 non-null float64
CG            1296 non-null int64
SHO           1296 non-null int64
SV            1296 non-null int64
IPouts        1296 non-null int64
HA            1296 non-null int64
HRA           1296 no

In [2]:
#Last check for NA values
team.isnull().sum()

Unnamed: 0    0
yearID        0
Rank          0
W             0
L             0
R             0
AB            0
H             0
2B            0
3B            0
HR            0
BB            0
SO            0
SB            0
CS            0
HBP           0
SF            0
RA            0
ER            0
ERA           0
CG            0
SHO           0
SV            0
IPouts        0
HA            0
HRA           0
BBA           0
SOA           0
E             0
DP            0
FP            0
attendance    0
BPF           0
PPF           0
Playoff       0
WHIP          0
KBB           0
KAB           0
Bavg          0
Slug          0
OBP           0
OPS           0
dtype: int64

### Logistic Regresssion
#### Grid Search CV
------

#### Classifier Evaluation

In [3]:
#Credit To:  https://github.com/jakemdrew/EducationDataNC/blob/master/2017/Models/2017ComparingSegregatedHighSchoolCampuses.ipynb

from sklearn.model_selection import cross_validate

def EvaluateClassifierEstimator(classifierEstimator, X, y, cv):
   
    #Perform cross validation 
    scores = cross_validate(classifierEstimator, teamX, teamY, scoring=['accuracy','precision','recall']
                            , cv=cv, return_train_score=True)

    Accavg = scores['test_accuracy'].mean()
    Preavg = scores['test_precision'].mean()
    Recavg = scores['test_recall'].mean()

    print_str = "The average accuracy for all cv folds is: \t\t\t {Accavg:.5}"
    print_str2 = "The average precision for all cv folds is: \t\t\t {Preavg:.5}"
    print_str3 = "The average recall for all cv folds is: \t\t\t {Recavg:.5}"

    print(print_str.format(Accavg=Accavg))
    print(print_str2.format(Preavg=Preavg))
    print(print_str3.format(Recavg=Recavg))
    print('*********************************************************')

    print('Cross Validation Fold Mean Error Scores')
    scoresResults = pd.DataFrame()
    scoresResults['Accuracy'] = scores['test_accuracy']
    scoresResults['Precision'] = scores['test_precision']
    scoresResults['Recall'] = scores['test_recall']

    return scoresResults

def EvaluateClassifierEstimator2(classifierEstimator, X, y, cv):
    
    #Perform cross validation 
    from sklearn.model_selection import cross_val_predict
    predictions = cross_val_predict(classifierEstimator, teamX, teamY, cv=cv)
    
    #model evaluation 
    from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
    
    #pass true test set values and predictions to classification_report
    classReport = classification_report(Y,predictions)
    confMat = confusion_matrix(Y,predictions)
    acc = accuracy_score(Y,predictions)
    
    print (classReport)
    print (confMat)
    print (acc)

#### Perform Logistic Regression Using Grid Search CV

In [4]:
#Logisitic regression 10-fold cross-validation 
from sklearn.linear_model import LogisticRegression
regEstimator = LogisticRegression()

parameters = { 'penalty':['l2']
              ,'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
              ,'class_weight': ['balanced', 'none']
              ,'random_state': [0]
              ,'solver': ['lbfgs']
              ,'max_iter':[100,500]
             }

#Create a grid search object using the  
from sklearn.model_selection import GridSearchCV
regGridSearch = GridSearchCV(estimator=regEstimator
                   , n_jobs=8 # jobs to run in parallel
                   , verbose=1 # low verbosity
                   , param_grid=parameters
                   , cv=cv # KFolds = 10
                   , scoring='accuracy')

#Perform hyperparameter search to find the best combination of parameters for our data
regGridSearch.fit(teamX, teamY)

Fitting 10 folds for each of 28 candidates, totalling 280 fits


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    6.0s
[Parallel(n_jobs=8)]: Done 280 out of 280 | elapsed:    9.1s finished


GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=0, test_size=0.8, train_size=None),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=8,
       param_grid={'penalty': ['l2'], 'random_state': [0], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'max_iter': [100, 500], 'class_weight': ['balanced', 'none'], 'solver': ['lbfgs']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [16]:
#Diplay the top model parameters
regGridSearch.best_estimator_.coef_

array([[-6.94248649e-04, -6.47705199e-05, -1.70403632e-03,
         6.50218861e-03, -6.61468392e-03,  1.92541427e-02,
        -6.30503860e-03,  6.77202773e-03, -9.10675933e-05,
         1.33138918e-03,  6.30086572e-03,  1.22436629e-03,
         2.71654751e-03,  3.78392302e-03,  5.56902667e-06,
         1.75407168e-03,  1.41434989e-03, -1.24676218e-02,
        -1.11756650e-02, -7.54723759e-05, -3.36355058e-04,
         3.30589121e-04,  4.83258121e-03,  5.05353535e-03,
         5.96123108e-04, -2.99727114e-03, -5.72835491e-03,
        -1.04105601e-03, -8.14246079e-04,  2.91857071e-04,
         9.89066714e-08,  7.64938294e-07, -1.25560193e-04,
        -3.38681559e-04, -5.38167226e-06,  2.40880083e-05,
         6.55015504e-07,  1.51165229e-06,  5.77495988e-06,
         1.65180673e-06,  7.42676661e-06]])

In [6]:
#Use the best parameters for our Linear Regression object
classifierEst = regGridSearch.best_estimator_

#Evaluate the regression estimator above using our pre-defined cross validation and scoring metrics. 
EvaluateClassifierEstimator(classifierEst, teamX, teamY, cv)

The average accuracy for all cv folds is: 			 0.86557
The average precision for all cv folds is: 			 0.68189
The average recall for all cv folds is: 			 0.50894
*********************************************************
Cross Validation Fold Mean Error Scores


Unnamed: 0,Accuracy,Precision,Recall
0,0.870781,0.678571,0.516304
1,0.883317,0.694915,0.647368
2,0.866924,0.702479,0.454545
3,0.857281,0.694915,0.42268
4,0.863067,0.660377,0.544041
5,0.869817,0.659341,0.621762
6,0.862102,0.697674,0.463918
7,0.853423,0.640288,0.465969
8,0.874638,0.762712,0.46875
9,0.854388,0.627586,0.484043


In [22]:
regGridSearch.best_estimator_.coef_

array([[-6.94248649e-04, -6.47705199e-05, -1.70403632e-03,
         6.50218861e-03, -6.61468392e-03,  1.92541427e-02,
        -6.30503860e-03,  6.77202773e-03, -9.10675933e-05,
         1.33138918e-03,  6.30086572e-03,  1.22436629e-03,
         2.71654751e-03,  3.78392302e-03,  5.56902667e-06,
         1.75407168e-03,  1.41434989e-03, -1.24676218e-02,
        -1.11756650e-02, -7.54723759e-05, -3.36355058e-04,
         3.30589121e-04,  4.83258121e-03,  5.05353535e-03,
         5.96123108e-04, -2.99727114e-03, -5.72835491e-03,
        -1.04105601e-03, -8.14246079e-04,  2.91857071e-04,
         9.89066714e-08,  7.64938294e-07, -1.25560193e-04,
        -3.38681559e-04, -5.38167226e-06,  2.40880083e-05,
         6.55015504e-07,  1.51165229e-06,  5.77495988e-06,
         1.65180673e-06,  7.42676661e-06]])

In [29]:
#Predictions using Grid Search CV
print("Plain GridSearch Prediction")
print(regGridSearch.predict(teamX))
print(regGridSearch.predict_proba(teamX))

#Is there a difference between .predict and .best_estimator_.predict?  Nope.
print("Best Estimator GridSearch Prediction")
print(regGridSearch.best_estimator_.predict(teamX))
print(regGridSearch.best_estimator_.predict_proba(teamX))

Plain GridSearch Prediction
[0 1 0 ... 0 0 1]
[[0.99412152 0.00587848]
 [0.0960525  0.9039475 ]
 [0.89725698 0.10274302]
 ...
 [0.95323381 0.04676619]
 [0.99234163 0.00765837]
 [0.35492726 0.64507274]]
Best Estimator GridSearch Prediction
[0 1 0 ... 0 0 1]
[[0.99412152 0.00587848]
 [0.0960525  0.9039475 ]
 [0.89725698 0.10274302]
 ...
 [0.95323381 0.04676619]
 [0.99234163 0.00765837]
 [0.35492726 0.64507274]]


### Testing Scaled vs Unscaled Data

### Recursive Feature Elimination

In [9]:
#Credit to:  Jake Drew NC Education Data Set Analysis

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit

#### Added by GS ####
print("Logistic Regression 1st Pass")
regEstimator = LogisticRegression()

parameters = { 'penalty':['l2']
              ,'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
              ,'class_weight': ['balanced', 'none']
              ,'random_state': [0]
              ,'solver': ['lbfgs']
              ,'max_iter':[100,500]
             }

#Create a grid search object using the  
from sklearn.model_selection import GridSearchCV
regGridSearch = GridSearchCV(estimator=regEstimator
                   , n_jobs=8 # jobs to run in parallel
                   , verbose=1 # low verbosity
                   , param_grid=parameters
                   , cv=cv # KFolds = 10
                   , scoring='accuracy')

#Perform hyperparameter search to find the best combination of parameters for our data
regGridSearch.fit(teamX, teamY)

#Use the best parameters for our Linear Regression object
classifierEst = regGridSearch.best_estimator_
#### End Added by GS ####

print("Logistic Regression Second Pass")
#Recursive Feature Elimination
rfecv = RFECV(estimator=classifierEst, step=1, cv=cv, scoring='accuracy')
X_BestFeatures = rfecv.fit_transform(teamX, teamY)

#create a pipeline to scale all of the data and perform logistic regression during each grid search step.
pipe = make_pipeline(StandardScaler(), LogisticRegression())

#Define a range of hyper parameters for grid search
parameters = { 'logisticregression__penalty':['l2']
              ,'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
              ,'logisticregression__class_weight': ['balanced','none']
              ,'logisticregression__random_state': [0]
              ,'logisticregression__solver': ['lbfgs']
              ,'logisticregression__max_iter':[100,500]
             }

#Perform the grid search using accuracy as a metric during cross validation.
grid = GridSearchCV(pipe, parameters, cv=cv, scoring='accuracy')

#Use the best features from recursive feature elimination during the grid search
grid.fit(teamX, teamY)

Fitting 10 folds for each of 28 candidates, totalling 280 fits


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.4s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    6.3s
[Parallel(n_jobs=8)]: Done 280 out of 280 | elapsed:    9.5s finished


Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=10, class_weight='none', dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False))])


In [15]:
#Use the best parameters for our Linear Regression object
classifierEst = grid.best_estimator_

#Evaluate the regression estimator above using our pre-defined cross validation and scoring metrics. 
EvaluateClassifierEstimator(classifierEst, teamX, teamY, cv)

The average accuracy for all cv folds is: 			 0.98042
The average precision for all cv folds is: 			 0.94962
The average recall for all cv folds is: 			 0.94366
*********************************************************
Cross Validation Fold Mean Error Scores


Unnamed: 0,Accuracy,Precision,Recall
0,0.972999,0.943182,0.902174
1,0.991321,1.0,0.952632
2,0.992285,0.97861,0.97861
3,0.991321,0.979275,0.974227
4,0.992285,0.989418,0.968912
5,0.982642,0.926829,0.984456
6,0.974928,0.9375,0.927835
7,0.945998,0.868852,0.832461
8,0.992285,0.979167,0.979167
9,0.968177,0.893401,0.93617


In [17]:
print(grid.best_estimator_.predict(teamX))
print(grid.best_estimator_.predict_proba(teamX))

[0 1 0 ... 0 0 1]
[[1.00000000e+00 5.81096988e-11]
 [2.56733454e-02 9.74326655e-01]
 [9.99835518e-01 1.64482157e-04]
 ...
 [9.99999997e-01 3.37262781e-09]
 [9.99999986e-01 1.36970413e-08]
 [3.68681241e-02 9.63131876e-01]]


#### Logistic Regression - Toward Data Science Approach
https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8

#### Logistic Regression Using R

#### Top Logistic Regression Model

### SVM

#lets investigate SVMs on the data and play with the parameters and kernels
from sklearn.svm import SVC

#train the model just as before
svm_clf = SVC(C=0.5, kernel='rbf', degree=3, gamma='auto') # get object
svm_clf.fit(X_train_scaled, y_train)  # train object

y_hat = svm_clf.predict(X_test_scaled) # get test set precitions

acc = mt.accuracy_score(y_test,y_hat)
conf = mt.confusion_matrix(y_test,y_hat)
print('accuracy:', acc )
print(conf)

In [24]:
#SVM for consolidated team level baseball data created in Lab 1.
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn import metrics as mt

scaler = StandardScaler()

teamX_scaled = scaler.fit_transform(teamX)

#train the model just as before
svm_clf = SVC(C=0.5, kernel='rbf', degree=3, gamma='auto') # get object
svm_clf.fit(teamX_scaled, teamY)  # train object

y_hat = svm_clf.predict(teamX_scaled)

acc = mt.accuracy_score(teamY,y_hat)
conf = mt.confusion_matrix(teamY,y_hat)
prec = mt.precision_score(teamY, y_hat)
print('accuracy:', acc )
print('precision:', prec)
print(conf)

accuracy: 0.9552469135802469
precision: 0.9285714285714286
[[1043   15]
 [  43  195]]


In [20]:
#look at the support vectors
print(svm_clf.support_vectors_.shape)
print(svm_clf.support_.shape)
print(svm_clf.n_support_ )


(416, 41)
(416,)
[217 199]


In [25]:
# SVM based Prediction
print(y_hat)

[0 1 0 ... 0 0 1]


#### SGD

#use some compact notation for creating a linear SVM classifier with stichastic descent
from sklearn.linear_model import SGDClassifier

regularize_const = 0.1
iterations = 5
svm_sgd = SGDClassifier(alpha=regularize_const,
        fit_intercept=True, l1_ratio=0.0, learning_rate='optimal',
        loss='hinge', n_iter=iterations, n_jobs=-1, penalty='l2')

scl = StandardScaler()
for train_idx, test_idx in cv.split(X,y):
    svm_sgd.fit(scl.fit_transform(X[train_idx]),y[train_idx])
    yhat = svm_sgd.predict(scl.transform(X[test_idx]))
    
    conf = mt.confusion_matrix(y[test_idx],yhat)
    acc = mt.accuracy_score(y[test_idx],yhat)

print('SVM:', acc)