# Machine Learning Modeling - NFL data

### Import libraries (comment / uncomment Tensorflow, until fixed)

Problems with Tensorflow (for Neural Network):

* Had to set up separate environment within Anaconda for it. Some other modules, like beautifulsoup weren't supported and other modules had compatibility problems in that environment.
* Could not pickle the Tensorflow model for the Predict script.

Switched to sklearn NN; see: https://blog.eduonix.com/artificial-intelligence/explore-neural-networks-scikit-learn/

NN modeling: https://machinelearningmastery.com/how-to-configure-the-number-of-layers-and-nodes-in-a-neural-network/

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import  StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib  
import pickle

# import tensorflow as tf
# tf.logging.set_verbosity(tf.logging.ERROR)     # only display errors

import sqlite3
from sqlite3 import Error

%matplotlib inline

#### Global Variables

In [2]:
notes = 'Notes: '                 # concatenate notes to this string for report, proceed each by ' **'

#### Key Inputs

In [3]:

features = 13                   # this is feature set # to access the feature set stored in testdb
saveRpt = 'Y'                   # Y/N to save report to testdb
alg = 'NN'                      # algorithmn to use (LR, LR-PCA, NN, RFC, LR-RFE, ETC, SVM, SVMgrid)
save_model = 'Y'                # Y/N to save model for production predictions


### This section includes all the functions to do algorithms

In [4]:
#### Logical Regression, w/o PCA
def LogRegNoPCA(xtrain,ytrain, xtest,ytest,xnote, featList):

    logmodel = LogisticRegression(solver='lbfgs')
    logmodel.fit(xtrain,ytrain)
    y_testpredict = logmodel.predict(xtest)
    x_trainpredict = logmodel.predict(xtrain)

    score = 0
    bestC = 999
    # C values: Inverse of regularization strength; smaller values specify stronger regularization
    for C_option in [.001,.50,.60,.70,.80, .85,.90,.95, 1, 1.05, 2, 5, 100]:
        logregC = LogisticRegression(C=C_option, solver='lbfgs').fit(xtrain, ytrain)
        if  logregC.score(xtest, ytest) > score:
            score = logregC.score(xtest, ytest)
            logmodel = logregC
            bestC = C_option      
            
    xnote = xnote + ' ** C selected, C = ' + str(bestC) + '; score (%):' + str(round(score,2)*100)     
        
    return y_testpredict,x_trainpredict,logmodel,xnote 


#### Logical Regression, w/ Recursive Feature Elimination
def LogRegRFE(xtrain,ytrain, xtest,ytest,xnote, featList):
    
    # create a base classifier used to evaluate a subset of attributes
    logmodel = LogisticRegression(solver='lbfgs')
    # create the RFE model and select n attributes
    num_attr = 10              # number of attributes
    rfe = RFE(logmodel, num_attr)
    rfe = rfe.fit(xtrain, ytrain)
    # summarize the selection of the attributes
    support = rfe.support_
    ranking = rfe.ranking_
           
    featureDF = pd.DataFrame({'Feature': featList, 'Support': support, 'Ranking': ranking})
    featureDF = featureDF.sort_values("Ranking", ascending=[True])
    print('Feature Analysis (contributors):\n', featureDF)
    
    y_testpredict = rfe.predict(xtest)
    x_trainpredict = rfe.predict(xtrain)
            
    xnote = xnote + ' ** RFE # of attributes: ' + str(num_attr)      
        
    return y_testpredict,x_trainpredict,rfe,xnote 



#### Logical Regression, w/ PCA
def LogRegPCA(xtrain,ytrain, xtest,xnotes):
    # Make an instance of the Model
    pca = PCA(.95)                      # alternatively, could force to n PCs; e.g. pca = PCA(2)
    pca.fit(xtrain)                            # fit the Scaled features
    xnotes = xnotes + ' ** PCA components:' + str(pca.n_components_ )
    
    X_trainP = pca.transform(xtrain)       # this creates the PCA components (new / composite features)
    X_testP = pca.transform(xtest)
    
    logR_PCA = LogisticRegression()                     # set up instance (model)
    logR_PCA.fit(X_trainP, ytrain)                     # fit training data

    y_testpredict = logR_PCA.predict(X_testP)                     # predict all test cases
    x_trainpredict = logR_PCA.predict(X_trainP)                     # predict all training cases
    
    print("PCA explained variance:\n", pca.explained_variance_ratio_)
    
    return y_testpredict,x_trainpredict,logR_PCA, xnotes

#### Support Vector Machine, No Grid
def SVM(xtrain,ytrain, xtest,ytest,xnote, featList):

    svc_model = SVC()
    svc_model.fit(xtrain,ytrain)
    y_testpredict = svc_model.predict(xtest)
    x_trainpredict = svc_model.predict(xtrain)

    xnote = xnote + ' ** SVM selected. No Grid.'        
    return y_testpredict,x_trainpredict,svc_model,xnote 

#### Support Vector Machine, w Grid
def SVMgrid(xtrain,ytrain, xtest,ytest,xnote, featList):

    svc_model = SVC()
    svc_model.fit(xtrain,ytrain)
    y_testpredict = svc_model.predict(xtest)
    x_trainpredict = svc_model.predict(xtrain)
    
    # Grid Search
    param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001]}
    grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
    grid.fit(xtrain,ytrain)
    
    y_testpredict  = grid.predict(xtest)
    x_trainpredict  = grid.predict(xtrain)

    xnote = xnote + ' ** SVM selected. With Grid'        
    return y_testpredict,x_trainpredict,grid,xnote 

#### Extra Trees Classifier
def ExtraTrees(xtrain,ytrain, xtest,ytest,xnote, featList):
    
    # create a base classifier used to evaluate    
    ETCmodel = ExtraTreesClassifier()
    ETCmodel.fit(xtrain, ytrain)
    
    # display the relative importance of each attribute

    importances = ETCmodel.feature_importances_
    featureDF = pd.DataFrame({'Feature': featList, 'Importance': importances})
    featureDF['Importance'] = featureDF['Importance'].apply(lambda x: round(x*100,2))
    featureDF = featureDF.sort_values("Importance", ascending=[False])
    print('Feature Analysis (contributors) in %:\n', featureDF)
    
    y_testpredict = ETCmodel.predict(xtest)
    x_trainpredict = ETCmodel.predict(xtrain)
            
    xnote = xnote + ' ** ETC - see feature analysis. '      
        
    return y_testpredict,x_trainpredict,ETCmodel,xnote 


#### Random Forest Classifier
def RFC(xtrain,ytrain, xtest,xnotes, featList):
    estimators = 101               # Number of trees in forest; default = 100
    
    #  oob: use out-of-bag (OOB) samples to estimate the generalization accuracy (default = False)
    #  verbose : int, optional (default=0); Controls the verbosity when fitting and predicting
    rfcmodel = RandomForestClassifier(n_estimators=estimators, random_state=242, oob_score=True)
    rfcmodel.fit(xtrain, ytrain)
    y_testpredict = rfcmodel.predict(xtest)      # predict test data with model
    x_trainpredict = rfcmodel.predict(xtrain)    # predict train data with model
    

    importances = rfcmodel.feature_importances_
    featureDF = pd.DataFrame({'Feature': featList, 'Importance': importances})
    featureDF['Importance'] = featureDF['Importance'].apply(lambda x: round(x*100,2))
    featureDF = featureDF.sort_values("Importance", ascending=[False])
    print('Feature Analysis (contributors) in %:\n', featureDF)
 

    
    xnotes = xnotes + ' ** RF classifier, estimators:' + str(estimators) + '  oob_score = True'

    return y_testpredict,x_trainpredict,rfcmodel,xnotes 

#### Neural Network, w/o PCA
def NN(xtrain,ytrain, xtest,xnotes):
    ## The MLPClassifier and MLPRegressor are sklearn implementations of NNs

    # need to model output as an number 0/1  (make a copy; don't modify existing ytrain)
    print('ytrain:', len(ytrain),  type(ytrain))
    print('xtrain:', len(xtrain),  type(xtrain))    

    ytrainx = pd.Series(ytrain.values.copy('C'),index=ytrain.index, name = ytrain.name)
     
    print('ytrainx:', len(ytrainx), type(ytrainx))
    for i in range(len(ytrain)):            
        if ytrain.iloc[i] == 'A':            # A (home team) will be one; B (away team) will be zero
            ytrainx.iloc[i] = 1
        else: ytrainx.iloc[i] = 0

    #### added   https://www.kaggle.com/hhllcks/neural-net-with-gridsearch
    grid_search = 'n'
    if grid_search == 'y':
        mlp = MLPClassifier()
        parameters = {'solver': ['lbfgs'], 'max_iter': [100,500, 800], 'alpha': 10.0 ** -np.arange(1, 7),
                      'hidden_layer_sizes':np.arange(5, 12), 'random_state':[0,1,2]}
        # hidden_layer_sizes':np.arange(5, 12) gives [ 5  6  7  8  9 10 11]  
        gs = GridSearchCV(mlp, parameters)
        gs.fit(xtrain, ytrain)
        y_testpredictN = gs.predict(xtest)                  # will give 0s and 1s
        y_trainpredictN = gs.predict(xtrain)                # will give 0s and 1s
        print('Params:', gs.get_params(deep=True))
    else:
        # Alogorithm will generate input layer (nodes= # features) and output layers (=1, since binary prediction)
        hidden_layers=[12,8]  # define the layers/depth of the NN
        print("Creating a neural network with "+str(len(hidden_layers))+" layers and "+str(2000)+" iterations")
        xnotes = xnotes + ' ** NN, # of layers:' + str(len(hidden_layers)) + ' Nodes/layer:' + str([x for x in hidden_layers])
        mlp = MLPClassifier(hidden_layer_sizes=(hidden_layers),activation='relu', 
                            alpha = .0001, max_iter=2000, random_state=10, solver = 'adam') 

        # an object which represents the neural network
        # Remember to use the pre-processed data and not original values for fit()

        mlp.fit(xtrain, ytrainx)  # fit features over NN
        # predict for both training and testing
        y_testpredictN = mlp.predict(xtest)                  # will give 0s and 1s
        y_trainpredictN = mlp.predict(xtrain)                # will give 0s and 1s
        print('Params:', mlp.get_params(deep=True))
        
              # change 0/1 to A/B for test data
    ydf1 = pd.DataFrame(y_testpredictN, columns=['Num-NN'])
    ydf1.head(2)
    ydf1['AB-NN'] = np.where(ydf1['Num-NN'] == 1 , 'A', 'B')
    print('\nTest ydf1 : ', ydf1.shape)
    y_testpredict = ydf1['AB-NN']
              # change 0/1 to A/B for train data
    ydf = pd.DataFrame(y_trainpredictN, columns=['Num-NN'])
    ydf['AB-NN'] = np.where(ydf['Num-NN'] ==1 , 'A', 'B')
    print('\nTrain ydf : ', ydf.shape)
    y_trainpredict = ydf['AB-NN']             
    
    return y_testpredict, y_trainpredict, mlp, xnotes


# y_testPredict,x_trainPredict,model,notes = NN(X_train,y_train, X_test, notes)



### Connect to TestDB so that test results can be recorded.

In [5]:
# Connect to database
def create_connection(db_file):
    """ 
    Create a database connection to a SQLite database.
    Note: When you connect to an SQLite database file that does not exist, SQLite creates a new database.
    """
    try:
        conn = sqlite3.connect(db_file)
        print('Database created:', db_file, ';  Sqlite3 version:', sqlite3.version)
    except Error as e:
        print(e)
    finally:
        conn.close()

TestDB = "TestDB.sqlite"
create_connection(TestDB)
conn = sqlite3.connect(TestDB)         # make  “connection” to the database
cur = conn.cursor()                            # cursor is like a ﬁle handle

Database created: TestDB.sqlite ;  Sqlite3 version: 2.6.0


### Get data; create X and y sets

In [6]:

project = 'ai_affinity; nfl'                   # this is same for all of our NFL tests
mdf = pd.read_pickle('mdftemp')
inCnt = len(mdf)
print("Number of games in (InCnt):", inCnt)
print('Columns:', mdf.columns.tolist())
mdf.head(2)

Number of games in (InCnt): 146
Columns: ['Year', 'Wk', 'Game', 'Ateam', 'Awin', 'AwinH', 'AptsPos', 'AptsNeg', 'AqbrTot', 'AqbPts', 'AqbPass', 'AqbRun', 'AL5WinR', 'AL5PtsP', 'AL5PtsN', 'AL5Qtr4', 'AL5upSp', 'AL5upSn', 'AoScore', 'AdScore', 'AimpactS1', 'AimpactN1', 'AimpactS2', 'AimpactN2', 'AupSpH', 'AupSnH', 'ATOnet', 'ATOpos', 'ATOneg', 'ApenNegCnt', 'ApenNegYds', 'ApenPosCnt', 'ApenPosYds', 'Aplays', 'AFGgood', 'AFG50plus', 'Bteam', 'Bwin', 'BwinA', 'BptsPos', 'BptsNeg', 'BqbrTot', 'BqbPts', 'BqbPass', 'BqbRun', 'BL5WinR', 'BL5PtsP', 'BL5PtsN', 'BL5upSp', 'BL5upSn', 'BL5Qtr4', 'BoScore', 'BdScore', 'BimpactS1', 'BimpactN1', 'BimpactS2', 'BimpactN2', 'BupSpA', 'BupSnA', 'BTOnet', 'BTOpos', 'BTOneg', 'BpenNegCnt', 'BpenNegYds', 'BpenPosCnt', 'BpenPosYds', 'Bplays', 'BFGgood', 'BFG50plus', 'winner']


Unnamed: 0,Year,Wk,Game,Ateam,Awin,AwinH,AptsPos,AptsNeg,AqbrTot,AqbPts,...,BTOpos,BTOneg,BpenNegCnt,BpenNegYds,BpenPosCnt,BpenPosYds,Bplays,BFGgood,BFG50plus,winner
0,2019,2,1,Panthers,0.438,0.625,376,-382,55.6,13.3,...,1.1,-2.2,-7.3,-60.9,7.9,72.2,181.4,0.1,0.1,B
1,2019,2,2,Ravens,0.625,0.75,389,-287,57.3,11.0,...,1.0,-1.8,-6.3,-49.1,6.2,49.4,171.5,0.1,0.1,A


In [7]:
#############  ADJUSTMENTS     #######
print(mdf['Ateam']== 'Steelers')

0      False
1      False
2      False
3      False
4       True
       ...  
141    False
142    False
143    False
144    False
145    False
Name: Ateam, Length: 146, dtype: bool


In [8]:
# Data characteristics
Years = mdf['Year'].unique()
Weeks = mdf['Wk'].unique()
print ('Years in input:', Years,'  Weeks:', Weeks)

Years in input: [2019]   Weeks: [2 3 4 5 6 7 8 9 10 11]


In [9]:
# Any null values (need to eliminate due to normalization)  --- data should be clean
tempL = len(mdf)            
mdf = mdf.dropna()     # drop missing, nan, etc
print (' *** Rows dropped due to missing (NaN):', tempL - len(mdf)) 

 *** Rows dropped due to missing (NaN): 0


## Features
Feature Sets (list of features to use) are stored in the TestDB in the featureSet table.
Use the featureSet table as the master copy (retrive and use it here).
Manually create new feature sets in the featureSet table.

In [10]:
# Select Feature List

cur.execute('SELECT fsSet From featureSet WHERE fsNum = ?', ( features,))
feat_str = cur.fetchone()[0]
feat_list = feat_str.split(',')
print('\nFeature list length:', len(feat_list),'Features:',  feat_list)

Num_of_feat= len(feat_list)

X = mdf[feat_list]
y = mdf[['Year','winner']]
print(y.head(2))
X.head(3)

# print(X.loc[0:5,['AptsNeg', 'AL5PtsN','AupSnH','AL5upSn']])





Feature list length: 64 Features: ['Awin', 'AwinH', 'AptsPos', 'AptsNeg', 'AqbrTot', 'AqbPts', 'AqbPass', 'AqbRun', 'AL5WinR', 'AL5PtsP', 'AL5PtsN', 'AL5Qtr4', 'AL5upSp', 'AL5upSn', 'AoScore', 'AdScore', 'AimpactS1', 'AimpactN1', 'AimpactS2', 'AimpactN2', 'AupSpH', 'AupSnH', 'Aplays', 'ApenNegCnt', 'ApenNegYds', 'ApenPosCnt', 'ApenPosYds', 'ATOnet', 'ATOpos', 'ATOneg', 'AFGgood', 'AFG50plus', 'Bwin', 'BwinA', 'BptsPos', 'BptsNeg', 'BqbrTot', 'BqbPts', 'BqbPass', 'BqbRun', 'BL5WinR', 'BL5PtsP', 'BL5PtsN', 'BL5Qtr4', 'BL5upSp', 'BL5upSn', 'BoScore', 'BdScore', 'BimpactS1', 'BimpactN1', 'BimpactS2', 'BimpactN2', 'BupSpA', 'BupSnA', 'Bplays', 'BpenNegCnt', 'BpenNegYds', 'BpenPosCnt', 'BpenPosYds', 'BTOnet', 'BTOpos', 'BTOneg', 'BFGgood', 'BFG50plus']
   Year winner
0  2019      B
1  2019      A


Unnamed: 0,Awin,AwinH,AptsPos,AptsNeg,AqbrTot,AqbPts,AqbPass,AqbRun,AL5WinR,AL5PtsP,...,Bplays,BpenNegCnt,BpenNegYds,BpenPosCnt,BpenPosYds,BTOnet,BTOpos,BTOneg,BFGgood,BFG50plus
0,0.438,0.625,376,-382,55.6,13.3,41.7,8.4,0.2,19.8,...,181.4,-7.3,-60.9,7.9,72.2,-1.1,1.1,-2.2,0.1,0.1
1,0.625,0.75,389,-287,57.3,11.0,30.1,2.9,0.8,30.2,...,171.5,-6.3,-49.1,6.2,49.4,-0.8,1.0,-1.8,0.1,0.1
2,0.438,0.375,281,-359,49.4,4.6,17.8,5.5,0.2,15.0,...,173.8,-6.3,-52.7,5.4,42.1,0.2,1.2,-1.1,0.81,0.86


### Normalized the data, as appropriate

In [11]:
#  Try different scalers and options.
#### scaler = MinMaxScaler(feature_range=(-1, 1))      # default is feature_range=(0, 1)
scaler = StandardScaler(with_mean= False)                # mean of zero
cols = X.columns.tolist()
Xscale = pd.DataFrame(scaler.fit_transform(X), columns=cols)
print(Xscale.shape)
Xscale.head(3)

(146, 64)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,Awin,AwinH,AptsPos,AptsNeg,AqbrTot,AqbPts,AqbPass,AqbRun,AL5WinR,AL5PtsP,...,Bplays,BpenNegCnt,BpenNegYds,BpenPosCnt,BpenPosYds,BTOnet,BTOpos,BTOneg,BFGgood,BFG50plus
0,2.485087,3.053798,4.984375,-7.545538,4.681687,0.564561,1.519625,1.261873,0.708932,3.536615,...,38.625577,-10.646226,-10.189863,10.389574,10.068417,-1.889635,2.894521,-6.289823,0.323513,0.355918
1,3.546072,3.664558,5.156707,-5.66903,4.824832,0.46693,1.0969,0.435647,2.835728,5.394231,...,36.517566,-9.187839,-8.215472,8.153843,6.888917,-1.37428,2.631383,-5.146218,0.323513,0.355918
2,2.485087,1.832279,3.725025,-7.091226,4.159628,0.195262,0.648665,0.826227,0.708932,2.679254,...,37.007306,-9.187839,-8.817829,7.101734,5.870919,0.34357,3.157659,-3.144911,2.620453,3.060899


In [12]:
### Create Training and Test datasets by Year

### Create Training and Test datasets (Special Option or Traditional Option)

In [13]:
#            Special: Separate Training and Test data by Years 
special = 'no'
test_percent=.33                   # used in Traditional Approach (not the special)

if special == 'yes':                   ##### SPECIAL APPROACH (by year)
    split_year = 2017
    notes = notes + '**  Train / Test Split special: ' + str(split_year)
    Xscale['Year'] = mdf['Year']
    X_train = Xscale[(Xscale['Year'] < split_year)]    
    X_test = Xscale[(Xscale['Year'] >= split_year)]
    X_train.drop('Year', axis=1,inplace=True)
    X_test.drop('Year', axis=1,inplace=True)

    y['Year'] = mdf['Year']
    print('y:', y.shape, type(y))
    y_train= y[(y['Year'] < split_year)]
    y_test = y[(y['Year'] >= split_year)]
    y_train.drop('Year', axis=1,inplace=True)
    y_test = y_test['winner']

    print('X_train shape:', X_train.shape, type(X_train))
    print('y_train:', len(y_train), type(y_train) )
    y_train = y_train['winner']  #convert to series
    ydf = pd.DataFrame(y_train)

else:                                   ##### TRADITIONAL APPROACH (using algorithm)
    y = mdf['winner']
    notes = notes + '** Train / Test Split traditional at: ' + str(test_percent)
    print('X:', Xscale.shape, type(X))
    print('y:', y.shape, type(y))
    X_train, X_test, y_train, y_test = train_test_split(Xscale, y, test_size=test_percent, random_state=42)
    print('Number of Training games:', len(X_train), 'Test games:', len(X_test))
    print('X_train shape:', X_train.shape, type(X_train))
    print('y_train:', len(y_train), type(y_train) )
    ydf = pd.DataFrame(y_train)


X: (146, 64) <class 'pandas.core.frame.DataFrame'>
y: (146,) <class 'pandas.core.series.Series'>
Number of Training games: 97 Test games: 49
X_train shape: (97, 64) <class 'pandas.core.frame.DataFrame'>
y_train: 97 <class 'pandas.core.series.Series'>


###  Create Model and Make Predictions

In [14]:
# Create Model  ---- variable 'alg' is set in the beginning, Key Inputs, section

model = ' '               # this will be the ML model created by the algorithm. 

if alg == 'LR':           # variable 'alg' is set in the beginning, Key Inputs, section
    y_testPredict,x_trainPredict,model,notes = LogRegNoPCA(X_train,y_train, X_test, y_test, notes,feat_list)

if alg == 'LR-PCA':
    y_testPredict,x_trainPredict,model,notes = LogRegPCA(X_train,y_train, X_test, notes)

if alg == 'NN':
    y_testPredict,x_trainPredict,model,notes = NN(X_train,y_train, X_test, notes)

if alg == 'RFC':
    y_testPredict,x_trainPredict,model,notes = RFC(X_train,y_train, X_test, notes,feat_list)

if alg == 'LR-RFE':
    y_testPredict,x_trainPredict,model,notes = LogRegRFE(X_train,y_train, X_test,y_test, notes, feat_list)

if alg == 'ETC':
    y_testPredict,x_trainPredict,model,notes = ExtraTrees(X_train,y_train, X_test,y_test, notes, feat_list)

if alg == 'SVM':
    y_testPredict,x_trainPredict,model,notes = SVM(X_train,y_train, X_test,y_test, notes, feat_list)

if alg == 'SVMgrid':
    y_testPredict,x_trainPredict,model,notes = SVMgrid(X_train,y_train, X_test,y_test, notes, feat_list)
                
        
print(notes)
print('model type:', type(model))

ytrain: 97 <class 'pandas.core.series.Series'>
xtrain: 97 <class 'pandas.core.frame.DataFrame'>
ytrainx: 97 <class 'pandas.core.series.Series'>
Creating a neural network with 2 layers and 2000 iterations
Params: {'activation': 'relu', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': [12, 8], 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_iter': 2000, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': 10, 'shuffle': True, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}

Test ydf1 :  (49, 2)

Train ydf :  (97, 2)
Notes: ** Train / Test Split traditional at: 0.33 ** NN, # of layers:2 Nodes/layer:[12, 8]
model type: <class 'sklearn.neural_network.multilayer_perceptron.MLPClassifier'>


### Determine Accuracy of both the Train and Test data using model results

In [15]:
# Determine Accuracy of both the Train and Test data using model results
print('x_trainPredict:', len(x_trainPredict))
print('y_train:',len(y_train))
test_acc, train_acc = 0,0
for i in range(len(y_testPredict)):
    if y_testPredict[i] == y_test.iloc[i]: test_acc +=1
for i in range(len(x_trainPredict)):
    if x_trainPredict[i] == y_train.iloc[i]: train_acc +=1 
train_acc = round(train_acc/len(X_train)*100,1)
test_acc = round(test_acc / len(y_test)*100, 1)
print('Train acc:', train_acc, 'Test acc:', test_acc )

x_trainPredict: 97
y_train: 97
Train acc: 48.5 Test acc: 59.2


### Look at predictions

In [16]:
# Look at some of the test predictions.
predict_df = mdf[['Year','Wk','Game','Ateam','AL5WinR', 'Bteam','BL5WinR','winner' ]]
predict_df = predict_df.drop(predict_df.index[0:len(x_trainPredict)])            # drop training games
print('Length of X and y:', len(predict_df),len(y_testPredict))
predict_df = predict_df.assign(predict = pd.Series(y_testPredict).values)   # add predictions for each test game
predict_df['predict_score'] = np.where((predict_df['winner'] == predict_df['predict']), 1,0)  # 1 = prediction correct

predict_Year = 2017
errors_total = 0
games_total = 0
print('Predict Score (all data):', round(predict_df['predict_score'].sum() / len(predict_df)*100,1))
temp = predict_df.filter(['Year','Wk','Ateam', 'Bteam', 'winner','predict'],axis=1)
temp2 = temp.loc[temp['Year'] == predict_Year ]
print('\nPredictions, Year: {}'.format(predict_Year))
for w in Weeks:
    temp3 = temp2.loc[temp2['Wk'] == w]
    temp3['predict_Errors'] =  np.where(temp3['winner']!= temp3['predict'],1,0)
    print('Predict Score (selected data) year:{}, week: {}, Correct %: {}, Missed:{}'.format( predict_Year, w,
                100 -round(temp3['predict_Errors'].sum()/len(temp3)*100, 1),
                temp3['predict_Errors'].sum()))
    games_total = games_total + temp3['predict_Errors'].count()
    errors_total = errors_total + temp3['predict_Errors'].sum()
print('Errors totals: {}, Games total: {}, Error %: {}'.format(errors_total, games_total, 
                    round((errors_total/games_total)*100,1)))
temp3.tail()
temp4 = predict_df[(predict_df.Year ==2017)  ] 
print('\nDetails, Year: {}, Week: {}'.format('2017', '17'))
temp4

Length of X and y: 49 49
Predict Score (all data): 49.0

Predictions, Year: 2017
Predict Score (selected data) year:2017, week: 2, Correct %: nan, Missed:0
Predict Score (selected data) year:2017, week: 3, Correct %: nan, Missed:0
Predict Score (selected data) year:2017, week: 4, Correct %: nan, Missed:0
Predict Score (selected data) year:2017, week: 5, Correct %: nan, Missed:0
Predict Score (selected data) year:2017, week: 6, Correct %: nan, Missed:0
Predict Score (selected data) year:2017, week: 7, Correct %: nan, Missed:0
Predict Score (selected data) year:2017, week: 8, Correct %: nan, Missed:0
Predict Score (selected data) year:2017, week: 9, Correct %: nan, Missed:0
Predict Score (selected data) year:2017, week: 10, Correct %: nan, Missed:0
Predict Score (selected data) year:2017, week: 11, Correct %: nan, Missed:0
Errors totals: 0, Games total: 0, Error %: nan

Details, Year: 2017, Week: 17




Unnamed: 0,Year,Wk,Game,Ateam,AL5WinR,Bteam,BL5WinR,winner,predict,predict_score


### Calculate Performance

In [17]:
matrix = confusion_matrix(y_test,y_testPredict)
print('Confusion Matrix:\n')
true_pos  = int(matrix[0][0])
false_pos = int(matrix[0][1])
false_neg = int(matrix[1][0])
true_neg  = int(matrix[1][1])
print('True Positives : {}   False Positives: {}'.format(true_pos, false_pos))
print('False Negatives: {}   True Negatives:  {}'.format(false_neg, true_neg))
print('\nNote: True Positive: predict = A, actual = A; False Positive: predict = A, actual = B')
print('     False Negative: predict = B, actual = A; True Negative: predict = B, actual = B')
print('\nTest Accuracy: {: .1f}% '.format((true_pos+ true_neg)/ matrix.sum() * 100 ))

precision = round(true_pos / (true_pos + false_pos),2)               # precision = tp / (tp + fp) [best = 1; worst = 0]
recall = round(true_pos / (true_pos + false_neg),2)                  # recall = tp / (tp + fn) [best = 1]
f1 = round(2 * (precision * recall) / (precision + recall),2) # weighted value of precision and recall
print('Precision: {}, Recall: {}, f1-score: {}'.format( precision,recall,f1))

Confusion Matrix:

True Positives : 7   False Positives: 14
False Negatives: 6   True Negatives:  22

Note: True Positive: predict = A, actual = A; False Positive: predict = A, actual = B
     False Negative: predict = B, actual = A; True Negative: predict = B, actual = B

Test Accuracy:  59.2% 
Precision: 0.33, Recall: 0.54, f1-score: 0.41


### Print Report.  Optionally, save Report.  Optionally, save Model.

In [18]:
# Create Test Report
def array2str(yarray):
    """ input is a numpy array, output a comma separate string"""
    ystr = ''
    for iy in range(len(yarray)):   # convert the array to set of items seperated by commas so that it can be in db
        if ystr == '': ystr = str(yarray[iy])
        else: ystr = ystr + ', ' + str(yarray[iy]) 
    return ystr

# Set-up data for recording

runDate = dt.datetime.today().strftime("%m/%d/%Y %H:%M")
script = 'ML-Modeling-v4'
project = 'ai_affinity'
period1 = array2str(Years)   #array of years to string of comma sep years for database
period2 = array2str(Weeks)   #array of weeks to string of comma sep weeks for database
testsize =   round((len(X_test) / (len(X_test)+len(X_train)))*100,1)     # size of test dataset (as a % of all input)

print('Test Report (key fields):')
print('TestAcc, TrainAcc, Notes')
print( test_acc, train_acc, notes)



Test Report (key fields):
TestAcc, TrainAcc, Notes
59.2 48.5 Notes: ** Train / Test Split traditional at: 0.33 ** NN, # of layers:2 Nodes/layer:[12, 8]


In [19]:
#  Save test results to TestDB    

if saveRpt == 'Y':                           # this is set at beginning (key inputs section)
    # Insert into TestDB (as record in 'test' table)
    cur.execute("""INSERT OR IGNORE INTO test 
                (project,date,inCnt,testAcc,trainAcc,
                period1,period2,num_of_feat,features, testsize,alg,script,notes,
                truePos, falsePos, falseNeg, trueNeg,precision,recall,f1) 
                VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""" , 
                (project,runDate,inCnt,test_acc,train_acc,
                 period1,period2,Num_of_feat,features,testsize,alg,script,notes,
                 true_pos, false_pos, false_neg, true_neg, precision, recall, f1 ))
    print ('\nReport saved.')


conn.commit()                     # flush to database 
conn.close()                      # Close database
print('\nEnded')


Report saved.

Ended


In [20]:
#  Save model  naming convention: 'Model'-'Alg'-'FeatureSet#'-'Date&time'  

if save_model == 'Y':             # This is set at begining of script (key inputs section)
    print('Model type being saved:',  'type:', type(model))
    name = 'Model-' + alg + '-' + str(features) +'-' + str(dt.datetime.today().strftime("%Y%m%d-%H%M"))+ '.sav'  #name of file  
    joblib.dump(model,  name)
    print ('\nModel saved. Name:',  name)

Model type being saved: type: <class 'sklearn.neural_network.multilayer_perceptron.MLPClassifier'>

Model saved. Name: Model-NN-13-20191119-0813.sav
