In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from IPython.display import display
import matplotlib.pyplot as plt

In [2]:
# READING TESTING AND TRAINING DATA
data = pd.read_csv('nflData.csv')

In [3]:
# NUMBER OF ROWS TO PROCESS
totalPlays = data.shape[0]
print ("Total Rows: {}".format(totalPlays))

Total Rows: 172568


In [4]:
# SPLITTING THE DEPENDENT VARIABLES AND INDEPENDENT VARIABLE
iVar = data.drop(columns=['game_date','home_win','total_home_score','total_away_score'])
dVar = data['home_win']

In [5]:
# STANDARDISING THE DATA
# converting categorical data into dummy variables

standard = pd.DataFrame(index = iVar.index)
for c, cData in iVar.iteritems():
    if cData.dtype == object:
        cData = pd.get_dummies(cData, prefix = c)
    standard = standard.join(cData)
iVar = standard

In [6]:
# SPLITTING TRAINING DATA AND TESTING DATA
import sklearn.model_selection as model_selection
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iVar, dVar, train_size = 132095, shuffle = False, random_state = 40)
print ("X_train: ", X_train)
print ("y_train: ", y_train)
print ("X_test: ", X_test)
print ("y_test: ", y_test)

X_train:          home_team_ARI  home_team_ATL  home_team_BAL  home_team_BUF  \
0                   0              0              0              0   
1                   0              0              0              0   
2                   0              0              0              0   
3                   0              0              0              0   
4                   0              0              0              0   
...               ...            ...            ...            ...   
132090              0              0              1              0   
132091              0              0              1              0   
132092              0              0              1              0   
132093              0              0              1              0   
132094              0              0              1              0   

        home_team_CAR  home_team_CHI  home_team_CIN  home_team_CLE  \
0                   0              0              0              0   
1        

In [7]:
# TRAINING AND EVALUATING MODELS
from time import time
# f score measure models accuracy from 0(worst) to 1(best)
# precision is the number of correct +ve results divided by the number of all +ve results
# recall is the number of correct +ve results divide by the number of all +ve results
from sklearn.metrics import f1_score

# fits a classifier to the training data
def trainClassifier(clf, X_train, y_train):
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    print ("Trained model in {:.3f} seconds".format(end-start))

# predicts a label using a classifer based on f1 score
def predictLabel(clf, independent, dependent):
    start = time()
    y_pred = clf.predict(independent)
    end = time()
    
    print ("Predicted label in {:.3f} seconds".format(end-start))
    return f1_score(dependent, y_pred, pos_label=1), sum(dependent == y_pred)/float(len(y_pred))

# amalgamated previous two subroutines to train and predict using classifier based on f1 score
def trainPredict(clf, X_train, y_train, X_test, y_test):
    print("Training dataset size: {}".format(len(X_train)))
    trainClassifier(clf, X_train, y_train)
    
    f1, accuracy = predictLabel(clf, X_train, y_train)
    print ("TRAINING SET:")
    print ("F1 score: {:.3f}\nAccuracy: {:.3f}".format(f1, accuracy))
    
    f1, accuracy = predictLabel(clf, X_test, y_test)
    print ("TESTING SET:")
    print ("F1 score: {:.3f}\nAccuracy: {:.3f}".format(f1, accuracy))

In [10]:
# EXECUTING THE CLASSIFIERS
lr = LogisticRegression(random_state = 40, solver = 'saga', max_iter = 5000)
sv = SVC(random_state = 90, kernel = 'rbf', gamma='scale')

print ("Modelling with Logistic Regression:")
trainPredict(lr, X_train, y_train, X_test, y_test)
print(" ")

print ("Modelling with Support Vector Machine:")
trainPredict(sv, X_train, y_train, X_test, y_test)
print(" ")

Modelling with Logistic Regression:
Training dataset size: 132095
Trained model in 183.922 seconds
Predicted label in 0.052 seconds
TRAINING SET:
F1 score: 0.912
Accuracy: 0.916
Predicted label in 0.029 seconds
TESTING SET:
F1 score: 0.917
Accuracy: 0.917
 
Modelling with Support Vector Machine:
Training dataset size: 132095
Trained model in 670.320 seconds
Predicted label in 804.838 seconds
TRAINING SET:
F1 score: 0.913
Accuracy: 0.918
Predicted label in 266.369 seconds
TESTING SET:
F1 score: 0.915
Accuracy: 0.916
 
