In [1]:
# Import dependencies
import numpy as np
import pandas as pd
import re

import hvplot.pandas
import holoviews as hv

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import RandomOverSampler

# Regression using Offense Statistics

In [2]:
# Read the CSV file from the output folder into a Pandas DataFrame
df_data = pd.read_csv("output/nebraska_offense.csv")

# Display the data
df_data.head()

Unnamed: 0,date,home_away,opponent,score,passing_cmp,passing_att,passing_pct,passing_yds,passing_td,rushing_att,...,first_down_pen,first_down_total,penalties,penalty_yds,fumbles,interceptions,turnovers,outcome,points_for,points_against
0,2000-09-02,home,San Jose State,W (49-13),5,13,38.5,91,1,60,...,0,28,4,31,0,3,3,W,49,13
1,2000-09-09,away,Notre Dame,W (27-24),7,15,46.7,103,0,59,...,1,20,6,44,0,1,1,W,27,24
2,2000-09-23,home,Iowa,W (42-13),10,13,76.9,159,5,51,...,0,23,5,55,1,0,1,W,42,13
3,2000-09-30,home,Missouri,W (42-24),11,23,47.8,173,2,53,...,4,27,7,53,0,0,0,W,42,24
4,2000-10-07,away,Iowa State,W (49-27),7,17,41.2,164,0,60,...,0,20,5,38,0,1,1,W,49,27


In [3]:
# Check the dataset to determine initial processing steps
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287 entries, 0 to 286
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              287 non-null    object 
 1   home_away         287 non-null    object 
 2   opponent          287 non-null    object 
 3   score             287 non-null    object 
 4   passing_cmp       287 non-null    int64  
 5   passing_att       287 non-null    int64  
 6   passing_pct       287 non-null    float64
 7   passing_yds       287 non-null    int64  
 8   passing_td        287 non-null    int64  
 9   rushing_att       287 non-null    int64  
 10  rushing_yds       287 non-null    int64  
 11  rushing_avg       287 non-null    float64
 12  rushing_td        287 non-null    int64  
 13  total_plays       287 non-null    int64  
 14  total_yds         287 non-null    int64  
 15  total_avg         287 non-null    float64
 16  first_down_pass   287 non-null    int64  
 1

In [4]:
# Set the index to the date column
df_data = df_data.set_index('date')

# Encode the home_away column (home = 0, away = 1, N = 1)
df_data['home_away'] = df_data['home_away'].replace('home', 0)
df_data['home_away'] = df_data['home_away'].replace('away', 1)
df_data['home_away'] = df_data['home_away'].replace('N', 1)

# Encode the outcome column (win = 0, loss = 1)
df_data['outcome'] = df_data['outcome'].replace('W', 0)
df_data['outcome'] = df_data['outcome'].replace('L', 1)

# Drop the unnecessary columns and set the team column to the index
columns_to_drop = ['opponent', 'score']
df_data = df_data.drop(columns_to_drop, axis=1)

#Preview the dataframe
df_data.head()

Unnamed: 0_level_0,home_away,passing_cmp,passing_att,passing_pct,passing_yds,passing_td,rushing_att,rushing_yds,rushing_avg,rushing_td,...,first_down_pen,first_down_total,penalties,penalty_yds,fumbles,interceptions,turnovers,outcome,points_for,points_against
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-09-02,0,5,13,38.5,91,1,60,505,8.4,6,...,0,28,4,31,0,3,3,0,49,13
2000-09-09,1,7,15,46.7,103,0,59,274,4.6,4,...,1,20,6,44,0,1,1,0,27,24
2000-09-23,0,10,13,76.9,159,5,51,331,6.5,0,...,0,23,5,55,1,0,1,0,42,13
2000-09-30,0,11,23,47.8,173,2,53,311,5.9,2,...,4,27,7,53,0,0,0,0,42,24
2000-10-07,1,7,17,41.2,164,0,60,336,5.6,6,...,0,20,5,38,0,1,1,0,49,27


In [5]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data
data_scaled = StandardScaler().fit_transform(df_data)

# Creating a DataFrame with with the scaled data
df_scaled = pd.DataFrame(data_scaled, columns=list(df_data.columns))

# Set the index to the team value
df_scaled['date'] = df_data.index
df_scaled = df_scaled.set_index('date')

# Preview the DataFrame
df_scaled.head()

Unnamed: 0_level_0,home_away,passing_cmp,passing_att,passing_pct,passing_yds,passing_td,rushing_att,rushing_yds,rushing_avg,rushing_td,...,first_down_pen,first_down_total,penalties,penalty_yds,fumbles,interceptions,turnovers,outcome,points_for,points_against
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-09-02,-0.897235,-1.54662,-1.386554,-1.483401,-1.221747,-0.369818,1.532764,2.917762,2.240868,2.365558,...,-1.156017,1.246407,-0.902913,-1.000016,-0.894651,1.909233,0.807484,-0.841625,1.383476,-0.746078
2000-09-09,1.114535,-1.261716,-1.195557,-0.831127,-1.092929,-1.180028,1.444055,0.699274,0.0322,1.154291,...,-0.346805,-0.114387,-0.150704,-0.471387,-0.894651,0.026244,-0.64094,-0.841625,-0.213757,-0.016184
2000-09-23,-0.897235,-0.834361,-1.386554,1.571151,-0.491781,2.871026,0.734391,1.246693,1.136534,-1.268243,...,-1.156017,0.39591,-0.526808,-0.024087,0.085367,-0.915251,-0.64094,-0.841625,0.875265,-0.746078
2000-09-30,-0.897235,-0.691909,-0.431572,-0.743627,-0.341494,0.440393,0.911807,1.054616,0.787797,-0.056976,...,2.080831,1.076307,0.225401,-0.105414,-0.894651,-0.915251,-1.365152,-0.841625,0.875265,-0.016184
2000-10-07,1.114535,-1.261716,-1.004561,-1.268628,-0.438107,-1.180028,1.532764,1.294712,0.613429,2.365558,...,-1.156017,-0.114387,-0.526808,-0.71537,-0.894651,0.026244,-0.64094,-0.841625,1.383476,0.182878


In [6]:
# Separate the y variable, the labels
y = df_data['outcome']

# Separate the X variable, the features
X = df_data.drop('outcome', axis=1)

# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Instantiate the Logistic Regression model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

# Make a prediction using the testing data
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0_level_0,Prediction,Actual
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-09-04,0,0
2019-09-21,0,0
2014-10-04,1,1
2007-09-22,0,0
2022-11-25,0,0
...,...,...
2009-10-17,1,1
2014-11-15,1,1
2007-09-29,0,0
2002-11-16,1,1


In [7]:
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

1.0

In [8]:
# Display the confusiong matrix for the test dataset.
confusion_matrix(y_test, predictions)

array([[40,  0],
       [ 0, 32]], dtype=int64)

In [9]:
# Final Report
target_names = ["win", "loss"]
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

         win       1.00      1.00      1.00        40
        loss       1.00      1.00      1.00        32

    accuracy                           1.00        72
   macro avg       1.00      1.00      1.00        72
weighted avg       1.00      1.00      1.00        72



### Oversample the data

In [10]:
# Instantiate the random oversampler model
ros = RandomOverSampler(random_state=1)

# Fit the original training data to the RandomOverSampler model
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Instantiate the Logistic Regression model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

# Fit the model using training data
classifier.fit(X_train_resampled, y_train_resampled)

# Make a prediction using the testing data
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0_level_0,Prediction,Actual
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-09-04,0,0
2019-09-21,0,0
2014-10-04,1,1
2007-09-22,0,0
2022-11-25,0,0
...,...,...
2009-10-17,1,1
2014-11-15,1,1
2007-09-29,0,0
2002-11-16,1,1


In [11]:
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

# Display the confusiong matrix for the test dataset.
confusion_matrix(y_test, predictions)

array([[39,  1],
       [ 0, 32]], dtype=int64)

In [12]:
# Print the classification report for the model
target_names = ["wins", "losses"]
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

        wins       1.00      0.97      0.99        40
      losses       0.97      1.00      0.98        32

    accuracy                           0.99        72
   macro avg       0.98      0.99      0.99        72
weighted avg       0.99      0.99      0.99        72



### Drop points against to see if that impacts the model

In [13]:
# Separate the y variable, the labels
y = df_data['outcome']

# Separate the X variable, the features
columns_to_drop = ['outcome', 'points_against']
X = df_data.drop(columns_to_drop, axis=1)

# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Instantiate the Logistic Regression model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

# Make a prediction using the testing data
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0_level_0,Prediction,Actual
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-09-04,0,0
2019-09-21,0,0
2014-10-04,1,1
2007-09-22,0,0
2022-11-25,0,0
...,...,...
2009-10-17,1,1
2014-11-15,1,1
2007-09-29,0,0
2002-11-16,1,1


In [14]:
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.7638888888888888

In [15]:
# Display the confusiong matrix for the test dataset.
confusion_matrix(y_test, predictions)

array([[32,  8],
       [ 9, 23]], dtype=int64)

In [16]:
# Final Report
target_names = ["win", "loss"]
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

         win       0.78      0.80      0.79        40
        loss       0.74      0.72      0.73        32

    accuracy                           0.76        72
   macro avg       0.76      0.76      0.76        72
weighted avg       0.76      0.76      0.76        72



# Regression using Defense Statistics

In [17]:
# Read the CSV file from the output folder into a Pandas DataFrame
df_data = pd.read_csv("output/nebraska_defense.csv")

# Display the data
df_data.head()

Unnamed: 0,date,home_away,opponent,score,passing_cmp,passing_att,passing_pct,passing_yds,passing_td,rushing_att,...,first_down_pen,first_down_total,penalties,penalty_yds,fumbles,interceptions,turnovers,outcome,points_for,points_against
0,2000-09-02,home,San Jose State,W (49-13),14,37,37.8,153,2,29,...,2,14,12,65,0,1,1,W,49,13
1,2000-09-09,away,Notre Dame,W (27-24),3,15,20.0,40,0,40,...,2,11,8,67,0,1,1,W,27,24
2,2000-09-23,home,Iowa,W (42-13),19,40,47.5,252,1,31,...,2,18,7,57,0,2,2,W,42,13
3,2000-09-30,home,Missouri,W (42-24),19,39,48.7,283,1,31,...,1,22,10,95,1,1,2,W,42,24
4,2000-10-07,away,Iowa State,W (49-27),22,43,51.2,346,2,25,...,2,18,8,71,0,2,2,W,49,27


In [18]:
# Set the index to the date column
df_data = df_data.set_index('date')

# Encode the home_away column (home = 0, away = 1, N = 1)
df_data['home_away'] = df_data['home_away'].replace('home', 0)
df_data['home_away'] = df_data['home_away'].replace('away', 1)
df_data['home_away'] = df_data['home_away'].replace('N', 1)

# Encode the outcome column (win = 0, loss = 1)
df_data['outcome'] = df_data['outcome'].replace('W', 0)
df_data['outcome'] = df_data['outcome'].replace('L', 1)

# Drop the unnecessary columns and set the team column to the index
columns_to_drop = ['opponent', 'score']
df_data = df_data.drop(columns_to_drop, axis=1)

#Preview the dataframe
df_data.head()

Unnamed: 0_level_0,home_away,passing_cmp,passing_att,passing_pct,passing_yds,passing_td,rushing_att,rushing_yds,rushing_avg,rushing_td,...,first_down_pen,first_down_total,penalties,penalty_yds,fumbles,interceptions,turnovers,outcome,points_for,points_against
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-09-02,0,14,37,37.8,153,2,29,193,6.7,0,...,2,14,12,65,0,1,1,0,49,13
2000-09-09,1,3,15,20.0,40,0,40,184,4.6,1,...,2,11,8,67,0,1,1,0,27,24
2000-09-23,0,19,40,47.5,252,1,31,47,1.5,0,...,2,18,7,57,0,2,2,0,42,13
2000-09-30,0,19,39,48.7,283,1,31,209,6.7,2,...,1,22,10,95,1,1,2,0,42,24
2000-10-07,1,22,43,51.2,346,2,25,37,1.5,1,...,2,18,8,71,0,2,2,0,49,27


In [19]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data
data_scaled = StandardScaler().fit_transform(df_data)

# Creating a DataFrame with with the scaled data
df_scaled = pd.DataFrame(data_scaled, columns=list(df_data.columns))

# Set the index to the team value
df_scaled['date'] = df_data.index
df_scaled = df_scaled.set_index('date')

# Preview the DataFrame
df_scaled.head()

Unnamed: 0_level_0,home_away,passing_cmp,passing_att,passing_pct,passing_yds,passing_td,rushing_att,rushing_yds,rushing_avg,rushing_td,...,first_down_pen,first_down_total,penalties,penalty_yds,fumbles,interceptions,turnovers,outcome,points_for,points_against
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-09-02,-0.897235,-0.496158,0.403824,-1.240222,-0.635032,0.670756,-0.806753,0.441075,1.497189,-0.998071,...,0.24514,-0.855689,2.17639,0.584816,-0.709041,-0.065325,-0.462723,-0.841625,1.383476,-0.746078
2000-09-09,1.114535,-1.975262,-1.649111,-2.641978,-1.866995,-1.003218,0.330772,0.346629,0.379582,-0.374005,...,0.24514,-1.364142,0.747045,0.663101,-0.709041,-0.065325,-0.462723,-0.841625,-0.213757,-0.016184
2000-09-23,-0.897235,0.176162,0.68377,-0.476344,0.444298,-0.166231,-0.59993,-1.091042,-1.270219,-0.998071,...,0.24514,-0.177752,0.389708,0.271678,-0.709041,0.872083,0.283353,-0.841625,0.875265,-0.746078
2000-09-30,-0.897235,0.176162,0.590455,-0.381843,0.78227,-0.166231,-0.59993,0.608978,1.497189,0.250061,...,-0.527993,0.500185,1.461717,1.759086,0.5789,-0.065325,0.283353,-0.841625,0.875265,-0.016184
2000-10-07,1.114535,0.579554,0.963715,-0.184967,1.469117,0.670756,-1.220399,-1.195981,-1.270219,-0.374005,...,0.24514,-0.177752,0.747045,0.81967,-0.709041,0.872083,0.283353,-0.841625,1.383476,0.182878


In [20]:
# Separate the y variable, the labels
y = df_data['outcome']

# Separate the X variable, the features
X = df_data.drop('outcome', axis=1)

# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Instantiate the Logistic Regression model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

# Make a prediction using the testing data
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0_level_0,Prediction,Actual
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-09-04,0,0
2019-09-21,0,0
2014-10-04,1,1
2007-09-22,0,0
2022-11-25,0,0
...,...,...
2009-10-17,1,1
2014-11-15,1,1
2007-09-29,0,0
2002-11-16,1,1


In [21]:
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.9722222222222222

In [22]:
# Display the confusiong matrix for the test dataset.
confusion_matrix(y_test, predictions)

array([[39,  1],
       [ 1, 31]], dtype=int64)

In [23]:
# Print the classification report for the model
target_names = ["wins", "losses"]
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

        wins       0.97      0.97      0.97        40
      losses       0.97      0.97      0.97        32

    accuracy                           0.97        72
   macro avg       0.97      0.97      0.97        72
weighted avg       0.97      0.97      0.97        72



### Drop points for to see if that impacts the model

In [24]:
# Separate the y variable, the labels
y = df_data['outcome']

# Separate the X variable, the features
columns_to_drop = ['outcome', 'points_against']
X = df_data.drop(columns_to_drop, axis=1)

# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Instantiate the Logistic Regression model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

# Make a prediction using the testing data
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0_level_0,Prediction,Actual
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-09-04,0,0
2019-09-21,0,0
2014-10-04,1,1
2007-09-22,1,0
2022-11-25,0,0
...,...,...
2009-10-17,1,1
2014-11-15,1,1
2007-09-29,0,0
2002-11-16,1,1


In [25]:
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.875

In [26]:
# Display the confusiong matrix for the test dataset.
confusion_matrix(y_test, predictions)

array([[33,  7],
       [ 2, 30]], dtype=int64)

In [27]:
# Print the classification report for the model
target_names = ["wins", "losses"]
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

        wins       0.94      0.82      0.88        40
      losses       0.81      0.94      0.87        32

    accuracy                           0.88        72
   macro avg       0.88      0.88      0.87        72
weighted avg       0.88      0.88      0.88        72

