In [1]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# Read in the CSV file to a DataFrame
df_raw = pd.read_csv("output/nebraska_offense.csv", index_col='date')
df_raw.head()

Unnamed: 0_level_0,home_away,opponent,score,passing_cmp,passing_att,passing_pct,passing_yds,passing_td,rushing_att,rushing_yds,...,first_down_pen,first_down_total,penalties,penalty_yds,fumbles,interceptions,turnovers,outcome,points_for,points_against
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-09-02,home,San Jose State,W (49-13),5,13,38.5,91,1,60,505,...,0,28,4,31,0,3,3,W,49,13
2000-09-09,away,Notre Dame,W (27-24),7,15,46.7,103,0,59,274,...,1,20,6,44,0,1,1,W,27,24
2000-09-23,home,Iowa,W (42-13),10,13,76.9,159,5,51,331,...,0,23,5,55,1,0,1,W,42,13
2000-09-30,home,Missouri,W (42-24),11,23,47.8,173,2,53,311,...,4,27,7,53,0,0,0,W,42,24
2000-10-07,away,Iowa State,W (49-27),7,17,41.2,164,0,60,336,...,0,20,5,38,0,1,1,W,49,27


### Preprocessing

In [3]:
# Review information for preprocessing
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 287 entries, 2000-09-02 to 2022-11-25
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   home_away         287 non-null    object 
 1   opponent          287 non-null    object 
 2   score             287 non-null    object 
 3   passing_cmp       287 non-null    int64  
 4   passing_att       287 non-null    int64  
 5   passing_pct       287 non-null    float64
 6   passing_yds       287 non-null    int64  
 7   passing_td        287 non-null    int64  
 8   rushing_att       287 non-null    int64  
 9   rushing_yds       287 non-null    int64  
 10  rushing_avg       287 non-null    float64
 11  rushing_td        287 non-null    int64  
 12  total_plays       287 non-null    int64  
 13  total_yds         287 non-null    int64  
 14  total_avg         287 non-null    float64
 15  first_down_pass   287 non-null    int64  
 16  first_down_rush   287 non-null   

In [4]:
# Drop the score column
df_raw = df_raw.drop('score', axis=1)
df_raw.head()

Unnamed: 0_level_0,home_away,opponent,passing_cmp,passing_att,passing_pct,passing_yds,passing_td,rushing_att,rushing_yds,rushing_avg,...,first_down_pen,first_down_total,penalties,penalty_yds,fumbles,interceptions,turnovers,outcome,points_for,points_against
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-09-02,home,San Jose State,5,13,38.5,91,1,60,505,8.4,...,0,28,4,31,0,3,3,W,49,13
2000-09-09,away,Notre Dame,7,15,46.7,103,0,59,274,4.6,...,1,20,6,44,0,1,1,W,27,24
2000-09-23,home,Iowa,10,13,76.9,159,5,51,331,6.5,...,0,23,5,55,1,0,1,W,42,13
2000-09-30,home,Missouri,11,23,47.8,173,2,53,311,5.9,...,4,27,7,53,0,0,0,W,42,24
2000-10-07,away,Iowa State,7,17,41.2,164,0,60,336,5.6,...,0,20,5,38,0,1,1,W,49,27


In [5]:
# Get Dummy values for home/away and outcome
df_dummies = pd.get_dummies(df_raw, columns=['home_away', 'outcome'])
df_dummies.drop(['opponent', 'passing_cmp', 'passing_att', 'passing_pct', 'passing_yds',
       'passing_td', 'rushing_att', 'rushing_yds', 'rushing_avg', 'rushing_td',
       'total_plays', 'total_yds', 'total_avg', 'first_down_pass',
       'first_down_rush', 'first_down_pen', 'first_down_total', 'penalties',
       'penalty_yds', 'fumbles', 'interceptions', 'turnovers', 'points_for',
       'points_against'], axis=1, inplace=True)
df_dummies = df_dummies.reset_index()

df_dummies.head()

Unnamed: 0,date,home_away_N,home_away_away,home_away_home,outcome_L,outcome_W
0,2000-09-02,0,0,1,0,1
1,2000-09-09,0,1,0,0,1
2,2000-09-23,0,0,1,0,1
3,2000-09-30,0,0,1,0,1
4,2000-10-07,0,1,0,0,1


In [6]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
columns_to_scale = ['passing_cmp', 'passing_att', 'passing_pct', 'passing_yds',
       'passing_td', 'rushing_att', 'rushing_yds', 'rushing_avg', 'rushing_td',
       'total_plays', 'total_yds', 'total_avg', 'first_down_pass',
       'first_down_rush', 'first_down_pen', 'first_down_total', 'penalties',
       'penalty_yds', 'fumbles', 'interceptions', 'turnovers', 'points_for',
       'points_against']
data_scaled = StandardScaler().fit_transform(df_raw[columns_to_scale])

# Creating a DataFrame with with the scaled data
df_scaled = pd.DataFrame(data_scaled, columns=columns_to_scale)

# Include date, opponent and dummy value columns back in
df_scaled = pd.concat([df_dummies, df_scaled], axis=1)
df_scaled['date'] = df_raw.index
df_scaled = df_scaled.set_index('date')

# Preview the dataframe
df_scaled

Unnamed: 0_level_0,home_away_N,home_away_away,home_away_home,outcome_L,outcome_W,passing_cmp,passing_att,passing_pct,passing_yds,passing_td,...,first_down_rush,first_down_pen,first_down_total,penalties,penalty_yds,fumbles,interceptions,turnovers,points_for,points_against
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-09-02,0,0,1,0,1,-1.546620,-1.386554,-1.483401,-1.221747,-0.369818,...,2.773414,-1.156017,1.246407,-0.902913,-1.000016,-0.894651,1.909233,0.807484,1.383476,-0.746078
2000-09-09,0,1,0,0,1,-1.261716,-1.195557,-0.831127,-1.092929,-1.180028,...,0.775498,-0.346805,-0.114387,-0.150704,-0.471387,-0.894651,0.026244,-0.640940,-0.213757,-0.016184
2000-09-23,0,0,1,0,1,-0.834361,-1.386554,1.571151,-0.491781,2.871026,...,1.374873,-1.156017,0.395910,-0.526808,-0.024087,0.085367,-0.915251,-0.640940,0.875265,-0.746078
2000-09-30,0,0,1,0,1,-0.691909,-0.431572,-0.743627,-0.341494,0.440393,...,0.375915,2.080831,1.076307,0.225401,-0.105414,-0.894651,-0.915251,-1.365152,0.875265,-0.016184
2000-10-07,0,1,0,0,1,-1.261716,-1.004561,-1.268628,-0.438107,-1.180028,...,0.575706,-1.156017,-0.114387,-0.526808,-0.715370,-0.894651,0.026244,-0.640940,1.383476,0.182878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-10-29,0,0,1,1,0,-0.691909,-0.336074,-0.902718,-0.180472,-0.369818,...,-1.422210,-0.346805,-1.985479,-1.279017,-1.040679,0.085367,1.909233,1.531696,-1.520584,0.116524
2022-11-05,0,0,1,1,0,-0.691909,-0.240576,-1.045900,-0.899703,-1.180028,...,-0.423252,-1.156017,-1.305082,-1.655122,-1.853953,-0.894651,0.026244,-0.640940,-1.230178,-0.281600
2022-11-12,0,1,0,1,0,-0.834361,-0.718066,-0.568627,-1.436442,-1.180028,...,-1.422210,-0.346805,-2.155578,-0.902913,-1.040679,-0.894651,-0.915251,-1.365152,-1.956193,0.647356
2022-11-19,0,0,1,1,0,-0.549457,-0.718066,0.226830,-1.060725,0.440393,...,-1.222419,0.462407,-1.475181,-1.655122,-1.650635,-0.894651,-0.915251,-1.365152,-1.157576,-0.613370


### Determine k value

In [8]:
# Create a a list to store inertia values
inertia = []

# Create a a list to store the values of k
k = list(range(1, 11))

# Create a for loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the home_sales_df DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the K-means model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(df_scaled)
    inertia.append(k_model.inertia_)

# Create a dictionary that holds the list values for k and inertia
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame using the elbow_data dictionary
df_elbow_data = pd.DataFrame(elbow_data)

# Plot the DataFrame
df_elbow_data.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)



Select k = 3 for analysis

### Build the Model

In [9]:
# Initialize the K-Means model
model = KMeans(n_clusters=3, random_state=1)

# Fit the K-Means model using the scaled data
model.fit(df_scaled)

# Predict the clusters to group the cryptocurrencies using the scaled data
k = model.predict(df_scaled)

# Create a copy of the Dataframe
df_predictions = df_scaled.copy()
df_predictions['group'] = k

# View the stock segments
df_predictions.head()



Unnamed: 0_level_0,home_away_N,home_away_away,home_away_home,outcome_L,outcome_W,passing_cmp,passing_att,passing_pct,passing_yds,passing_td,...,first_down_pen,first_down_total,penalties,penalty_yds,fumbles,interceptions,turnovers,points_for,points_against,group
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-09-02,0,0,1,0,1,-1.54662,-1.386554,-1.483401,-1.221747,-0.369818,...,-1.156017,1.246407,-0.902913,-1.000016,-0.894651,1.909233,0.807484,1.383476,-0.746078,0
2000-09-09,0,1,0,0,1,-1.261716,-1.195557,-0.831127,-1.092929,-1.180028,...,-0.346805,-0.114387,-0.150704,-0.471387,-0.894651,0.026244,-0.64094,-0.213757,-0.016184,1
2000-09-23,0,0,1,0,1,-0.834361,-1.386554,1.571151,-0.491781,2.871026,...,-1.156017,0.39591,-0.526808,-0.024087,0.085367,-0.915251,-0.64094,0.875265,-0.746078,0
2000-09-30,0,0,1,0,1,-0.691909,-0.431572,-0.743627,-0.341494,0.440393,...,2.080831,1.076307,0.225401,-0.105414,-0.894651,-0.915251,-1.365152,0.875265,-0.016184,0
2000-10-07,0,1,0,0,1,-1.261716,-1.004561,-1.268628,-0.438107,-1.180028,...,-1.156017,-0.114387,-0.526808,-0.71537,-0.894651,0.026244,-0.64094,1.383476,0.182878,0


In [14]:
# Generate plots
df_predictions.hvplot.scatter(
    x="points_for",
    y="points_against",
    by="group",
    hover_cols="date"
).opts(yformatter="%.0f")

In [15]:
# To Do

In [None]:
# PCA Stuff