# MATH 475 Final Project
## Predicting NFL QB All-Pros
## Authors: Brayan Mauricio-Gonzalez and Graham Swain

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier

In [None]:
df = pd.read_csv('nfl_qb.csv')

Looking at the first few rows of our data set to ensure there is nothing that immediately seems wrong.

In [None]:
df.head()

In [None]:
df = df.loc[df['pass_att'] >= 75]

Checking if any cells are N/A.

In [None]:
df.isna().sum()

Making a new column 'was_ap' that will be one if the player received 1st or 2nd team AP honors that season.
We are using a bitwise or operator to accomplish this.

In [None]:
df['was_ap'] = (df['ap_1st'] | df['ap_2nd'])

This formats all of the Seaborn plots in the manner that we want.

In [None]:
sns.set(rc={'figure.figsize':(20,7.55)})
sns.set_style('ticks')

This sorts the players who achieved AP honors to the top of the dataframe so they will be displayed on
the top layer of graphs.

In [None]:
graph_df = df.sort_values(by = 'was_ap')

In [None]:
fig, axs = plt.subplots(nrows = 5)
fig.set_size_inches(15, 15)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df, x = 'year', y = 'g', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df, x = 'year', y = 'gs', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df, x = 'year', y = 'wins', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])
sns.scatterplot(data = graph_df, x = 'year', y = 'loses', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[3])
sns.scatterplot(data = graph_df, x = 'year', y = 'ties', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[4])

In [None]:
fig, axs = plt.subplots(nrows = 6)
fig.set_size_inches(15, 18)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df, x = 'year', y = 'pass_rk', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df, x = 'year', y = 'pass_att', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df, x = 'year', y = 'cmp', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])
sns.scatterplot(data = graph_df, x = 'year', y = 'pass_yds', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[3])
sns.scatterplot(data = graph_df, x = 'year', y = 'pass_td', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[4])
sns.scatterplot(data = graph_df, x = 'year', y = 'int', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[5])

In [None]:
fig, axs = plt.subplots(nrows = 3)
fig.set_size_inches(15, 9)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df, x = 'year', y = 'cmp%', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df, x = 'year', y = 'td%', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df, x = 'year', y = 'int%', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])

In [None]:
fig, axs = plt.subplots(nrows = 4)
fig.set_size_inches(15, 12)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df, x = 'year', y = 'pass_lng', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df, x = 'year', y = 'pass_y/a', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df, x = 'year', y = 'ay/a', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])
sns.scatterplot(data = graph_df, x = 'year', y = 'rate', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[3])

In [None]:
fig, axs = plt.subplots(nrows = 3)
fig.set_size_inches(15, 9)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df, x = 'year', y = 'y/c', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df, x = 'year', y = 'ny/a', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df, x = 'year', y = 'any/a', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])

In [None]:
fig, axs = plt.subplots(nrows = 3)
fig.set_size_inches(15, 9)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df, x = 'year', y = 'sk', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df, x = 'year', y = 'sk%', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df, x = 'year', y = 'yards_lost_sack', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])

In [None]:
fig, axs = plt.subplots(nrows = 2)
fig.set_size_inches(15, 6)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df, x = 'year', y = '4qc', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df, x = 'year', y = 'gwd', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])

In [None]:
fig, axs = plt.subplots(nrows = 4)
fig.set_size_inches(15, 12)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df, x = 'year', y = 'rush_rk', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df, x = 'year', y = 'rush_att', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df, x = 'year', y = 'rush_yds', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])
sns.scatterplot(data = graph_df, x = 'year', y = 'rush_td', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[3])

In [None]:
df.drop(columns = {'pos', 'pass_rk', 'rush_rk', 'rush_att', 'rush_yds', 'rush_td', 'rush_lng', 'rush_y/a',  'rush_y/g', 'fmb'}, inplace = True)

In [None]:
def normalize_by_year(df):
    def normalize(group):
        for column in group.columns:
            if ((str(group[column].values[0]).isnumeric() or isinstance(group[column].values[0], float))
                and column not in ['year', 'age', 'ap_1st', 'ap_2nd', 'was_ap']):
                group[column] = (group[column] - min(group[column])) / (max(group[column]) - min(group[column]))

        return group

    groups = df.copy().groupby("year")
    return groups.apply(normalize)

df_normalized = normalize_by_year(df)

In [None]:
graph_df_n = df_normalized.sort_values(by = 'was_ap')

In [None]:
fig, axs = plt.subplots(nrows = 5)
fig.set_size_inches(15, 15)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'g', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'gs', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'wins', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'loses', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[3])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'ties', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[4])

In [None]:
fig, axs = plt.subplots(nrows = 5)
fig.set_size_inches(15, 15)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])

sns.scatterplot(data = graph_df_n, x = 'year', y = 'pass_att', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'cmp', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'pass_yds', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'pass_td', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[3])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'int', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[4])

In [None]:
fig, axs = plt.subplots(nrows = 3)
fig.set_size_inches(15, 9)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'cmp%', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'td%', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'int%', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])

In [None]:
fig, axs = plt.subplots(nrows = 4)
fig.set_size_inches(15, 12)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'pass_lng', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'pass_y/a', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'ay/a', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'rate', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[3])

In [None]:
fig, axs = plt.subplots(nrows = 3)
fig.set_size_inches(15, 9)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'sk', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'sk%', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'yards_lost_sack', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])

In [None]:
fig, axs = plt.subplots(nrows = 2)
fig.set_size_inches(15, 6)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df_n, x = 'year', y = '4qc', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'gwd', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])

In [None]:
# df.drop(columns = {'pos', 'pass_rk', 'rush_rk', 'rush_att', 'rush_yds', 'rush_td', 'rush_lng', 'rush_y/a',  'rush_y/g', 'fmb'}, inplace = True)
df_normalized.drop(columns = {'year', 'tm', 'age', 'gs', 'wins', 'loses', 'ties', 'pass_lng', 'pass_att',
                              'int', 'sk', 'sk%', 'yards_lost_sack', '4qc', 'gwd', 'ap_1st', 'ap_2nd',
                              'player'}, 
                   inplace = True)

# Decision Tree

In [None]:
y = df_normalized['was_ap']
X = df_normalized.drop(columns = {'was_ap'})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 12)

In [None]:
DTC = DecisionTreeClassifier(max_depth = 4)
DTC.fit(X_train, y_train)

In [None]:
print(f"Our model's score against the testing data: {DTC.score(X_test, y_test)*100:4.2f}%")

In [None]:
ConfusionMatrixDisplay.from_estimator(DTC, X_test, y_test)

In [None]:
y_pred = DTC.predict(X_test)
print(f"Our model's accuracy for predicting All-Pro's: {recall_score(y_test, y_pred)*100:4.2f}%")

In [None]:
plot_tree(DTC, feature_names = X.columns, class_names = ['Was NOT AP', 'Was AP'])

In [None]:
plot_data = X_test.copy()
plot_data['was_ap'] = y_pred
sns.scatterplot(data = plot_data, x = "pass_td", y = "any/a", hue = "was_ap")

# Random Forest Classifier (Grid Search)

In [None]:
RFC = RandomForestClassifier()
grid_params = {
    "n_estimators": range(1, 15, 3),
    "max_depth": range(1, 6)
}

RFC_GSCV = GridSearchCV(RFC, grid_params)

RFC_GSCV.fit(X_train, y_train)

RFC_GSCV.best_params_

In [None]:
best_RFC = RFC_GSCV.best_estimator_
print(f"Our model's score against the training data: {best_RFC.score(X_train, y_train)*100:4.2f}%")

In [None]:
print(f"Our model's score against the testing data: {best_RFC.score(X_test, y_test)*100:4.2f}%")

In [None]:
ConfusionMatrixDisplay.from_estimator(best_RFC, X_test, y_test)

In [None]:
y_pred = best_RFC.predict(X_test)
print(f"Our model's accuracy for predicting All-Pro's: {recall_score(y_test, y_pred)*100:4.2f}%")

# Decision Tree (Grid Search)

In [None]:
DTC = DecisionTreeClassifier()
grid_params = {
    "max_depth": range(1, 6)
}

DTC_GSCV = GridSearchCV(DTC, grid_params)

DTC_GSCV.fit(X_train, y_train)

DTC_GSCV.best_params_

In [None]:
best_DTC = DTC_GSCV.best_estimator_
print(f"Our model's score against the training data: {best_DTC.score(X_train, y_train)*100:4.2f}%")

In [None]:
print(f"Our model's score against the testing data: {best_DTC.score(X_test, y_test)*100:4.2f}%")

In [None]:
ConfusionMatrixDisplay.from_estimator(best_DTC, X_test, y_test)

In [None]:
y_pred = best_DTC.predict(X_test)
print(f"Our model's accuracy for predicting All-Pro's: {recall_score(y_test, y_pred)*100:4.2f}%")

# Stacking Classifier

In [None]:
my_estimators = [
    ('DTC', DecisionTreeClassifier(max_depth = 2, random_state = 12)),
    ('RFC', best_RFC),
    ('LR', LogisticRegression(max_iter = 1000))
]

SC = StackingClassifier(estimators = my_estimators, final_estimator = best_DTC)

SC.fit(X_train, y_train)

In [None]:
print(f"Our model's score against the testing data: {SC.score(X_test, y_test)*100:4.2f}%")

In [None]:
ConfusionMatrixDisplay.from_estimator(SC, X_test, y_test)

In [None]:
y_pred = SC.predict(X_test)
print(f"Our model's accuracy for predicting All-Pro's: {recall_score(y_test, y_pred)*100:4.2f}%")

In [None]:
df2022 = pd.read_csv('2022_nfl_qb.csv')
df2022 = df2022.loc[df2022['Pos'] == 'QB']

In [None]:
df2022.isna().sum()
df2022.loc[df2022['Y/C'].isna()]

In [None]:
df2022.dropna(inplace = True)

In [None]:
df_normalized.columns

In [None]:
df2022.drop(columns = {'Rk', 'Player', 'Tm', 'Age', 'Pos', 'GS', 'QBrec', 'Att', 'Int', '1D', 'Lng',
                       'QBR', 'Sk', 'Yds.1', 'Sk%', '4QC', 'GWD', 'Player-additional'}, inplace = True)

In [None]:
df2022.columns

In [None]:
df2022.columns = ['g', 'cmp', 'cmp%', 'pass_yds', 'pass_td', 'td%', 'int%', 'pass_y/a', 'ay/a', 'y/c', 
                  'pass_y/g', 'rate', 'ny/a', 'any/a']

In [None]:
df2022['year'] = 2022
df2022_normalized = normalize_by_year(df2022)
df2022_normalized.drop(columns = {'year'}, inplace = True)

In [None]:
SC.predict(df2022_normalized)

In [None]:
best_DTC.predict(df2022_normalized)

In [None]:
best_RFC.predict(df2022_normalized)

In [None]:
df2022_normalized

In [None]:
best_DTC.predict(df2022_normalized[0:1])