# Notebook for Creation of Binary Dataset

This notebook was used to create our binary dataset, alongside our thresholds. This notebook focuses on NFL_Merged_Data.csv which contains the relevant data to be used for now.

In [1]:
import pandas as pd
import numpy as np

## Threshold Creation

In [2]:
stats_playoffs = pd.read_csv("Data/NFL_Merged_Data.csv")

Thresholds and their meaning better are better explained in the README, under **"Methodologies for Establishing Thresholds"**

In [3]:
# easy strength of schedule
sos_metric = 0.449
above_avg_srs = 0
# above average Offensive SRS and Defensive SRS
above_avg_osrs = 2
above_avg_dsrs = stats_playoffs["DSRS"].median()
Avg_Points_Allowed_th = 21
Avg_Points_Scored_th = 24
# 7 pts or above for MoV is considered above average
mov_threshold = 7
# above 50 pts differential means a strong team
pd_threshold = 50
# above 59% is a good win percentage
wp_threshold = 0.59

# offense completes more than 55% of the time
completion_threshold_o = 0.55
# average passing yards per play
ave_passing_o = 6.6
# above average rushing
ave_rushing_o = 4.5
# defense_completion_percentage
completion_d = 0.55
# defense average passing yards gained
pass_gain_def = 5.9
rush_gain_def = 4.5
# defense success rate pass and run
succ_rate_pass_def = 0.43
succ_rate_rush_def = 0.4


epa_pass_o = 0.15
epa_run_o = 0

epa_pass_d = -0.02
epa_run_d = -0.06
# offense success rate
success_rate_pass_o = 0.46
success_rate_run_o = 0.4
# more than 16 interceptions
interception_d = 16
# wpa metrics
wpa_pass_o = 0.002
wpa_rush_o = 0

wpa_pass_d = 0
wpa_rush_d = -0.0018

avg_succ_rate = 0.86 / 2

In [4]:
binary_dataset = stats_playoffs.copy()

In [5]:
# This modifies the columns in the binary dataset to be between 0 and 1,
# depending on the thresholds
binary_dataset["offense_completion_percentage"] = np.where(binary_dataset["offense_completion_percentage"] >= completion_threshold_o, 1, 0)
binary_dataset["offense_ave_yards_gained_pass"] = np.where(binary_dataset["offense_ave_yards_gained_pass"] > ave_passing_o, 1, 0)
binary_dataset["offense_ave_yards_gained_run"] = np.where(binary_dataset["offense_ave_yards_gained_run"] > ave_rushing_o, 1, 0)

binary_dataset["defense_completion_percentage"] = np.where(binary_dataset["defense_completion_percentage"] > completion_d, 1, 0)
binary_dataset["defense_ave_yards_gained_pass"] = np.where(binary_dataset["defense_ave_yards_gained_pass"] > pass_gain_def, 1, 0)
binary_dataset["defense_ave_yards_gained_run"] = np.where(binary_dataset["defense_ave_yards_gained_run"] > rush_gain_def, 1, 0)
binary_dataset["defense_success_rate_pass"] = np.where(binary_dataset["defense_success_rate_pass"] > succ_rate_pass_def, 1, 0)
binary_dataset["defense_success_rate_run"] = np.where(binary_dataset["defense_success_rate_run"] > succ_rate_rush_def, 1, 0)

binary_dataset["offense_ave_epa_pass"] = np.where(binary_dataset["offense_ave_epa_pass"] > epa_pass_o, 1, 0)
binary_dataset["offense_ave_epa_run"] = np.where(binary_dataset["offense_ave_epa_run"] > epa_run_o, 1, 0)
binary_dataset["defense_ave_epa_pass"] = np.where(binary_dataset["defense_ave_epa_pass"] > epa_pass_d, 1, 0)
binary_dataset["defense_ave_epa_run"] = np.where(binary_dataset["defense_ave_epa_run"] > epa_run_d, 1, 0)
binary_dataset["offense_success_rate_pass"] = np.where(binary_dataset["offense_success_rate_pass"] > success_rate_pass_o, 1, 0)

binary_dataset["offense_success_rate_run"] = np.where(binary_dataset["offense_success_rate_run"] > success_rate_run_o, 1, 0)
binary_dataset["defense_n_interceptions"] = np.where(binary_dataset["defense_n_interceptions"] > interception_d, 1, 0)
binary_dataset["offense_ave_wpa_pass"] = np.where(binary_dataset["offense_ave_wpa_pass"] > wpa_pass_o, 1, 0)
binary_dataset["offense_ave_wpa_run"] = np.where(binary_dataset["offense_ave_wpa_run"] > wpa_rush_o, 1, 0)
binary_dataset["defense_ave_wpa_pass"] = np.where(binary_dataset["defense_ave_wpa_pass"] > wpa_pass_d, 1, 0)
binary_dataset["defense_ave_wpa_run"] = np.where(binary_dataset["defense_ave_wpa_run"] > wpa_rush_d, 1, 0)

# from 1st
binary_dataset["MoV"] = np.where(binary_dataset["MoV"] >= mov_threshold, 1, 0)
binary_dataset["PD"] = np.where(binary_dataset["PD"] > pd_threshold, 1, 0)
binary_dataset["W-L%"] = np.where(binary_dataset["W-L%"] > wp_threshold, 1, 0)

binary_dataset["SoS"] = np.where(binary_dataset["SoS"] > sos_metric, 1, 0)
binary_dataset["SRS"] = np.where(binary_dataset["SRS"] > above_avg_srs, 1, 0)
binary_dataset["OSRS"] = np.where(binary_dataset["OSRS"] > above_avg_osrs, 1, 0)
binary_dataset["DSRS"] = np.where(binary_dataset["DSRS"] > above_avg_dsrs, 1, 0)

# calculated values
binary_dataset["Avg_Points_Allowed"] = np.where(binary_dataset["Avg_Points_Allowed"] >= Avg_Points_Allowed_th, 1, 0)
binary_dataset["Avg_Points_Scored"] = np.where(binary_dataset["Avg_Points_Scored"] >= Avg_Points_Scored_th, 1, 0)
binary_dataset["Avg_offense_success_rate"] = np.where(binary_dataset["Avg_offense_success_rate"] >= avg_succ_rate, 1, 0)

In [6]:
binary_dataset_columns = [
    "Playoffs", "offense_completion_percentage", "offense_ave_yards_gained_pass", "offense_ave_yards_gained_run", 
    "defense_completion_percentage", "defense_ave_yards_gained_pass", "defense_ave_yards_gained_run", 
    "defense_success_rate_pass", "defense_success_rate_run", "offense_ave_epa_pass", "offense_ave_epa_run",
    "defense_ave_epa_pass", "defense_ave_epa_run", "offense_success_rate_pass", "offense_success_rate_run",
    "defense_n_interceptions", 'offense_ave_wpa_pass', 'offense_ave_wpa_run', 'defense_ave_wpa_pass', 'defense_ave_wpa_run',
    'PD', 'W-L%', 'MoV', 'Strong_Start', 'SoS', 'SRS', 'OSRS', 'DSRS', 'Avg_Points_Allowed', 'Avg_Points_Scored',
    'Avg_offense_success_rate'
]
binary_dataset = binary_dataset[binary_dataset_columns]

In [7]:
binary_dataset.to_csv("Data/Binary_Dataset.csv")

## Correlation Matrix to support findings and identify more

In [8]:
binary_dataset.corr()

Unnamed: 0,Playoffs,offense_completion_percentage,offense_ave_yards_gained_pass,offense_ave_yards_gained_run,defense_completion_percentage,defense_ave_yards_gained_pass,defense_ave_yards_gained_run,defense_success_rate_pass,defense_success_rate_run,offense_ave_epa_pass,...,W-L%,MoV,Strong_Start,SoS,SRS,OSRS,DSRS,Avg_Points_Allowed,Avg_Points_Scored,Avg_offense_success_rate
Playoffs,1.0,0.286311,0.354939,0.181826,-0.195358,-0.300987,-0.070669,-0.247765,-0.071149,0.380925,...,0.830031,0.519173,0.287828,-0.116944,0.610701,0.434823,0.413561,-0.416409,0.468894,0.346876
offense_completion_percentage,0.286311,1.0,0.40504,0.162928,0.105735,0.042485,0.136234,0.070287,0.150505,0.272325,...,0.267663,0.208431,0.151669,-0.044146,0.298105,0.338025,0.142658,-0.075966,0.394858,0.480454
offense_ave_yards_gained_pass,0.354939,0.40504,1.0,0.163457,0.07883,0.054609,0.115683,0.082655,0.158666,0.620872,...,0.378845,0.334805,0.194489,-0.062605,0.34754,0.575322,0.040106,-0.011324,0.616522,0.537854
offense_ave_yards_gained_run,0.181826,0.162928,0.163457,1.0,0.123302,0.067926,0.11045,0.08,0.157311,0.119444,...,0.191214,0.149166,0.032133,-0.067189,0.154337,0.202446,0.074191,0.024358,0.237175,0.365659
defense_completion_percentage,-0.195358,0.105735,0.07883,0.123302,1.0,0.466583,0.17906,0.664835,0.293752,0.028495,...,-0.225095,-0.159931,-0.117734,0.010865,-0.247426,-0.081317,-0.260695,0.381595,-0.02496,0.128806
defense_ave_yards_gained_pass,-0.300987,0.042485,0.054609,0.067926,0.466583,1.0,0.210784,0.559193,0.232692,-0.004296,...,-0.334782,-0.283096,-0.123627,0.019466,-0.351984,-0.106969,-0.475674,0.568085,-0.064257,0.070532
defense_ave_yards_gained_run,-0.070669,0.136234,0.115683,0.11045,0.17906,0.210784,1.0,0.208104,0.479521,0.083996,...,-0.087188,-0.055866,0.006698,0.05451,-0.154553,0.068626,-0.209933,0.268981,0.099669,0.155776
defense_success_rate_pass,-0.247765,0.070287,0.082655,0.08,0.664835,0.559193,0.208104,1.0,0.295592,0.040204,...,-0.271595,-0.226391,-0.085735,0.009857,-0.318783,-0.081836,-0.400439,0.485495,-0.037688,0.105117
defense_success_rate_run,-0.071149,0.150505,0.158666,0.157311,0.293752,0.232692,0.479521,0.295592,1.0,0.146576,...,-0.080987,-0.094475,0.01131,-0.004334,-0.126604,0.057759,-0.174047,0.255205,0.104503,0.163601
offense_ave_epa_pass,0.380925,0.272325,0.620872,0.119444,0.028495,-0.004296,0.083996,0.040204,0.146576,1.0,...,0.421627,0.439978,0.261085,-0.048882,0.336712,0.509162,0.09497,-0.067851,0.511181,0.457528


In [9]:
# epa wpa thresholds
spearman_corr = binary_dataset.corr(method='spearman')

In [10]:
spearman_corr.to_csv("Data/result_matrix.csv")