# Notebook for Conversion Data EDA

This notebook was used to merge conversion data with nfl_first.csv. This data was unused as conversion data was not too meaningful

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../Data/nfl_first.csv")

In [3]:
df['Team'] = df['Team'].str.replace(r"[*+]", "", regex=True)

In [4]:
# Filters data since 1999
df99 = df[df["Year"] >= 1999]

In [5]:
conversion_df = pd.read_csv("NFL_Conversion_Data.csv")

In [6]:
# average of points allowed and points scored
# ignore warnings
df99["Avg_Points_Allowed"] = df99["PA"] / (df99["W"] + df99["L"] + df99["T"])
df99["Avg_Points_Scored"] = df99["PF"] / (df99["W"] + df99["L"] + df99["T"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df99["Avg_Points_Allowed"] = df99["PA"] / (df99["W"] + df99["L"] + df99["T"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df99["Avg_Points_Scored"] = df99["PF"] / (df99["W"] + df99["L"] + df99["T"])


In [7]:
df99.head()

Unnamed: 0,Year,Team,W,L,T,W-L%,PF,PA,PD,MoV,SoS,SRS,OSRS,DSRS,Playoffs,Strong_Start,Avg_Points_Allowed,Avg_Points_Scored
260,1999,Indianapolis Colts,13,3,0,0.813,423,333,90,5.6,0.5,6.1,6.6,-0.5,1,0.0,20.8125,26.4375
261,1999,Buffalo Bills,11,5,0,0.688,320,229,91,5.7,1.5,7.1,-0.2,7.4,1,0.0,14.3125,20.0
262,1999,Miami Dolphins,9,7,0,0.563,326,336,-10,-0.6,2.5,1.9,1.2,0.7,1,0.0,21.0,20.375
263,1999,New York Jets,8,8,0,0.5,308,309,-1,-0.1,3.1,3.1,0.2,2.9,0,0.0,19.3125,19.25
264,1999,New England Patriots,8,8,0,0.5,299,284,15,0.9,1.3,2.2,-1.0,3.2,0,1.0,17.75,18.6875


In [8]:
full_df = pd.merge(df99, conversion_df, on="Team", how="inner")
clean_df = full_df.loc[:, ~full_df.columns.str.contains('^Unnamed')]

In [9]:
clean_df.to_csv("NFL_Full_Conversion_Data.csv", index=False)

In [10]:
# easy strength of schedule
sos_metric = 0.449

In [11]:
# An above average Simple Rating System
above_avg_srs = 0

In [12]:
# above average Offensive SRS and Defensive SRS
above_avg_osrs = 2
# used median for DSRS
above_avg_dsrs = clean_df["DSRS"].median()

In [13]:
# 39% conversion rate is considered average
third_down_conv = 0.39

In [14]:
# above 51% conversion rate for fourth down is considered good
fourth_down_conv = 0.55

In [15]:
# above 50% red zone conversion rate is considered good
red_zone_eff = clean_df["RZPct"].quantile([0.75]).values[0]
# 7 pts or above for MoV is considered above average
mov_threshold = 7
# above 50 pts differential means a strong team
pd_threshold = 50
# above 59% is a good win percentage
wp_threshold = 0.59

In [16]:
Avg_Points_Allowed_th = 21
Avg_Points_Scored_th = 24

## Creating Binary Dataset for analysis and Correlation Matrix

In [17]:
binary_df = clean_df.copy()

binary_df["MoV"] = np.where(binary_df["MoV"] >= mov_threshold, 1, 0)
binary_df["PD"] = np.where(binary_df["PD"] > pd_threshold, 1, 0)
binary_df["W-L%"] = np.where(binary_df["W-L%"] > wp_threshold, 1, 0)

binary_df["SoS"] = np.where(binary_df["SoS"] > sos_metric, 1, 0)
binary_df["SRS"] = np.where(binary_df["SRS"] > above_avg_srs, 1, 0)
binary_df["OSRS"] = np.where(binary_df["OSRS"] > above_avg_osrs, 1, 0)
binary_df["DSRS"] = np.where(binary_df["DSRS"] > above_avg_dsrs, 1, 0)
binary_df["3D%"] = np.where(binary_df["3D%"] > third_down_conv, 1, 0)
binary_df["4D%"] = np.where(binary_df["4D%"] > fourth_down_conv, 1, 0)
binary_df["RZPct"] = np.where(binary_df["RZPct"] > red_zone_eff, 1, 0)
binary_df["Avg_Points_Allowed"] = np.where(binary_df["Avg_Points_Allowed"] >= Avg_Points_Allowed_th, 1, 0)
binary_df["Avg_Points_Scored"] = np.where(binary_df["Avg_Points_Scored"] >= Avg_Points_Scored_th, 1, 0)

In [18]:
binary_columns = ["Playoffs", "PD", "W-L%", "MoV", "Strong_Start", "SoS", "SRS", "OSRS", "DSRS", "3D%", "4D%", "RZPct", "Avg_Points_Allowed", "Avg_Points_Scored"]
binary_df = binary_df[binary_columns]

In [19]:
binary_df.corr()

Unnamed: 0,Playoffs,PD,W-L%,MoV,Strong_Start,SoS,SRS,OSRS,DSRS,3D%,4D%,RZPct,Avg_Points_Allowed,Avg_Points_Scored
Playoffs,1.0,0.674491,0.828313,0.520505,0.288053,-0.124104,0.62307,0.459519,0.380991,0.072244,0.023902,0.033401,-0.422542,0.49384
PD,0.674491,1.0,0.751387,0.622978,0.275647,-0.103087,0.675838,0.609475,0.433445,0.087915,0.033362,0.042192,-0.469008,0.646398
W-L%,0.828313,0.751387,1.0,0.558249,0.292724,-0.105445,0.629998,0.52734,0.415446,0.077557,0.025102,0.036393,-0.447142,0.551376
MoV,0.520505,0.622978,0.558249,1.0,0.24112,-0.074834,0.421032,0.497432,0.334928,0.066056,0.01955,0.029532,-0.363037,0.485488
Strong_Start,0.288053,0.275647,0.292724,0.24112,1.0,-0.052534,0.266689,0.22872,0.16803,0.025814,0.010791,0.017527,-0.182765,0.213848
SoS,-0.124104,-0.103087,-0.105445,-0.074834,-0.052534,1.0,0.05695,0.000214,0.081826,-0.037295,-0.018519,-0.018048,0.024399,-0.143237
SRS,0.62307,0.675838,0.629998,0.421032,0.266689,0.05695,1.0,0.544592,0.543615,0.080934,0.023044,0.036147,-0.516877,0.522069
OSRS,0.459519,0.609475,0.52734,0.497432,0.22872,0.000214,0.544592,1.0,0.120453,0.09348,0.02585,0.052255,-0.15663,0.767018
DSRS,0.380991,0.433445,0.415446,0.334928,0.16803,0.081826,0.543615,0.120453,1.0,0.020762,0.008143,-0.005994,-0.686806,0.135028
3D%,0.072244,0.087915,0.077557,0.066056,0.025814,-0.037295,0.080934,0.09348,0.020762,1.0,0.195453,0.317913,-0.029816,0.099839


Alongside our correlation matrix, there were other indicators that yielded no significant statistical relationship between conversions of various teams and the variables that influence a team's playoff chances.