In [1]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport


In [2]:
# Columns to include 
cont_cols = [
    'D_datediff',              # Days since last game (Home - visitor)
    
    # first downs
    'D_First_Downs',
    
    # Basic Stats
    'D_Rush',                  # Number of running plays attempted
    'D_Yds',                   # Yards gained through running plays
    'D_TDs',                   # Touchdowns scored via running plays
    'D_Cmp',                   # Completions (# of successful passes)
    'D_Att',                   # Attempts (# of passes thrown, completed or not)
    'D_Yd',                    # Yards (Yards the passes have covered)
    'D_TD',                    # Touchdowns
    'D_INT',                   # Interceptions
    'D_Sacked',                # Number of times quarterback was tackled behind line of scrimmage
    'D_Yards',                 # Yards lost from sacks
    'D_Net_Pass_Yards',        # Net passing yards (total yds - yards lost due to sacks)
    'D_Total_Yards',           # Total yards gained (net pass yards + rushing yds)
    'D_Fumbles',               # Number of times ball was fumbled
    'D_Lost',                  # Number of times the team lost possession of the ball due to a fumble
    'D_Turnovers',             # Total number of turnovers, includes interceptions & fumbles lost
    'D_Penalties',             # Number of penalties committed by the team
    'D_Third_Down_Conv',       # 3rd down conversion percentage
    'D_Fourth_Down_Conv',      # 3rd down conversion percentage
    'D_Time_of_Possession',    # Time of possession in minutes
    
    
    # Passing Detailed
    'D_passing_att',           # Passes attempted
    'D_passing_cmp',           # Passes completed
    'D_passing_int',           # Interceptions thrown
    'D_passing_lng',           # Longest completed pass
    'D_passing_sk',            # Passing times sacked
    'D_passing_td',            # Passing touchdowns
    # 'D_passing_yds',           # Yards gained by passing
    
    # Receiving
    'D_receiving_lng',         # Longest reception
    # 'D_receiving_td',          # Receiving touchdowns
    # 'D_receiving_yds',         # Receiving yards
    
    # Rushing Detailed
    'D_rushing_att',           # Rushing attempts (sacks not included)
    'D_rushing_lng',           # Longest rushing attempt (sacks not included)
    'D_rushing_td',            # Rushing touchdowns
    'D_rushing_yds',           # Rushing yards
    
    # Defense interceptions
    'D_def_interceptions_int', # Passes intercepted on defense
    # 'D_def_interceptions_lng', # Longest interception returned
    'D_def_interceptions_td',  # Interceptions returned for touchdown
    'D_def_interceptions_yds', # Yards interceptions were returned
    
    # Defense fumbles
    'D_fumbles_ff',            # Num of times forced a fumble by the opposition recovered by either team
    'D_fumbles_fr',            # Fumbles recovered by player or team
    'D_fumbles_td',            # Fumbles recovered resulting in touchdown for receiver
    'D_fumbles_yds',           # Yards recovered fumbles were returned
    
    # Defense tackles
    'D_sk',                    # Sacks
    'D_tackles_ast',           # Assists on tackles
    'D_tackles_comb',          # Solo + ast tackles
    'D_tackles_solo',          # Tackles

    # ----------------- Kick & Punt returns are combined in EDA ----------------
    ## Kick Returns
    #'D_kick_returns_lng',      # Longest kickoff return
    #'D_kick_returns_rt',       # Kickoff returns 
    #'D_kick_returns_td',       # Kickoffs returned for a touchdown
    #'D_kick_returns_yds',      # Yardage for kickoffs returned
    ## Punt Returns
    #'D_punt_returns_lng',      # Longest punt return
    #'D_punt_returns_ret',      # Punts returned
    #'D_punt_returns_td',       # Punts returned for touchdown
    #'D_punt_returns_yds',      # Punts return yardage
    
    # Kick & Punt returns combined (Created as a result of EDA)
    #'kick_punt_returns_lng',   # Does not appear on final CSV (UMAP)
    #'kick_punt_returns_rt',    # Does not appear on final CSV (UMAP)
    #'kick_punt_returns_td',    # Does not appear on final CSV (UMAP)
    #'kick_punt_returns_yds',   # Does not appear on final CSV (UMAP)
    'kick_punt_umap_dim_1',  # Appears on final CSV (UMAP)
    'kick_punt_umap_dim_2',  # Appears on final CSV (UMAP)
    
    # Punting / Scoring
    # 'D_punting_lng',         # Longest punt
    
    'D_punting_pnt',           # Times punted
    # 'D_punting_yds',         # Total punt yardage
    'D_punting_avg',           # Total punt yardage / number of punts
    
    'D_scoring_fga',           # Field goals attempted
    # 'D_scoring_fgm',         # Field goals made
    'D_scoring_fgp',           # Field goals made / Field goals attempted

    'D_scoring_xpa',           # Extra points attempted
    # 'D_scoring_xpm',         # Extra points made
    'D_scoring_xpp',           # Extra pints made / Extra points attempted
    
    # Additional, calculated metrics
    'D_pythagorean',           # NFL variation of Bill James pythagorean expectation (from wikipedia)
]

In [3]:
print(len(cont_cols))

52


### Import dataset & select relevant columns
Using CombinedSlidingWindow4, grab only the columns that will be passed into the prediction algorithms

In [4]:
df = pd.read_csv("./footballData/CombinedSlidingWindow4.csv", index_col=False, low_memory=False)
df.info()
df = df[cont_cols]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5308 entries, 0 to 5307
Columns: 193 entries, Unnamed: 0 to kick_punt_umap_dim_2
dtypes: float64(116), int64(74), object(3)
memory usage: 7.8+ MB


### Perform exploratory data analysis via ydata-profiling
Use this to explore the dataset & develop hypothesis that can be later tested.

In [5]:
profile = ProfileReport(df, title="Profiling Report")

# Configuration
profile.config.vars.num.skewness_threshold = 1

In [None]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

# Save profile to file

In [None]:
profile.to_file("EdaReportThree.html")