In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import hdbscan

In [None]:
##Helper Functions :)
#convert feet to inches for players
def ft_in(x):
    if '-' in x:
        meas=x.split('-')
        #this will be a list ['ft','in']
        inches = int(meas[0])*12 + int(meas[1])
        return inches
    else:
        return int(x)
    
#convert Game Clock from MM:SS:00 to Seconds
def clock(x,df):
    gameClock = df.loc[x]['gameClock']
    quarter = df.loc[x]['quarter']

    gameClock_split = gameClock.split(':')

    minutes = gameClock_split[0]
    seconds = gameClock_split[1]

    total_minutes = int(minutes) + 15 * (quarter - 1)

    return (total_minutes * 60) + int(seconds)

In [None]:
##preprocess by dataset
#players Dataset
def preprocess_players(players_df):
    # preprocessing steps
    players_df['height'] = players_df['height'].apply(ft_in)
    return players_df


Note: run `preprocess_play` before `preprocess_tracking`.

In [None]:
#preprocess tracking
def preprocess_tracking(track18, track19, track20, play_df):
     '''
    This function creates the tracking dataframes.

    Parameters:
    -----------
    track18, track19, track20 - trackYY.csv dataframes
    play_df - Preprocessed play.csv dataframe
    ...

    Returns:
    -----------
    track_ep18 - Tracking ExtraPoint 2018 dataframe
    track_ep19 - Tracking ExtraPoint 2019 dataframe
    track_ep20 - Tracking ExtraPoint 2020 dataframe
    track_fg18 - Tracking FieldGoal 2018 dataframe
    track_fg19 - Tracking FieldGoal 2019 dataframe
    track_fg20 - Tracking FieldGoal 2020 dataframe
    track_punt18 - Tracking Punt 2018 dataframe
    track_punt19 - Tracking Punt 2019 dataframe
    track_punt20 - Tracking Punt 2020 dataframe
    track_ko18 - Tracking Kickoff 2018 dataframe
    track_ko19 - Tracking Kickoff 2019 dataframe
    track_ko20 - Tracking Kickoff 2020 dataframe
    track_fep - Tracking Football ExtraPoint dataframe
    track_ffg - Tracking Football Fieldgoal dataframe
    track_fpunt - Tracking Football Punt dataframe
    track_fko - Tracking Football Kickoff dataframe

    '''
    #re-orient direction of play by offensive team direction : 
    #We must reorient this to reflect movement in the offense direction instead of the on-field coordinates 
    #(reorient the orgin from the bottom left to top right for a change in direction).
    #2018 tracking data
    track18.loc[track18['playDirection'] == 'left', 'x'] = 120 -track18.loc[track18['playDirection']=='left','x']
    track18.loc[track18['playDirection'] == 'left', 'y'] = 160/3 -track18.loc[track18['playDirection']=='left','y']
    #note that we have 160/3 for the y direction since the football field is 160ft, but our units are yards

    #2019 tracking data
    track19.loc[track19['playDirection'] == 'left', 'x'] = 120 -track19.loc[track19['playDirection']=='left','x']
    track19.loc[track19['playDirection'] == 'left', 'y'] = 160/3 -track19.loc[track19['playDirection']=='left','y']

    #2020 tracking data
    track20.loc[track20['playDirection'] == 'left', 'x'] = 120 -track20.loc[track20['playDirection']=='left','x']
    track20.loc[track20['playDirection'] == 'left', 'y'] = 160/3 -track20.loc[track20['playDirection']=='left','y']

    #divide play dataset by type of play
    play_ep = play_df.loc[play_df['specialTeamsPlayType']=='Extra Point'][['gameId', 'playId']]
    play_fg = play_df.loc[play_df['specialTeamsPlayType']=='Field Goal'][['gameId', 'playId']]
    play_punt = play_df.loc[play_df['specialTeamsPlayType']=='Punt'][['gameId', 'playId']]
    play_ko = play_df.loc[play_df['specialTeamsPlayType']=='Kickoff'][['gameId', 'playId']]
    
    #merge play_type with tracking for each year
    #extrapoint
    track_ep18 = pd.merge(play_ep, track18, left_on = ['gameId', 'playId'], right_on = ['gameId', 'playId'])
    track_ep19 = pd.merge(play_ep, track19, left_on = ['gameId', 'playId'], right_on = ['gameId', 'playId'])
    track_ep20 = pd.merge(play_ep, track20, left_on = ['gameId', 'playId'], right_on = ['gameId', 'playId'])
    
    #fieldgoal 
    track_fg18 = pd.merge(play_fg, track18, left_on = ['gameId', 'playId'], right_on = ['gameId', 'playId'])
    track_fg19 = pd.merge(play_fg, track19, left_on = ['gameId', 'playId'], right_on = ['gameId', 'playId'])
    track_fg20 = pd.merge(play_fg, track20, left_on = ['gameId', 'playId'], right_on = ['gameId', 'playId'])
    
    #punt
    track_punt18 = pd.merge(play_punt, track18, left_on = ['gameId', 'playId'], right_on = ['gameId', 'playId'])
    track_punt19 = pd.merge(play_punt, track19, left_on = ['gameId', 'playId'], right_on = ['gameId', 'playId'])
    track_punt20 = pd.merge(play_punt, track20, left_on = ['gameId', 'playId'], right_on = ['gameId', 'playId'])
    
    #kickoff
    track_ko18 = pd.merge(play_ko, track18, left_on = ['gameId', 'playId'], right_on = ['gameId', 'playId'])
    track_ko19 = pd.merge(play_ko, track19, left_on = ['gameId', 'playId'], right_on = ['gameId', 'playId'])
    track_ko20 = pd.merge(play_ko, track20, left_on = ['gameId', 'playId'], right_on = ['gameId', 'playId'])
    
    #separate out the football data in each playtype tracking dataframe and drop null values
    #concatenate to one dataframe per type of play
    #extrapoint football
    track_fep18 = track_ep18.loc[track_ep18['displayName'] == 'football'].dropna(axis = 'columns')
    track_fep19 = track_ep19.loc[track_ep19['displayName'] == 'football'].dropna(axis = 'columns')
    track_fep20 = track_ep20.loc[track_ep20['displayName'] == 'football'].dropna(axis = 'columns')
    track_fep = pd.concat([track_fep18, track_fep19, track_fep20], ignore_index = True)
    
    #fieldgoal football
    track_ffg18 = track_fg18.loc[track_fg18['displayName'] == 'football'].dropna(axis = 'columns')
    track_ffg19 = track_fg19.loc[track_fg19['displayName'] == 'football'].dropna(axis = 'columns')
    track_ffg20 = track_fg20.loc[track_fg20['displayName'] == 'football'].dropna(axis = 'columns')
    track_ffg = pd.concat([track_ffg18, track_ffg19, track_ffg20], ignore_index = True)
    
    #punt football
    track_fpunt18 = track_punt18.loc[track_punt18['displayName'] == 'football'].dropna(axis = 'columns')
    track_fpunt19 = track_punt19.loc[track_punt19['displayName'] == 'football'].dropna(axis = 'columns')
    track_fpunt20 = track_punt20.loc[track_punt20['displayName'] == 'football'].dropna(axis = 'columns')
    track_fpunt = pd.concat([track_fpunt18, track_fpunt19, track_fpunt20], ignore_index = True)
    
    #kickoff football
    track_fko18 = track_ko18.loc[track_ko18['displayName'] == 'football'].dropna(axis = 'columns')
    track_fko19 = track_ko19.loc[track_ko19['displayName'] == 'football'].dropna(axis = 'columns')
    track_fko20 = track_ko20.loc[track_ko20['displayName'] == 'football'].dropna(axis = 'columns')
    track_fko = pd.concat([track_fko18, track_fko19, track_fko20], ignore_index = True)
    
    return track_ep18, track_ep19, track_ep20, track_fg18, track_fg19, track_fg20, track_punt18, track_punt19, track_punt20, track_ko18, track_ko19, track_ko20, track_fep, track_ffg, track_fpunt, track_fko


In [None]:
#preprocess play data
def preprocess_play(play_df):
    #null penalty yards = 0
    play_df['penaltyYards']=play_df['penaltyYards'].fillna(0)
    
    #clock: MM:SS to Seconds
    play_df['gameClockSeconds'] = play_df.index.map(lambda x: clock(x,df))
    
    #redefine nulls in penalty as no penalty
    play_df['penaltyCodes']=play_df['penaltyCodes'].fillna('no penalty')
    
    #TO-DO: Address null values on kickerId and kickBlockerId 
    #(note their height & weight comes in too)
    return play_df

#### We need to make some data frames for analysis: Weather, ExtraPoint, FieldGoal, Punts, Kickoffs

In [None]:
def get_weather_data():
    '''
    This function creates the Weather dataframes by year.

    Returns:
    -----------
    weather2018, weather2019, weather2020 - Weather dataframes by year

    '''
    # Pull down datasets
    games = pd.read_csv('https://raw.githubusercontent.com/ThompsonJamesBliss/WeatherData/master/data/games.csv')
    stadium_coordinates = pd.read_csv('https://raw.githubusercontent.com/ThompsonJamesBliss/WeatherData/master/data/stadium_coordinates.csv')
    games_weather = pd.read_csv('https://raw.githubusercontent.com/ThompsonJamesBliss/WeatherData/master/data/games_weather.csv')

    # Merge game and weather data on game_id
    games_weather_merge = pd.merge(games_weather, games, on='game_id')

    # Merge stadium data on StadiumName
    final_df = pd.merge(games_weather_merge, stadium_coordinates, on='StadiumName')

    # Convert time columns to datetime objects
    time_cols = ['TimeMeasure', 'TimeStartGame', 'TimeEndGame']

    for col in time_cols:
        final_df[col] = pd.to_datetime(final_df[col], format='%m/%d/%Y %H:%M')

    # Create sliced DataFrames
    weather2018 = final_df[final_df['TimeMeasure'].dt.year == 2018]
    weather2019 = final_df[final_df['TimeMeasure'].dt.year == 2019]
    weather2020 = final_df[final_df['TimeMeasure'].dt.year == 2020]

    return weather2018, weather2019, weather2020

In [2]:
#make the ExtraPoint dataframe
#this runs AFTER play and players are preprocessed
def make_extraPoint(play_df, players_df):
    '''
    This function creates the ExtraPoint dataframe.

    Parameters:
    -----------
    play_df - Preprocessed players.csv dataframe
    ...

    Returns:
    -----------
    ep_plays - ExtraPoint dataframe

    '''
    play_extrapoint = play_df.loc[play_df['specialTeamsPlayType']=='Extra Point']
    #remove extraneous columns
    ep = play_extrapoint.drop(columns =['kickReturnYardage', 'kickLength', 'playResult', 'returnerId', 'yardsToGo', 'down', 'specialTeamsPlayType'])
   
    #add in Kickers
    ep_play = pd.merge(ep, players_df[['nflId', 'height', 'weight','Position', 'displayName']], how = 'left',
             left_on = 'kickerId', right_on = 'nflId')
    ep_plays=ep_play.rename(columns = {"height": 'kicker_height', "weight": 'kicker_weight', "Position": 'kicker_position', "displayName": 'kicker_name'})
    ep_plays=ep_plays.drop(columns=['nflId'])
    #add in Blockers (figure out Nulls first!)
    #ep_full = pd.merge(ep_plays, players_df[['nflId', 'height', 'weight','Position', 'displayName']], how = 'left',
    #         left_on = 'kickBlockerId', right_on = 'nflId')
    #eps=ep_full.rename(columns = {"height": 'blocker_height', "weight": 'blocker_weight', "Position": 'blocker_position', "displayName": 'blocker_name'})
    #eps=eps.drop(columns=['nflId'])
    return ep_plays
    

In [None]:
#make FieldGoal dataframe
def make_fieldGoal(play_df, players_df):
    '''
    This function creates the FieldGoal dataframe.

    Parameters:
    -----------
    play_df - Preprocessed players.csv dataframe
    ...

    Returns:
    -----------
    fg_plays - FieldGoal dataframe

    '''
    play_fieldgoal = play_df.loc[play_df['specialTeamsPlayType']=='Field Goal']
    #remove extraneous columns
    fg = play_fieldgoal.drop(columns =['kickReturnYardage', 'specialTeamsPlayType'])
   
    #add in Kickers
    fg_play = pd.merge(fg, players_df[['nflId', 'height', 'weight','Position', 'displayName']], how = 'left',
             left_on = 'kickerId', right_on = 'nflId')
    fg_plays=fg_play.rename(columns = {"height": 'kicker_height', "weight": 'kicker_weight', "Position": 'kicker_position', "displayName": 'kicker_name'})
    fg_plays=fg_plays.drop(columns=['nflId'])
    #add in Blockers (figure out Nulls first!)
    #fg_full = pd.merge(fg_plays, players_df[['nflId', 'height', 'weight','Position', 'displayName']], how = 'left',
    #         left_on = 'kickBlockerId', right_on = 'nflId')
    #fgs=fg_full.rename(columns = {"height": 'blocker_height', "weight": 'blocker_weight', "Position": 'blocker_position', "displayName": 'blocker_name'})
    #fgs=fgs.drop(columns=['nflId'])
    return fg_plays


#### Preprocessing functions for actual modeling or clustering.

In [None]:
def preprocess_ep(ep_plays):
    '''
    This function the ExtraPoint dataframe for clustering.

    Parameters:
    -----------
    ep_plays - ExtraPoint dataframe
    ...

    Returns:
    -----------
    ep_scale - processed ExtraPoint dataframe

    '''
    #reduce number of columns to those with numeric values or one-hot-encode the categoricals
    useful_cols = ['specialTeamsResult', 'yardlineNumber', 'gameClockSeconds', 
                   'penaltyCodes', 'penaltyYards', 'preSnapHomeScore', 
                   'preSnapVisitorScore', 'kicker_height', 'kicker_weight']
    
    #useful_cols with blockers
    #useful_cols = ['specialTeamsResult', 'yardlineNumber', 'gameClockSeconds', 
                 #   'penaltyCodes', 'penaltyYards', 'preSnapHomeScore', 'preSnapVisitorScore', 
                # 'kicker_height', 'kicker_weight', 'blocker_height', 'blocker_weight']
    ep_df = ep_plays[useful_cols]
    #one-hot-encode SpecialTeamsResult and penaltyCodes
    le_str = LabelEncoder()
    le_pc = LabelEncoder()
    ohe_str = le_str.fit_transform(ep_df['specialTeamsResult'])
    ohe_pc = le_pc.fit_transform(ep_df['penaltyCodes'])
    new_eps = ep_df.drop(['specialTeamsResult', 'penaltyCodes'], axis=1)
    #new_eps['specialTeamsResult'] = ohe_str
    #new_eps['penaltyCodes'] = ohe_pc
    
    #scale data, but only non-categorical columns
    scale = StandardScaler()
    ep_scale = scale.fit_transform(new_eps)
    #TO-DO QUESTION: do we want to scale categoricals too? No
    
    #add categorical columns back
    ep_scale['specialTeamsResult'] = ohe_str
    ep_scale['penaltyCodes'] = ohe_pc
    #we are running a distance dependent algorithm
    return ep_scale
    

In [None]:
def preprocess_fg(fg_plays):
    '''
    This function the FieldGoal dataframe for clustering.

    Parameters:
    -----------
    fg_plays - FieldGoal dataframe
    ...

    Returns:
    -----------
    fg_scale - processed FieldGoal dataframe

    '''
    #reduce number of columns to those with numeric values or one-hot-encode categoricals
    useful_cols = ['specialTeamsResult', 'yardlineNumber', 
               'gameClockSeconds', 'penaltyCodes', 
               'penaltyYards', 'preSnapHomeScore', 
               'preSnapVisitorScore', 'kicker_height', 
               'kicker_weight', 'down',
              'yardsToGo', 'kickLength',
              'playResult']
    
    #useful_cols with blockers
    #useful_cols = ['specialTeamsResult', 'yardlineNumber', 
   #            'gameClockSeconds', 'penaltyCodes', 
   #            'penaltyYards', 'preSnapHomeScore', 
   #            'preSnapVisitorScore', 'kicker_height', 
   #            'kicker_weight', 'blocker_height', 
    #           'blocker_weight', 'down',
    #          'yardsToGo', 'kickLength',
    #          'playResult']
    fg_df = fg_plays[useful_cols]
    #one-hot-encode SpecialTeamsResult and penaltyCodes
    le_str = LabelEncoder()
    le_pc = LabelEncoder()
    ohe_str = le_str.fit_transform(fg_df['specialTeamsResult'])
    ohe_pc = le_pc.fit_transform(fg_df['penaltyCodes'])
    new_fgs = fg_df.drop(['specialTeamsResult', 'penaltyCodes'], axis=1)
    #new_fgs['specialTeamsResult'] = ohe_str
    #new_fgs['penaltyCodes'] = ohe_pc
    
    #scale data, but only non-categorical columns
    scale = StandardScaler()
    fg_scale = scale.fit_transform(new_fgs)
    #TO-DO QUESTION: do we want to scale categoricals too? No
    
    #add categorical columns back
    fg_scale['specialTeamsResult'] = ohe_str
    fg_scale['penaltyCodes'] = ohe_pc
    #we are running a distance dependent algorithm
    return fg_scale, fg_df

Let's make the clustering pipeline.

In [None]:
def cluster_df(df_scale, df):
    '''
    This function performs the clustering on the dataframe df through the scaled data
    
    Parameters:
    -----------
    df_scale - the array produced from StandardScaler on the entire preprocessed dataframe df
    df - the preprocessed dataframe, prior to encoding
    
    Returns:
    ---------
    df with column 'cluster_id' to track cluster labels
    cls - the fit cluster object to make trees, etc.
    '''
    clusterer = hdbscan.HDBSCAN()
    cls = clusterer.fit(df_scale)
    df['cluster_id']=cls.labels_
    
    return cls, df
    