### feature generation notebook 

In [3]:
import pandas as pd
import numpy as np
import re
import warnings; warnings.simplefilter('ignore')
import math
from multiprocessing import Pool, cpu_count

In [4]:
# This dataset can be downloaded from https://www.kaggle.com/c/nfl-big-data-bowl-2020/data after an agreement is digitally signed through kaggle. 
data = pd.read_csv('train.csv')
# set all columns to lower
data.columns = map(str.lower, data.columns)

#### Player height feature

In [5]:
# create function to convert playerHeight to inches
def heightToInches(x):
    try:
        x,x1 = x.split('-')
        x = int(x)
        x1 = int(x1)
        x = (12*x) + x1
        return(x)
    except:
        return int(x*12)

In [6]:
# run function over column
data["playerheight"] = data["playerheight"].apply(heightToInches)

In [7]:
# convert TimeHandoff to dates and times
data['timehandoff'] = pd.to_datetime(data['timehandoff'])
data['timesnap'] = pd.to_datetime(data['timesnap'])
data['playerbirthdate'] = pd.to_datetime(data["playerbirthdate"])

#### player age

In [8]:
# generate age of each player
def birthDayCalc(df):
    birthDay = df[0].year
    timeHandoff = df[1].year
    age = (timeHandoff - birthDay) 
    return age
data["age"] = data[["playerbirthdate","timehandoff"]].apply(birthDayCalc, axis=1)

#### time since snap in seconds

In [9]:
# find time from snap to handoff
def time_since_snap(df):
    handoff = df[0]
    snap = df[1]
    time = df[0] - df[1]
    return time.seconds
data['time_since_snap'] = data[['timehandoff', 'timesnap']].apply(time_since_snap, axis = 1)

#### windspeed

In [10]:
# convert windspeed
def windConvert(x):
    try:
        x = re.match('[0-9]+',x)
        x = x.groups()
        for i in x:
            y += i
            return(y)
    except:
        return(x)
data["windspeed"] = data["windspeed"].apply(windConvert)

#### oreintation and direction

In [11]:
# create functions to fix oreintation and direction
# Refrenced this notebook to help understand how to modify https://www.kaggle.com/ben519/understanding-x-y-dir-and-orientation
def fixDir(x):
    y = (360-x+90) % 360
    return y
def fixOri(x):
    y = (360-x) % 360
    return y

In [12]:
# apply oreintation fix
data["orientation"][data["season"] == 2017] = data["orientation"][data["season"] == 2017].apply(fixOri)
# apply direction fix
data["dir"] = data["dir"].apply(fixDir)

#### distance to the rusher

In [13]:
# create a rusher dummy variable
data['rusher'] = 0
data['rusher'][data['nflid'] == data['nflidrusher']] = 1# create a rusher dummy variable


In [14]:
# get Rusher X and Y
pos_df = pd.DataFrame(zip(data['x'][data['rusher'] == 1], data['y'][data['rusher'] == 1]), columns = ['x', 'y'])

In [15]:
posdf = pd.DataFrame(zip(pos_df['x'].repeat(22), pos_df['y'].repeat(22)), index = range(len(data)), columns = ['rushx', 'rushy'])
# put the rushX and rushY in the main df
data['rushx'] = posdf['rushx']
data['rushy'] = posdf['rushy']
del posdf

In [16]:
# euclidean dist = sqrt((x1 - x2)^2 + (y1 - y2)^2) - straight line distance
data['disttorush'] = np.sqrt((data['x'] - data['rushx'])**2 + (data['y'] - data['rushy'])**2)


####  Team name for each player

In [17]:
def teamIdentitySet(x):
    if x["team"] == "home":
        return x["hometeamabbr"]
    else:
        return x["visitorteamabbr"]
# applying function entire data df
data["teamidentity"] = data.apply(teamIdentitySet, axis=1)

#### min distance from defender to rusher by play

In [18]:
# create attribute for minimum defensive player from runningback
def minDistanceFromDef(x):
    # get playid 
    x = x["playid"]
    # return back only play with that id
    x = data[data["playid"] == x]
    # Return back minmimum distance of players not on rusher team
    x = min(x["disttorush"][x["teamidentity"] != x["possessionteam"]])
    return x
data["min_distancetorusher"]=data.apply(minDistanceFromDef,axis=1)

#### match home, vistior and possession team abbreviation 

In [19]:
# find teams with mismatched abbreviations
for home_abbr, pos_abbr in zip(sorted(data['hometeamabbr'].unique()),
                                     sorted(data['possessionteam'].unique())):
    if home_abbr != pos_abbr:
        print(home_abbr + ' ' + pos_abbr )

# define a mapping to fix these
def map_abbr(df):
    '''
    input: dataframe
    function: fixes incorrect abbreviations in possession team
    returns: correct abbreviations
    '''
    home_abbr = df['hometeamabbr']
    pos_abbr = df['possessionteam']
    if home_abbr != pos_abbr:
        pos_abbr = home_abbr
    return pos_abbr

data['possessionteam'] = data.apply(map_abbr, axis = 1)

ARI ARZ
BAL BLT
CLE CLV
HOU HST


#### stadium type

In [20]:
# Transform stadium type
def clean_stad_type(txt):
    if pd.isna(txt):
        return np.nan
    txt = txt.lower()
    #txt = ''.join([c for c in txt if c not in punctuation])
    txt = re.sub(' +', ' ', txt)
    txt = txt.strip()
    txt = txt.replace('outside', 'outdoor')
    txt = txt.replace('outdor', 'outdoor')
    txt = txt.replace('outddors', 'outdoor')
    txt = txt.replace('outdoors', 'outdoor')
    txt = txt.replace('oudoor', 'outdoor')
    txt = txt.replace('indoors', 'indoor')
    txt = txt.replace('ourdoor', 'outdoor')
    txt = txt.replace('retractable', 'rtr.')
    return txt
data['stadiumtype'] = data['stadiumtype'].apply(clean_stad_type)

In [21]:
# focus only on outdoor, indoor, open, and closed
def num_stad_type(txt):
    if pd.isna(txt):
        return np.nan
    if 'outdoor' in txt or 'open' in txt:
        return 1
    if 'indoor' in txt or 'closed' in txt:
        return 0
    # else
    return np.nan

data['stadiumtype'] = data['stadiumtype'].apply(num_stad_type)

#### offensive and defensive personnel counts 

In [22]:
# count defensive position
# differentiate defensive ol and offensive ol - JM
data["ol_d"] = data["defensepersonnel"].str.extract(pat="([1-9] OL)")
data["ol_d"] = data["ol_d"].str.extract(pat="([1-9])")
data["dl"] = data["defensepersonnel"].str.extract(pat="([1-9] DL)")
data["dl"] = data["dl"].str.extract(pat="([1-9])")
data["lb"] = data["defensepersonnel"].str.extract(pat="([1-9] LB)")
data["lb"] = data["lb"].str.extract(pat="([1-9])")
data["db"] = data["defensepersonnel"].str.extract(pat="([1-9] DB)")
data["db"] = data["db"].str.extract(pat="([1-9])")

# differentiate defensive rb and offensive rb - JM
data["rb_d"] = data["defensepersonnel"].str.extract(pat="([1-9] RB)")
data["rb_d"] = data["rb_d"].str.extract(pat="([1-9])")

# count offensive position
# QB, OL, RB, TE, WR, DL, LB, DB
data['qb'] = data['offensepersonnel'].str.extract(pat='([1-9] QB)')
data['qb'] = data['qb'].str.extract(pat='([1-9])')
data['ol'] = data['offensepersonnel'].str.extract(pat='([1-9] OL)')
data['ol'] = data['ol'].str.extract(pat='([1-9])')
data['rb'] = data['offensepersonnel'].str.extract(pat='(\d RB)')
data['rb'] = data['rb'].str.extract(pat='(\d)')
data['te'] = data['offensepersonnel'].str.extract(pat='(\d TE)')
data['te'] = data['te'].str.extract(pat='(\d)')
data['wr'] = data['offensepersonnel'].str.extract(pat='(\d WR)')
data['wr'] = data['wr'].str.extract(pat='(\d)')

# differentiate offensive dl 
data['dl_o'] = data['offensepersonnel'].str.extract(pat='(\d DL)')
data['dl_o'] = data['dl_o'].str.extract(pat='(\d)')
data['lb_o'] = data['offensepersonnel'].str.extract(pat='(\d LB)')
data['lb_o'] = data['lb_o'].str.extract(pat='(\d)')
data['db_o'] = data['offensepersonnel'].str.extract(pat='(\d DB)')
data['db_o'] = data['db_o'].str.extract(pat='(\d)')

# replace nas with 0
data['ol'].fillna(0, inplace = True)
data['dl'].fillna(0, inplace = True)
data['db'].fillna(0, inplace = True)
data['lb'].fillna(0, inplace = True)
data['rb'].fillna(0, inplace = True)
data['ol_d'].fillna(0, inplace = True)
data['rb_d'].fillna(0, inplace = True)
data['te'].fillna(0, inplace = True)
data['wr'].fillna(0, inplace = True)
data['dl_o'].fillna(0, inplace = True)
data['lb_o'].fillna(0, inplace = True)
data['db_o'].fillna(0, inplace = True)

# replace qb nas with 1 - WILDCAT has a qb, but the ball is snapped to someone else - JM
data['qb'].fillna(1, inplace = True)

# set ol == 0 to be 11 - sum(offensive personnel)
off = ['rb', 'qb', 'te', 'wr', 'dl_o', 'lb_o', 'db_o'] 
data[off] = data[off].astype(str).astype(int)
data['ol'][data['ol'] == 0] = 11 - data[off].sum(axis = 1)

# convert rest of personnel to int
defp = ['dl', 'lb', 'rb_d', 'ol_d', 'db']
data[defp] = data[defp].astype(str).astype(int)

# fill lb na with 11 - sum(everyone else on defense)
data['lb'][data['lb'] == 0] = 11 - data[defp].sum(axis = 1)# count defensive position
# differentiate defensive ol and offensive ol - JM
data["ol_d"] = data["defensepersonnel"].str.extract(pat="([1-9] OL)")
data["ol_d"] = data["ol_d"].str.extract(pat="([1-9])")
data["dl"] = data["defensepersonnel"].str.extract(pat="([1-9] DL)")
data["dl"] = data["dl"].str.extract(pat="([1-9])")
data["lb"] = data["defensepersonnel"].str.extract(pat="([1-9] LB)")
data["lb"] = data["lb"].str.extract(pat="([1-9])")
data["db"] = data["defensepersonnel"].str.extract(pat="([1-9] DB)")
data["db"] = data["db"].str.extract(pat="([1-9])")

# differentiate defensive rb and offensive rb - JM
data["rb_d"] = data["defensepersonnel"].str.extract(pat="([1-9] RB)")
data["rb_d"] = data["rb_d"].str.extract(pat="([1-9])")

# count offensive position
# QB, OL, RB, TE, WR, DL, LB, DB
data['qb'] = data['offensepersonnel'].str.extract(pat='([1-9] QB)')
data['qb'] = data['qb'].str.extract(pat='([1-9])')
data['ol'] = data['offensepersonnel'].str.extract(pat='([1-9] OL)')
data['ol'] = data['ol'].str.extract(pat='([1-9])')
data['rb'] = data['offensepersonnel'].str.extract(pat='(\d RB)')
data['rb'] = data['rb'].str.extract(pat='(\d)')
data['te'] = data['offensepersonnel'].str.extract(pat='(\d TE)')
data['te'] = data['te'].str.extract(pat='(\d)')
data['wr'] = data['offensepersonnel'].str.extract(pat='(\d WR)')
data['wr'] = data['wr'].str.extract(pat='(\d)')

# differentiate offensive dl 
data['dl_o'] = data['offensepersonnel'].str.extract(pat='(\d DL)')
data['dl_o'] = data['dl_o'].str.extract(pat='(\d)')
data['lb_o'] = data['offensepersonnel'].str.extract(pat='(\d LB)')
data['lb_o'] = data['lb_o'].str.extract(pat='(\d)')
data['db_o'] = data['offensepersonnel'].str.extract(pat='(\d DB)')
data['db_o'] = data['db_o'].str.extract(pat='(\d)')

# replace nas with 0
data['ol'].fillna(0, inplace = True)
data['dl'].fillna(0, inplace = True)
data['db'].fillna(0, inplace = True)
data['lb'].fillna(0, inplace = True)
data['rb'].fillna(0, inplace = True)
data['ol_d'].fillna(0, inplace = True)
data['rb_d'].fillna(0, inplace = True)
data['te'].fillna(0, inplace = True)
data['wr'].fillna(0, inplace = True)
data['dl_o'].fillna(0, inplace = True)
data['lb_o'].fillna(0, inplace = True)
data['db_o'].fillna(0, inplace = True)

# replace qb nas with 1 - WILDCAT has a qb, but the ball is snapped to someone else - JM
data['qb'].fillna(1, inplace = True)

# set ol == 0 to be 11 - sum(offensive personnel)
off = ['rb', 'qb', 'te', 'wr', 'dl_o', 'lb_o', 'db_o'] 
data[off] = data[off].astype(str).astype(int)
data['ol'][data['ol'] == 0] = 11 - data[off].sum(axis = 1)

# convert rest of personnel to int
defp = ['dl', 'lb', 'rb_d', 'ol_d', 'db']
data[defp] = data[defp].astype(str).astype(int)

# fill lb na with 11 - sum(everyone else on defense)
data['lb'][data['lb'] == 0] = 11 - data[defp].sum(axis = 1)

#### yards from own goal

In [23]:
def createYardsFromOwnGoal(x):
    # if the ball is on the possession teams side, just return yardline
    if x["fieldposition"] == x["possessionteam"]:
        return x["yardline"]
    else:
    # if the ball is not on the possesion team side, convert
        return (50 + (50-x["yardline"]))
data["yardfromowngoal"] = data.apply(createYardsFromOwnGoal,axis = 1)

#### standardize cooridnates - all plays will have a direction from left to right

In [24]:
# standardize coordinates so all plays are left to right (playdirection = right)
def standardizeXCoordinates(x):
    if x["playdirection"] == "left":
        return 120 - x["x"] 
    else:
        return x["x"] 
def standardizeYCoordinates(x):
    if x["playdirection"] == "left":
        return 160/3 - x["y"]
    else:
        return x["y"]
data["x_standard"] = data.apply(standardizeXCoordinates,axis=1)
data["y_standard"] = data.apply(standardizeYCoordinates,axis=1)

#### Run success , sucess being gain of half yards to first down

In [25]:
def runSuccess(x):
    x = x["playid"]
    x = data[data["playid"] == x]
    if sum(x["yards"])/22 >= sum(x["distance"])/22/2:
        return 1
    else:
        return 0
runSuccessLookup = data.drop_duplicates(subset=["playid"])
runSuccessLookup["runsuccess"] = runSuccessLookup.apply(runSuccess,axis=1)
data = data.merge(runSuccessLookup[['playid','runsuccess']],on='playid', how='left')
del runSuccessLookup

#### average yards gained by team each game (prior name MeanPerGame)


In [26]:
def averageYardsPerGame(x):
    game = data[data["gameid"]== x["gameid"]]
    team = game[game["teamidentity"]==x["teamidentity"]]
    x_return = team["yards"].mean()
    return x_return

avgYardsLookup = data
avgYardsLookup["temp"] = avgYardsLookup["gameid"].astype(str)+ avgYardsLookup["teamidentity"]
avgYardsLookup = avgYardsLookup.drop_duplicates(subset=["temp"])

avgYardsLookup["avgyardspergame"] = avgYardsLookup[["gameid","teamidentity"]].apply(averageYardsPerGame,axis=1)
data = data.merge(avgYardsLookup[["gameid","teamidentity","avgyardspergame"]],how='left',left_on=["gameid","teamidentity"],right_on=["gameid","teamidentity"])
del avgYardsLookup

#### mean yards per attempt... by season

In [27]:
#### Mean Yards per Attempt (by Season)
# get each season
df17 = data.loc[data['season'] == 2017]
df18 = data.loc[data['season'] == 2018]
df19 = data.loc[data['season'] == 2019]

# get avg yards per attempt by team
avg_yards_17 = df17.groupby('possessionteam')['yards'].mean()
avg_yards_18 = df18.groupby('possessionteam')['yards'].mean()
avg_yards_19 = df19.groupby('possessionteam')['yards'].mean()

# get a list of teams
team_list = list(data['teamidentity'].unique())

del df17, df18, df19#### Mean Yards per Attempt (by Season)
# get each season
df17 = data.loc[data['season'] == 2017]
df18 = data.loc[data['season'] == 2018]
df19 = data.loc[data['season'] == 2019]

# get avg yards per attempt by team
avg_yards_17 = df17.groupby('possessionteam')['yards'].mean()
avg_yards_18 = df18.groupby('possessionteam')['yards'].mean()
avg_yards_19 = df19.groupby('possessionteam')['yards'].mean()

# get a list of teams
team_list = list(data['teamidentity'].unique())

del df17, df18, df19

#### average yards per rush attempt by season

In [28]:
# average yards per rush attempt for each team for each season
def mean_ypa(df):
    '''
    input: dataframe
    operation: map the mean yards per attempt of each team and season
    returns: mean yards per attempt for that team in that season
    '''
    for team in team_list:
        # 2017 season
        if (df['teamidentity'] == team) & (df['season'] == 2017):
            ypa = avg_yards_17[team]
        # 2018 season
        elif (df['teamidentity'] == team) & (df['season'] == 2018):
            ypa = avg_yards_18[team]
        # 2019 season
        elif (df['teamidentity'] == team) & (df['season'] == 2019):
            ypa = avg_yards_19[team]
        else:
            continue
        
        return ypa
data['mean_yards_per_rush'] = data.apply(mean_ypa, axis = 1)

#### future play position 

In [38]:
# get future x coordinate
def future_x(df,t):
    '''
    input: df - designed to be used with apply()
    calculates: new x coordinate with speed, acceleration, and direction
    Returns: new x-coordinate
    '''
    if t != "missing":
        # distance traveled = speed(time) + (acceleration/2)(time)^2
        distance = df['s']*t + df['a']/2*t**2 
        # delta x = cos(direction)  convert to radians
        dx = distance * math.cos(math.radians(df['dir'])) 
        # future = initial position + change
        new_x = df['x_standard'] + dx
        # can't have coordinates outside the boundaries
        if new_x < 0:
            new_x = 0
        elif new_x > 120:
            new_x = 120
        return new_x
    else:
        # distance traveled = speed(time) + (acceleration/2)(time)^2
        distance = df['s']*df['time_since_snap'] + df['a']/2*df['time_since_snap']**2 
        # delta x = cos(direction)  convert to radians
        dx = distance * math.cos(math.radians(df['dir'])) 
        # future = initial position + change
        new_x = df['x_standard'] + dx
        # can't have coordinates outside the boundaries
        if new_x < 0:
            new_x = 0
        elif new_x > 120:
            new_x = 120
        return new_x
# get future y coordinate
def future_y(df,t):
    '''
    input: df - designed to be used with apply()
    calculates: new y coordinate with speed, acceleration, and direction
    Returns: new y-coordinate
    '''
    if t != "missing":
        # distance traveled = speed(time) + (acceleration/2)(time)^2
        distance = df['s']*t + df['a']/2*t**2 
        # delta y = sin(direction)  convert to radians
        dy = distance * math.sin(math.radians(df['dir'])) 
        # future = initial position + change 
        new_y = df['y_standard'] + dy
        # can't have coordinates outside the boundaries
        if new_y < 0:
            new_y = 0
        elif new_y > 53.3:
            new_y = 53.3
        return new_y
    else:
        # distance traveled = speed(time) + (acceleration/2)(time)^2
        distance = df['s']*df['time_since_snap'] + df['a']/2*df['time_since_snap']**2 
        # delta y = sin(direction)  convert to radians
        dy = distance * math.sin(math.radians(df['dir'])) 
        # future = initial position + change 
        new_y = df['y_standard'] + dy
        # can't have coordinates outside the boundaries
        if new_y < 0:
            new_y = 0
        elif new_y > 53.3:
            new_y = 53.3
        return new_y

In [39]:
#apply functions for time since snap
p = Pool(os.cpu_count())
data['x_handoff'] = data.apply(future_x, axis = 1, t="missing") #50 represents time_since_snap
p.close()
p.join()
p = Pool(os.cpu_count())
data['y_handoff'] = data.apply(future_y, axis = 1, t="missing") #50 represents time_since_snap
p.close()
p.join()

In [31]:
# apply functions
p = Pool(os.cpu_count())
data["1_sec_x"] = data.apply(future_x,axis=1,t=1)
data["1_5_sec_x"] = data.apply(future_x,axis=1,t=1.5)
data["2_sec_x"] = data.apply(future_x,axis=1,t=2)
data["2_5_sec_x"] = data.apply(future_x,axis=1,t=2.5)
data["3_sec_x"] = data.apply(future_x,axis=1,t=3.0)
data["3_5_sec_x"] = data.apply(future_x,axis=1,t=3.5)
data["4_sec_x"] = data.apply(future_x,axis=1,t=4)
data["4_5_sec_x"] = data.apply(future_x,axis=1,t=4.5)
data["5_sec_x"] = data.apply(future_x,axis=1,t=5)
data["5_5_sec_x"] = data.apply(future_x,axis=1,t=5.5)
data["6_sec_x"] = data.apply(future_x,axis=1,t=6)
p.close()
p.join()


In [32]:
# apply functions
p = Pool(os.cpu_count())
data["1_sec_y"] = data.apply(future_y,axis=1,t=1)
data["1_5_sec_y"] = data.apply(future_y,axis=1,t=1.5)
data["2_sec_y"] = data.apply(future_y,axis=1,t=2)
data["2_5_sec_y"] = data.apply(future_y,axis=1,t=2.5)
data["3_sec_y"] = data.apply(future_y,axis=1,t=3.0)
data["3_5_sec_y"] = data.apply(future_y,axis=1,t=3.5)
data["4_sec_y"] = data.apply(future_y,axis=1,t=4)
data["4_5_sec_y"] = data.apply(future_y,axis=1,t=4.5)
data["5_sec_y"] = data.apply(future_y,axis=1,t=5)
data["5_5_sec_y"] = data.apply(future_y,axis=1,t=5.5)
data["6_sec_y"] = data.apply(future_y,axis=1,t=6)
p.close()
p.join()


Unnamed: 0,gameid,playid,team,x,y,s,a,dis,orientation,dir,...,1_5_sec_y,2_sec_y,2_5_sec_y,3_sec_y,3_5_sec_y,4_sec_y,4_5_sec_y,5_sec_y,5_5_sec_y,6_sec_y
0,2017090700,20170907000118,away,73.91,34.84,1.69,1.13,0.4,278.01,272.82,...,14.691693,12.860163,10.746476,8.350631,5.672628,2.712467,0.0,0.0,0.0,0.0
1,2017090700,20170907000118,away,74.67,32.64,0.42,1.35,0.01,332.39,251.3,...,18.658015,17.340209,15.702719,13.745546,11.468689,8.872149,5.955925,2.720018,0.0,0.0
2,2017090700,20170907000118,away,74.0,33.2,1.22,0.59,0.31,356.99,247.27,...,17.833258,16.794477,15.619652,14.308782,12.861868,11.278909,9.559905,7.704857,5.713764,3.586627
3,2017090700,20170907000118,away,71.46,27.7,0.42,0.54,0.02,0.23,344.36,...,25.299713,25.115716,24.895325,24.638538,24.345357,24.01578,23.649809,23.247443,22.808681,22.333525
4,2017090700,20170907000118,away,69.32,35.42,1.82,2.43,0.16,347.37,285.69,...,12.653168,9.730052,6.222072,2.129228,0.0,0.0,0.0,0.0,0.0,0.0


#### indicate if defender is blocked - To be determined

#### offense dummy variable

In [40]:
#### # offense dummy variable
# 1 = offense, 0 = defense
data['offense'] = 0
data["offense"][data["possessionteam"] == data["teamidentity"]]=1 

#### effective directional acceleration

In [41]:
def eff_df_acc(df):
    '''
    input: dataframe
    calculates: x component of acceleration
    returns: x component
    positive -> left to right
    negative -> right to left
    '''
    downfield_acc = df['a'] * math.cos(math.radians(df['dir']))
    return downfield_acc

# effective crossfield accleration
def eff_cf_acc(df):
    '''
    input: dataframe
    calculates: y component of acceleration
    returns: y component
    positive -> up toward "far sideline"
    negative -> down toward "near sideline"
    '''
    crossfield_acc = df['a'] * math.sin(math.radians(df['dir']))
    return crossfield_acc
p = Pool(os.cpu_count())
data['eff_downfield_acc'] = data.apply(eff_df_acc, axis = 1)
p.close()
p.join()
p = Pool(os.cpu_count())
data['eff_crossfield_acc'] = data.apply(eff_cf_acc, axis = 1)
p.close()
p.join()

#### is team winning

In [42]:
def rushteamwinning(df):
    if df['possessionteam'] == df['hometeamabbr']:
        if df['homescorebeforeplay'] > df['visitorscorebeforeplay']:
            return 1
        else:
            return 0
    if df['possessionteam'] == df['visitorteamabbr']:
        if df['visitorscorebeforeplay'] > df['homescorebeforeplay']:
            return 1
        else:
            return 0
data["rushteamwinning"] = data.apply(rushteamwinning,axis=1)

#### playersBlocked

In [None]:
def playerBlocked(x):
    # generate players distance from everyone
    # player's coordinates
    player_x = x.x_handoff
    player_y = x.y_handoff
    # get play id
    playid = x["playid"]
    # grab df from data table with matching play id
    distance_df = data[data["playid"] == playid]
    # remove offense players
    distance_df = distance_df[data["offense"] != 1]
    # add distance to dataframe column
    distance_df["euc_distance"] = np.sqrt((player_x - distance_df["x_handoff"])**2 + (player_y - distance_df["y_handoff"])**2)
    # if any of the distances have 1 or less, return 1
    distance_df = distance_df[distance_df["euc_distance"]<=1]
    if len(distance_df.index.values == 0):
        return 0
    else:
        return 1

In [None]:
data["playerblocked"] = data.apply(playerBlocked,axis=1)

In [None]:
def sumPlayersBlocked(x):
    df = data[data["playid"] == x["playid"]]
    return df["playerblocked"].sum()
playersBlockedLookup = data.drop_duplicates(subset=["playid"])
playersBlockedLookup["total_playersblocked"] = data.apply(sumPlayersBlocked,axis=1)
data = data.merge(playersBlockedLookup[['playid','total_playersblocked']],on='playid', how='left')

In [None]:
# dump file to pickle storage
data.to_pickle('data.pkl')