In [11]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.display.max_columns = 100
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
from fuzzywuzzy import fuzz
import datetime

#for image generation
from scipy import stats
from scipy.special import expit
import matplotlib.image as mpimg

import time
from tqdm import tqdm_notebook

import pickle

In [12]:
# Training data is in the competition dataset
train_df = pd.read_csv('data/train.csv', low_memory=False)

In [13]:
# who needs tidyverse? it's all just SQL in the end

## TODO: fix player direction mapping

# standardize co-ordinates, courtesy of Michael Lopez's R implementation
# https://www.kaggle.com/statsbymichaellopez/nfl-tracking-wrangling-voronoi-and-sonars

def clean_df(df):
    # first, re-map a few team names
    di = {"ARZ":"ARI", "BLT":"BAL", "CLV":"CLE", "HST":"HOU"}
    df = df.replace({'PossessionTeam':di, 'FieldPosition':di})

    df = (df 
            .assign(ToLeft=df['PlayDirection']=='left')
            .assign(BallCarrier=df['NflId']==train_df['NflIdRusher'])
           )

    df = df.assign(TeamOnOffense=np.where(df['PossessionTeam']==df['HomeTeamAbbr'],'home','away'))

    df = (df
            .assign(IsOnOffense=df['Team']==df['TeamOnOffense'])
            .assign(YardsFromOwnGoal=np.where(df['FieldPosition']==df['PossessionTeam'], df['YardLine'], 50 + (50-df['YardLine'])))
           )

    # standardize field positions
    df = (df
            .assign(YardsFromOwnGoal=np.where(df['YardLine']==50, 50, df['YardsFromOwnGoal']))
            .assign(X=np.where(df['ToLeft'], 120-df['X'], df['X'])-10)
            .assign(Y=np.where(df['ToLeft'], 160/3-df['Y'], df['Y']))
           )

    # standardize player directions (- to swtich from cw to ccw, + 90 to rotate so 0 = x-axis, -180 if going left to flip field)
    df = (df
            .assign(Dir=np.radians(np.where(~df['ToLeft'], -df['Dir'], -df['Dir']-180)+90))
           )
    
    # play duration so far
    df = (df
             .assign(Duration=(pd.to_datetime(df['TimeHandoff']) - pd.to_datetime(df['TimeSnap']))/np.timedelta64(1,'s'))
         )
    
    # drop columns that we will not use
    df = (df
             .drop(columns=['Temperature', 'WindSpeed', 'WindDirection', 'Stadium', 'DisplayName', 'JerseyNumber',
                           'Season', 'Orientation', 'Humidity', 'Week', 'PlayerCollegeName', 'TimeSnap', 'TimeHandoff',
                           'Location', 'PlayerBirthDate', 'PlayerHeight', 'Position', 'GameWeather']))
    
    return df

In [14]:
cleandf = clean_df(train_df)

In [15]:
## helper codes to retrieve game state information

def split_personnel(s):
    splits = s.split(',')
    for i in range(len(splits)):
        splits[i] = splits[i].strip()

    return splits

def defense_formation(l):
    dl = 0
    lb = 0
    db = 0
    other = 0

    for position in l:
        sub_string = position.split(' ')
        if sub_string[1] == 'DL':
            dl += int(sub_string[0])
        elif sub_string[1] in ['LB','OL']:
            lb += int(sub_string[0])
        else:
            db += int(sub_string[0])

    counts = (dl,lb,db,other)

    return counts

def offense_formation(l):
    qb = 0
    rb = 0
    wr = 0
    te = 0
    ol = 0

    sub_total = 0
    qb_listed = False
    for position in l:
        sub_string = position.split(' ')
        pos = sub_string[1]
        cnt = int(sub_string[0])

        if pos == 'QB':
            qb += cnt
            sub_total += cnt
            qb_listed = True
        # Assuming LB is a line backer lined up as full back
        elif pos in ['RB','LB']:
            rb += cnt
            sub_total += cnt
        # Assuming DB is a defensive back and lined up as WR
        elif pos in ['WR','DB']:
            wr += cnt
            sub_total += cnt
        elif pos == 'TE':
            te += cnt
            sub_total += cnt
        # Assuming DL is a defensive lineman lined up as an additional line man
        else:
            ol += cnt
            sub_total += cnt

    # If not all 11 players were noted at given positions we need to make some assumptions
    # I will assume if a QB is not listed then there was 1 QB on the play
    # If a QB is listed then I'm going to assume the rest of the positions are at OL
    # This might be flawed but it looks like RB, TE and WR are always listed in the personnel
    if sub_total < 11:
        diff = 11 - sub_total
        if not qb_listed:
            qb += 1
            diff -= 1
        ol += diff

    counts = (qb,rb,wr,te,ol)

    return counts

def personnel_features(df):
    personnel = df[['GameId','PlayId','OffensePersonnel','DefensePersonnel']].drop_duplicates()
    personnel['DefensePersonnel'] = personnel['DefensePersonnel'].apply(lambda x: split_personnel(x))
    personnel['DefensePersonnel'] = personnel['DefensePersonnel'].apply(lambda x: defense_formation(x))
    personnel['DL'] = personnel['DefensePersonnel'].apply(lambda x: x[0])
    personnel['LB'] = personnel['DefensePersonnel'].apply(lambda x: x[1])
    personnel['DB'] = personnel['DefensePersonnel'].apply(lambda x: x[2])

    personnel['OffensePersonnel'] = personnel['OffensePersonnel'].apply(lambda x: split_personnel(x))
    personnel['OffensePersonnel'] = personnel['OffensePersonnel'].apply(lambda x: offense_formation(x))
    personnel['QB'] = personnel['OffensePersonnel'].apply(lambda x: x[0])
    personnel['RB'] = personnel['OffensePersonnel'].apply(lambda x: x[1])
    personnel['WR'] = personnel['OffensePersonnel'].apply(lambda x: x[2])
    personnel['TE'] = personnel['OffensePersonnel'].apply(lambda x: x[3])
    personnel['OL'] = personnel['OffensePersonnel'].apply(lambda x: x[4])

    # Let's create some features to specify if the OL is covered
    personnel['OL_diff'] = personnel['OL'] - personnel['DL']
    personnel['OL_TE_diff'] = (personnel['OL'] + personnel['TE']) - personnel['DL']
    # Let's create a feature to specify if the defense is preventing the run
    # Let's just assume 7 or more DL and LB is run prevention
    personnel['run_def'] = (personnel['DL'] + personnel['LB'] > 6).astype(int)

    personnel.drop(['OffensePersonnel','DefensePersonnel'], axis=1, inplace=True)

    return personnel

def clean_stadium_type(row):
    if not pd.isnull(row['StadiumType']):
        if fuzz.partial_ratio(row['StadiumType'],'outdoor') > 75:
            st = 'outdoor'
        else:
            st = 'indoor'
    else:
        st = 'indoor'
    return st

def clean_field_type(row):
    if not pd.isnull(row['Turf']):
        if fuzz.partial_ratio(row['Turf'],'natural grass') > 75:
            ft = 'natural'
        else:
            ft = 'artificial'
    else:
        ft = 'artificial'
    return ft

In [16]:
# additional cleaning steps
cleandf['Turf'] = cleandf.apply(clean_field_type, axis=1)
cleandf['StadiumType'] = cleandf.apply(clean_stadium_type, axis=1)

cleandf = pd.merge(cleandf,personnel_features(cleandf),on=['GameId','PlayId'],how='inner')

cleandf = cleandf.drop(columns=['OffensePersonnel','DefensePersonnel'])

In [21]:
### Define functions that generate spatial control fields for each play in input dataframe
def generateImages(df,nx,ny,alpha):
    plays = df.groupby('PlayId')
    nPlays = plays.ngroups
    playDict = {}
    
    xg = np.linspace(0,100,nx)
    yg = np.linspace(0,53.3,ny)
    x, y = np.meshgrid(xg,yg)
    grid = np.stack((x, y), axis=-1)
    
    
    for playId, playData in tqdm_notebook(plays):
        playDensities = makeFields(playData,grid)
        playTensor = makeTensor(playDensities,alpha)
        if np.isnan(playTensor).any():
            print(playId)
        playDict[playId] = playTensor
        
    return playDict

def makeTensor(rho,alpha):
    #alpha should be in the range [1e-3, 1e2]
    #dens_list[0] = defense
    #dens_list[1] = offense
    #dens_list[2] = ball carrier
    
    rho_def = rho[0]/np.max(rho[0])*127
    rho_off = rho[1]/np.max(rho[1])*127
    rho_bc = rho[2]/np.max(rho[2])*127
    rho_comp = (expit(alpha*(rho_off-rho_def)))*127
    playTensor = np.stack([rho_def,rho_off,rho_bc,rho_comp], axis = -1)
    playTensor = playTensor.astype('int8')
    #converting to int8 to save memory
    
    return playTensor

def makeFields(df,grid): 
    ny, nx, _ = grid.shape 
    rho_def = np.zeros((ny,nx))
    rho_off = np.zeros((ny,nx))
    rho_bc = np.zeros((ny,nx))
    
    
    for _, row in df.iterrows():
        pos = [row['X'],53.3 - row['Y']]
        spe = row['S']
        ori = row['Dir']
        
        if np.isnan(ori):
            ori = 0
            
        rho = dens(pos,spe,ori,grid)
   
        if row['IsOnOffense']:
            rho_off += rho
            if row['BallCarrier']:
                rho_bc += rho
        else:
            rho_def += rho
            
    return [rho_def,rho_off,rho_bc]

def dens(pos,spe,ori,grid):
    #need to convert units on parameters and estimate proper values for football vs. soccer
    roc = 4
    srat = spe**2/13**2
    
    R = np.array([[np.cos(ori),-np.sin(ori)], [np.sin(ori),np.cos(ori)]])
    S2 = np.array([[((roc-roc*srat)/2)**2,0],[0,((roc+roc*srat)/2)**2+1e-8]])
    sigma = np.matmul(np.matmul(R,S2),np.transpose(R))
    mu = (pos[0]+spe*np.cos(ori)*0.5, pos[1]+spe*np.sin(ori)*0.5)
    
    return stats.multivariate_normal.pdf(grid, mean = mu, cov = sigma)

In [22]:
## game state information for each row

plays = cleandf.groupby('PlayId').first().drop(columns=['Team', 'X', 'Y', 'Dir', 'NflId', 'PossessionTeam',
                                                        'ToLeft', 'IsOnOffense', 'BallCarrier', 'HomeTeamAbbr',
                                                       'VisitorTeamAbbr', 'PlayDirection', 'YardLine', 
                                                       'A', 'S', 'NflIdRusher', 'PlayerWeight', 'FieldPosition',
                                                       'Dis', 'GameId'])




In [23]:
plays
# add two score differential feature
# differential feature?

Unnamed: 0_level_0,Quarter,GameClock,Down,Distance,HomeScoreBeforePlay,VisitorScoreBeforePlay,OffenseFormation,DefendersInTheBox,Yards,StadiumType,Turf,TeamOnOffense,YardsFromOwnGoal,Duration,DL,LB,DB,QB,RB,WR,TE,OL,OL_diff,OL_TE_diff,run_def
PlayId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
20170907000118,1,14:14:00,3,2,0,0,SHOTGUN,6.0,8,outdoor,artificial,home,35,1.0,2,3,6,1,1,3,1,5,3,4,0
20170907000139,1,13:52:00,1,10,0,0,SHOTGUN,6.0,3,outdoor,artificial,home,43,1.0,2,3,6,1,1,3,1,5,3,4,0
20170907000189,1,13:02:00,1,10,0,0,SINGLEBACK,7.0,5,outdoor,artificial,home,65,2.0,2,3,6,1,1,3,1,5,3,4,0
20170907000345,1,12:12:00,2,2,0,0,JUMBO,9.0,2,outdoor,artificial,home,98,2.0,4,4,3,1,2,0,2,6,2,4,1
20170907000395,1,12:08:00,1,10,7,0,SHOTGUN,7.0,7,outdoor,artificial,away,25,1.0,3,2,6,1,1,1,3,5,2,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20181230153910,4,03:03:00,1,10,24,21,I_FORM,8.0,1,outdoor,artificial,away,65,1.0,4,3,4,1,2,2,1,5,1,2,1
20181230154035,4,01:49:00,1,10,24,24,SHOTGUN,6.0,4,outdoor,artificial,home,25,1.0,4,2,5,1,1,3,1,5,1,2,0
20181230154082,4,01:24:00,3,1,24,24,SHOTGUN,7.0,4,outdoor,artificial,home,34,2.0,4,2,5,1,1,3,1,5,1,2,0
20181230154135,4,00:56:00,1,10,24,24,SHOTGUN,7.0,2,outdoor,artificial,home,75,1.0,4,2,5,1,1,3,1,5,1,2,0


In [24]:
image_dict = generateImages(cleandf,200,100,1)
#np.save('data/image_dict.npy',image_dict)

HBox(children=(IntProgress(value=0, max=23171), HTML(value='')))




In [50]:
def saveToFile(gameStates,imageDict,IDs,partition,split):
    partition[split] = []
    n = gameStates.shape[0]
    for ii in range(n):
        gameState = gameStates[ii,:]
        playID = str(int(gameState[0]))
        image = 0#imageDict[playID]
        y = gameState[1] ##indices here
        x = gameState[1]
        
        np.save('data/files/gameState'+str(IDs[ii])+'.npy',x)
        np.save('data/files/image'+str(IDs[ii])+'.npy',image)
        np.save('data/files/yardage'+str(IDs[ii])+'.npy',y)
    return

In [51]:
gameStates = np.zeros((10,5))
IDs = [1,2,3,4,5,6,7,8,9,10]
partition = {}
split = 'test'

saveToFile(gameStates,_,IDs,partition,split)

In [27]:
a = [1,2,3]

In [28]:
a[1]

2