In [12]:
import numpy as np # linear algebra
from numpy.matlib import repmat
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.display.max_columns = 100
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
from fuzzywuzzy import fuzz
import datetime
import sklearn

#for image generation
from scipy import stats
from scipy.special import expit
import matplotlib.image as mpimg

import time
from tqdm import tqdm_notebook

import pickle
from sklearn.model_selection import train_test_split

In [13]:
# Training data is in the competition dataset
train_df = pd.read_csv('data/train.csv', low_memory=False)

In [14]:
# who needs tidyverse? it's all just SQL in the end

## TODO: fix player direction mapping

# standardize co-ordinates, courtesy of Michael Lopez's R implementation
# https://www.kaggle.com/statsbymichaellopez/nfl-tracking-wrangling-voronoi-and-sonars

def clean_df(df):
    # first, re-map a few team names
    di = {"ARZ":"ARI", "BLT":"BAL", "CLV":"CLE", "HST":"HOU"}
    df = df.replace({'PossessionTeam':di, 'FieldPosition':di})
    di = {"ACE":"SINGLEBACK", np.nan:"NONE"}
    df = df.replace({'OffenseFormation':di})

    df = (df 
            .assign(ToLeft=df['PlayDirection']=='left')
            .assign(BallCarrier=df['NflId']==train_df['NflIdRusher'])
           )

    df = df.assign(TeamOnOffense=np.where(df['PossessionTeam']==df['HomeTeamAbbr'],'home','away'))

    df = (df
            .assign(IsOnOffense=df['Team']==df['TeamOnOffense'])
            .assign(YardsFromOwnGoal=np.where(df['FieldPosition']==df['PossessionTeam'], df['YardLine'], 50 + (50-df['YardLine'])))
           )

    # standardize field positions
    df = (df
            .assign(YardsFromOwnGoal=np.where(df['YardLine']==50, 50, df['YardsFromOwnGoal']))
            .assign(X=np.where(df['ToLeft'], 120-df['X'], df['X'])-10)
            .assign(Y=np.where(df['ToLeft'], 160/3-df['Y'], df['Y']))
           )

    # standardize player directions (- to swtich from cw to ccw, + 90 to rotate so 0 = x-axis, -180 if going left to flip field)
    df = (df
            .assign(Dir=np.radians(np.where(~df['ToLeft'], -df['Dir'], -df['Dir']-180)+90))
           )
    
    # play duration so far
    df = (df
             .assign(Duration=(pd.to_datetime(df['TimeHandoff']) - pd.to_datetime(df['TimeSnap']))/np.timedelta64(1,'s'))
         )
    
    # drop columns that we will not use
    df = (df
             .drop(columns=['Temperature', 'WindSpeed', 'WindDirection', 'Stadium', 'DisplayName', 'JerseyNumber',
                           'Season', 'Orientation', 'Humidity', 'Week', 'PlayerCollegeName', 'TimeSnap', 'TimeHandoff',
                           'Location', 'PlayerBirthDate', 'PlayerHeight', 'Position', 'GameWeather']))
    
    return df

In [15]:
cleandf = clean_df(train_df)

In [16]:
## helper codes to retrieve game state information

def split_personnel(s):
    splits = s.split(',')
    for i in range(len(splits)):
        splits[i] = splits[i].strip()

    return splits

def defense_formation(l):
    dl = 0
    lb = 0
    db = 0
    other = 0

    for position in l:
        sub_string = position.split(' ')
        if sub_string[1] == 'DL':
            dl += int(sub_string[0])
        elif sub_string[1] in ['LB','OL']:
            lb += int(sub_string[0])
        else:
            db += int(sub_string[0])

    counts = (dl,lb,db,other)

    return counts

def offense_formation(l):
    qb = 0
    rb = 0
    wr = 0
    te = 0
    ol = 0

    sub_total = 0
    qb_listed = False
    for position in l:
        sub_string = position.split(' ')
        pos = sub_string[1]
        cnt = int(sub_string[0])

        if pos == 'QB':
            qb += cnt
            sub_total += cnt
            qb_listed = True
        # Assuming LB is a line backer lined up as full back
        elif pos in ['RB','LB']:
            rb += cnt
            sub_total += cnt
        # Assuming DB is a defensive back and lined up as WR
        elif pos in ['WR','DB']:
            wr += cnt
            sub_total += cnt
        elif pos == 'TE':
            te += cnt
            sub_total += cnt
        # Assuming DL is a defensive lineman lined up as an additional line man
        else:
            ol += cnt
            sub_total += cnt

    # If not all 11 players were noted at given positions we need to make some assumptions
    # I will assume if a QB is not listed then there was 1 QB on the play
    # If a QB is listed then I'm going to assume the rest of the positions are at OL
    # This might be flawed but it looks like RB, TE and WR are always listed in the personnel
    if sub_total < 11:
        diff = 11 - sub_total
        if not qb_listed:
            qb += 1
            diff -= 1
        ol += diff

    counts = (qb,rb,wr,te,ol)

    return counts

def personnel_features(df):
    personnel = df[['GameId','PlayId','OffensePersonnel','DefensePersonnel']].drop_duplicates()
    personnel['DefensePersonnel'] = personnel['DefensePersonnel'].apply(lambda x: split_personnel(x))
    personnel['DefensePersonnel'] = personnel['DefensePersonnel'].apply(lambda x: defense_formation(x))
    personnel['DL'] = personnel['DefensePersonnel'].apply(lambda x: x[0])
    personnel['LB'] = personnel['DefensePersonnel'].apply(lambda x: x[1])
    personnel['DB'] = personnel['DefensePersonnel'].apply(lambda x: x[2])

    personnel['OffensePersonnel'] = personnel['OffensePersonnel'].apply(lambda x: split_personnel(x))
    personnel['OffensePersonnel'] = personnel['OffensePersonnel'].apply(lambda x: offense_formation(x))
    personnel['QB'] = personnel['OffensePersonnel'].apply(lambda x: x[0])
    personnel['RB'] = personnel['OffensePersonnel'].apply(lambda x: x[1])
    personnel['WR'] = personnel['OffensePersonnel'].apply(lambda x: x[2])
    personnel['TE'] = personnel['OffensePersonnel'].apply(lambda x: x[3])
    personnel['OL'] = personnel['OffensePersonnel'].apply(lambda x: x[4])

    # Let's create some features to specify if the OL is covered
    personnel['OL_diff'] = personnel['OL'] - personnel['DL']
    personnel['OL_TE_diff'] = (personnel['OL'] + personnel['TE']) - personnel['DL']
    # Let's create a feature to specify if the defense is preventing the run
    # Let's just assume 7 or more DL and LB is run prevention
    personnel['run_def'] = (personnel['DL'] + personnel['LB'] > 6).astype(int)

    personnel.drop(['OffensePersonnel','DefensePersonnel'], axis=1, inplace=True)

    return personnel

def clean_stadium_type(row):
    if not pd.isnull(row['StadiumType']):
        if fuzz.partial_ratio(row['StadiumType'],'outdoor') > 75:
            st = 'outdoor'
        else:
            st = 'indoor'
    else:
        st = 'indoor'
    return st

def clean_field_type(row):
    if not pd.isnull(row['Turf']):
        if fuzz.partial_ratio(row['Turf'],'natural grass') > 75:
            ft = 'natural'
        else:
            ft = 'artificial'
    else:
        ft = 'artificial'
    return ft

def time_remaining(row):
    gc = row['GameClock']
    tmp = gc.split(':')[:-1]
    tr = (int(tmp[0])*3600) + (int(tmp[1]))
    tr = tr/3600/15
    return tr

def get_score_diff(row):
    if row['TeamOnOffense'] == 'home':
        scoreDiff = row['HomeScoreBeforePlay'] - row['VisitorScoreBeforePlay']
    else: 
        scoreDiff = row['VisitorScoreBeforePlay'] - row['HomeScoreBeforePlay']
    return scoreDiff

def one_hot_enc(df, var):
    one_hot = pd.get_dummies(df[var])
    df = (df
              .drop(var, axis=1)
              .join(one_hot)
         )
    return df

In [17]:
# additional cleaning steps
cleandf['Turf'] = cleandf.apply(clean_field_type, axis=1)
cleandf['StadiumType'] = cleandf.apply(clean_stadium_type, axis=1)

cleandf = pd.merge(cleandf,personnel_features(cleandf),on=['GameId','PlayId'],how='inner')

cleandf = cleandf.drop(columns=['OffensePersonnel','DefensePersonnel'])

cleandf['ScoreDiff'] = cleandf.apply(get_score_diff, axis=1)

cleandf['GameClock'] = cleandf.apply(time_remaining, axis=1)

In [20]:
## game state information for each row

plays = cleandf.groupby('PlayId').first().drop(columns=['Team', 'X', 'Y', 'Dir', 'NflId', 'PossessionTeam',
                                                        'ToLeft', 'IsOnOffense', 'BallCarrier', 'HomeTeamAbbr',
                                                       'VisitorTeamAbbr', 'PlayDirection', 'YardLine', 
                                                       'A', 'S', 'PlayerWeight', 'FieldPosition',
                                                       'Dis', 'GameId', 'HomeScoreBeforePlay', 'VisitorScoreBeforePlay'])
# one-hot categoricals
plays = one_hot_enc(plays, 'OffenseFormation')
di = {"outdoor":1, "indoor":0, "artificial":1, "natural":0, "home":1, "away":0}
plays = plays.replace({'StadiumType':di, 'Turf':di, 'TeamOnOffense':di})
di = {1:"D1", 2:"D2", 3:"D3", 4:"D4"}
plays = plays.replace({'Down':di})
plays = one_hot_enc(plays, 'Down')
di = {1:"Q1", 2:"Q2", 3:"Q3", 4:"Q4", 5:"OT"}
plays = plays.replace({'Quarter':di})
plays = one_hot_enc(plays, 'Quarter')

# add play id back to groupby dataframe
pids = plays.index.tolist()
plays['PlayId'] = pids

In [21]:
plays

Unnamed: 0_level_0,GameClock,Distance,NflIdRusher,DefendersInTheBox,Yards,StadiumType,Turf,TeamOnOffense,YardsFromOwnGoal,Duration,DL,LB,DB,QB,RB,WR,TE,OL,OL_diff,OL_TE_diff,run_def,ScoreDiff,EMPTY,I_FORM,JUMBO,NONE,PISTOL,SHOTGUN,SINGLEBACK,WILDCAT,D1,D2,D3,D4,OT,Q1,Q2,Q3,Q4,PlayId
PlayId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
20170907000118,0.933593,2,2543773,6.0,8,1,1,1,35,1.0,2,3,6,1,1,3,1,5,3,4,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,20170907000118
20170907000139,0.867630,10,2543773,6.0,3,1,1,1,43,1.0,2,3,6,1,1,3,1,5,3,4,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,20170907000139
20170907000189,0.866704,10,2543773,7.0,5,1,1,1,65,2.0,2,3,6,1,1,3,1,5,3,4,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,20170907000189
20170907000345,0.800222,2,2539663,9.0,2,1,1,1,98,2.0,4,4,3,1,2,0,2,6,2,4,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,20170907000345
20170907000395,0.800148,10,2557917,7.0,7,1,1,0,25,1.0,3,2,6,1,1,1,3,5,2,5,0,-7,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,20170907000395
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20181230153910,0.200056,10,2553435,8.0,1,1,1,0,65,1.0,4,3,4,1,2,2,1,5,1,2,1,-3,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,20181230153910
20181230154035,0.067574,10,2553439,6.0,4,1,1,1,25,1.0,4,2,5,1,1,3,1,5,1,2,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,20181230154035
20181230154082,0.067111,1,2553439,7.0,4,1,1,1,34,2.0,4,2,5,1,1,3,1,5,1,2,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,20181230154082
20181230154135,0.001037,10,2558865,7.0,2,1,1,1,75,1.0,4,2,5,1,1,3,1,5,1,2,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,20181230154135


In [25]:
rusherDict = {}
pids = plays.index.tolist()
plays['PlayId'] = pids
rushers = plays.groupby('NflIdRusher')

In [35]:
temp = rushers.get_group(2543773)
ls = temp["PlayId"].tolist()
ls

[20170907000118,
 20170907000139,
 20170907000189,
 20170907000473,
 20170907001156,
 20170907001177,
 20170907003444,
 20170907003465,
 20170907003507,
 20170917051736,
 20170917053087,
 20170924070169,
 20170924070604,
 20170924070801,
 20170924072204,
 20170924072923,
 20171001071294,
 20171005000839,
 20171005002056,
 20171015052139,
 20171015053399,
 20171015053853,
 20171022120562,
 20171022121806,
 20171022121851,
 20171022121969,
 20171029031889,
 20171112111371,
 20171112113605,
 20171119090322,
 20171119090367,
 20171119091356,
 20171119093359,
 20171119093380,
 20171126042950,
 20171126042993,
 20171126043017,
 20171203020390,
 20171203020439,
 20171203021873,
 20171203021953,
 20171203024273,
 20180909050072,
 20180909050093,
 20180909052734,
 20180909053424,
 20180909053728,
 20180916120925,
 20180916121364,
 20180916121414,
 20180916123186,
 20180923131533,
 20180923133680,
 20180923133701,
 20180923133722,
 20180930060388,
 20180930060409,
 20180930060816,
 2018093006107

In [36]:
for rusherId, rusherData in tqdm_notebook(rushers):
    rusherDict[rusherId] = rusherData["PlayId"].tolist()

rusherDict

HBox(children=(IntProgress(value=0, max=371), HTML(value='')))




{234: [20170910002293,
  20170910002477,
  20170910002754,
  20170910003531,
  20170917090204,
  20170917090639,
  20170917091352,
  20170917091586,
  20170917091607,
  20170917091628,
  20170917093010,
  20170917093079,
  20170917093100,
  20170924080051,
  20170924080298,
  20170924080832,
  20170924081212,
  20170924081550,
  20170924082195,
  20170924082216,
  20170924082310,
  20171015050126,
  20171015050325,
  20171015050599,
  20171015051170,
  20171015051564,
  20171015051585,
  20171015052798,
  20171015052918,
  20171015053044,
  20171022060389,
  20171022061124,
  20171022062458,
  20171022062676,
  20171022063144,
  20171022063699,
  20171022063741,
  20171029050078,
  20171029050788,
  20171029052260,
  20171029052410,
  20171102001598,
  20171102003529,
  20171126050514,
  20171126050885,
  20171126051212,
  20171126051702,
  20171126051928,
  20171126052585,
  20171126052645,
  20171126052846,
  20171126053417,
  20171126054002,
  20171203080244,
  20171203080771,
  201

In [None]:
def saveToFile(gameStates,yards,playIds,fileIds,imageDict,partition,split):
    partition[split] = []
    n = gameStates.shape[0]
    print('Saving files for '+split+' split...')
    for ii in tqdm_notebook(range(n)):
        gameState = gameStates[ii,:]
        y = int(yards[ii]) 
        yvec = np.concatenate((np.zeros((1,y+99)),np.ones((1,100-y))), axis = 1)
        
        playId = playIds[ii]
        fileId = fileIds[ii]
        image = imageDict[playId]
        
        np.save('data_fixed/files/gameState'+str(fileId)+'.npy',gameState)
        np.save('data_fixed/files/image'+str(fileId)+'.npy',image)
        np.save('data_fixed/files/yardage'+str(fileId)+'.npy',yvec)
        
        partition[split].append(str(fileId))
    return partition

def augment(gameStates):
    n = gameStates.shape[0]
    nAug = 3*n
    gsAug = repmat(gameStates,3,1)
    yards = gsAug[:,3]
    playIds = gsAug[:,-1].astype('int')
    
    gsAug = np.delete(gsAug, -1, axis=1)
    gsAug = np.delete(gsAug, 3, axis=1)
    
    aug = np.concatenate((np.zeros(n),np.ones(n),-np.ones(n)))
    yards = np.clip(yards+aug,-99,99)
    
    gsAug = gsAug.astype('float')
    return gsAug, yards, playIds

In [None]:
playMat = plays.to_numpy()
playMat = playMat.astype('float')
nanInd = np.argwhere(np.isnan(playMat))
playMat[nanInd[:,0],nanInd[:,1]] = np.zeros((1,3))

In [None]:
gs = playMat

yards = gs[:,3]
pids = gs[:,-1].astype('int')
    
gs[:,7] = (100-gs[:,7])/100.
        
gs = gs.astype('float')

In [None]:

gsTrain, gsTV = train_test_split(gs,train_size = 0.7)

gsVal, gsTest = train_test_split(gsTV, test_size = 0.5)

In [None]:
gsTrainAug, yardsTrainAug, pidTrainAug = augment(gsTrain)

yardsTest = gsTest[:,3]
pidTest = gsTest[:,-1]
gsTest = np.delete(gsTest, -1, axis=1)
gsTest = np.delete(gsTest, 3, axis=1)

yardsVal = gsVal[:,3]
pidVal = gsVal[:,-1]
gsVal = np.delete(gsVal, -1, axis=1)
gsVal = np.delete(gsVal, 3, axis=1)

In [None]:


n = gsTrainAug.shape[0]+gsTest.shape[0]+gsVal.shape[0]
fileIds = np.arange(n)+1
fileIdTrain = fileIds[0:gsTrainAug.shape[0]]
fileIdTest = fileIds[gsTrainAug.shape[0]:gsTrainAug.shape[0]+gsTest.shape[0]]
fileIdVal = fileIds[gsTrainAug.shape[0]+gsTest.shape[0]:gsTrainAug.shape[0]+gsTest.shape[0]+gsVal.shape[0]]


image_dict = np.load('data/image_dict.npy',allow_pickle='TRUE').item()
partition = {}
partition = saveToFile(gsTrainAug, yardsTrainAug, pidTrainAug, fileIdTrain, image_dict, partition,'train')
partition = saveToFile(gsTest, yardsTest, pidTest, fileIdTest, image_dict, partition,'test')
partition = saveToFile(gsVal, yardsVal, pidVal, fileIdVal, image_dict, partition,'validation')
np.save('data_fixed/partition_dict.npy',partition)

In [None]:
gs = np.load('data/files/gameState1.npy')
gs

In [None]:
n = gameStates.shape[0]
fileIds = np.arange(n)+1
gameStates, yards, playIds = augment(playMat)

In [None]:
np.isnan(playMat.astype('float')).any()

In [None]:
nanInd = np.argwhere(np.isnan(playMat.astype('float')))

In [None]:
playMat[nanInd[:,0],nanInd[:,1]]

In [None]:
np.argwhere(np.isnan(playMat))

In [None]:
np.isnan(playMat).any()

In [None]:
pid1 = 20170910000081
pid2 = 20170910001102

In [None]:
print(np.argwhere(playMat == pid2))

In [None]:
gs1 = playMat[63,:]
y = gs1[3].astype(int)
yards = np.concatenate((np.zeros((1,y+99)),np.ones((1,100-y))), axis = 1)
gsAug = np.delete(gs1, -1)
gsAug = np.delete(gsAug, 3)
gsAug[7] = gsAug[7]/100.

In [None]:
np.save('play2Yardage.npy',yards)
np.save('play2GameState.npy',gsAug)

In [None]:
gs1a.shape

In [None]:
asdf = np.load('data/files/gameState1.npy')

In [None]:
asdf.shape

In [None]:
gsAug.shape

In [None]:
playMat.shape

In [None]:
def saveOGToFile(gameStates,yards,playIds,imageDict):
    n = gameStates.shape[0]
    print('Saving files for each play...')
    for ii in tqdm_notebook(range(n)):
        gameState = gameStates[ii,:]
        y = int(yards[ii]) 
        yvec = np.concatenate((np.zeros((1,y+99)),np.ones((1,100-y))), axis = 1)
        
        playId = playIds[ii]
        image = imageDict[playId]
        
        np.save('data_fixed/og_files/gameState'+str(playId)+'.npy',gameState)
        np.save('data_fixed/og_files/image'+str(playId)+'.npy',image)
        np.save('data_fixed/og_files/yardage'+str(playId)+'.npy',yvec)
        
    return 

def splitGS(gs):
    n = gs.shape[0]
    yards = gs[:,3]
    pids = gs[:,-1].astype('int')
    
    gs = np.delete(gs, -1, axis=1)
    gs = np.delete(gs, 3, axis=1)
    gs[:,7] = (100-gs[:,7])/100.
        
    gs = gs.astype('float')
    return gs, yards, pids

In [None]:
imageDict = np.load('data/image_dict.npy',allow_pickle='TRUE').item()
gs, y, pids = splitGS(playMat)
saveOGToFile(gs,y,pids,imageDict)

In [None]:
temp = list(pids)
np.save('data/PlayIds.npy',temp)

In [None]:
np.load('data/playIDs.npy')

In [None]:
n = gsTrainAug.shape[0]+gsTest.shape[0]+gsVal.shape[0]

In [None]:
n

In [None]:
n = gsTrainAug.shape[0]+gsTest.shape[0]+gsVal.shape[0]
fileIds = np.arange(n)+1
fileIdTrain = fileIds[0:gsTrainAug.shape[0]]

In [None]:
gsTrainAug.shape

In [None]:
fileIdTrain.shape

In [None]:
gsVal.shape

In [None]:
fileIdVal.shape

In [None]:
fileIdTrain

In [None]:
fileIdTest

In [None]:
fileIdVal

In [None]:
n

In [None]:
asdf = np.load('data_fixed/partition_dict.npy',allow_pickle = True).item()

In [None]:
len(asdf['train'])

In [None]:
len(asdf['validation'])