In [22]:
import numpy as np # linear algebra
from numpy.matlib import repmat
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.display.max_columns = 100
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
from fuzzywuzzy import fuzz
import datetime
import sklearn

#for image generation
from scipy import stats
from scipy.special import expit
import matplotlib.image as mpimg

import time
from tqdm import tqdm_notebook

import pickle
from sklearn.model_selection import train_test_split

In [23]:
# Training data is in the competition dataset
train_df = pd.read_csv('data/train.csv', low_memory=False)

In [24]:
# who needs tidyverse? it's all just SQL in the end

## TODO: fix player direction mapping

# standardize co-ordinates, courtesy of Michael Lopez's R implementation
# https://www.kaggle.com/statsbymichaellopez/nfl-tracking-wrangling-voronoi-and-sonars

def clean_df(df):
    # first, re-map a few team names
    di = {"ARZ":"ARI", "BLT":"BAL", "CLV":"CLE", "HST":"HOU"}
    df = df.replace({'PossessionTeam':di, 'FieldPosition':di})
    di = {"ACE":"SINGLEBACK", np.nan:"NONE"}
    df = df.replace({'OffenseFormation':di})

    df = (df 
            .assign(ToLeft=df['PlayDirection']=='left')
            .assign(BallCarrier=df['NflId']==train_df['NflIdRusher'])
           )

    df = df.assign(TeamOnOffense=np.where(df['PossessionTeam']==df['HomeTeamAbbr'],'home','away'))

    df = (df
            .assign(IsOnOffense=df['Team']==df['TeamOnOffense'])
            .assign(YardsFromOwnGoal=np.where(df['FieldPosition']==df['PossessionTeam'], df['YardLine'], 50 + (50-df['YardLine'])))
           )

    # standardize field positions
    df = (df
            .assign(YardsFromOwnGoal=np.where(df['YardLine']==50, 50, df['YardsFromOwnGoal']))
            .assign(X=np.where(df['ToLeft'], 120-df['X'], df['X'])-10)
            .assign(Y=np.where(df['ToLeft'], 160/3-df['Y'], df['Y']))
           )

    # standardize player directions (- to swtich from cw to ccw, + 90 to rotate so 0 = x-axis, -180 if going left to flip field)
    df = (df
            .assign(Dir=np.radians(np.where(~df['ToLeft'], -df['Dir'], -df['Dir']-180)+90))
           )
    
    # play duration so far
    df = (df
             .assign(Duration=(pd.to_datetime(df['TimeHandoff']) - pd.to_datetime(df['TimeSnap']))/np.timedelta64(1,'s'))
         )
    
    # drop columns that we will not use
    df = (df
             .drop(columns=['Temperature', 'WindSpeed', 'WindDirection', 'Stadium', 'DisplayName', 'JerseyNumber',
                           'Season', 'Orientation', 'Humidity', 'Week', 'PlayerCollegeName', 'TimeSnap', 'TimeHandoff',
                           'Location', 'PlayerBirthDate', 'PlayerHeight', 'Position', 'GameWeather']))
    
    return df

In [25]:
cleandf = clean_df(train_df)

In [26]:
## helper codes to retrieve game state information

def split_personnel(s):
    splits = s.split(',')
    for i in range(len(splits)):
        splits[i] = splits[i].strip()

    return splits

def defense_formation(l):
    dl = 0
    lb = 0
    db = 0
    other = 0

    for position in l:
        sub_string = position.split(' ')
        if sub_string[1] == 'DL':
            dl += int(sub_string[0])
        elif sub_string[1] in ['LB','OL']:
            lb += int(sub_string[0])
        else:
            db += int(sub_string[0])

    counts = (dl,lb,db,other)

    return counts

def offense_formation(l):
    qb = 0
    rb = 0
    wr = 0
    te = 0
    ol = 0

    sub_total = 0
    qb_listed = False
    for position in l:
        sub_string = position.split(' ')
        pos = sub_string[1]
        cnt = int(sub_string[0])

        if pos == 'QB':
            qb += cnt
            sub_total += cnt
            qb_listed = True
        # Assuming LB is a line backer lined up as full back
        elif pos in ['RB','LB']:
            rb += cnt
            sub_total += cnt
        # Assuming DB is a defensive back and lined up as WR
        elif pos in ['WR','DB']:
            wr += cnt
            sub_total += cnt
        elif pos == 'TE':
            te += cnt
            sub_total += cnt
        # Assuming DL is a defensive lineman lined up as an additional line man
        else:
            ol += cnt
            sub_total += cnt

    # If not all 11 players were noted at given positions we need to make some assumptions
    # I will assume if a QB is not listed then there was 1 QB on the play
    # If a QB is listed then I'm going to assume the rest of the positions are at OL
    # This might be flawed but it looks like RB, TE and WR are always listed in the personnel
    if sub_total < 11:
        diff = 11 - sub_total
        if not qb_listed:
            qb += 1
            diff -= 1
        ol += diff

    counts = (qb,rb,wr,te,ol)

    return counts

def personnel_features(df):
    personnel = df[['GameId','PlayId','OffensePersonnel','DefensePersonnel']].drop_duplicates()
    personnel['DefensePersonnel'] = personnel['DefensePersonnel'].apply(lambda x: split_personnel(x))
    personnel['DefensePersonnel'] = personnel['DefensePersonnel'].apply(lambda x: defense_formation(x))
    personnel['DL'] = personnel['DefensePersonnel'].apply(lambda x: x[0])
    personnel['LB'] = personnel['DefensePersonnel'].apply(lambda x: x[1])
    personnel['DB'] = personnel['DefensePersonnel'].apply(lambda x: x[2])

    personnel['OffensePersonnel'] = personnel['OffensePersonnel'].apply(lambda x: split_personnel(x))
    personnel['OffensePersonnel'] = personnel['OffensePersonnel'].apply(lambda x: offense_formation(x))
    personnel['QB'] = personnel['OffensePersonnel'].apply(lambda x: x[0])
    personnel['RB'] = personnel['OffensePersonnel'].apply(lambda x: x[1])
    personnel['WR'] = personnel['OffensePersonnel'].apply(lambda x: x[2])
    personnel['TE'] = personnel['OffensePersonnel'].apply(lambda x: x[3])
    personnel['OL'] = personnel['OffensePersonnel'].apply(lambda x: x[4])

    # Let's create some features to specify if the OL is covered
    personnel['OL_diff'] = personnel['OL'] - personnel['DL']
    personnel['OL_TE_diff'] = (personnel['OL'] + personnel['TE']) - personnel['DL']
    # Let's create a feature to specify if the defense is preventing the run
    # Let's just assume 7 or more DL and LB is run prevention
    personnel['run_def'] = (personnel['DL'] + personnel['LB'] > 6).astype(int)

    personnel.drop(['OffensePersonnel','DefensePersonnel'], axis=1, inplace=True)

    return personnel

def clean_stadium_type(row):
    if not pd.isnull(row['StadiumType']):
        if fuzz.partial_ratio(row['StadiumType'],'outdoor') > 75:
            st = 'outdoor'
        else:
            st = 'indoor'
    else:
        st = 'indoor'
    return st

def clean_field_type(row):
    if not pd.isnull(row['Turf']):
        if fuzz.partial_ratio(row['Turf'],'natural grass') > 75:
            ft = 'natural'
        else:
            ft = 'artificial'
    else:
        ft = 'artificial'
    return ft

def time_remaining(row):
    gc = row['GameClock']
    tmp = gc.split(':')[:-1]
    tr = (int(tmp[0])*3600) + (int(tmp[1]))
    tr = tr/3600/15
    return tr

def get_score_diff(row):
    if row['TeamOnOffense'] == 'home':
        scoreDiff = row['HomeScoreBeforePlay'] - row['VisitorScoreBeforePlay']
    else: 
        scoreDiff = row['VisitorScoreBeforePlay'] - row['HomeScoreBeforePlay']
    return scoreDiff

def distance_remaining(row):
    dist_rem = (100 - row['YardsFromOwnGoal'])/100
    return dist_rem

def one_hot_enc(df, var):
    one_hot = pd.get_dummies(df[var])
    df = (df
              .drop(var, axis=1)
              .join(one_hot)
         )
    return df

In [27]:
# additional cleaning steps
cleandf['Turf'] = cleandf.apply(clean_field_type, axis=1)
cleandf['StadiumType'] = cleandf.apply(clean_stadium_type, axis=1)
cleandf['DistanceRemaining'] = cleandf.apply(distance_remaining, axis=1)

cleandf = pd.merge(cleandf,personnel_features(cleandf),on=['GameId','PlayId'],how='inner')

cleandf = cleandf.drop(columns=['OffensePersonnel','DefensePersonnel'])

cleandf['ScoreDiff'] = cleandf.apply(get_score_diff, axis=1)

cleandf['GameClock'] = cleandf.apply(time_remaining, axis=1)

In [28]:
# game state information for each row

plays = cleandf.groupby('PlayId').first().drop(columns=['Team', 'X', 'Y', 'Dir', 'NflId', 'PossessionTeam',
                                                        'ToLeft', 'IsOnOffense', 'BallCarrier', 'HomeTeamAbbr',
                                                       'VisitorTeamAbbr', 'PlayDirection', 'YardLine', 
                                                       'A', 'S', 'NflIdRusher', 'PlayerWeight', 'FieldPosition',
                                                       'Dis', 'GameId', 'HomeScoreBeforePlay', 
                                                        'VisitorScoreBeforePlay', 'YardsFromOwnGoal'])
# one-hot categoricals
plays = one_hot_enc(plays, 'OffenseFormation')
di = {"outdoor":1, "indoor":0, "artificial":1, "natural":0, "home":1, "away":0}
plays = plays.replace({'StadiumType':di, 'Turf':di, 'TeamOnOffense':di})
di = {1:"D1", 2:"D2", 3:"D3", 4:"D4"}
plays = plays.replace({'Down':di})
plays = one_hot_enc(plays, 'Down')
di = {1:"Q1", 2:"Q2", 3:"Q3", 4:"Q4", 5:"OT"}
plays = plays.replace({'Quarter':di})
plays = one_hot_enc(plays, 'Quarter')

# add play id back to groupby dataframe
pids = plays.index.tolist()
plays['PlayId'] = pids

In [29]:
outcomes = pd.DataFrame.to_numpy(plays['Yards'])
plays = plays.drop(columns=['Yards', 'PlayId'])
plays = pd.DataFrame.to_numpy(plays)
# concat x,y-data values for all players
positions = train_df[['X','Y']]
positions = np.array(positions).reshape(-1, len(positions.columns)*22)
data = np.concatenate((plays, positions), axis=1)
data = data.astype('float')

# handle remaining nans
nanInd = np.argwhere(np.isnan(data))
data[nanInd[:,0],nanInd[:,1]] = np.zeros((1,3))

In [9]:
# map true outcomes to correct representation
n = len(outcomes)
ystar = np.zeros((n,199))
for ii in range(n):
    y = int(outcomes[ii]) 
    yvec = np.concatenate((np.zeros((1,y+99)),np.ones((1,100-y))), axis = 1)
    ystar[ii,:] = yvec

In [10]:
# required packages for MLP model
import tensorflow as tf
import keras
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, BatchNormalization, Dropout
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

from keras.optimizers import TFOptimizer, Adam

import keras.backend as kb

from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [11]:
def CRPS(yTrue, yPred):
    yPred = kb.cumsum(yPred, axis=1)
    return kb.mean(kb.sum(kb.square(yPred - yTrue), axis=1)) / 199

In [12]:
# get test/val/train splits
X_train, X_test, y_train, y_test = train_test_split(data, ystar, test_size=0.3)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3)

# model definition
model = Sequential()
model.add(Dense(128, input_dim=data.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(256, input_dim=data.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(199, input_dim=data.shape[1], activation='softmax'))

model.compile(loss=CRPS,
              metrics = [CRPS],
              optimizer=Adam())

mlp_json = model.to_json()
with open("./mlp/mlp.json", "w") as json_file:
    json_file.write(mlp_json)

In [16]:
# model training
es = EarlyStopping(monitor='val_loss', 
                   mode='min',
                   restore_best_weights=True, 
                   verbose=1, 
                   patience=21)
es.set_model(model)

lr = ReduceLROnPlateau(monitor='val_loss', 
                       factor=0.5,
                       patience=10,
                       verbose=1,
                       mode='min',
                       min_delta=0.00001)

every5 = keras.callbacks.ModelCheckpoint(filepath='./mlp/weights/mlpWeights{epoch:02d}.h5', 
                                         verbose=1, save_best_only=False, period = 5)
saveBest = keras.callbacks.ModelCheckpoint(filepath='./mlp/weights/mlpBestWeights.h5', 
                                           verbose=1, save_best_only=True)

batch_size = 50
epochs = 100

history = model.fit(x=X_train, y=y_train,
            validation_data = (X_val, y_val),
            epochs = epochs,
            batch_size = batch_size,
            callbacks = [lr, every5, saveBest],
            verbose = 1)

Train on 11353 samples, validate on 4866 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.01391, saving model to ./mlp/weights/mlpBestWeights.h5
Epoch 2/100

Epoch 00002: val_loss improved from 0.01391 to 0.01390, saving model to ./mlp/weights/mlpBestWeights.h5
Epoch 3/100

Epoch 00003: val_loss did not improve from 0.01390
Epoch 4/100

Epoch 00004: val_loss improved from 0.01390 to 0.01390, saving model to ./mlp/weights/mlpBestWeights.h5
Epoch 5/100

Epoch 00005: saving model to ./mlp/weights/mlpWeights05.h5

Epoch 00005: val_loss did not improve from 0.01390
Epoch 6/100

Epoch 00006: val_loss did not improve from 0.01390
Epoch 7/100

Epoch 00007: val_loss did not improve from 0.01390
Epoch 8/100

Epoch 00008: val_loss did not improve from 0.01390
Epoch 9/100

Epoch 00009: val_loss did not improve from 0.01390
Epoch 10/100

Epoch 00010: saving model to ./mlp/weights/mlpWeights10.h5

Epoch 00010: val_loss improved from 0.01390 to 0.01390, saving model to ./mlp/weights/


Epoch 00038: val_loss did not improve from 0.01390
Epoch 39/100

Epoch 00039: val_loss did not improve from 0.01390
Epoch 40/100

Epoch 00040: saving model to ./mlp/weights/mlpWeights40.h5

Epoch 00040: val_loss did not improve from 0.01390
Epoch 41/100

Epoch 00041: ReduceLROnPlateau reducing learning rate to 3.906250185536919e-06.

Epoch 00041: val_loss did not improve from 0.01390
Epoch 42/100

Epoch 00042: val_loss did not improve from 0.01390
Epoch 43/100

Epoch 00043: val_loss did not improve from 0.01390
Epoch 44/100

Epoch 00044: val_loss did not improve from 0.01390
Epoch 45/100

Epoch 00045: saving model to ./mlp/weights/mlpWeights45.h5

Epoch 00045: val_loss did not improve from 0.01390
Epoch 46/100

Epoch 00046: val_loss did not improve from 0.01390
Epoch 47/100

Epoch 00047: val_loss did not improve from 0.01390
Epoch 48/100

Epoch 00048: val_loss did not improve from 0.01390
Epoch 49/100

Epoch 00049: val_loss did not improve from 0.01390
Epoch 50/100

Epoch 00050: savin


Epoch 00077: val_loss did not improve from 0.01390
Epoch 78/100

Epoch 00078: val_loss did not improve from 0.01390
Epoch 79/100

Epoch 00079: val_loss did not improve from 0.01390
Epoch 80/100

Epoch 00080: saving model to ./mlp/weights/mlpWeights80.h5

Epoch 00080: val_loss did not improve from 0.01390
Epoch 81/100

Epoch 00081: ReduceLROnPlateau reducing learning rate to 2.4414063659605745e-07.

Epoch 00081: val_loss did not improve from 0.01390
Epoch 82/100

Epoch 00082: val_loss did not improve from 0.01390
Epoch 83/100

Epoch 00083: val_loss did not improve from 0.01390
Epoch 84/100

Epoch 00084: val_loss did not improve from 0.01390
Epoch 85/100

Epoch 00085: saving model to ./mlp/weights/mlpWeights85.h5

Epoch 00085: val_loss did not improve from 0.01390
Epoch 86/100

Epoch 00086: val_loss did not improve from 0.01390
Epoch 87/100

Epoch 00087: val_loss did not improve from 0.01390
Epoch 88/100

Epoch 00088: val_loss did not improve from 0.01390
Epoch 89/100

Epoch 00089: val_

In [14]:
score = model.evaluate(X_test, y_test)
np.save('./mlp/mlp_test_score.npy', score)
np.save('./mlp/mlp_hist.npy', history)



In [19]:
np.save('data/mlp_data.npy',data)

In [37]:
pids.index(20170910000081)

52

In [38]:
pids.index(20181206001238)

20328