In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import datetime
from kaggle.competitions import nflrush
import tqdm
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder

from scipy.spatial import Voronoi, voronoi_plot_2d, Delaunay, ConvexHull, convex_hull_plot_2d

import keras

from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')

sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [15,10]

Using TensorFlow backend.


In [2]:
pd.options.display.max_rows = 999

In [3]:
useless = ['JerseyNumber', 'PlayerCollegeName', 'Week', 'Turf', 'GameWeather', 'Humidity', 'WindSpeed', 'WindDirection']

In [4]:
train_init = pd.read_csv('../input/nfl-big-data-bowl-2020/train.csv').drop(useless, axis=1)
print(train_init.shape)
train_init.head()

(509762, 41)


Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,...,PlayerHeight,PlayerWeight,PlayerBirthDate,Position,HomeTeamAbbr,VisitorTeamAbbr,Stadium,Location,StadiumType,Temperature
0,2017090700,20170907000118,away,73.91,34.84,1.69,1.13,0.4,81.99,177.18,...,6-0,212,12/29/1988,SS,NE,KC,Gillette Stadium,"Foxborough, MA",Outdoor,63.0
1,2017090700,20170907000118,away,74.67,32.64,0.42,1.35,0.01,27.61,198.7,...,6-3,288,03/25/1989,DE,NE,KC,Gillette Stadium,"Foxborough, MA",Outdoor,63.0
2,2017090700,20170907000118,away,74.0,33.2,1.22,0.59,0.31,3.01,202.73,...,6-3,270,01/21/1989,DE,NE,KC,Gillette Stadium,"Foxborough, MA",Outdoor,63.0
3,2017090700,20170907000118,away,71.46,27.7,0.42,0.54,0.02,359.77,105.64,...,6-3,245,11/22/1982,ILB,NE,KC,Gillette Stadium,"Foxborough, MA",Outdoor,63.0
4,2017090700,20170907000118,away,69.32,35.42,1.82,2.43,0.16,12.63,164.31,...,6-0,206,08/17/1987,FS,NE,KC,Gillette Stadium,"Foxborough, MA",Outdoor,63.0


In [5]:
cat_features_init = []
dense_features_init = []
for col in train_init.columns:
    if train_init[col].dtype =='object':
        cat_features_init.append(col)
#         print("*cat*", col, len(train_init[col].unique()))
    else:
        dense_features_init.append(col)
#         print("!dense!", col, len(train_init[col].unique()))
dense_features_init.remove("PlayId")
dense_features_init.remove("Yards")
dense_features_init.remove("GameId")
dense_features_init.remove("NflId")
dense_features_init.remove("NflIdRusher")
dense_features_init.remove("Season")

In [6]:
# dict(train_init[dense_features_init].max())
max_init={'X': 120.0,
 'Y': 60.0,
 'S': 12.0,
 'A': 17.0,
 'Dis': 2.0,
 'Orientation': 360.0,
 'Dir': 360.0,
 'YardLine': 50.0,
 'Quarter': 5.0,
 'Down': 4.0,
 'Distance': 40.0,
 'HomeScoreBeforePlay': 75.0,
 'VisitorScoreBeforePlay': 75.0,
 'DefendersInTheBox': 11.0,
 'PlayerWeight': 400.0,
 'Temperature': 100.0}

In [7]:
# dict(train_init[dense_features_init].min())
min_init={'X': 0.0,
 'Y': 0.0,
 'S': 0.01,
 'A': 0.01,
 'Dis': 0.0,
 'Orientation': 0.0,
 'Dir': 0.0,
 'YardLine': 0.0,
 'Quarter': 1.0,
 'Down': 1.0,
 'Distance': 0.0,
 'HomeScoreBeforePlay': 0.0,
 'VisitorScoreBeforePlay': 0.0,
 'DefendersInTheBox': 0.0,
 'PlayerWeight': 140.0,
 'Temperature': 0.0}

In [8]:
# dict(train_init[dense_features_init].median())
median_init={'X': 60.29,
 'Y': 26.79,
 'S': 2.47,
 'A': 1.54,
 'Dis': 0.27,
 'Orientation': 179.7,
 'Dir': 180.09,
 'YardLine': 29.0,
 'Quarter': 3.0,
 'Down': 1.0,
 'Distance': 10.0,
 'HomeScoreBeforePlay': 9.0,
 'VisitorScoreBeforePlay': 7.0,
 'DefendersInTheBox': 7.0,
 'PlayerWeight': 245.0,
 'Temperature': 62.0}

In [9]:
def clean_dense(df):
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(median_init[col])
        df[col] = np.clip(df[col].values, min_init[col], max_init[col])
    return df

In [10]:
train_init[dense_features_init] = clean_dense(train_init[dense_features_init])

## Feature engineering

In [11]:
def clean_location(df):
    import re
    df.Location = df.Location.str.lower() #to lower case
    df.Location = df.Location.map(lambda x: re.sub(r'\W+', '', x)) #remove special characters
    df.Location = df.Location.str[:5] #taking first 5 characters solves the problem
    return df


In [12]:
def split_personnel(s, defense = True):
        splits = s.split(',')   
        
        try:
            for i in range(len(splits)):
                splits[i] = splits[i].strip()
                if splits[i][0].isdigit() == False: # first number should be numeric
                    splits[i] = '3' + splits[i][1:] #hard coded
                    
                if defense:    
                    if splits[i][2] + splits[i][3] not in ['DB','OL','DL','LB','OB']: #should be one of these
                        splits[i] = splits[i][0:1] + ' DB'
                else:
                    if splits[i][2] + splits[i][3] not in ['RB','OL','QB','TE','WR']: #should be one of these
                        splits[i] = splits[i][0:1] + ' RB'
        
        except:
            if defense:
                splits = ['3 DB', '3 LB', '6 DB'] #hard coded random existing personnel
            else:
                splits = ['1 RB, 1 TE, 3 WR'] #hard coded random existing personnel
                
        return splits

def defense_formation(l):
    dl = 0
    lb = 0
    db = 0
    other = 0
    for position in l:
        sub_string = position.split(' ')
        if len(sub_string) > 1:
            if sub_string[1] == 'DL':
                dl += int(sub_string[0])
            elif sub_string[1] in ['LB','OL']:
                lb += int(sub_string[0])
            else:
                db += int(sub_string[0])

    counts = (dl,lb,db,other)

    return counts

def offense_formation(l):
    qb = 0
    rb = 0
    wr = 0
    te = 0
    ol = 0

    sub_total = 0
    qb_listed = False
    for position in l:
        sub_string = position.split(' ')
        if len(sub_string) > 1:
            pos = sub_string[1]
            cnt = int(sub_string[0])

            if pos == 'QB':
                qb += cnt
                sub_total += cnt
                qb_listed = True
            # Assuming LB is a line backer lined up as full back
            elif pos in ['RB','LB']:
                rb += cnt
                sub_total += cnt
            # Assuming DB is a defensive back and lined up as WR
            elif pos in ['WR','DB']:
                wr += cnt
                sub_total += cnt
            elif pos == 'TE':
                te += cnt
                sub_total += cnt
            # Assuming DL is a defensive lineman lined up as an additional line man
            else:
                ol += cnt
                sub_total += cnt

    # If not all 11 players were noted at given positions we need to make some assumptions
    # I will assume if a QB is not listed then there was 1 QB on the play
    # If a QB is listed then I'm going to assume the rest of the positions are at OL
    # This might be flawed but it looks like RB, TE and WR are always listed in the personnel
    if sub_total < 11:
        diff = 11 - sub_total
        if not qb_listed:
            qb += 1
            diff -= 1
        ol += diff

    counts = (qb,rb,wr,te,ol)

    return counts

def personnel_features(data):
    
    df = data.copy(deep=True)
    
    personnel = df[['GameId','PlayId','OffensePersonnel','DefensePersonnel']].drop_duplicates()
       
    personnel['DefensePersonnel'] = personnel['DefensePersonnel'].apply(lambda x: split_personnel(str(x)))
    personnel['DefensePersonnel'] = personnel['DefensePersonnel'].apply(lambda x: defense_formation((x)))
    personnel['num_DL'] = personnel['DefensePersonnel'].apply(lambda x: x[0])
    personnel['num_LB'] = personnel['DefensePersonnel'].apply(lambda x: x[1])
    personnel['num_DB'] = personnel['DefensePersonnel'].apply(lambda x: x[2])

    personnel['OffensePersonnel'] = personnel['OffensePersonnel'].apply(lambda x: split_personnel(str(x), defense=False))
    personnel['OffensePersonnel'] = personnel['OffensePersonnel'].apply(lambda x: offense_formation((x)))
    personnel['num_QB'] = personnel['OffensePersonnel'].apply(lambda x: x[0])
    personnel['num_RB'] = personnel['OffensePersonnel'].apply(lambda x: x[1])
    personnel['num_WR'] = personnel['OffensePersonnel'].apply(lambda x: x[2])
    personnel['num_TE'] = personnel['OffensePersonnel'].apply(lambda x: x[3])
    personnel['num_OL'] = personnel['OffensePersonnel'].apply(lambda x: x[4])

    # Let's create some features to specify if the OL is covered
    personnel['OL_diff'] = personnel['num_OL'] - personnel['num_DL']
    personnel['OL_TE_diff'] = (personnel['num_OL'] + personnel['num_TE']) - personnel['num_DL']
    # Let's create a feature to specify if the defense is preventing the run
    # Let's just assume 7 or more DL and LB is run prevention
    personnel['run_def'] = (personnel['num_DL'] + personnel['num_LB'] > 6).astype(int)

    personnel.drop(['OffensePersonnel','DefensePersonnel'], axis=1, inplace=True)

    df_ = pd.merge(df,personnel,on=['GameId','PlayId'],how='inner')
    df_.index=df.index
    return df_

In [13]:
def fix_position(df):
    S = ['S', 'SAF', 'FS', 'SS']
    L = ['LB', 'ILB', 'MLB', 'OLB']
    RB = ['RB', 'HB', 'FB']
    G = ['G', 'OG']
    T = ['T', 'OT']

    df['Position'] = df['Position'].replace(S, 'S')
    df['Position'] = df['Position'].replace(L, 'L')
    df['Position'] = df['Position'].replace(RB, 'RB')
    df['Position'] = df['Position'].replace(G, 'G')
    df['Position'] = df['Position'].replace(T, 'T')
    
    return df

In [14]:
def fix_stadium_type(df):
    df_train = df.loc[df['BallCarrier'], ['PlayId', 'StadiumType']].copy(deep=True)
    
    
    outdoor = ['Outdoor', 'Outdoors', 'Cloudy', 'Heinz Field', 'Outdor', 'Ourdoor', 
           'Outside', 'Outddors','Outdoor Retr Roof-Open', 'Oudoor', 'Bowl']

    indoor_closed = ['Indoors', 'Indoor', 'Indoor, Roof Closed', 'Indoor, Roof Closed', 'Retractable Roof',
                     'Retr. Roof-Closed', 'Retr. Roof - Closed', 'Retr. Roof Closed']

    indoor_open   = ['Indoor, Open Roof', 'Open', 'Retr. Roof-Open', 'Retr. Roof - Open']
    dome_closed   = ['Dome', 'Domed, closed', 'Closed Dome', 'Domed', 'Dome, closed']
    dome_open     = ['Domed, Open', 'Domed, open']
    
    df_train['StadiumType'] = df_train['StadiumType'].replace(outdoor,'outdoor')
    df_train['StadiumType'] = df_train['StadiumType'].replace(indoor_closed,'indoor_closed')
    df_train['StadiumType'] = df_train['StadiumType'].replace(indoor_open,'indoor_open')
    df_train['StadiumType'] = df_train['StadiumType'].replace(dome_closed,'dome_closed')
    df_train['StadiumType'] = df_train['StadiumType'].replace(dome_open,'dome_open')
    
    df = df.drop(['StadiumType'], axis=1).merge(df_train, on = "PlayId", copy=False)
    return df

In [15]:
def fix_stadium(df):
    train = df.loc[df['BallCarrier'], ['PlayId', 'Stadium']].copy(deep=True)
    ## Stadium clean
    map_stad = {'Broncos Stadium at Mile High': 'Broncos Stadium At Mile High', 'CenturyField': 'CenturyLink Field', 
                'CenturyLink': 'CenturyLink Field', 'Everbank Field': 'EverBank Field', 'FirstEnergy': 'First Energy Stadium',
                'FirstEnergy Stadium': 'First Energy Stadium', 'FirstEnergyStadium': 'First Energy Stadium',
                'Lambeau field': 'Lambeau Field', 'Los Angeles Memorial Coliesum': 'Los Angeles Memorial Coliseum',
                'M & T Bank Stadium': 'M&T Bank Stadium', 'M&T Stadium': 'M&T Bank Stadium', 'Mercedes-Benz Dome': 'Mercedes-Benz Superdome',
                'MetLife': 'MetLife Stadium', 'Metlife Stadium': 'MetLife Stadium', 'NRG': 'NRG Stadium',
                'Oakland Alameda-County Coliseum': 'Oakland-Alameda County Coliseum', 'Paul Brown Stdium': 'Paul Brown Stadium', 'Twickenham': 'Twickenham Stadium'}
    
    for stad in train['Stadium'].unique():
        if stad in map_stad.keys():
            pass
        else:
            map_stad[stad]=stad

    train['Stadium'] = train['Stadium'].map(map_stad)
    
    df = df.drop(['Stadium'], axis=1).merge(train, on = "PlayId", copy=False)
    return df

In [16]:
def fix_team(df):
    train = df.copy(deep=True)
    train.loc[train.VisitorTeamAbbr == "ARI", 'VisitorTeamAbbr'] = "ARZ"
    train.loc[train.HomeTeamAbbr == "ARI", 'HomeTeamAbbr'] = "ARZ"

    train.loc[train.VisitorTeamAbbr == "BAL", 'VisitorTeamAbbr'] = "BLT"
    train.loc[train.HomeTeamAbbr == "BAL", 'HomeTeamAbbr'] = "BLT"

    train.loc[train.VisitorTeamAbbr == "CLE", 'VisitorTeamAbbr'] = "CLV"
    train.loc[train.HomeTeamAbbr == "CLE", 'HomeTeamAbbr'] = "CLV"

    train.loc[train.VisitorTeamAbbr == "HOU", 'VisitorTeamAbbr'] = "HST"
    train.loc[train.HomeTeamAbbr == "HOU", 'HomeTeamAbbr'] = "HST"
    
    return train

In [17]:
def standardize_dataset(df):

    train = df.copy(deep=True)
    
    train.loc[train['S']== 0, 'S'] = 0.01
    train.loc[train['A']== 0, 'A'] = 0.01
    
    train['ToLeft'] = train.PlayDirection == "left"
    train['TeamOnOffense'] = "home"
    train.loc[train.PossessionTeam != train.HomeTeamAbbr, 'TeamOnOffense'] = "away"
    train['IsOnOffense'] = (train.Team == train.TeamOnOffense).astype(int) # Is player on offense?
    
    train['DefenseTeam'] = train['HomeTeamAbbr']
    train.loc[train['PossessionTeam']==train['HomeTeamAbbr'], 'DefenseTeam'] = train[train['PossessionTeam']==train['HomeTeamAbbr']].VisitorTeamAbbr
    
    
    train['YardLine_std'] = 100 - train.YardLine
    train.loc[train.FieldPosition.fillna('') == train.PossessionTeam,  
            'YardLine_std'
             ] = train.loc[train.FieldPosition.fillna('') == train.PossessionTeam,  
              'YardLine']
    train['YardLine_std'] = np.where(train['YardLine'] == 50, 50,  train['YardLine_std'])     

    
    train['X_std'] = train.X - 10
    train.loc[train.ToLeft, 'X_std'] = 120 - train.loc[train.ToLeft, 'X']
    
    train['Y_std'] = train.Y
    train.loc[train.ToLeft, 'Y_std'] = 53.3 - train.loc[train.ToLeft, 'Y'] 
    
    train = standardize_angles(train, 'Orientation')
    train = standardize_angles(train, 'Dir')
    train.loc[train['Season'] == 2017, 'Orientation_std'] = np.mod(90 + train.loc[train['Season'] == 2017, 'Orientation_std'], 360)
    
    train.loc[train['Season'] == 2017, 'S'] = (train['S'][train['Season'] == 2017] - 2.4355) / 1.2930 * 1.4551 + 2.7570 #standardize for season
    
    train.drop(['X', 'Y', 'YardLine', 'Orientation', 'Dir'],axis=1, inplace=True)
    
    return train

In [18]:
def standardize_angles(data, column_name):
    '''
    #Function to standardize angles,like orientation and direction
    '''
    df = data.copy(deep=True)
    
    df[column_name].fillna(0, inplace=True) #temporary solution to solve ConvexHull on NaN // missing replaced by 0
    conditions = [
        (df['ToLeft'] & (df[column_name] < 90) ),
        (df['ToLeft'] == False & (df[column_name] > 270) ),
        (5 > 4)] #should be changed
    choices = [ df[column_name] + 360, df[column_name] - 360, df[column_name] ]
    
    new_name = column_name + '_std'
    df[new_name] = np.select(conditions, choices)
    df[new_name] = np.where(df['ToLeft'], df[new_name] - 180  , df[new_name])
    
    return df

In [19]:
def process_angles(df):
    train = df.copy()
    train["Orientation_sin"] = train["Orientation_std"].apply(lambda x : np.sin(x/360 * 2 * np.pi))
    train["Orientation_cos"] = train["Orientation_std"].apply(lambda x : np.cos(x/360 * 2 * np.pi))
    train["Dir_sin"] = train["Dir_std"].apply(lambda x : np.sin(x/360 * 2 * np.pi))
    train["Dir_cos"] = train["Dir_std"].apply(lambda x : np.cos(x/360 * 2 * np.pi))
    return train

In [20]:
def add_time_features(df):
    
    X = df.copy(deep=True)
    
    X['TimeHandoff'] = pd.to_datetime(X['TimeHandoff'])
    X['PassDuration'] = (X['TimeHandoff'] - pd.to_datetime(X['TimeSnap'])).dt.seconds
    # from https://www.kaggle.com/zero92/best-lbgm-new-features
    pd.to_datetime(train_init['TimeHandoff']).dt.month
    X['Month'] = X['TimeHandoff'].dt.month
    X['Afternoon'] = X['TimeHandoff'].dt.hour.apply(lambda x : 1 if (x <18 and x >=12) else 0)
    X['Evening'] = X['TimeHandoff'].dt.hour.apply(lambda x : 1 if (x >= 18 and x < 24) else 0)
    
    time=pd.to_datetime(X['GameClock'])
    
    X["GameClock_minute_total"] = time.dt.hour + X.Quarter*15
    
    X['PlayerAge'] = X['TimeHandoff']-pd.to_datetime(X['PlayerBirthDate'], utc=True)
    X['PlayerAge'] = X['PlayerAge'].dt.days/365
    
    return X

In [21]:
def projection_features(data):
    
    df = data.copy(deep=True)
    
    radian_angle = (90 - df['Dir_std']) * np.pi / 180.0
    df['v_horizontal'] = np.abs(df['S'] * np.cos(radian_angle))
    df['v_vertical'] = np.abs(df['S'] * np.sin(radian_angle))
    df['a_horizontal'] = np.abs(df['A'] * np.cos(radian_angle))
    df['a_vertical'] = np.abs(df['A'] * np.sin(radian_angle))
    df['dis_horizontal'] = np.abs(df['Dis'] * np.cos(radian_angle))
    df['dis_vertical'] = np.abs(df['Dis'] * np.sin(radian_angle))    
    df['Dist_YardLine'] = df['YardLine_std'] - df['X_std']   

    return df

In [22]:
def euclidean_distance(x1,y1,x2,y2):
    x_diff = (x1-x2)**2
    y_diff = (y1-y2)**2

    return np.sqrt(x_diff + y_diff)

In [23]:
def defense_features(data):
    
    df = data.copy(deep=True)
    
    
    rusher = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','Team','X_std','Y_std']]
    rusher.columns = ['GameId','PlayId','RusherTeam','RusherX','RusherY']

    defense = pd.merge(df,rusher,on=['GameId','PlayId'],how='inner')
    defense = defense[defense['Team'] != defense['RusherTeam']][['GameId','PlayId','X_std','Y_std','RusherX','RusherY']]
    defense['def_dist_to_back'] = defense[['X_std','Y_std','RusherX','RusherY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)

    defense_agg = defense.groupby(['GameId','PlayId'])\
                     .agg({'def_dist_to_back':['min','max','mean','std']})\
                     .reset_index()
    defense_agg.columns = ['GameId','PlayId','def_min_dist','def_max_dist','def_mean_dist','def_std_dist']
    
    
    df = pd.merge(df,defense_agg,on=['GameId','PlayId'],how='inner')

    df['def_dist_to_back']=0
    df.loc[defense.index, 'def_dist_to_back']=defense['def_dist_to_back']
    
    df['DefendersInTheBox_vs_Distance'] = df['DefendersInTheBox'] / df['Distance']

    return df

In [24]:
def back_direction(orientation):
    if orientation > 180.0:
        return 1
    else:
        return 0

In [25]:
def features_relative_to_back(data, person):
    """
    The same dunction for Quarterback and BallCarried.
    """
    df = data.copy(deep=True)
    
    carriers = df[df[person]][['GameId','PlayId', 'Team', 'NflId','X_std','Y_std','Orientation_std','Dir_std','YardLine_std']]
        
    carriers['back_from_scrimmage'] = carriers['YardLine_std'] - carriers['X_std']
    carriers['back_oriented_down_field'] = carriers['Orientation_std'].apply(lambda x: back_direction(x))
    carriers['back_moving_down_field'] = carriers['Dir_std'].apply(lambda x: back_direction(x))
    carriers = carriers.rename(columns={'X_std':'back_X',
                                        'Y_std':'back_Y',
                                        'NflId':'PersonNflId',
                                        'Team': 'PersonTeam'})
    
    carriers = carriers[['GameId','PlayId', 'PersonTeam', 'PersonNflId','back_X','back_Y','back_from_scrimmage','back_oriented_down_field','back_moving_down_field']]




    player_distance = df[['GameId','PlayId', 'Team', 'NflId','X_std','Y_std']]
    player_distance = pd.merge(player_distance, carriers, on=['GameId','PlayId'], how='inner')
    player_distance = player_distance[player_distance['NflId'] != player_distance['PersonNflId']]
    player_distance = player_distance[player_distance['Team'] == player_distance['PersonTeam']]
    player_distance['dist_to_back'] = player_distance[['X_std','Y_std','back_X','back_Y']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)

    player_distance_agg = player_distance.groupby(['GameId','PlayId','back_from_scrimmage','back_oriented_down_field','back_moving_down_field'])\
                                     .agg({'dist_to_back':['min','max','mean','std']})\
                                     .reset_index()
    
    agg_names = ['back_from_scrimmage','back_oriented_down_field','back_moving_down_field', 'min_dist','max_dist','mean_dist','std_dist']
    agg_names = [person + '_' + name for name in agg_names]
    player_distance_agg.columns = ['GameId','PlayId'] + agg_names
    
    
    df = pd.merge(df,player_distance_agg,on=['GameId','PlayId'],how='inner')
    
    dist_toback_name = person + '_dist_to_back'
    df[dist_toback_name]=0
    df.loc[player_distance.index, dist_toback_name]=player_distance['dist_to_back']
    
    return df

In [26]:
def create_helper_variables(data):
    '''
    Function to create a bunch of helper variables for feature construction
    '''
    
    df = data.copy(deep=True)
    
    
    #Compute coordinates for Or/Dir arrows, Mostly for plotting purposes 
    df['X_std_end_Dir'] = df['S'] * np.cos((90-df['Dir_std']) * np.pi/180) + df['X_std'] 
    df['Y_std_end_Dir'] = df['S'] * np.sin((90-df['Dir_std']) * np.pi/180) + df['Y_std']
    df['X_std_end_Or'] = df['S'] * np.cos((90-df['Orientation_std']) * np.pi/180) + df['X_std'] #does this one make sense?
    df['Y_std_end_Or'] = df['S'] * np.sin((90-df['Orientation_std']) * np.pi/180) + df['Y_std'] #does this one make sense?
    
    df['MyTeam'] = np.where(df['Team'] == 'home', df['HomeTeamAbbr'], df['VisitorTeamAbbr']) ### glukhove: keep it for now, but I separated defense and offense teams for getting different embedings
    df['Quarterback'] = np.where(df['Position'] == 'QB', True, False) #is it 1 for each play?    
    df['BallCarrier'] = df.NflId == df.NflIdRusher
    
    #to fix case when we do not have any QB: then we will set BallCarrier as QB too
    qb_check=(df.groupby(['PlayId'])['Quarterback'].any()).reset_index()
    bad_plays=qb_check[qb_check.Quarterback==False]['PlayId'].values
    df.loc[(df.PlayId.isin(bad_plays)) & (df.BallCarrier==True), 'Quarterback']=True
    
    #clean cases with several QB
    qb_more=(df.groupby(['PlayId'])['Quarterback'].sum()>1).reset_index()
    plays_more = qb_more[qb_more.Quarterback==True]['PlayId'].values
    df.loc[(df.PlayId.isin(plays_more)), 'Quarterback']=False
    first_qb = df[(df['Position'] == 'QB') & (df.PlayId.isin(plays_more))].drop_duplicates('PlayId')
    df.loc[first_qb.index, 'Quarterback']=True

    df['ScoreOffense'] = np.where(df['HomeTeamAbbr'] == df['PossessionTeam'] , df['HomeScoreBeforePlay'], df['VisitorScoreBeforePlay'])
    df['ScoreDefense'] = np.where(df['HomeTeamAbbr'] == df['PossessionTeam'] , df['VisitorScoreBeforePlay'], df['HomeScoreBeforePlay'])
    
    df['Season_2017'] = np.where(df['Season'] == 2017, 1, 0)
    df['Season_2018'] = np.where(df['Season'] == 2018, 1, 0)
    df['Season_2019'] = np.where(df['Season'] == 2019, 1, 0)
#     df['Season_2020'] = np.where(df['Season'] == 2020, 1, 0)
    
    df['Quarter_Down'] = pd.to_numeric( df["Quarter"].map(int).map(str) + '.'  + df["Down"].map(int).map(str) )
    
    return df

In [27]:
def dist_between_ball_centroid(df, agg):
        carriers = df[df['BallCarrier']][['GameId','PlayId','X_std','Y_std']]
                
        carriers['Distance_Ball_Centroid_defense'] = euclidean_distance( carriers['X_std'].values, carriers['Y_std'].values, agg['X_std_mean_defense'].values, agg['Y_std_mean_defense'].values)

        carriers = carriers[['PlayId', 'Distance_Ball_Centroid_defense']].set_index('PlayId')

        return carriers

In [28]:
def diff_features(df, t = True):
    '''
    Calculates distance between two players
    '''  
    position_diffs = ""
    team_spread_diffs_std = ""
    if t:
        position_diffs = df[['X_std_mean','Y_std_mean']].diff(1).iloc[1::2, :]
        team_spread_diffs_std = df[['X_std_std','Y_std_std']].diff(1).iloc[1::2, :]   #instead of variance (std) we could use diff(minY ,maxY) and diff(minX ,maxX)
    else:
        position_diffs = df[['X_std_end_Dir_mean','Y_std_end_Dir_mean']].diff(1).iloc[1::2, :]
        team_spread_diffs_std = df[['X_std_end_Dir_std','Y_std_end_Dir_std']].diff(1).iloc[1::2, :]   #instead of variance (std) we could use diff(minY ,maxY) and diff(minX ,maxX)
   
    diffs = pd.concat([position_diffs, team_spread_diffs_std], axis=1)
    diffs.columns = diffs.columns + "_team_diff"
    return diffs

def aggregation_features(data):
    """
    Function that constructs aggregaiton features (min,max,mean, std etc.)
    """
    
    df = data.copy(deep=True)
    
    df['TEMP'] = np.where(df['IsOnOffense'], 'offense', 'defense') #is it 1 for each play?
    df_plays = df.groupby(['PlayId','TEMP'])

    df = df_plays[["PlayerHeight", "PlayerWeight","PlayerAge", "Dis", "X_std", "Y_std", "X_std_end_Dir", "Y_std_end_Dir"]].agg(["max", "min", "mean", "std"])
    df.columns = ["_".join(x) for x in df.columns.ravel()]
    
    df['Spread_abs_y'] =  df['Y_std_max'] - df['Y_std_min']
    df['Spread_abs_x'] =  df['X_std_max'] - df['X_std_min']

    diffs = diff_features(df)        
    diffs2 = diff_features(df, t = False)
    
    diffs = pd.concat([diffs, diffs2], axis=1)

    new = df.pivot_table(values=df.columns, index='PlayId', columns='TEMP')
    new.columns = ["_".join(x) for x in new.columns.ravel()]

    aggregated_features = new.join(diffs)

    aggregated_features.reset_index(inplace=True)
    aggregated_features.set_index('PlayId', inplace=True)
    aggregated_features = aggregated_features.drop('TEMP', axis=1)
    
    return aggregated_features



In [30]:
def voronoid_space_features(data):
    """
    Function that creates voronoid space for ball at different points (takes some time: ~2 minutes)
    """
    
    df = data.copy(deep=True)
    
    voronoid_space_ball = df.groupby(df['PlayId']).apply(calculate_space).to_frame(name = "VoronoidSpaceBall")
    voronoid_predicted_space_ball = df.groupby(df['PlayId']).apply(calculate_space, False, False).to_frame(name = "VoronoidPredictedSpaceBall")
    voronoid_predicted_corrected_space_ball = df.groupby(df['PlayId']).apply(calculate_space, False, False, True).to_frame(name = "VoronoidPredictedCorrectedSpaceBall")
    voronoid_set = voronoid_space_ball.join(voronoid_predicted_space_ball).join(voronoid_predicted_corrected_space_ball)
    for col in ['VoronoidSpaceBall', 'VoronoidPredictedSpaceBall', 'VoronoidPredictedCorrectedSpaceBall']:
        voronoid_set[col] = np.minimum(voronoid_set[col].values, 30)
    return voronoid_set

#calculates voronoids and space for ball (could easily add voronoi ofense here)
def calculate_space(play, all_results = False, current = True, pred_correction = False):
    try:
        ball = play.loc[play['BallCarrier'] == True]    
        #if statement should be improved by just setting variable name to use. 
        if current:
            ball_x = float(ball['X_std'].values) 
            ball_y = float(ball['Y_std'].values)            
            df_ball= pd.DataFrame([[ball_x, ball_y], [ball_x-1, ball_y], [ball_x, -ball_y], [ball_x, 53+ball_y]], columns=['X_std','Y_std']) #add point 1 yard left to ball to close region behind player.
            play = play.loc[play['IsOnOffense']==False]
            points = play[['X_std','Y_std']]

        else:
            ball_x = float(ball['X_std_end_Dir'].values) 
            ball_y = float(ball['Y_std_end_Dir'].values)            
            df_ball= pd.DataFrame([[ball_x, ball_y], [ball_x-1, ball_y], [ball_x, -ball_y], [ball_x, 53+ball_y]], columns=['X_std_end_Dir','Y_std_end_Dir']) #add point 1 yard left to ball to close region behind player.

            if pred_correction:
                play.loc[:,'X_std_end_Dir'] = np.maximum(play.X_std_end_Dir, ball_x)  #update that they won't go passed ball, doesnt work yet      
                #alter direction slightly towards ball
                #collision? X%less distance

            play = play.loc[play.loc[ : , 'IsOnOffense']==False]
            points = play[['X_std_end_Dir','Y_std_end_Dir']] #create extra if-block where we correct for collisions

        points = df_ball.append(points)
        points = points.values

        vor = Voronoi(points)
        #http://www.qhull.org/html/qh-faq.htm#volume: The volume of each convex hull is the volume of the corresponding Vornoi region 
        volume = float(ConvexHull(vor.vertices[vor.regions[vor.point_region[0]]], qhull_options = "QJ").volume)  #calculates convexhull for ball (QJ doesn't crash, no idea what it does)
    except:
        print("Vor problem")
        volume=-1
    return volume

In [31]:
def calculate_distance_features(data): #also do this for X_std_Dir??? #CLEAN THIS FUNCTION
    """
    Function that calculates distance features and corresponding aggregations
    """

    df = data.copy(deep=True)
    
    df = df.sort_values(['PlayId','IsOnOffense']) 
    df = df[['PlayId','X_std','Y_std','S','A','BallCarrier','IsOnOffense']]
    df.set_index('PlayId', inplace=True)

    ball = df[df['BallCarrier']][['X_std','Y_std']]
    ball.columns = ball.columns + '_ball'
    df = df.join(ball)
    df = calculate_tackle_time(df)
    
    ind_player_dist = individual_player_distances(df)
        
    df = aggregate_distance_features(df)    
    df = df.join(ind_player_dist)    
 
    df = add_misc_distance_features(df)
    
    return df


def calculate_tackle_time(df):
    """
    Function that solves a quadratic equation to find tackle time
    """
    df['Distance_ball'] = np.linalg.norm( np.subtract( [ df['X_std'], df['Y_std']] , [df['X_std_ball'], df['Y_std_ball']] ) , axis =0)    
    df['Tackle_Time'] = df['Distance_ball'] / df['S'] 
    #df['Tackle_Time_corrected'] = df['Distance_ball'] / (df['S'] * df['A']) #tackle time that takes into account, in some pretty random way, acceleration as well
        
    a = (1/2 * df['A'].values)
    b = df['S'].values 
    c =  - df['Distance_ball'].values 
        
    #this could lead to infinite valuesI think, so I replace infinitevalues later
    df['Tackle_Time_calculated']  = (-b + np.sqrt(b**2 - 4*a*c)) / (2 * a) #solve quadratic equation to find approx tackletime #assumes constant acceleration 
    
    return df
    
    
def add_misc_distance_features(df): 
    """
    Function that adds some tailor made ideas for features
    """
    df['Mean_dist_closest_3_def'] = ( df['1_Distance_defense'] + df['2_Distance_defense'] + df['3_Distance_defense'] ) / 3
    df['Mean_dist_closest_6_def'] = ( df['1_Distance_defense'] + df['2_Distance_defense'] + df['3_Distance_defense'] + 
                                     df['4_Distance_defense'] + df['5_Distance_defense'] + df['6_Distance_defense']) / 6
    
    df['Mean_dist_closest_3_off'] = ( df['1_Distance_offense'] + df['2_Distance_offense'] + df['3_Distance_offense'] ) / 3
    df['Diff_mean_dist_closest_3'] = df['Mean_dist_closest_3_def'] - df['Mean_dist_closest_3_off']
    
    df['Mean_S_closest_3_def'] = ( df['1_Speed_defense'] + df['2_Speed_defense'] + df['3_Speed_defense'] ) / 3
    df['Mean_S_closest_6_def'] = ( df['1_Speed_defense'] + df['2_Speed_defense'] + df['3_Speed_defense'] + 
                                     df['4_Speed_defense'] + df['5_Speed_defense'] + df['6_Speed_defense']) / 6
    
    df['Mean_TackleTime_closest_3_def'] = ( df['1_TackleTime_defense'] + df['2_TackleTime_defense'] + df['3_TackleTime_defense'] ) / 3
    df['Mean_TackleTime_closest_6_def'] = ( df['1_TackleTime_defense'] + df['2_TackleTime_defense'] + df['3_TackleTime_defense'] + 
                                     df['4_TackleTime_defense'] + df['5_TackleTime_defense'] + df['6_TackleTime_defense']) / 6  
    
    return df

def individual_player_distances(df):
    """
    Function that solves a quadratic equation to find tackle time
    """
    distance_df = df
    distance_df = distance_df[distance_df['BallCarrier'] == False].copy()
    distance_df['TEMP'] = np.where(distance_df['IsOnOffense'], 'offense', 'defense')    
    distance_df = distance_df.sort_values(['PlayId','TEMP','Distance_ball'],ascending=True)
    
    distance_df['Defender_Number'] = distance_df.groupby(['PlayId','TEMP']).cumcount()+1  
    
    distance_player = features_by_player_closeness(distance_df.copy(), "Distance", "Distance_ball")
    speed_player = features_by_player_closeness(distance_df.copy(), "Speed", "S")
    tackle_player = features_by_player_closeness(distance_df.copy(), "TackleTime", "Tackle_Time")

    added_features = distance_player.join(speed_player).join(tackle_player)
    added_features['1_Distance_by_Speed'] = added_features['1_Distance_defense'] / added_features['1_Speed_defense'] 
    added_features['2_Distance_by_Speed'] = added_features['2_Distance_defense'] / added_features['2_Speed_defense']
    added_features['3_Distance_by_Speed'] = added_features['3_Distance_defense'] / added_features['3_Speed_defense']
    
    return added_features

def features_by_player_closeness(df, name, values):
    df['Defender_Number'] = df['Defender_Number'].apply(str) + "_" + name
    df = df.pivot_table(index=['PlayId','TEMP'], columns='Defender_Number', values= values)
    df = df.pivot_table(values=df.columns, index='PlayId', columns='TEMP')
    df.columns = ["_".join(x) for x in df.columns.ravel()] 
    return df

def aggregate_distance_features(df): #add trimmed mean here?

    df['TEMP'] = np.where(df['IsOnOffense'], 'offense', 'defense') #is it 1 for each play?
    df =df[df['BallCarrier'] == False]
    df_plays = df.groupby(['PlayId','TEMP'])

    df = df_plays.agg(["min", "mean","median","std"])[["Distance_ball", "Tackle_Time","Tackle_Time_calculated"]] #idea; use trimmed mean
    df.columns = ["_".join(x) for x in df.columns.ravel()]
        
    new = df.pivot_table(values=df.columns, index='PlayId', columns='TEMP')
    new.columns = ["_".join(x) for x in new.columns.ravel()]
    
    df=new
    
    return df

In [32]:
def speed_diff(df, features):
        
    carriers = df[df['BallCarrier']][['GameId','PlayId','S']]

    features['Ballspeed_by_def1speed'] = carriers['S'].values/ features['1_Speed_defense'].values
    features['Ballspeed_by_def2speed'] = carriers['S'].values/ features['2_Speed_defense'].values
    features['Ballspeed_by_def3speed'] = carriers['S'].values/ features['3_Speed_defense'].values
    
    return features

In [33]:
def collect_game_featrues(data):
    
    df = data.copy(deep=True)
    
    vor = voronoid_space_features(df) #should be enough try_except inside
    features = vor
      
    try:
        agg = aggregation_features(df)
        dist_ball_teams = dist_between_ball_centroid(df, agg)
        features = features.join(agg).join(dist_ball_teams)
    except:
        print("aggregation problem")
        pass
    try:        
        dist_f = calculate_distance_features(df)
        features = features.join(dist_f)
    except:
        print("calculate_distance_features problem")
        pass
        
    features = speed_diff(df, features)
    
    try:
        features['VoronoidSpaceBall'][features['VoronoidSpaceBall']==0]=0.01
        features['VoronoiDivided'] = features['VoronoidPredictedSpaceBall'] / features['VoronoidSpaceBall']
        features['VoronoiDividedCorrected'] = features['VoronoidPredictedCorrectedSpaceBall'] / features['VoronoidSpaceBall']

        features['VoronoiMultiplied'] = features['VoronoidPredictedSpaceBall'] * features['VoronoidSpaceBall']
        features['VoronoiMultipliedCorrected'] = features['VoronoidPredictedCorrectedSpaceBall'] * features['VoronoidSpaceBall']
    except:
        features['VoronoiDivided'] = 1
        features['VoronoiDividedCorrected'] = 1

        features['VoronoiMultiplied'] = 1
        features['VoronoiMultipliedCorrected'] = 1
    
    
    df = pd.merge(df,features,on=['PlayId'],how='inner')
    
    return df


### The main function

In [34]:
def preprocess(df):
    
    train = df.copy(deep=True)
    try:
        train['PlayerHeight'] = train['PlayerHeight'].apply(lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))
        train['PlayerHeight'] = pd.to_numeric(train['PlayerHeight'], errors='coerce').fillna(75.0)
        train['PlayerHeight'] = np.clip(train['PlayerHeight'].values, 60.0, 90.0)
        train['BallCarrier'] = (train['NflId'] == train['NflIdRusher'])
    except:
        print("First problem")
        pass
    try:
        train = fix_team(train)
    except:
        print("Team fix problem")
        pass
    try:
        train = standardize_dataset(train)
    except:
        print("standardize_dataset problem")
        pass
    try:
        train = create_helper_variables(train)
    except:
        print("create_helper_variables problem")
        pass
    try:
        train = process_angles(train)
    except:
        print("process_angles problem")
        pass
    try:
        train = fix_position(train)
    except:
        print("fix_position problem")
        pass
    try:
        train = personnel_features(train)
    except:
        print("personnel_features problem")
        pass
    try:
        train = fix_stadium(train)
    except:
        print("fix_stadium problem")
        pass
    try:
        train = clean_location(train)
    except:
        print("clean_location problem")
        pass
    try:
        train = fix_stadium_type(train)
    except:
        print("fix_stadium_type problem")
        pass
    try:
        train = add_time_features(train)
    except:
        print("add_time_features problem")
        pass
    try:
        train = projection_features(train)
    except:
        print("projection_features problem")
        pass
    try:
        train = defense_features(train)
    except:
        print("defense_features problem")
        pass
    try:
        train = features_relative_to_back(train, "BallCarrier")
    except:
        print("features_relative_to_back problem")
        pass
    try:
        train = features_relative_to_back(train, "Quarterback")
    except:
        print("features_relative_to_back problem")
        pass
    try:
        train = collect_game_featrues(train)
    except:
        print("collect_game_featrues problem")
        pass
    
    try:
        train['BallCarrier'] = train['BallCarrier'].astype(int)
        train['Quarterback'] = train['Quarterback'].astype(int)
    except:
        pass


    ## sort
    try:
        train = train.sort_values(by = ['X_std']).sort_values(by = ['Dis']).sort_values(by=['PlayId', 'IsOnOffense', 'BallCarrier']).reset_index(drop = True)
    except:
        pass
    train = train.replace([np.inf, -np.inf], np.nan)
    return train

In [35]:
%%time
success=False
try:
    train = preprocess(train_init)#.iloc[:22*20])
except:
    print("did not work for all data, revert to 2017-2018")
else:
    if train.shape[1] == 293:
        success=True
    print("STATUS: data was created without errors (probably)")
# success=True
if (not success):
    print("WARNING: Drop 2019 data and try again")
    train_init = train_init[train_init.Season<2019]
    train = preprocess(train_init)#.iloc[:22*2000])

STATUS: data was created without errors (probably)
CPU times: user 9min 48s, sys: 33.2 s, total: 10min 22s
Wall time: 10min 12s


In [36]:
group_split = train.drop_duplicates('PlayId').reset_index(drop=True).GameId

In [37]:
## DisplayName remove Outlier
v = train["DisplayName"].value_counts()
missing_values = list(v[v < 5].index)
train["DisplayName"] = train["DisplayName"].where(~train["DisplayName"].isin(missing_values), "nan")

In [38]:
def drop(train):
    drop_cols = ['10_Speed_defense', '10_Speed_offense', '10_TackleTime_defense',
       '10_TackleTime_offense', '11_Distance_defense',
       '1_Distance_by_Speed', '1_Distance_defense', '1_Distance_offense',
       '1_Speed_defense', '1_TackleTime_offense', '2_Distance_defense',
       '2_Distance_offense', '2_Speed_offense', '2_TackleTime_defense',
       '2_TackleTime_offense', '3_Distance_by_Speed',
       '3_Distance_defense', '3_Distance_offense', '3_Speed_defense',
       '3_Speed_offense', '4_Speed_defense', '4_TackleTime_defense',
       '4_TackleTime_offense', '5_Speed_offense', '5_TackleTime_offense',
       '6_Distance_offense', '6_TackleTime_defense',
       '6_TackleTime_offense', '7_Distance_defense', '7_Speed_defense',
       '7_Speed_offense', '8_Speed_defense', '8_TackleTime_defense',
       '8_TackleTime_offense', '9_Distance_defense', '9_Speed_defense',
       '9_Speed_offense', 'Afternoon',
       'BallCarrier_back_oriented_down_field', 'BallCarrier_max_dist',
       'BallCarrier_mean_dist', 'DefensePersonnel', 'Dis_min_defense',
       'Dis_std_defense', 'Dis_std_offense', 'Distance',
       'Distance_ball_mean_offense', 'Distance_ball_median_defense',
       'Distance_ball_median_offense', 'Distance_ball_min_offense',
       'Down', 'FieldPosition', 'GameClock', 'GameId',
       'HomeScoreBeforePlay', 'HomeTeamAbbr', 'Mean_S_closest_3_def',
       'Mean_S_closest_6_def', 'Mean_TackleTime_closest_6_def',
       'Mean_dist_closest_3_def', 'Mean_dist_closest_6_def', 'MyTeam',
       'NflId', 'NflIdRusher', 'OffensePersonnel', 'PassDuration',
       'PlayDirection', 'PlayerAge_max_defense', 'PlayerAge_max_offense',
       'PlayerAge_min_defense', 'PlayerAge_min_offense',
       'PlayerBirthDate', 'PlayerHeight_max_defense',
       'PlayerHeight_max_offense', 'PlayerHeight_mean_defense',
       'PlayerHeight_min_defense', 'PlayerHeight_min_offense',
       'PlayerHeight_std_defense', 'PlayerWeight_max_defense',
       'PlayerWeight_max_offense', 'PlayerWeight_mean_defense',
       'PlayerWeight_min_offense', 'Quarter',
       'Quarterback_back_oriented_down_field', 'Quarterback_max_dist',
       'Quarterback_mean_dist', 'Quarterback_min_dist', 'ScoreDefense',
       'Season', 'Spread_abs_x_defense', 'Spread_abs_x_offense',
       'Spread_abs_y_offense', 'Tackle_Time_calculated_min_offense',
       'Tackle_Time_calculated_std_offense', 'Tackle_Time_mean_defense',
       'Tackle_Time_mean_offense', 'Tackle_Time_median_offense',
       'Tackle_Time_min_defense', 'Tackle_Time_min_offense',
       'Tackle_Time_std_defense', 'Team', 'TeamOnOffense', 'TimeHandoff',
       'TimeSnap', 'VisitorScoreBeforePlay', 'VisitorTeamAbbr',
       'X_std_end_Or', 'X_std_max_defense', 'X_std_min_defense',
       'X_std_min_offense', 'X_std_std_offense', 'X_std_std_team_diff',
       'Y_std_end_Or', 'Y_std_max_defense', 'Y_std_max_offense',
       'Y_std_mean_offense', 'Y_std_mean_team_diff', 'Y_std_min_defense',
       'Y_std_std_defense', 'Y_std_std_offense', 'YardLine_std',
       'def_max_dist', 'num_DL', 'num_LB', 'num_OL', 'num_QB', 'run_def']

    drop_cols = list(set(drop_cols)&set(train.columns))
    train = train.drop(drop_cols, axis = 1)
    return train

In [39]:
train = drop(train)

In [40]:
cat_features = []
dense_features = []
for col in train.columns:
    if train[col].dtype =='object':
        cat_features.append(col)
        print("*cat*", col, len(train[col].unique()))
    else:
        dense_features.append(col)
        print("!dense!", col, len(train[col].unique()))
dense_features.remove("PlayId")
dense_features.remove("Yards")

!dense! PlayId 23171
!dense! S 1714
!dense! A 902
!dense! Dis 105
*cat* DisplayName 2128
*cat* PossessionTeam 32
*cat* OffenseFormation 9
!dense! DefendersInTheBox 11
!dense! Yards 94
!dense! PlayerHeight 16
!dense! PlayerWeight 182
*cat* Position 15
*cat* Location 34
!dense! Temperature 78
!dense! BallCarrier 2
!dense! ToLeft 2
!dense! IsOnOffense 2
*cat* DefenseTeam 32
!dense! X_std 15326
!dense! Y_std 6948
!dense! Orientation_std 86160
!dense! Dir_std 68073
!dense! X_std_end_Dir 509761
!dense! Y_std_end_Dir 509757
!dense! Quarterback 2
!dense! ScoreOffense 51
!dense! Season_2017 2
!dense! Season_2018 2
!dense! Season_2019 1
!dense! Quarter_Down 20
!dense! Orientation_sin 63807
!dense! Orientation_cos 45437
!dense! Dir_sin 54309
!dense! Dir_cos 41685
!dense! num_DB 8
!dense! num_RB 4
!dense! num_WR 6
!dense! num_TE 5
!dense! OL_diff 7
!dense! OL_TE_diff 8
*cat* Stadium 37
*cat* StadiumType 6
!dense! Month 5
!dense! Evening 2
!dense! GameClock_minute_total 71
!dense! PlayerAge 5395
!d

## categorical

In [41]:
#this hardcode allow to make shore that at least we will have all necessary columns with some dummy value 
cat_features= ['PossessionTeam', 'DefenseTeam', 'OffenseFormation', 'Stadium', 'Location', 'StadiumType', 'DisplayName',  'Position']

In [42]:
train_cat = train.loc[:,cat_features]
categories = {}
most_appear_each_categories = {}
for col in tqdm_notebook(train_cat.columns):
    train_cat.loc[:,col] = train_cat[col].fillna("nan")
    train_cat.loc[:,col] = col + "__" + train_cat[col].astype(str)
    most_appear_each_categories[col] = list(train_cat[col].value_counts().index)[0]
    categories[col] = train_cat[col].unique()
# categories = np.hstack(categories)
print(len(categories))

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))


8


In [43]:
labels_encoders = {}
for col in tqdm_notebook(train_cat.columns):
    le = LabelEncoder()
    le.fit(categories[col])
    train_cat.loc[:, col] = le.transform(train_cat[col])
    labels_encoders[col] = le
num_classes = len(le.classes_)
print(num_classes)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))


15


In [44]:
labels_encoders

{'PossessionTeam': LabelEncoder(),
 'DefenseTeam': LabelEncoder(),
 'OffenseFormation': LabelEncoder(),
 'Stadium': LabelEncoder(),
 'Location': LabelEncoder(),
 'StadiumType': LabelEncoder(),
 'DisplayName': LabelEncoder(),
 'Position': LabelEncoder()}

## Dense

In [46]:
minimals={'S': 0.007,
 'A': 0.007,
 'Dis': 0.0,
 'DefendersInTheBox': 0.7,
 'PlayerHeight': 46.2,
 'PlayerWeight': 107.1,
 'Temperature': 6.300000000000001,
 'BallCarrier': 0.0,
 'ToLeft': 0.0,
 'IsOnOffense': 0.0,
 'X_std': -8.19,
 'Y_std': 1.883,
 'Orientation_std': -467.987,
 'Dir_std': -468.0,
 'X_std_end_Dir': -10.787807237503912,
 'Y_std_end_Dir': 0.0882287648290582,
 'Quarterback': 0.0,
 'ScoreOffense': 0.0,
 'Season_2017': 0.0,
 'Season_2018': 0.0,
 'Season_2019': 0.0,
 'Quarter_Down': 0.77,
 'Orientation_sin': -1.3,
 'Orientation_cos': -1.3,
 'Dir_sin': -1.3,
 'Dir_cos': -1.3,
 'num_DB': 0.7,
 'num_RB': 0.0,
 'num_WR': 0.0,
 'num_TE': 0.0,
 'OL_diff': -1.3,
 'OL_TE_diff': 0.0,
 'Month': 0.7,
 'Evening': 0.0,
 'GameClock_minute_total': 10.5,
 'PlayerAge': 14.258904109589041,
 'v_horizontal': 1.1144285872240916e-17,
 'v_vertical': 0.0,
 'a_horizontal': 3.8576374173141625e-18,
 'a_vertical': 0.0,
 'dis_horizontal': 0.0,
 'dis_vertical': 0.0,
 'Dist_YardLine': -73.151,
 'def_min_dist': 0.1616075493286056,
 'def_mean_dist': 1.6440124016955324,
 'def_std_dist': 0.5111474249695855,
 'def_dist_to_back': 0.0,
 'DefendersInTheBox_vs_Distance': 0.025,
 'BallCarrier_back_from_scrimmage': -14.092000000000004,
 'BallCarrier_back_moving_down_field': 0.0,
 'BallCarrier_min_dist': 0.040816663263917946,
 'BallCarrier_std_dist': 0.3623361942451159,
 'BallCarrier_dist_to_back': 0.0,
 'Quarterback_back_from_scrimmage': -15.846999999999998,
 'Quarterback_back_moving_down_field': 0.0,
 'Quarterback_std_dist': 0.4560925299510584,
 'Quarterback_dist_to_back': 0.0,
 'VoronoidSpaceBall': 1.1455845901707098,
 'VoronoidPredictedSpaceBall': 0.06836300511160717,
 'VoronoidPredictedCorrectedSpaceBall': 0.0007778785215131101,
 'Dis_max_defense': 0.0,
 'Dis_max_offense': 0.0,
 'Dis_mean_defense': 0.0,
 'Dis_mean_offense': 0.0,
 'Dis_min_offense': 0.0,
 'PlayerAge_mean_defense': 16.554520547945206,
 'PlayerAge_mean_offense': 16.980273972602742,
 'PlayerAge_std_defense': 0.6143161043696947,
 'PlayerAge_std_offense': 0.642599335567203,
 'PlayerHeight_mean_offense': 51.1,
 'PlayerHeight_std_offense': 0.6331738236133047,
 'PlayerWeight_mean_offense': 172.0090909090909,
 'PlayerWeight_min_defense': 111.30000000000001,
 'PlayerWeight_std_defense': 18.770872019265283,
 'PlayerWeight_std_offense': 22.18097464864058,
 'Spread_abs_y_defense': 4.361000000000002,
 'X_std_end_Dir_max_defense': 5.349756382495535,
 'X_std_end_Dir_max_offense': 1.2138855507449198,
 'X_std_end_Dir_mean_defense': 0.4067023577319442,
 'X_std_end_Dir_mean_offense': -0.8021117329472253,
 'X_std_end_Dir_min_defense': -6.564463250519841,
 'X_std_end_Dir_min_offense': -10.787807237503912,
 'X_std_end_Dir_std_defense': 0.4434553478938701,
 'X_std_end_Dir_std_offense': 0.8178162007616498,
 'X_std_max_offense': 0.3430000000000002,
 'X_std_mean_defense': 1.296272727272727,
 'X_std_mean_offense': -1.4264545454545456,
 'X_std_std_defense': 0.4199936363154242,
 'Y_std_end_Dir_max_defense': 12.9182459223158,
 'Y_std_end_Dir_max_offense': 13.337763319979665,
 'Y_std_end_Dir_mean_defense': 9.048774075780202,
 'Y_std_end_Dir_mean_offense': 8.887596767741812,
 'Y_std_end_Dir_min_defense': 0.5401376111029357,
 'Y_std_end_Dir_min_offense': 0.0882287648290582,
 'Y_std_end_Dir_std_defense': 0.6132344987289681,
 'Y_std_end_Dir_std_offense': 0.4329041130591371,
 'Y_std_mean_defense': 12.49690909090909,
 'Y_std_min_offense': 1.883,
 'X_std_mean_team_diff': -41.46645454545455,
 'Y_std_std_team_diff': -10.793060243816283,
 'X_std_end_Dir_mean_team_diff': -39.02880685885748,
 'Y_std_end_Dir_mean_team_diff': -6.95477768578049,
 'X_std_end_Dir_std_team_diff': -22.30103056224646,
 'Y_std_end_Dir_std_team_diff': -9.376518095789017,
 'Distance_Ball_Centroid_defense': 0.9687730887677826,
 'Distance_ball_mean_defense': 1.6440124016955324,
 'Distance_ball_min_defense': 0.1616075493286056,
 'Distance_ball_std_defense': 0.5111474249695855,
 'Distance_ball_std_offense': 0.3623361942451159,
 'Tackle_Time_calculated_mean_defense': 0.8093984908252967,
 'Tackle_Time_calculated_mean_offense': 0.6120812365642119,
 'Tackle_Time_calculated_median_defense': 0.6038544085075295,
 'Tackle_Time_calculated_median_offense': 0.45343496344348055,
 'Tackle_Time_calculated_min_defense': 0.044018241528307536,
 'Tackle_Time_calculated_std_defense': 0.16241623794114474,
 'Tackle_Time_median_defense': 0.7029314540179782,
 'Tackle_Time_std_offense': 0.17124404065033766,
 '10_Distance_defense': 2.7122628928627037,
 '10_Distance_offense': 1.9225038361470215,
 '4_Distance_defense': 1.2364861503470226,
 '4_Distance_offense': 1.01471030348568,
 '5_Distance_defense': 1.3241846547970582,
 '5_Distance_offense': 1.1480853626799692,
 '6_Distance_defense': 1.9756996229184227,
 '7_Distance_offense': 1.4039319784092057,
 '8_Distance_defense': 2.106860460495664,
 '8_Distance_offense': 1.6532008347445255,
 '9_Distance_offense': 1.8926943757511476,
 '11_Speed_defense': 0.007,
 '1_Speed_offense': 0.007,
 '2_Speed_defense': 0.007,
 '4_Speed_offense': 0.007,
 '5_Speed_defense': 0.007,
 '6_Speed_defense': 0.007,
 '6_Speed_offense': 0.007,
 '8_Speed_offense': 0.007,
 '11_TackleTime_defense': 0.7470839546240906,
 '1_TackleTime_defense': 0.044558101241065504,
 '3_TackleTime_defense': 0.3525612276799387,
 '3_TackleTime_offense': 0.17206408402684303,
 '5_TackleTime_defense': 0.5212795640419514,
 '7_TackleTime_defense': 0.6042674620507817,
 '7_TackleTime_offense': 0.42030785733089027,
 '9_TackleTime_defense': 0.620454708148166,
 '9_TackleTime_offense': 0.39934005867080424,
 '2_Distance_by_Speed': 0.2863602116584891,
 'Mean_dist_closest_3_off': 0.657364462744435,
 'Diff_mean_dist_closest_3': -5.095150099211924,
 'Mean_TackleTime_closest_3_def': 0.42944222662267584,
 'Ballspeed_by_def1speed': 0.0014028056112224446,
 'Ballspeed_by_def2speed': 0.0035714285714285718,
 'Ballspeed_by_def3speed': 0.002389078498293515,
 'VoronoiDivided': 0.002370105812801961,
 'VoronoiDividedCorrected': 4.921827039116882e-05,
 'VoronoiMultiplied': 0.8939048347735492,
 'VoronoiMultipliedCorrected': 0.012294113332759326}

In [47]:
maximals={'S': 13.202446044083526,
 'A': 19.123,
 'Dis': 1.807,
 'DefendersInTheBox': 14.3,
 'PlayerHeight': 105.3,
 'PlayerWeight': 494.0,
 'Temperature': 126.1,
 'BallCarrier': 1.3,
 'ToLeft': 1.3,
 'IsOnOffense': 1.3,
 'X_std': 155.142,
 'Y_std': 73.385,
 'Orientation_std': 467.987,
 'Dir_std': 350.987,
 'X_std_end_Dir': 154.44810960377373,
 'Y_std_end_Dir': 75.32283028894615,
 'Quarterback': 1.3,
 'ScoreOffense': 74.1,
 'Season_2017': 1.3,
 'Season_2018': 1.3,
 'Season_2019': 0.0,
 'Quarter_Down': 7.0200000000000005,
 'Orientation_sin': 1.3,
 'Orientation_cos': 1.3,
 'Dir_sin': 1.3,
 'Dir_cos': 1.3,
 'num_DB': 10.4,
 'num_RB': 3.9,
 'num_WR': 6.5,
 'num_TE': 5.2,
 'OL_diff': 6.5,
 'OL_TE_diff': 9.1,
 'Month': 15.6,
 'Evening': 1.3,
 'GameClock_minute_total': 110.5,
 'PlayerAge': 53.86630136986301,
 'v_horizontal': 12.340348282735247,
 'v_vertical': 13.063504822421166,
 'a_horizontal': 15.740857666312891,
 'a_vertical': 19.115544229916697,
 'dis_horizontal': 1.4630005668067736,
 'dis_vertical': 1.7383840025560569,
 'Dist_YardLine': 13.935999999999998,
 'def_min_dist': 11.973853932631718,
 'def_mean_dist': 48.209018968649985,
 'def_std_dist': 25.168450224245195,
 'def_dist_to_back': 72.03447688433643,
 'DefendersInTheBox_vs_Distance': 14.3,
 'BallCarrier_back_from_scrimmage': 13.598,
 'BallCarrier_back_moving_down_field': 1.3,
 'BallCarrier_min_dist': 15.748495324950891,
 'BallCarrier_std_dist': 19.51892766841324,
 'BallCarrier_dist_to_back': 55.53751695025626,
 'Quarterback_back_from_scrimmage': 11.505000000000003,
 'Quarterback_back_moving_down_field': 1.3,
 'Quarterback_std_dist': 19.51892766841324,
 'Quarterback_dist_to_back': 55.53751695025626,
 'VoronoidSpaceBall': 39.0,
 'VoronoidPredictedSpaceBall': 39.0,
 'VoronoidPredictedCorrectedSpaceBall': 39.0,
 'Dis_max_defense': 1.807,
 'Dis_max_offense': 1.508,
 'Dis_mean_defense': 0.7587272727272727,
 'Dis_mean_offense': 0.8296363636363636,
 'Dis_min_offense': 0.5980000000000001,
 'PlayerAge_mean_defense': 40.148792029887915,
 'PlayerAge_mean_offense': 39.80234122042341,
 'PlayerAge_std_defense': 7.738221898786854,
 'PlayerAge_std_offense': 8.098862078738975,
 'PlayerHeight_mean_offense': 100.33636363636364,
 'PlayerHeight_std_offense': 6.052016944036845,
 'PlayerWeight_mean_offense': 383.1454545454546,
 'PlayerWeight_min_defense': 279.5,
 'PlayerWeight_std_defense': 76.94199113618,
 'PlayerWeight_std_offense': 88.76906914214902,
 'Spread_abs_y_defense': 60.814,
 'X_std_end_Dir_max_defense': 154.44810960377373,
 'X_std_end_Dir_max_offense': 149.91143762207605,
 'X_std_end_Dir_mean_defense': 144.27283323406778,
 'X_std_end_Dir_mean_offense': 143.20709220583623,
 'X_std_end_Dir_min_defense': 142.08874208052995,
 'X_std_end_Dir_min_offense': 138.46807921514736,
 'X_std_end_Dir_std_defense': 26.525356964233186,
 'X_std_end_Dir_std_offense': 19.80169239290661,
 'X_std_max_offense': 145.275,
 'X_std_mean_defense': 144.87081818181818,
 'X_std_mean_offense': 141.80872727272725,
 'X_std_std_defense': 25.36183487375542,
 'Y_std_end_Dir_max_defense': 75.32283028894615,
 'Y_std_end_Dir_max_offense': 68.68397280994532,
 'Y_std_end_Dir_mean_defense': 51.49932179750935,
 'Y_std_end_Dir_mean_offense': 52.03436653654356,
 'Y_std_end_Dir_min_defense': 45.08689015121851,
 'Y_std_end_Dir_min_offense': 45.552010485023395,
 'Y_std_end_Dir_std_defense': 19.38103321748543,
 'Y_std_end_Dir_std_offense': 22.26121556732894,
 'Y_std_mean_defense': 45.711545454545444,
 'Y_std_min_offense': 39.532999999999994,
 'X_std_mean_team_diff': -0.2730000000000004,
 'Y_std_std_team_diff': 3.78634623426951,
 'X_std_end_Dir_mean_team_diff': 1.737864638861673,
 'Y_std_end_Dir_mean_team_diff': 8.884617434038242,
 'X_std_end_Dir_std_team_diff': 3.232961037359399,
 'Y_std_end_Dir_std_team_diff': 4.995100326398471,
 'Distance_Ball_Centroid_defense': 45.18305166132266,
 'Distance_ball_mean_defense': 48.20901896864998,
 'Distance_ball_min_defense': 11.973853932631716,
 'Distance_ball_std_defense': 25.168450224245195,
 'Distance_ball_std_offense': 19.51892766841324,
 'Tackle_Time_calculated_mean_defense': 61.97039123571207,
 'Tackle_Time_calculated_mean_offense': 51.03466820664923,
 'Tackle_Time_calculated_median_defense': 63.53681874209167,
 'Tackle_Time_calculated_median_offense': 47.15986775490071,
 'Tackle_Time_calculated_min_defense': 51.68240080274859,
 'Tackle_Time_calculated_std_defense': 28.128749683058974,
 'Tackle_Time_median_defense': 1623.7915906913672,
 'Tackle_Time_std_offense': 656.3077872386906,
 '10_Distance_defense': 71.35915992358655,
 '10_Distance_offense': 55.53751695025626,
 '4_Distance_defense': 56.199088711472896,
 '4_Distance_offense': 23.757630710994736,
 '5_Distance_defense': 57.492452600319645,
 '5_Distance_offense': 25.16676443248119,
 '6_Distance_defense': 59.987748857579255,
 '7_Distance_offense': 37.527352437921856,
 '8_Distance_defense': 63.21612485750769,
 '8_Distance_offense': 44.17907460551885,
 '9_Distance_offense': 47.95070045369514,
 '11_Speed_defense': 11.051869013921115,
 '1_Speed_offense': 12.129,
 '2_Speed_defense': 10.880999999999998,
 '4_Speed_offense': 11.359094303944314,
 '5_Speed_defense': 10.335010003866977,
 '6_Speed_defense': 10.817792602474865,
 '6_Speed_offense': 12.427067931167828,
 '8_Speed_offense': 12.397808379737048,
 '11_TackleTime_defense': 3623.090437734063,
 '1_TackleTime_defense': 1079.0172287781133,
 '3_TackleTime_defense': 1239.113465345285,
 '3_TackleTime_offense': 835.9889532762971,
 '5_TackleTime_defense': 1616.2011786903267,
 '7_TackleTime_defense': 1665.8392959706537,
 '7_TackleTime_offense': 1056.1569959054386,
 '9_TackleTime_defense': 2666.22520616695,
 '9_TackleTime_offense': 2261.6432189892384,
 '2_Distance_by_Speed': 1218.4551243275237,
 'Mean_dist_closest_3_off': 17.19388383973217,
 'Diff_mean_dist_closest_3': 7.745755161725263,
 'Mean_TackleTime_closest_3_def': 1178.8619394836405,
 'Ballspeed_by_def1speed': 444.6,
 'Ballspeed_by_def2speed': 298.99999999999994,
 'Ballspeed_by_def3speed': 630.4999999999999,
 'VoronoiDivided': 15.113452152384685,
 'VoronoiDividedCorrected': 7.439795458758944,
 'VoronoiMultiplied': 1170.0,
 'VoronoiMultipliedCorrected': 1170.0}

In [48]:
train_dense = train[dense_features]
sss = {}
medians = {}
for col in tqdm_notebook(train_dense.columns):
    print(col)
    train_dense[col] = pd.to_numeric(train_dense[col], errors='coerce')
    medians[col] = np.nanmedian(train_dense[col])
    train_dense.loc[:, col] = train_dense[col].fillna(medians[col])
    try:
        train_dense[col] = np.clip(train_dense[col].values, minimals[col], maximals[col])
    except:
        pass
    ss = StandardScaler()
    train_dense.loc[:, col] = ss.fit_transform(train_dense[col].values[:,None])
    sss[col] = ss

HBox(children=(IntProgress(value=0, max=156), HTML(value='')))

S
A
Dis
DefendersInTheBox
PlayerHeight
PlayerWeight
Temperature
BallCarrier
ToLeft
IsOnOffense
X_std
Y_std
Orientation_std
Dir_std
X_std_end_Dir
Y_std_end_Dir
Quarterback
ScoreOffense
Season_2017
Season_2018
Season_2019
Quarter_Down
Orientation_sin
Orientation_cos
Dir_sin
Dir_cos
num_DB
num_RB
num_WR
num_TE
OL_diff
OL_TE_diff
Month
Evening
GameClock_minute_total
PlayerAge
v_horizontal
v_vertical
a_horizontal
a_vertical
dis_horizontal
dis_vertical
Dist_YardLine
def_min_dist
def_mean_dist
def_std_dist
def_dist_to_back
DefendersInTheBox_vs_Distance
BallCarrier_back_from_scrimmage
BallCarrier_back_moving_down_field
BallCarrier_min_dist
BallCarrier_std_dist
BallCarrier_dist_to_back
Quarterback_back_from_scrimmage
Quarterback_back_moving_down_field
Quarterback_std_dist
Quarterback_dist_to_back
VoronoidSpaceBall
VoronoidPredictedSpaceBall
VoronoidPredictedCorrectedSpaceBall
Dis_max_defense
Dis_max_offense
Dis_mean_defense
Dis_mean_offense
Dis_min_offense
PlayerAge_mean_defense
PlayerAge_mean_

## Divide features into groups

In [49]:
presonal_drop = ['IsOnOffense', 'BallCarrier', 'Quarterback']

In [50]:
dense_player_features = ['S', 'A', 'Dis',  'PlayerHeight', 'PlayerWeight', 
       'BallCarrier', 'X_std',  'Orientation_std', 
       'X_std_end_Dir', 'Y_std_end_Dir', 
       'Quarterback', 'Orientation_sin', 'Orientation_cos', 'Dir_sin',
       'Dir_cos', 'PlayerAge', 'Dist_YardLine',
        'v_horizontal', 'v_vertical', 'a_horizontal', 'a_vertical', 'dis_horizontal', 'dis_vertical',
       'BallCarrier_dist_to_back', 'Quarterback_dist_to_back', 'def_dist_to_back',  'IsOnOffense', 'Y_std', 'Dir_std']

#add this to make sure that there will be no mismatch later
dense_player_features = list(set(dense_player_features)&set(train_dense.columns))

dense_game_features = list(set(train_dense.columns) - set(dense_player_features))


cat_game_features = ['PossessionTeam', 'DefenseTeam', 'OffenseFormation', 'Stadium', 'Location', 'StadiumType']

cat_player_features = ['DisplayName',  'Position']

In [51]:
(train_dense[dense_game_features].head(22).std()>0.000001).sum()

0

In [52]:
game_dense=train_dense[dense_game_features].drop_duplicates().values
game_cat=train_cat.loc[train_dense[dense_game_features].drop_duplicates().index, cat_game_features].values

shape = train_dense.loc[train_dense['IsOnOffense']==1, dense_player_features].values.shape[1]


offense_dense=train_dense.loc[train_dense['IsOnOffense']==1, dense_player_features].values.reshape(-1,11,shape)
offence_cat=train_cat.loc[train_dense['IsOnOffense']==1, cat_player_features].values.reshape(-1,11,2)

defense_dense=train_dense.loc[train_dense['IsOnOffense']!=1, dense_player_features].values.reshape(-1,11,shape)
deffence_cat=train_cat.loc[train_dense['IsOnOffense']!=1, cat_player_features].values.reshape(-1,11,2)

# rusher
rusher_dense=train_dense.loc[train_dense['BallCarrier']>0, dense_player_features].drop(presonal_drop,axis=1).values
rusher_cat=train_cat.loc[train_dense['BallCarrier']>0, cat_player_features].values

# qb
qb_dense=train_dense.loc[train_dense['Quarterback']>0, dense_player_features].drop(presonal_drop,axis=1).values
qb_cat=train_cat.loc[train_dense['Quarterback']>0, cat_player_features].values

In [53]:
game_dense.shape, game_cat.shape, offense_dense.shape, offence_cat.shape, defense_dense.shape, deffence_cat.shape, rusher_dense.shape, rusher_cat.shape, qb_dense.shape, qb_cat.shape

((23171, 127),
 (23171, 6),
 (23171, 11, 29),
 (23171, 11, 2),
 (23171, 11, 29),
 (23171, 11, 2),
 (23171, 26),
 (23171, 2),
 (23171, 26),
 (23171, 2))

In [54]:
# extra security for any nan. it should prevent
for f in [game_dense, game_cat, offense_dense, offence_cat, defense_dense, deffence_cat, rusher_dense, rusher_cat, qb_dense, qb_cat]:
    np.nan_to_num(f, copy=False)

### COX

In [55]:
def extract_feature(play, is_train=True):
    
    if play['PlayDirection'].iloc[0] == 'right':
        direction = 1
    else:
        direction = -1
        
    home, away = play['Team'].values == 'home', play['Team'].values == 'away'
    indRusher = np.where(play['NflId'].values == play['NflIdRusher'].iloc[0])[0][0]

    if play['FieldPosition'].iloc[0] == play['PossessionTeam'].iloc[0]:
        yardToGoal = 100 - play['YardLine'].iloc[0]
        start = np.array([120 + (play['YardLine'].iloc[0] + 10) * direction, 53.3 / 2]) % 120
    else:
        yardToGoal = play['YardLine'].iloc[0]
        start = np.array([120 - (play['YardLine'].iloc[0] + 10) * direction, 53.3 / 2]) % 120

    Dir = play['Dir'].values
    rad = np.nan_to_num(2 * np.pi * (90 - Dir) / 360)
    x, y = play['X'].values, play['Y'].values
    S = play['S'].values * np.logical_not(np.isnan(Dir))
    A = play['A'].values * np.logical_not(np.isnan(Dir))

    loc = np.vstack([x - start[0], y - start[1]]).T * direction
    vel = (S * np.vstack([np.cos(rad), np.sin(rad)])).T * direction
    acc = (A * np.vstack([np.cos(rad), np.sin(rad)])).T * direction
    locRusher, velRusher, accRusher = loc[indRusher], vel[indRusher], acc[indRusher]

    diff = np.hstack([np.square(loc - locRusher), np.square(vel - velRusher), np.square(acc - accRusher)])

    scrimWidth = 5
    inTheBox = (play['NflId'].values != play['NflIdRusher'].iloc[0]) * (np.abs(loc[:, 0]) < scrimWidth)

    locDet = np.linalg.slogdet(np.exp(- np.square(loc[inTheBox, np.newaxis] - loc[inTheBox][np.newaxis]).sum(2) / 2.))[1]
    locHomeDet = np.linalg.slogdet(np.exp(- np.square(loc[home * inTheBox][:, np.newaxis] - loc[home * inTheBox][np.newaxis]).sum(2) / 2.))[1]
    locAwayDet = np.linalg.slogdet(np.exp(- np.square(loc[away * inTheBox][:, np.newaxis] - loc[away * inTheBox][np.newaxis]).sum(2) / 2.))[1]

    if play['PossessionTeam'].iloc[0] == play['HomeTeamAbbr'].iloc[0]:
        x = np.hstack([diff[home].sum(0), diff[away].sum(0), locDet, locHomeDet, locAwayDet])
    else: 
        x = np.hstack([diff[away].sum(0), diff[home].sum(0), locDet, locAwayDet, locHomeDet])
    
#     vor = voronoid_space_features(play)
    x = np.hstack([locRusher, velRusher, accRusher, x, (downs == play['Down'].iloc[0]).astype(np.float)])#, (teams == play['PossessionTeam'].iloc[0]).astype(np.float)])    
    
    offset = locRusher[0] - 5
    threshold = play['Distance'].iloc[0] - offset
    
    if is_train:
         
        yard = play['Yards'].iloc[0] - offset
        
        c = yard < threshold
        y = np.minimum(yard, threshold)
        
        return x, y, c, offset
    
    else:
        return x, offset

In [56]:
data = train_init.copy(deep=True)# pd.read_csv('/kaggle/input/nfl-big-data-bowl-2020/train.csv', low_memory=False)

data.loc[data.HomeTeamAbbr.values == "ARI", 'HomeTeamAbbr'] = "ARZ"
data.loc[data.HomeTeamAbbr.values == "BAL", 'HomeTeamAbbr'] = "BLT"
data.loc[data.HomeTeamAbbr.values == "CLE", 'HomeTeamAbbr'] = "CLV"
data.loc[data.HomeTeamAbbr.values == "HOU", 'HomeTeamAbbr'] = "HST"

data.loc[data['Season'] == 2017, 'S'] = (data['S'][data['Season'] == 2017] - 2.4355) / 1.2930 * 1.4551 + 2.7570

downs = np.array([1, 2, 3])
teams = pd.get_dummies(data['PossessionTeam']).columns[:-1]

# train = data
n_train = data.shape[0] // 22

In [57]:
import statsmodels.api as sm

In [58]:
inds = list(data.groupby('PlayId').groups.values())

xs, ys, cs, os = [], [], [], []

for i in range(n_train):
    try:
        ind = inds[i]
        play = data.loc[ind]
        x, y, c, o = extract_feature(play)
    except:
        print("ERROR: problem with COX")
        x = np.array([-4.88000000e+00,  1.20000000e-01,  3.20315977e+00, -3.08452002e-02,
        1.80993108e+00, -2.02125176e-02,  2.01013000e+02,  4.60299800e+02,
        9.92685007e+01,  5.66515048e+01,  3.14532910e+01,  2.08711101e+01,
        9.51740400e+02,  5.75563600e+02,  1.67962734e+02,  5.03570419e+01,
        5.96471237e+01,  2.17650753e+01, -5.55762769e+00, -1.00027062e+00,
       -1.28013294e-01,  1.00000000e+00,  0.00000000e+00,  0.00000000e+00])
        y=12.46
        c=True
        o=-9.879999999999995
        
        
    xs.append(x)
    ys.append(y)
    cs.append(c)
    os.append(o)

xs, ys, cs, os = np.vstack(xs), np.hstack(ys), np.array(cs).astype(np.int), np.hstack(os)
ys = np.maximum(0, ys)

In [59]:
model = sm.PHReg(ys, xs, cs)
result = model.fit()

baseline_cum_hazard_func = result.baseline_cumulative_hazard_function[0]
pred_index = np.arange(-99, 100)

In [60]:
cox_xb = []
cox_preds = []
for i in range(n_train):
    ind = inds[i]
    play = data.loc[ind]
    
    xb = np.exp(result.params.dot(xs[i]))
    cum_hazard =  xb * baseline_cum_hazard_func(pred_index - os[i])
    pred = 1 - np.exp(- cum_hazard)

    if play['FieldPosition'].iloc[0] == play['PossessionTeam'].iloc[0]:
        yardToGoal = 100 - play['YardLine'].iloc[0]
    else:
        yardToGoal = play['YardLine'].iloc[0]

    pred /= pred[pred_index <= yardToGoal][-1]
    pred[pred_index > yardToGoal] = 1.
    
    cox_xb.append(xb)
    cox_preds.append(pred)
    
cox_xb, cox_preds = np.hstack(cox_xb), np.vstack(cox_preds)
cox_features = np.hstack([cox_preds[:,95:115], cox_xb.reshape(-1,1)])

result_cox_main = result

In [61]:
game_dense.shape, cox_features.shape, cox_preds.shape

((23171, 127), (23171, 21), (23171, 199))

In [62]:
game_dense = np.hstack([game_dense, cox_features])

In [63]:
def return_step(x):
    temp = np.zeros(199)
    temp[x + 99:] = 1
    return temp

def cross_entropy_target(x):
    temp = np.zeros(199)
    temp[x + 99] = 1
    return temp

train_y_raw = train["Yards"].iloc[np.arange(0, len(train), 22)].reset_index(drop = True)
train_y = np.vstack(train_y_raw.apply(return_step).values)
train_y_cross = np.vstack(train_y_raw.apply(cross_entropy_target).values)

## Model

In [64]:
def extract_feature(play, is_train=True):
    
    if play['PlayDirection'].iloc[0] == 'right':
        direction = 1
    else:
        direction = -1
        
    home, away = play['Team'].values == 'home', play['Team'].values == 'away'
    indRusher = np.where(play['NflId'].values == play['NflIdRusher'].iloc[0])[0][0]

    if play['FieldPosition'].iloc[0] == play['PossessionTeam'].iloc[0]:
        yardToGoal = 100 - play['YardLine'].iloc[0]
        start = np.array([120 + (play['YardLine'].iloc[0] + 10) * direction, 53.3 / 2]) % 120
    else:
        yardToGoal = play['YardLine'].iloc[0]
        start = np.array([120 - (play['YardLine'].iloc[0] + 10) * direction, 53.3 / 2]) % 120

    Dir = play['Dir'].values
    rad = np.nan_to_num(2 * np.pi * (90 - Dir) / 360)
    x, y = play['X'].values, play['Y'].values
    S = play['S'].values * np.logical_not(np.isnan(Dir))
    A = play['A'].values * np.logical_not(np.isnan(Dir))

    loc = np.vstack([x - start[0], y - start[1]]).T * direction
    vel = (S * np.vstack([np.cos(rad), np.sin(rad)])).T * direction
    acc = (A * np.vstack([np.cos(rad), np.sin(rad)])).T * direction
    locRusher, velRusher, accRusher = loc[indRusher], vel[indRusher], acc[indRusher]

    diff = np.hstack([np.square(loc - locRusher), np.square(vel - velRusher), np.square(acc - accRusher)])

    scrimWidth = 5
    inTheBox = (play['NflId'].values != play['NflIdRusher'].iloc[0]) * (np.abs(loc[:, 0]) < scrimWidth)

    locDet = np.linalg.slogdet(np.exp(- np.square(loc[inTheBox, np.newaxis] - loc[inTheBox][np.newaxis]).sum(2) / 2.))[1]
    locHomeDet = np.linalg.slogdet(np.exp(- np.square(loc[home * inTheBox][:, np.newaxis] - loc[home * inTheBox][np.newaxis]).sum(2) / 2.))[1]
    locAwayDet = np.linalg.slogdet(np.exp(- np.square(loc[away * inTheBox][:, np.newaxis] - loc[away * inTheBox][np.newaxis]).sum(2) / 2.))[1]

    if play['PossessionTeam'].iloc[0] == play['HomeTeamAbbr'].iloc[0]:
        x = np.hstack([diff[home].sum(0), diff[away].sum(0), locDet, locHomeDet, locAwayDet])
    else: 
        x = np.hstack([diff[away].sum(0), diff[home].sum(0), locDet, locAwayDet, locHomeDet])
    
#     vor = voronoid_space_features(play)
    x = np.hstack([locRusher, velRusher, accRusher, x, (downs == play['Down'].iloc[0]).astype(np.float)])#, (teams == play['PossessionTeam'].iloc[0]).astype(np.float)])    
    
    offset = locRusher[0] - 5
    threshold = play['Distance'].iloc[0] - offset
    
    if is_train:
         
        yard = play['Yards'].iloc[0] - offset
        
        c = yard < threshold
        y = np.minimum(yard, threshold)
        
        return x, y, c, offset
    
    else:
        return x, offset

In [65]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras import regularizers
from keras.callbacks import Callback, EarlyStopping
from keras.layers import Activation
import tensorflow as tf
from keras.utils.generic_utils import get_custom_objects

In [66]:
def crop(dimension, start, end):
    # Crops (or slices) a Tensor on a given dimension from start to end
    # example : to crop tensor x[:, :, 5:10]
    # call slice(2, 5, 10) as you want to crop on the second dimension
    def func(x):
        if dimension == 0:
            return x[start: end]
        if dimension == 1:
            return x[:, start: end]
        if dimension == 2:
            return x[:, :, start: end]
        if dimension == 3:
            return x[:, :, :, start: end]
        if dimension == 4:
            return x[:, :, :, :, start: end]
    return keras.layers.Lambda(func)

In [67]:
def get_emb_one_dim(input_cat, embeding, dim):
    emb_cat = crop(1, dim, dim+1)(input_cat)
    emb_cat = embeding(emb_cat)
    emb_cat =keras.layers.Flatten()(emb_cat)
    return emb_cat

def get_emb_two_dim(input_cat, embeding, dim):
    emb_cat = crop(2, dim, dim+1)(input_cat)
    emb_cat = embeding(emb_cat)
    emb_cat = keras.layers.Reshape((int(emb_cat.shape[1]), int(emb_cat.shape[2]) * int(emb_cat.shape[3])))(emb_cat)
    return emb_cat

def get_emb_person(input_cat, embedding_names, embedding_position):
    emb_name = get_emb_one_dim(input_cat, embedding_names, dim=0)
    emb_position = get_emb_one_dim(input_cat, embedding_position, dim=1)
    emb_person = keras.layers.Concatenate()([emb_name, emb_position])
    return emb_person

def get_emb_team(input_cat, embedding_names, embedding_position):
    emb_name = get_emb_two_dim(input_cat, embedding_names, dim=0)
    emb_position = get_emb_two_dim(input_cat, embedding_position, dim=1)
    emb_person = keras.layers.Concatenate()([emb_name, emb_position])
    return emb_person

In [68]:
keras.backend.clear_session()
def crps(y_true, y_pred):
    loss = K.mean((K.cumsum(y_pred, axis = 1) - y_true)**2)
    return loss
def crps_np(y_true, y_pred):
    loss = np.mean((np.cumsum(y_pred, axis = 1) - y_true)**2)
    return loss

def get_model(batch_size = 32, epochs = 10):
    
    activation="relu"
    drop_out_rate=0.5
    
    ## inputs   
    input_dense_game = keras.layers.Input(shape=(game_dense.shape[1],), name = "numerical_general_inputs")
    input_cat_game = keras.layers.Input(shape=(game_cat.shape[1], ), name = "categorical_general_inputs")
    input_dense_offense = keras.layers.Input(shape=(offense_dense.shape[1],offense_dense.shape[2]), name = "numerical_offense_inputs")
    input_cat_offense = keras.layers.Input(shape=(offence_cat.shape[1], offence_cat.shape[2]), name = "categorical_offense_input")
    input_dense_deffense = keras.layers.Input(shape=(defense_dense.shape[1],defense_dense.shape[2]), name = "numerical_deffense_inputs")
    input_cat_deffense = keras.layers.Input(shape=(deffence_cat.shape[1], deffence_cat.shape[2]), name = "categorical_deffense_input")
    input_dense_rusher = keras.layers.Input(shape=(rusher_dense.shape[1],), name = "numerical_rusher_inputs")
    input_cat_rusher = keras.layers.Input(shape=(rusher_cat.shape[1], ), name = "categorical_rusher_inputs")
    input_dense_qb = keras.layers.Input(shape=(qb_dense.shape[1],), name = "numerical_qb_inputs")
    input_cat_qb = keras.layers.Input(shape=(qb_cat.shape[1], ), name = "categorical_qb_inputs")
    
    # embadings_game
    embedding_offense = keras.layers.Embedding(len(categories['PossessionTeam']), 32, embeddings_regularizer=regularizers.l2(1e-4))
    embedding_deffense = keras.layers.Embedding(len(categories['DefenseTeam']), 32, embeddings_regularizer=regularizers.l2(1e-4))
    embedding_formation = keras.layers.Embedding(len(categories['OffenseFormation']), 8, embeddings_regularizer=regularizers.l2(1e-4))
    embedding_s = keras.layers.Embedding(len(categories['Stadium']), 12, embeddings_regularizer=regularizers.l2(1e-4))
    embedding_location = keras.layers.Embedding(len(categories['Location']), 12, embeddings_regularizer=regularizers.l2(1e-4))
    embedding_s_type = keras.layers.Embedding(len(categories['StadiumType']), 3, embeddings_regularizer=regularizers.l2(1e-4))
    ## embeddings_players
    embedding_names = keras.layers.Embedding(len(categories['DisplayName']), 40, embeddings_regularizer=regularizers.l2(1e-4))
    embedding_position = keras.layers.Embedding(len(categories['Position']), 12, embeddings_regularizer=regularizers.l2(1e-4))

    emb_cat_game_offense = get_emb_one_dim(input_cat_game, embedding_offense, dim=0)
    emb_cat_game_defense = get_emb_one_dim(input_cat_game, embedding_deffense, dim=1)
    emb_cat_game_formation = get_emb_one_dim(input_cat_game, embedding_formation, dim=2)
    emb_cat_game_s = get_emb_one_dim(input_cat_game, embedding_s, dim=3)
    emb_cat_game_location = get_emb_one_dim(input_cat_game, embedding_location, dim=4)
    emb_cat_game_s_type = get_emb_one_dim(input_cat_game, embedding_s_type, dim=5)
    
    emb_cat_game = keras.layers.Concatenate(name = "emb_general_features")([emb_cat_game_offense, emb_cat_game_defense, emb_cat_game_formation, emb_cat_game_s, emb_cat_game_location, emb_cat_game_s_type])
    
    emb_cat_rusher = get_emb_person(input_cat_rusher, embedding_names, embedding_position)
    emb_cat_qb = get_emb_person(input_cat_qb, embedding_names, embedding_position)
    
    emb_cat_offense = get_emb_team(input_cat_offense, embedding_names, embedding_position)
    emb_cat_deffense = get_emb_team(input_cat_deffense, embedding_names, embedding_position)    
    
    ## general game features
    game = keras.layers.Concatenate(name = "general_features")([input_dense_game, emb_cat_game])
#     game = keras.layers.Dropout(0.5)(game)
    game = keras.layers.Dense(32, activation=activation)(game)
#     game = keras.layers.normalization.BatchNormalization()(game)
    game = keras.layers.Dropout(drop_out_rate)(game)
#     game = keras.layers.Dense(128, activation=activation)(game)
#     game = keras.layers.Dropout(0.5)(game)
    
    ## rusher features
    rusher = keras.layers.Concatenate(name = "rusher_features")([input_dense_rusher, emb_cat_rusher])
#     rusher = keras.layers.Dropout(0.5)(rusher)
    rusher = keras.layers.Dense(32, activation=activation)(rusher)
#     rusher = keras.layers.normalization.BatchNormalization()(rusher)
    rusher = keras.layers.Dropout(drop_out_rate)(rusher)
#     rusher = keras.layers.Dense(64, activation=activation)(rusher)
#     rusher = keras.layers.Dropout(0.5)(rusher)
    
     ## rusher features
    qb = keras.layers.Concatenate(name = "qb_features")([input_dense_qb, emb_cat_qb])
#     qb = keras.layers.Dropout(0.5)(qb)
    qb = keras.layers.Dense(32, activation=activation)(qb)
#     qb = keras.layers.normalization.BatchNormalization()(qb)
    qb = keras.layers.Dropout(drop_out_rate)(qb)
#     qb = keras.layers.Dense(64, activation=activation)(qb)
#     qb = keras.layers.Dropout(0.5)(qb)
    
        ## players features
    offense = keras.layers.Concatenate(name = "offense_features")([input_dense_offense, emb_cat_offense])   
    n_unit = 16
    offense_aves = []
    for k in range(3):
        offense = keras.layers.Dense(16, activation=None)(offense)
        offense_aves.append(keras.layers.GlobalAveragePooling1D()(offense))
        offense = keras.layers.Activation(activation)(offense)
    offense = keras.layers.Concatenate(name = "deep_offense_features")(offense_aves)
    offense = keras.layers.Dropout(drop_out_rate)(offense)

    deffense = keras.layers.Concatenate(name = "deffense_features")([input_dense_deffense, emb_cat_deffense])
    n_unit = 16
    deffenses_aves = []
    for k in range(3):
        deffense = keras.layers.Dense(16, activation=None)(deffense)
        deffenses_aves.append(keras.layers.GlobalAveragePooling1D()(deffense))
        deffense = keras.layers.Activation(activation)(deffense)
    deffense = keras.layers.Concatenate(name = "deep_defense_features")(deffenses_aves)
    deffense = keras.layers.Dropout(drop_out_rate)(deffense)
    

    ### concat all
    x_concat = keras.layers.Concatenate(name = "general_and_players")([game, rusher, offense, deffense])#qb,
#     x_concat = keras.layers.Dropout(0.5)(x_concat)
    print(x_concat.shape)
    x_concats = []
    n_unit = 128
    decay_rate = 0.5
    for k in range(3):
        x_concat = keras.layers.Dense(n_unit, activation=activation)(x_concat)
#         x_concat = keras.layers.normalization.BatchNormalization()(x_concat)
        x_concats.append(x_concat)
        n_unit = int(n_unit * decay_rate)
    x_concat = keras.layers.Concatenate(name = "deep_features")(x_concats)
    x_concat = keras.layers.Dropout(drop_out_rate)(x_concat)
    
    ## concat
#     x_concat = keras.layers.Concatenate(name = "all_concat")([game, rusher, offense, deffense, x_concat])#qb,
#     x_concat = keras.layers.normalization.BatchNormalization()(x_concat)
#     x_concat = keras.layers.GaussianNoise(0.2)(x_concat)
#     x_concat = keras.layers.Dense(256, activation=activation)(x_concat)
#     x_concat = keras.layers.Dropout(drop_out_rate)(x_concat)
#     x_concat = keras.layers.GaussianNoise(0.2)(x_concat)
    out_soft = keras.layers.Dense(199, activation="softmax", name = "out_soft")(x_concat)
    out_soft_entropy = keras.layers.Dense(199, activation="softmax", name = "out_soft_entropy")(x_concat)
    out_reg = keras.layers.Dense(1, activation=None, name = "out_reg")(x_concat)
    model = keras.models.Model(inputs = [input_dense_game, input_cat_game, input_dense_offense, input_cat_offense, input_dense_deffense, input_cat_deffense,
                                         input_dense_rusher, input_cat_rusher, input_dense_qb, input_cat_qb],
                               outputs = [out_soft, out_soft_entropy]) #out_reg

    ## compile
    model.compile(loss=[crps, keras.losses.categorical_crossentropy],  #, keras.losses.mae
                  loss_weights=[1, 0.005], #, 0
                  optimizer=keras.optimizers.Adam(learning_rate=0.0005, decay = 1e-4))

    ## train
    tr_x = [game_dense[tr_inds], game_cat[tr_inds], offense_dense[tr_inds], offence_cat[tr_inds], defense_dense[tr_inds], deffence_cat[tr_inds],rusher_dense[tr_inds], rusher_cat[tr_inds], qb_dense[tr_inds], qb_cat[tr_inds]]
    tr_y = [train_y[tr_inds], train_y_cross[tr_inds]] #, train_y_raw[tr_inds]/100
    val_x = [game_dense[_val_inds], game_cat[_val_inds], offense_dense[_val_inds], offence_cat[_val_inds], defense_dense[_val_inds], deffence_cat[_val_inds], 
             rusher_dense[_val_inds], rusher_cat[_val_inds], qb_dense[_val_inds], qb_cat[_val_inds]]
    val_y = [train_y[_val_inds], train_y_cross[_val_inds]] #, train_y_raw[val_inds]/100
    es = EarlyStopping(monitor='val_out_soft_loss', 
               mode='min',
               restore_best_weights=True, 
               verbose=1, 
               patience=10)
    model.fit(tr_x,
              tr_y,
              batch_size=batch_size,
              epochs=epochs,
              verbose=1,
              validation_data=(val_x, val_y),
             callbacks=[es])
    preds = model.predict(val_x)[0]
    loss = crps_np(val_y[0],preds)
    return model, loss, (np.cumsum(preds, axis = 1))

In [69]:
def train_cox(tr_inds, _val_inds):
    ys_tr, xs_tr, cs_tr = ys[tr_inds], xs[tr_inds], cs[tr_inds]
    cox_model = sm.PHReg(ys_tr, xs_tr, cs_tr)
    result = cox_model.fit()

    baseline_cum_hazard_func = result.baseline_cumulative_hazard_function[0]
    pred_index = np.arange(-99, 100)

    cox_xb = []
    cox_preds = []
    for i in _val_inds:
        ind = inds[i]
        play = data.loc[ind]

        xb = np.exp(result.params.dot(xs[i]))
        cum_hazard =  xb * baseline_cum_hazard_func(pred_index - os[i])
        pred = 1 - np.exp(- cum_hazard)

        if play['FieldPosition'].iloc[0] == play['PossessionTeam'].iloc[0]:
            yardToGoal = 100 - play['YardLine'].iloc[0]
        else:
            yardToGoal = play['YardLine'].iloc[0]

        pred /= pred[pred_index <= yardToGoal][-1]
        pred[pred_index > yardToGoal] = 1.

        cox_xb.append(xb)
        cox_preds.append(pred)

    cox_xb, cox_preds = np.hstack(cox_xb), np.vstack(cox_preds)
    y_true = train_y[_val_inds]

    cox_loss = np.mean((cox_preds - y_true)**2)

    return result, cox_loss, cox_preds, y_true

In [70]:
def set_session():
    os.environ['PYTHONHASHSEED'] = '914'                      
    np.random.seed(914)
    rn.seed(914)
    tf.set_random_seed(914)
    num_cores=1
    num_GPU = 0
    num_CPU = 1
    session_conf = tf.ConfigProto(intra_op_parallelism_threads=num_cores,
                                          inter_op_parallelism_threads=num_cores, 
                                          allow_soft_placement=True,
                                          device_count = {'CPU' : num_CPU, 'GPU' : num_GPU})
    sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
    K.set_session(sess)

In [71]:
def find_weight(nn_preds, cox_preds, y_true):
    minimal_loss=1
    for i in np.arange(0, 1.01, 0.05):
        loss = np.mean((nn_preds*i + cox_preds*(1-i) - y_true)**2)
#         print(i, loss)
        if loss < minimal_loss:
            minimal_loss = loss
            best_i = i
#     print("BEST: ", best_i, minimal_loss)
    return best_i, minimal_loss 
    

In [72]:
from sklearn.model_selection import KFold, GroupKFold
rkf = GroupKFold(10)
FOLD_LIST = list(rkf.split(group_split,group_split,group_split))

losses = []
models = []
cox_losses = []
cox_models = []
weights = []
min_losses=[]
count=0
for k_fold, (tr_inds, _val_inds) in enumerate(FOLD_LIST):
    set_session
    print("-----------")
    print("-----------")
    model, loss, nn_preds = get_model(32, 100)
    models.append(model)
    losses.append(loss)
    print("working on Cox model")
    cox_model, cox_loss, cox_preds, y_true = train_cox(tr_inds, _val_inds)
    cox_losses.append(cox_loss)
    cox_models.append(cox_model)
    best_i, minimal_loss = find_weight(nn_preds, cox_preds, y_true)
    weights.append(best_i)
    print(k_fold, loss, cox_loss, minimal_loss)
    min_losses.append(minimal_loss)
#     count+=1
#     if count == 1:
#         break
print("-------")
print(min_losses)
print(np.mean(min_losses))

-----------
-----------
(?, 160)
Train on 20832 samples, validate on 2339 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Restoring model weights from the end of the best epoch
Epoch 00025: early stopping
working on Cox model
0 0.012196951522258396 0.01307013571157025 0.012191974032891175
-----------
-----------
(?, 160)
Train on 20859 samples, validate on 2312 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/1

In [73]:
print(min_losses)
print(np.mean(min_losses))
print("-------")
print(losses)
print(np.mean(losses))

[0.012191974032891175, 0.012590070337647127, 0.011260637665613514, 0.012542546903262161, 0.012178906923265294, 0.012242231348637219, 0.011963296572526769, 0.013207071200148409, 0.01166568775573237, 0.013079033951736005]
0.012292145669146005
-------
[0.012196951522258396, 0.012593319486172051, 0.011279060586687047, 0.012567029701982459, 0.012192048317505198, 0.01224618020625405, 0.011966212859958527, 0.013219704723647658, 0.01166568775573237, 0.013083665887669934]
0.012300986104786768


## Prediction

In [78]:
def cox_predict(x, offset, result):
   
    baseline_cum_hazard_func = result.baseline_cumulative_hazard_function[0]

    xb = np.exp(result.params.dot(x))
    cum_hazard = xb * baseline_cum_hazard_func(pred_index - offset)
    pred = 1 - np.exp(- cum_hazard)

    if play['FieldPosition'].iloc[0] == play['PossessionTeam'].iloc[0]:
        yardToGoal = 100 - play['YardLine'].iloc[0]
    else:
        yardToGoal = play['YardLine'].iloc[0]

    pred /= pred[pred_index <= yardToGoal][-1]
    pred[pred_index > yardToGoal] = 1.
    
    return np.hstack([pred[95:115], xb]), pred

In [79]:
def make_pred(test, sample, env, model, cox_models, weights):
    test = test.drop(useless, axis=1)
    test[dense_features_init] = clean_dense(test[dense_features_init])
    
    #COX
    play = copy.deepcopy(test)    
    play.loc[play.HomeTeamAbbr.values == "ARI", 'HomeTeamAbbr'] = "ARZ"
    play.loc[play.HomeTeamAbbr.values == "BAL", 'HomeTeamAbbr'] = "BLT"
    play.loc[play.HomeTeamAbbr.values == "CLE", 'HomeTeamAbbr'] = "CLV"
    play.loc[play.HomeTeamAbbr.values == "HOU", 'HomeTeamAbbr'] = "HST"
    try:
        x, offset = extract_feature(play, False)
    except:
        x = np.array([-4.88000000e+00,  1.20000000e-01,  3.20315977e+00, -3.08452002e-02,
        1.80993108e+00, -2.02125176e-02,  2.01013000e+02,  4.60299800e+02,
        9.92685007e+01,  5.66515048e+01,  3.14532910e+01,  2.08711101e+01,
        9.51740400e+02,  5.75563600e+02,  1.67962734e+02,  5.03570419e+01,
        5.96471237e+01,  2.17650753e+01, -5.55762769e+00, -1.00027062e+00,
       -1.28013294e-01,  1.00000000e+00,  0.00000000e+00,  0.00000000e+00])
        offset=-9.879999999999995
        
    cox_data, _ = cox_predict(x, offset, result_cox_main)
    
    
    test = preprocess(test)
    test = test.replace([np.inf, -np.inf], np.nan) 
    test = drop(test)

    ### categorical
    test_cat = test.loc[:,cat_features]
    for col in (test_cat.columns):
        test_cat.loc[:,col] = test_cat[col].fillna("nan")
        test_cat.loc[:,col] = col + "__" + test_cat[col].astype(str)
        isnan = ~test_cat.loc[:,col].isin(categories)
        if np.sum(isnan) > 0:
            if not ((col + "__nan") in categories):
                test_cat.loc[isnan,col] = most_appear_each_categories[col]
            else:
                test_cat.loc[isnan,col] = col + "__nan"
    for col in (test_cat.columns):
        le = labels_encoders[col]
        test_cat.loc[:, col] = le.transform(test_cat[col])

    ### dense
    test_dense = test[dense_features]
    for col in (test_dense.columns):
        test_dense[col] = pd.to_numeric(test_dense[col], errors='coerce')
        test_dense.loc[:, col] = test_dense[col].fillna(medians[col])
        try:
            test_dense[col] = np.clip(test_dense[col].values, minimals[col], maximals[col])
        except:
            pass
        test_dense.loc[:, col] = sss[col].transform(test_dense[col].values[:,None])
        
    ### divide
  
    
    test_game_dense=test_dense[dense_game_features].drop_duplicates().values
    test_game_cat=test_cat.loc[test_dense[dense_game_features].drop_duplicates().index, cat_game_features].values

    test_offense_dense=test_dense.loc[test_dense['IsOnOffense']==1, dense_player_features].values.reshape(-1,11,shape)
    test_offence_cat=test_cat.loc[test_dense['IsOnOffense']==1, cat_player_features].values.reshape(-1,11,2)

    test_defense_dense=test_dense.loc[test_dense['IsOnOffense']!=1, dense_player_features].values.reshape(-1,11,shape)
    test_deffence_cat=test_cat.loc[test_dense['IsOnOffense']!=1, cat_player_features].values.reshape(-1,11,2)

    # rusher
    test_rusher_dense=test_dense.loc[test_dense['BallCarrier']>0, dense_player_features].drop(presonal_drop,axis=1).values
    test_rusher_cat=test_cat.loc[test_dense['BallCarrier']>0, cat_player_features].values
    
    test_qb_dense=test_dense.loc[test_dense['Quarterback']>0, dense_player_features].drop(presonal_drop,axis=1).values
    test_qbr_cat=test_cat.loc[test_dense['Quarterback']>0, cat_player_features].values
        
    test_game_dense = np.hstack([test_game_dense, cox_data.reshape(1,-1)])
    for f in [test_game_dense, test_game_cat, test_offense_dense, test_offence_cat, test_defense_dense, test_deffence_cat, test_rusher_dense, test_rusher_cat, test_qb_dense, test_qbr_cat]:
        np.nan_to_num(f, copy=False)
    
    test_inp = [test_game_dense, test_game_cat, test_offense_dense, test_offence_cat, test_defense_dense, test_deffence_cat, test_rusher_dense, test_rusher_cat, test_qb_dense, test_qbr_cat]

    
    ## pred
    pred = 0
    for k in range(len(models)):
        model, cox_model, w = models[k], cox_models[k], weights[k]
        _pred = model.predict(test_inp)[0]
        _pred = np.cumsum(_pred, axis = 1)
        _, _cox_pred = cox_predict(x, offset, cox_model)
        pred += _pred * w + _cox_pred*(1-w)
    pred /= len(models)
    pred = np.clip(pred, 0, 1)
    env.predict(pd.DataFrame(data=pred,columns=sample.columns))
    return pred

In [80]:
env = nflrush.make_env()
preds = []
for test_init, sample in tqdm_notebook(env.iter_test()):
    try:
        pred = make_pred(test_init, sample, env, models, cox_models, weights)
    except:
        print("ERROR: problem with predictions with sample # ", len(preds))
        env.predict(sample)
        preds.append(sample)
    else:
        preds.append(pred)
    
env.write_submission_file()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Your submission file has been saved!  Once you `Commit` your Notebook and it finishes running, you can submit the file to the competition from the Notebook Viewer `Output` tab.


In [81]:
preds = np.vstack(preds)
## check whether prediction is submittable
print(np.mean(np.diff(preds, axis = 1) >= 0) == 1.0)
print(np.mean(preds > 1) == 0)

True
True


In [82]:
print(losses)
print(np.mean(losses))

[0.012196951522258396, 0.012593319486172051, 0.011279060586687047, 0.012567029701982459, 0.012192048317505198, 0.01224618020625405, 0.011966212859958527, 0.013219704723647658, 0.01166568775573237, 0.013083665887669934]
0.012300986104786768
