In [90]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [330]:
df = pd.read_csv('df_for_stats.csv')

In [92]:
df.head()

Unnamed: 0,PlayerKey,GameID,PlayKey,BodyPart,Surface,DM_M1,DM_M7,DM_M28,DM_M42,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup
0,39873,39873-4,39873-4-32,Knee,Synthetic,1,1,1,1,Linebacker,29,4,Indoors,Synthetic,84,Cloudy,Punt,32,OLB,LB
1,46074,46074-7,46074-7-26,Knee,Natural,1,1,0,0,Linebacker,50,7,Open,Natural,76,Partly Cloudy,Punt,26,OLB,LB
2,36557,36557-1,36557-1-70,Ankle,Synthetic,1,1,1,1,Safety,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,70,SS,DB
3,46646,46646-3,46646-3-30,Ankle,Natural,1,0,0,0,Linebacker,15,3,Outdoor,Natural,80,Cloudy,Punt,30,LB,LB
4,43532,43532-5,43532-5-69,Ankle,Synthetic,1,1,1,1,Wide Receiver,32,5,Retractable Roof,Synthetic,89,Partly Cloudy,Kickoff,69,WR,WR


# Data cleaning
* fit traditional statistical model (regression statsmodel) to see the interpret coefficients (positive means positively correlated to outcome, p-value shows statistical significance)
* label encode field type
* play type hot encode
* bin/reduce sizes of categories in StatidumType and weather
* hot encode weather and stadium type
* get rid of DM_M1 to DM_M42
* target: severity

In [331]:
df['Severity'] = df['DM_M1'] + df['DM_M7'] + df['DM_M28'] + df['DM_M42']

In [332]:
df = df.drop(['PlayerKey', 'GameID', 'PlayKey', 'DM_M1', 'DM_M7', 'DM_M28', 'DM_M42', 'FieldType'], axis=1)

In [333]:
df.head()

Unnamed: 0,BodyPart,Surface,RosterPosition,PlayerDay,PlayerGame,StadiumType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup,Severity
0,Knee,Synthetic,Linebacker,29,4,Indoors,84,Cloudy,Punt,32,OLB,LB,4
1,Knee,Natural,Linebacker,50,7,Open,76,Partly Cloudy,Punt,26,OLB,LB,2
2,Ankle,Synthetic,Safety,1,1,Outdoor,63,Clear and warm,Pass,70,SS,DB,4
3,Ankle,Natural,Linebacker,15,3,Outdoor,80,Cloudy,Punt,30,LB,LB,1
4,Ankle,Synthetic,Wide Receiver,32,5,Retractable Roof,89,Partly Cloudy,Kickoff,69,WR,WR,4


In [96]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [152]:
df.isnull().sum()

BodyPart          0
Surface           0
RosterPosition    0
PlayerDay         0
PlayerGame        0
StadiumType       4
Temperature       0
Weather           3
PlayType          0
PlayerGamePlay    0
Position          0
PositionGroup     0
Severity          0
dtype: int64

In [401]:
# start with label encoding, do all the hot encoding after
le = LabelEncoder()
surface = le.fit(df['Surface'])
print(surface.classes_)

['Natural' 'Synthetic']


In [402]:
surface_transformed = le.transform(df['Surface'])    #### ADD TO FINAL TABLE
print(surface_transformed) # 1 = Synthetic, 0 = Natural

[1 0 1 0 1 0 1 0 1 1 1 0 0 0 0 1 1 1 0 1 1 1 0 1 1 1 1 0 0 1 0 0 1 0 0 0 1
 0 1 0 0 0 1 1 0 0 1 0 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 0 0 0 0 1 1 1 0 1 1
 0 1 1]


In [155]:
print(len(df['StadiumType'].unique()))
print(len(df['Weather'].unique()))
# need to bin these into simpler categories

17
24


In [156]:
df['StadiumType'].unique()

array(['Indoors', 'Open', 'Outdoor', 'Retractable Roof', 'Indoor', 'Dome',
       'Outddors', 'Outdoors', 'Retr. Roof-Closed', nan, 'Closed Dome',
       'Oudoor', 'Indoor, Roof Closed', 'Indoor, Open Roof',
       'Retr. Roof - Open', 'Domed, closed', 'Retr. Roof - Closed'],
      dtype=object)

In [334]:
df['StadiumSimplified'] = ""  # drop later
df['StadiumType'] = df['StadiumType'].fillna('unknown') # placing unknowns in outdoors list, most common type
indoors = ['Indoors', 'Retractable Roof', 'Indoor', 'Dome', 'Retr. Roof-Closed','Closed Dome', 'Indoor, Roof Closed', 'Domed, closed', 'Retr. Roof - Closed']
outdoors = ['Open', 'Outdoor', 'Outddors', 'Outdoors', 'Oudoor', 'Indoor, Open Roof', 'Retr. Roof - Open', 'unknown'] 
len(indoors) + len(outdoors)

17

In [335]:
for index, row in df.iterrows():
    for stadtype in indoors:
        if stadtype == row['StadiumType']:
            df.loc[df.index[index], 'StadiumSimplified'] = 'Indoors'
    for stadtype in outdoors:
        if stadtype == row['StadiumType']:
            df.loc[df.index[index], 'StadiumSimplified'] = 'Outdoors'

In [336]:
df.head()

Unnamed: 0,BodyPart,Surface,RosterPosition,PlayerDay,PlayerGame,StadiumType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup,Severity,StadiumSimplified
0,Knee,Synthetic,Linebacker,29,4,Indoors,84,Cloudy,Punt,32,OLB,LB,4,Indoors
1,Knee,Natural,Linebacker,50,7,Open,76,Partly Cloudy,Punt,26,OLB,LB,2,Outdoors
2,Ankle,Synthetic,Safety,1,1,Outdoor,63,Clear and warm,Pass,70,SS,DB,4,Outdoors
3,Ankle,Natural,Linebacker,15,3,Outdoor,80,Cloudy,Punt,30,LB,LB,1,Outdoors
4,Ankle,Synthetic,Wide Receiver,32,5,Retractable Roof,89,Partly Cloudy,Kickoff,69,WR,WR,4,Indoors


In [337]:
df = df.drop('StadiumType', axis=1)

In [403]:
stadium = le.fit(df['StadiumSimplified'])
stadium.classes_

array(['Indoors', 'Outdoors'], dtype=object)

In [404]:
stadium_transformed = le.transform(df['StadiumSimplified']) #### ADD TO FINAL TABLE
stadium_transformed # 0 = Indoors, 1 = Outdoors

array([0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0])

In [172]:
df['Weather'].unique()

array(['Cloudy', 'Partly Cloudy', 'Clear and warm', 'Sunny', 'Indoor',
       'Clear', nan, 'Controlled Climate', 'Cold',
       'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
       'Sun & clouds', 'Indoors', 'Rain', 'Coudy', 'Mostly sunny', 'Fair',
       'Cloudy, 50% change of rain', 'Light Rain', 'Mostly Sunny',
       'Clear Skies', 'Rain shower', 'Clear skies', 'Cloudy and Cool',
       'Mostly cloudy'], dtype=object)

In [109]:
df['Weather'].value_counts()

Cloudy                                                                              13
Partly Cloudy                                                                       11
Sunny                                                                               11
Clear                                                                                7
Indoor                                                                               5
Rain                                                                                 4
Indoors                                                                              3
Cold                                                                                 3
Cloudy, 50% change of rain                                                           2
Clear skies                                                                          2
Coudy                                                                                1
Mostly Sunny                               

In [347]:
df['WeatherSimplified'] = ""  # hot encode, drop 'Weather'
df['Weather'] = df['Weather'].fillna('unknown') # placing unknowns in cloudy, most common 
cloudy = ['Cloudy', 'Partly Cloudy', 'Cloudy, 50% change of rain', 'Coudy', 'Mostly cloudy', 'Cloudy and Cool', 
          'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.', 'unknown']
clear = ['Sunny' , 'Clear', 'Clear skies', 'Mostly Sunny', 'Clear and warm', 'Fair', 'Sun & clouds', 'Mostly sunny', 'Clear Skies']
rain = ['Rain', 'Cold', 'Rain shower', 'Light Rain']
controlled = ['Indoor', 'Indoors', 'Controlled Climate']
len(cloudy) + len(clear) + len(rain) + len(controlled)

24

In [348]:
for index, row in df.iterrows():
    for weather in cloudy:
        if weather == row['Weather']:
            df.loc[df.index[index], 'WeatherSimplified'] = 'Cloudy'
    for weather in clear:
        if weather == row['Weather']:
            df.loc[df.index[index], 'WeatherSimplified'] = 'Clear'
    for weather in rain:
        if weather == row['Weather']:
            df.loc[df.index[index], 'WeatherSimplified'] = 'Rain'
    for weather in controlled:
        if weather == row['Weather']:
            df.loc[df.index[index], 'WeatherSimplified'] = 'Controlled'

In [349]:
df = df.drop('Weather', axis=1)

In [350]:
df.head()

Unnamed: 0,BodyPart,Surface,RosterPosition,PlayerDay,PlayerGame,Temperature,PlayType,PlayerGamePlay,Position,PositionGroup,Severity,StadiumSimplified,WeatherSimplified
0,Knee,Synthetic,Linebacker,29,4,84,Punt,32,OLB,LB,4,Indoors,Cloudy
1,Knee,Natural,Linebacker,50,7,76,Punt,26,OLB,LB,2,Outdoors,Cloudy
2,Ankle,Synthetic,Safety,1,1,63,Pass,70,SS,DB,4,Outdoors,Clear
3,Ankle,Natural,Linebacker,15,3,80,Punt,30,LB,LB,1,Outdoors,Cloudy
4,Ankle,Synthetic,Wide Receiver,32,5,89,Kickoff,69,WR,WR,4,Indoors,Cloudy


In [114]:
df['Position'].unique()

array(['OLB', 'SS', 'LB', 'WR', 'RB', 'ILB', 'T', 'C', 'FS', 'DE', 'MLB',
       'CB', 'DT', 'TE', 'DB'], dtype=object)

In [115]:
df['PositionGroup'].unique()

array(['LB', 'DB', 'WR', 'RB', 'OL', 'DL', 'TE'], dtype=object)

In [351]:
# getting rid of RosterPosition and PositionGroup; only interested in position for the actual play
df = df.drop(['RosterPosition', 'PositionGroup'], axis=1)

In [352]:
# remove PlayerGame, it's an identifier
df = df.drop(['PlayerGame'], axis=1)

In [353]:
filter_temp = df['Temperature'] < -100
df[filter_temp]

Unnamed: 0,BodyPart,Surface,PlayerDay,Temperature,PlayType,PlayerGamePlay,Position,Severity,StadiumSimplified,WeatherSimplified
8,Knee,Synthetic,43,-999,Rush,61,ILB,1,Indoors,Controlled
16,Knee,Synthetic,7,-999,Pass,38,WR,2,Indoors,Controlled
20,Knee,Synthetic,21,-999,Rush,27,RB,4,Indoors,Controlled
25,Knee,Synthetic,64,-999,Pass,12,RB,2,Indoors,Cloudy
26,Ankle,Synthetic,36,-999,Pass,25,CB,4,Indoors,Controlled
36,Ankle,Synthetic,-34,-999,Pass,18,CB,1,Indoors,Controlled
50,Ankle,Synthetic,18,-999,Rush,34,LB,1,Indoors,Cloudy


In [354]:
for index, row in df.iterrows():
    if row['Temperature'] == -999:
        df.loc[df.index[index], 'Temperature'] = df['Temperature'].median()

In [355]:
df['Temperature'].mean()

65.6103896103896

In [356]:
df.head() # THESE ARE YOUR FINAL VARIABLES, BEFORE TRANSFORMATION

Unnamed: 0,BodyPart,Surface,PlayerDay,Temperature,PlayType,PlayerGamePlay,Position,Severity,StadiumSimplified,WeatherSimplified
0,Knee,Synthetic,29,84.0,Punt,32,OLB,4,Indoors,Cloudy
1,Knee,Natural,50,76.0,Punt,26,OLB,2,Outdoors,Cloudy
2,Ankle,Synthetic,1,63.0,Pass,70,SS,4,Outdoors,Clear
3,Ankle,Natural,15,80.0,Punt,30,LB,1,Outdoors,Cloudy
4,Ankle,Synthetic,32,89.0,Kickoff,69,WR,4,Indoors,Cloudy


In [119]:
# Order positions by 'offensiveness', or bin them into defensive/offensive groups, or directly hot encode?

Positions:
* WR = Wide Receiver (O)
* OLB = Outside Linebackers (D)
* CB = Cornerback (D)
* RB = Running Back (O)
* SS = Strong safety (D)
* FS = Free safety (D)
* DE = Defensive end (D)
* MLB = Middle linebacker (D)
* C = Center (O)
* ILB = Inside linenacker (D)
* TE = Tight end (O)
* T = Tight end (O)
* LB = Linebacker (D)
* DT = Defensive tackle (D)
* DB = Defensive Back (D)

In [357]:
# categories for hot encode
cat = ['BodyPart', 'PlayType', 'Position', 'WeatherSimplified']
df_cat = df[cat]
df_cat.head()

Unnamed: 0,BodyPart,PlayType,Position,WeatherSimplified
0,Knee,Punt,OLB,Cloudy
1,Knee,Punt,OLB,Cloudy
2,Ankle,Pass,SS,Clear
3,Ankle,Punt,LB,Cloudy
4,Ankle,Kickoff,WR,Cloudy


In [358]:
df_cat['BodyPart'].value_counts()

Knee     36
Ankle    35
Foot      6
Name: BodyPart, dtype: int64

In [359]:
df_cat['PlayType'].unique()

array(['Punt', 'Pass', 'Kickoff', 'Rush', 'Punt Not Returned',
       'Punt Returned', 'Kickoff Returned', 'Kickoff Not Returned'],
      dtype=object)

In [None]:
# get dummies or hot encode?

# Note: The encoding here is done without binning the position feature

In [137]:
# hot encode
enc = OneHotEncoder(sparse=False)
fit = enc.fit(df_cat)
fit.categories_

[array(['Ankle', 'Foot', 'Knee'], dtype=object),
 array(['Kickoff', 'Kickoff Not Returned', 'Kickoff Returned', 'Pass',
        'Punt', 'Punt Not Returned', 'Punt Returned', 'Rush'], dtype=object),
 array(['C', 'CB', 'DB', 'DE', 'DT', 'FS', 'ILB', 'LB', 'MLB', 'OLB', 'RB',
        'SS', 'T', 'TE', 'WR'], dtype=object),
 array(['', 'Clear', 'Cloudy', 'Controlled', 'Rain'], dtype=object)]

In [142]:
hot_encoded_array = fit.transform(df_cat)
print(hot_encoded_array)
print(hot_encoded_array.shape)

[[0. 0. 1. ... 1. 0. 0.]
 [0. 0. 1. ... 1. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 1. 0. 0.]
 [0. 0. 1. ... 1. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]]
(77, 31)


In [360]:
# dummies
df_dummy = pd.get_dummies(df[cat])

In [362]:
df_dummy

Unnamed: 0,BodyPart_Ankle,BodyPart_Foot,BodyPart_Knee,PlayType_Kickoff,PlayType_Kickoff Not Returned,PlayType_Kickoff Returned,PlayType_Pass,PlayType_Punt,PlayType_Punt Not Returned,PlayType_Punt Returned,PlayType_Rush,Position_C,Position_CB,Position_DB,Position_DE,Position_DT,Position_FS,Position_ILB,Position_LB,Position_MLB,Position_OLB,Position_RB,Position_SS,Position_T,Position_TE,Position_WR,WeatherSimplified_Clear,WeatherSimplified_Cloudy,WeatherSimplified_Controlled,WeatherSimplified_Rain
0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
3,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
4,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
73,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
74,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
75,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0


In [182]:
df.head()

Unnamed: 0,BodyPart,Surface,PlayerDay,Temperature,PlayType,PlayerGamePlay,Position,Severity,StadiumSimplified,WeatherSimplified
0,Knee,Synthetic,29,84,Punt,32,OLB,4,Indoors,Cloudy
1,Knee,Natural,50,76,Punt,26,OLB,2,Outdoors,Cloudy
2,Ankle,Synthetic,1,63,Pass,70,SS,4,Outdoors,Clear
3,Ankle,Natural,15,80,Punt,30,LB,1,Outdoors,Cloudy
4,Ankle,Synthetic,32,89,Kickoff,69,WR,4,Indoors,Cloudy


In [408]:
df_final = df_dummy
df_final['Surface (encoded)'] = surface_transformed
df_final['Stadium (encoded)'] = stadium_transformed
df_final['PlayerDay'] = df['PlayerDay']
df_final['Temperature'] = df['Temperature']
df_final['PlayerGamePlay'] = df['PlayerGamePlay']
df_final['Severity'] = df['Severity']
y = df_final['Severity']

In [364]:
df_final = df_final.drop('Severity', axis=1)

In [365]:
df_final.head()

Unnamed: 0,BodyPart_Ankle,BodyPart_Foot,BodyPart_Knee,PlayType_Kickoff,PlayType_Kickoff Not Returned,PlayType_Kickoff Returned,PlayType_Pass,PlayType_Punt,PlayType_Punt Not Returned,PlayType_Punt Returned,PlayType_Rush,Position_C,Position_CB,Position_DB,Position_DE,Position_DT,Position_FS,Position_ILB,Position_LB,Position_MLB,Position_OLB,Position_RB,Position_SS,Position_T,Position_TE,Position_WR,WeatherSimplified_Clear,WeatherSimplified_Cloudy,WeatherSimplified_Controlled,WeatherSimplified_Rain,Surface (encoded),Stadium (encoded),PlayerDay,Temperature,PlayerGamePlay
0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,29,84.0,32
1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,50,76.0,26
2,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,1,63.0,70
3,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,15,80.0,30
4,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,32,89.0,69


In [294]:
print(df_final.shape)
print(df_final.dtypes)

(77, 32)
BodyPart_Foot                      uint8
BodyPart_Knee                      uint8
PlayType_Kickoff Not Returned      uint8
PlayType_Kickoff Returned          uint8
PlayType_Pass                      uint8
PlayType_Punt                      uint8
PlayType_Punt Not Returned         uint8
PlayType_Punt Returned             uint8
PlayType_Rush                      uint8
Position_CB                        uint8
Position_DB                        uint8
Position_DE                        uint8
Position_DT                        uint8
Position_FS                        uint8
Position_ILB                       uint8
Position_LB                        uint8
Position_MLB                       uint8
Position_OLB                       uint8
Position_RB                        uint8
Position_SS                        uint8
Position_T                         uint8
Position_TE                        uint8
Position_WR                        uint8
WeatherSimplified_Clear            uint8
Weather

In [290]:
df_final = df_final.astype('float64') # don't need

In [239]:
df_final.dtypes

BodyPart_Ankle                   float64
BodyPart_Foot                    float64
BodyPart_Knee                    float64
PlayType_Kickoff                 float64
PlayType_Kickoff Not Returned    float64
PlayType_Kickoff Returned        float64
PlayType_Pass                    float64
PlayType_Punt                    float64
PlayType_Punt Not Returned       float64
PlayType_Punt Returned           float64
PlayType_Rush                    float64
Position_C                       float64
Position_CB                      float64
Position_DB                      float64
Position_DE                      float64
Position_DT                      float64
Position_FS                      float64
Position_ILB                     float64
Position_LB                      float64
Position_MLB                     float64
Position_OLB                     float64
Position_RB                      float64
Position_SS                      float64
Position_T                       float64
Position_TE     

In [219]:
import statsmodels.api as sm # not using

In [300]:
df_final.describe()

Unnamed: 0,BodyPart_Foot,BodyPart_Knee,PlayType_Kickoff Not Returned,PlayType_Kickoff Returned,PlayType_Pass,PlayType_Punt,PlayType_Punt Not Returned,PlayType_Punt Returned,PlayType_Rush,Position_CB,Position_DB,Position_DE,Position_DT,Position_FS,Position_ILB,Position_LB,Position_MLB,Position_OLB,Position_RB,Position_SS,Position_T,Position_TE,Position_WR,WeatherSimplified_Clear,WeatherSimplified_Cloudy,WeatherSimplified_Controlled,WeatherSimplified_Rain,Surface (encoded),Stadium (encoded),PlayerDay,Temperature,PlayerGamePlay
count,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0
mean,0.077922,0.467532,0.012987,0.012987,0.415584,0.116883,0.012987,0.038961,0.298701,0.103896,0.012987,0.064935,0.025974,0.064935,0.038961,0.025974,0.051948,0.155844,0.077922,0.064935,0.025974,0.025974,0.207792,0.337662,0.415584,0.116883,0.116883,0.532468,0.727273,139.727273,65.61039,25.766234
std,0.269807,0.502217,0.113961,0.113961,0.496054,0.323388,0.113961,0.194771,0.46069,0.307127,0.113961,0.248027,0.160101,0.248027,0.194771,0.160101,0.223377,0.365086,0.269807,0.248027,0.160101,0.160101,0.408388,0.476014,0.496054,0.323388,0.323388,0.502217,0.448282,168.401122,14.569395,19.918846
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-34.0,33.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,57.0,10.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,50.0,68.0,19.0
75%,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,365.0,75.0,38.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,471.0,89.0,76.0


In [246]:
# how to use model? have to use multinomial logit
df_final_model = df_final.copy()
df_final_model.drop('Severity', axis=1, inplace=True)
df_final_model.shape

(77, 36)

In [241]:
df_final_model = sm.add_constant(df_final_model, prepend=False)
df_final_model.shape

(77, 37)

In [308]:
mlogit_mod = sm.MNLogit(y, sm.add_constant(df_final, prepend=False))

In [309]:
mlogit_res = mlogit_mod.fit_regularized()

  eXB = np.column_stack((np.ones(len(X)), np.exp(X)))
  return eXB/eXB.sum(1)[:,None]
  logprob = np.log(self.cdf(np.dot(self.exog,params)))
  return np.sum(d * logprob)


Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.006922051305579761
            Iterations: 82
            Function evaluations: 307
            Gradient evaluations: 82


In [310]:
mlogit_res.summary() # doesn't work

  bse = np.sqrt(np.diag(self.cov_params()))


0,1,2,3
Dep. Variable:,Severity,No. Observations:,77.0
Model:,MNLogit,Df Residuals:,-25.0
Method:,MLE,Df Model:,99.0
Date:,"Thu, 20 May 2021",Pseudo R-squ.:,0.9946
Time:,09:01:09,Log-Likelihood:,-0.533
converged:,True,LL-Null:,-98.762
Covariance Type:,nonrobust,LLR p-value:,2.057e-08

Severity=2,coef,std err,z,P>|z|,[0.025,0.975]
BodyPart_Foot,-19.8580,4.48e+18,-4.44e-18,1.000,-8.78e+18,8.78e+18
BodyPart_Knee,28.1503,5.25e+09,5.36e-09,1.000,-1.03e+10,1.03e+10
PlayType_Kickoff Not Returned,8.4982,8.13e+76,1.05e-76,1.000,-1.59e+77,1.59e+77
PlayType_Kickoff Returned,9.5473,1.38e+17,6.9e-17,1.000,-2.71e+17,2.71e+17
PlayType_Pass,13.1302,,,,,
PlayType_Punt,-2.9690,7.4e+09,-4.01e-10,1.000,-1.45e+10,1.45e+10
PlayType_Punt Not Returned,-2.6904,,,,,
PlayType_Punt Returned,15.5475,1.7e+09,9.15e-09,1.000,-3.33e+09,3.33e+09
PlayType_Rush,-12.9198,,,,,
Position_CB,18.0714,,,,,


In [244]:
from sklearn.linear_model import LogisticRegression

In [367]:
df_final.head()

Unnamed: 0,BodyPart_Ankle,BodyPart_Foot,BodyPart_Knee,PlayType_Kickoff,PlayType_Kickoff Not Returned,PlayType_Kickoff Returned,PlayType_Pass,PlayType_Punt,PlayType_Punt Not Returned,PlayType_Punt Returned,PlayType_Rush,Position_C,Position_CB,Position_DB,Position_DE,Position_DT,Position_FS,Position_ILB,Position_LB,Position_MLB,Position_OLB,Position_RB,Position_SS,Position_T,Position_TE,Position_WR,WeatherSimplified_Clear,WeatherSimplified_Cloudy,WeatherSimplified_Controlled,WeatherSimplified_Rain,Surface (encoded),Stadium (encoded),PlayerDay,Temperature,PlayerGamePlay
0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,29,84.0,32
1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,50,76.0,26
2,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,1,63.0,70
3,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,15,80.0,30
4,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,32,89.0,69


In [368]:
model = LogisticRegression()
model.fit(df_final, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [369]:
log_ceof = model.coef_[0] # make sure drop_first for dummies is not set to True if using this
df_coef = pd.DataFrame(log_ceof, df_final.columns)
df_coef.sort_values(by=0, ascending=False)

Unnamed: 0,0
Position_WR,0.588866
BodyPart_Ankle,0.578025
PlayType_Rush,0.416079
Position_LB,0.39599
WeatherSimplified_Rain,0.339478
Position_C,0.260625
WeatherSimplified_Controlled,0.227929
Position_ILB,0.137386
PlayType_Punt,0.061302
Position_SS,0.060709


# Note: the encoding here is done after binning position into offensive or defensive

In [370]:
df.head()

Unnamed: 0,BodyPart,Surface,PlayerDay,Temperature,PlayType,PlayerGamePlay,Position,Severity,StadiumSimplified,WeatherSimplified
0,Knee,Synthetic,29,84.0,Punt,32,OLB,4,Indoors,Cloudy
1,Knee,Natural,50,76.0,Punt,26,OLB,2,Outdoors,Cloudy
2,Ankle,Synthetic,1,63.0,Pass,70,SS,4,Outdoors,Clear
3,Ankle,Natural,15,80.0,Punt,30,LB,1,Outdoors,Cloudy
4,Ankle,Synthetic,32,89.0,Kickoff,69,WR,4,Indoors,Cloudy


In [375]:
df['Offense/Defense'] = ''
offense = ['WR', 'RB', 'C', 'TE', 'T']
defense = ['OLB', 'CB', 'SS', 'FS', 'DE','MLB', 'ILB', 'LB', 'DT', 'DB']
print(len(offense) + len(defense))
print(len(df['Position'].unique()))

15
15


In [378]:
for index, row in df.iterrows():
    for position in offense:
        if position == row['Position']:
            df.loc[df.index[index], 'Offense/Defense'] = 'Offense'
    for position in defense:
        if position == row['Position']:
            df.loc[df.index[index], 'Offense/Defense'] = 'Defense'

In [381]:
df = df.drop('Position', axis=1)

In [382]:
df.head()

Unnamed: 0,BodyPart,Surface,PlayerDay,Temperature,PlayType,PlayerGamePlay,Severity,StadiumSimplified,WeatherSimplified,Offense/Defense
0,Knee,Synthetic,29,84.0,Punt,32,4,Indoors,Cloudy,Defense
1,Knee,Natural,50,76.0,Punt,26,2,Outdoors,Cloudy,Defense
2,Ankle,Synthetic,1,63.0,Pass,70,4,Outdoors,Clear,Defense
3,Ankle,Natural,15,80.0,Punt,30,1,Outdoors,Cloudy,Defense
4,Ankle,Synthetic,32,89.0,Kickoff,69,4,Indoors,Cloudy,Offense


In [405]:
offdef = le.fit(df['Offense/Defense'])
offdef.classes_

array(['Defense', 'Offense'], dtype=object)

In [406]:
offdef_transformed = le.transform(df['Offense/Defense']) #### ADD TO FINAL TABLE
offdef_transformed # 0 = defensive, 1 = offensive

array([0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0])

In [396]:
cat2 = ['BodyPart', 'PlayType', 'WeatherSimplified']
df_cat2 = df[cat2]
df_cat2.head()

Unnamed: 0,BodyPart,PlayType,WeatherSimplified
0,Knee,Punt,Cloudy
1,Knee,Punt,Cloudy
2,Ankle,Pass,Clear
3,Ankle,Punt,Cloudy
4,Ankle,Kickoff,Cloudy


In [398]:
df_dummy_2 = pd.get_dummies(df[cat2])
df_dummy_2

Unnamed: 0,BodyPart_Ankle,BodyPart_Foot,BodyPart_Knee,PlayType_Kickoff,PlayType_Kickoff Not Returned,PlayType_Kickoff Returned,PlayType_Pass,PlayType_Punt,PlayType_Punt Not Returned,PlayType_Punt Returned,PlayType_Rush,WeatherSimplified_Clear,WeatherSimplified_Cloudy,WeatherSimplified_Controlled,WeatherSimplified_Rain
0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0
1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0
2,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0
3,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0
4,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0
73,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0
74,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0
75,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0


In [409]:
df_final_2 = df_dummy_2
df_final_2['Surface (encoded)'] = surface_transformed
df_final_2['Stadium (encoded)'] = stadium_transformed
df_final_2['Offense/Defense (encoded)'] = offdef_transformed
df_final_2['PlayerDay'] = df['PlayerDay']
df_final_2['Temperature'] = df['Temperature']
df_final_2['PlayerGamePlay'] = df['PlayerGamePlay']
df_final_2['Severity'] = df['Severity']
y = df_final['Severity']

In [410]:
df_final_2.drop('Severity', axis=1, inplace=True)

In [412]:
df_final_2

Unnamed: 0,BodyPart_Ankle,BodyPart_Foot,BodyPart_Knee,PlayType_Kickoff,PlayType_Kickoff Not Returned,PlayType_Kickoff Returned,PlayType_Pass,PlayType_Punt,PlayType_Punt Not Returned,PlayType_Punt Returned,PlayType_Rush,WeatherSimplified_Clear,WeatherSimplified_Cloudy,WeatherSimplified_Controlled,WeatherSimplified_Rain,Surface (encoded),Stadium (encoded),Offense/Defense (encoded),PlayerDay,Temperature,PlayerGamePlay
0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,29,84.0,32
1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,50,76.0,26
2,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,1,63.0,70
3,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,15,80.0,30
4,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,32,89.0,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,442,57.0,15
73,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,1,1,393,45.0,3
74,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,1,327,81.0,4
75,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,379,78.0,3


In [414]:
model2 = LogisticRegression()
model2.fit(df_final_2, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [415]:
log_ceof_2 = model2.coef_[0] # make sure drop_first for dummies is not set to True if using this
df_coef_2 = pd.DataFrame(log_ceof_2, df_final_2.columns)
df_coef_2.sort_values(by=0, ascending=False)

Unnamed: 0,0
BodyPart_Ankle,0.659255
WeatherSimplified_Rain,0.393942
PlayType_Rush,0.376678
Offense/Defense (encoded),0.357502
WeatherSimplified_Controlled,0.230479
PlayType_Punt,0.087376
Temperature,0.008875
PlayerGamePlay,0.002103
PlayerDay,-0.000833
PlayType_Pass,-0.030686


In [None]:
# 0 = defensive, 1 = offensive
# 1 = Synthetic, 0 = Natural
# 0 = Indoors, 1 = Outdoors