In [1]:
import pandas as pd

import featuretools as ft
from featuretools import Feature

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
game_df = pd.read_csv('game.csv', nrows=1000)
game_df.head()

Unnamed: 0,game_id,season,type,date_time_GMT,away_team_id,home_team_id,away_goals,home_goals,outcome,home_rink_side_start,venue,venue_link,venue_time_zone_id,venue_time_zone_offset,venue_time_zone_tz
0,2016020045,20162017,R,2016-10-19T00:30:00Z,4,16,4,7,home win REG,right,United Center,/api/v1/venues/null,America/Chicago,-5,CDT
1,2017020812,20172018,R,2018-02-07T00:00:00Z,24,7,4,3,away win OT,left,KeyBank Center,/api/v1/venues/null,America/New_York,-4,EDT
2,2015020314,20152016,R,2015-11-24T01:00:00Z,21,52,4,1,away win REG,right,MTS Centre,/api/v1/venues/null,America/Winnipeg,-5,CDT
3,2015020849,20152016,R,2016-02-17T00:00:00Z,52,12,1,2,home win REG,right,PNC Arena,/api/v1/venues/null,America/New_York,-4,EDT
4,2017020586,20172018,R,2017-12-30T03:00:00Z,20,24,1,2,home win REG,left,Honda Center,/api/v1/venues/null,America/Los_Angeles,-7,PDT


In [8]:
plays_df = pd.read_csv('game_plays.csv', nrows=100000)

plays_df = plays_df.drop(['secondaryType', 'periodType', 'dateTime'], axis=1).fillna(0)
plays_df.head()

Unnamed: 0,play_id,game_id,team_id_for,team_id_against,event,x,y,period,periodTime,periodTimeRemaining,goals_away,goals_home,description,st_x,st_y
0,2016020045_1,2016020045,0.0,0.0,Game Scheduled,0.0,0.0,1,0,1200,0,0,Game Scheduled,0.0,0.0
1,2016020045_2,2016020045,0.0,0.0,Period Ready,0.0,0.0,1,0,1200,0,0,Period Ready,0.0,0.0
2,2016020045_3,2016020045,0.0,0.0,Period Start,0.0,0.0,1,0,1200,0,0,Period Start,0.0,0.0
3,2016020045_4,2016020045,16.0,4.0,Faceoff,0.0,0.0,1,0,1200,0,0,Jonathan Toews faceoff won against Claude Giroux,0.0,0.0
4,2016020045_5,2016020045,16.0,4.0,Shot,-71.0,9.0,1,54,1146,0,0,Artem Anisimov Wrist Shot saved by Michal Neuv...,71.0,-9.0


In [9]:
es = ft.EntitySet(id='plays')
es = es.entity_from_dataframe(entity_id="plays", 
                              dataframe=plays_df, 
                              index="play_id", 
                              variable_types={"event": ft.variable_types.Categorical,
                                              "description": ft.variable_types.Categorical})
f1 = Feature(es["plays"]["event"])
f2 = Feature(es["plays"]["description"])

encoded, defs = ft.encode_features(plays_df, [f1, f2], top_n=10)
encoded.reset_index(inplace=True)
encoded.head()

Unnamed: 0,index,event = Faceoff,event = Shot,event = Stoppage,event = Hit,event = Blocked Shot,event = Missed Shot,event = Giveaway,event = Takeaway,event = Penalty,event = Goal,event is unknown,description = Goalie Stopped,description = Icing,description = Offside,description = Puck in Netting,description = Period Start,description = Period Ready,description = Period Official,description = Puck Frozen,description = Puck in Benches,description = Puck in Crowd,description is unknown,play_id,game_id,team_id_for,team_id_against,x,y,period,periodTime,periodTimeRemaining,goals_away,goals_home,st_x,st_y
0,0,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,2016020045_1,2016020045,0.0,0.0,0.0,0.0,1,0,1200,0,0,0.0,0.0
1,1,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,2016020045_2,2016020045,0.0,0.0,0.0,0.0,1,0,1200,0,0,0.0,0.0
2,2,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,2016020045_3,2016020045,0.0,0.0,0.0,0.0,1,0,1200,0,0,0.0,0.0
3,3,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2016020045_4,2016020045,16.0,4.0,0.0,0.0,1,0,1200,0,0,0.0,0.0
4,4,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2016020045_5,2016020045,16.0,4.0,-71.0,9.0,1,54,1146,0,0,71.0,-9.0


In [10]:
es = ft.EntitySet(id='plays')
es = es.entity_from_dataframe(entity_id='plays', dataframe=encoded, index='play_id')
es = es.normalize_entity(base_entity_id='plays', new_entity_id='games', index='game_id')

features, transform = ft.dfs(entityset=es, target_entity="games", max_depth=2)
features.reset_index(inplace=True)
features.head()

Unnamed: 0,game_id,COUNT(plays),MAX(plays.goals_away),MAX(plays.goals_home),MAX(plays.index),MAX(plays.period),MAX(plays.periodTime),MAX(plays.periodTimeRemaining),MAX(plays.st_x),MAX(plays.st_y),MAX(plays.team_id_against),MAX(plays.team_id_for),MAX(plays.x),MAX(plays.y),MEAN(plays.goals_away),MEAN(plays.goals_home),MEAN(plays.index),MEAN(plays.period),MEAN(plays.periodTime),MEAN(plays.periodTimeRemaining),MEAN(plays.st_x),MEAN(plays.st_y),MEAN(plays.team_id_against),MEAN(plays.team_id_for),MEAN(plays.x),MEAN(plays.y),MIN(plays.goals_away),MIN(plays.goals_home),MIN(plays.index),MIN(plays.period),MIN(plays.periodTime),MIN(plays.periodTimeRemaining),MIN(plays.st_x),MIN(plays.st_y),MIN(plays.team_id_against),MIN(plays.team_id_for),MIN(plays.x),MIN(plays.y),PERCENT_TRUE(plays.description = Goalie Stopped),PERCENT_TRUE(plays.description = Icing),PERCENT_TRUE(plays.description = Offside),PERCENT_TRUE(plays.description = Period Official),PERCENT_TRUE(plays.description = Period Ready),PERCENT_TRUE(plays.description = Period Start),PERCENT_TRUE(plays.description = Puck Frozen),PERCENT_TRUE(plays.description = Puck in Benches),PERCENT_TRUE(plays.description = Puck in Crowd),PERCENT_TRUE(plays.description = Puck in Netting),PERCENT_TRUE(plays.description is unknown),PERCENT_TRUE(plays.event = Blocked Shot),PERCENT_TRUE(plays.event = Faceoff),PERCENT_TRUE(plays.event = Giveaway),PERCENT_TRUE(plays.event = Goal),PERCENT_TRUE(plays.event = Hit),PERCENT_TRUE(plays.event = Missed Shot),PERCENT_TRUE(plays.event = Penalty),PERCENT_TRUE(plays.event = Shot),PERCENT_TRUE(plays.event = Stoppage),PERCENT_TRUE(plays.event = Takeaway),PERCENT_TRUE(plays.event is unknown),SKEW(plays.goals_away),SKEW(plays.goals_home),SKEW(plays.index),SKEW(plays.period),SKEW(plays.periodTime),SKEW(plays.periodTimeRemaining),SKEW(plays.st_x),SKEW(plays.st_y),SKEW(plays.team_id_against),SKEW(plays.team_id_for),SKEW(plays.x),SKEW(plays.y),STD(plays.goals_away),STD(plays.goals_home),STD(plays.index),STD(plays.period),STD(plays.periodTime),STD(plays.periodTimeRemaining),STD(plays.st_x),STD(plays.st_y),STD(plays.team_id_against),STD(plays.team_id_for),STD(plays.x),STD(plays.y),SUM(plays.goals_away),SUM(plays.goals_home),SUM(plays.index),SUM(plays.period),SUM(plays.periodTime),SUM(plays.periodTimeRemaining),SUM(plays.st_x),SUM(plays.st_y),SUM(plays.team_id_against),SUM(plays.team_id_for),SUM(plays.x),SUM(plays.y)
0,2016020045,299,4,7,298,3,1200,1200,98.0,41.0,16.0,16.0,99.0,40.0,1.12709,3.080268,149.0,1.929766,590.769231,609.230769,-0.391304,0.220736,8.468227,7.986622,6.852843,0.361204,0,0,0,1,0,0,-99.0,-40.0,0.0,0.0,-97.0,-41.0,0.043478,0.026756,0.026756,0.010033,0.010033,0.010033,0.0,0.010033,0.003344,0.003344,0.856187,0.06689,0.190635,0.093645,0.036789,0.167224,0.040134,0.023411,0.147157,0.130435,0.056856,0.046823,0.975225,-0.024071,0.0,0.130381,0.020171,-0.020171,-0.028413,0.001353,0.139132,0.295399,-0.144796,-0.122695,1.694019,1.674997,86.458082,0.818259,376.58643,376.58643,60.491588,21.216638,6.719312,6.590604,60.102137,21.214705,337,921,44551,577,176640,182160,-117.0,66.0,2532.0,2388.0,2049.0,108.0
1,2017020812,312,4,3,613,4,1200,1200,99.0,41.0,24.0,24.0,99.0,41.0,1.224359,1.375,455.596154,2.048077,588.701923,588.221154,9.288462,0.086538,12.153846,11.990385,6.865385,0.086538,0,0,299,1,0,0,-99.0,-41.0,0.0,0.0,-97.0,-41.0,0.051282,0.038462,0.032051,0.012821,0.012821,0.012821,0.012821,0.003205,0.003205,0.009615,0.810897,0.089744,0.205128,0.038462,0.022436,0.105769,0.070513,0.022436,0.192308,0.163462,0.032051,0.057692,0.487668,0.238289,0.001591,0.141167,0.058741,0.018086,-0.243346,-0.141594,0.21235,0.247879,-0.195384,0.15287,1.137345,0.66884,90.992546,0.87163,371.271004,364.998567,55.257132,20.00793,9.925869,9.869074,55.611316,20.00793,382,429,142146,639,183675,183525,2898.0,27.0,3792.0,3741.0,2142.0,27.0
2,2017020240,301,1,4,2492,3,1200,1200,97.0,41.0,24.0,24.0,99.0,41.0,0.807309,1.647841,2325.255814,1.946844,599.45515,600.54485,5.591362,-0.179402,18.930233,19.013289,5.152824,3.581395,0,0,381,1,0,0,-99.0,-40.0,0.0,0.0,-98.0,-40.0,0.056478,0.036545,0.016611,0.009967,0.009967,0.009967,0.0,0.006645,0.0,0.016611,0.837209,0.089701,0.202658,0.079734,0.016611,0.069767,0.099668,0.046512,0.166113,0.146179,0.036545,0.046512,-1.566128,0.549355,-7.340775,0.099883,-0.048536,0.048536,-0.188965,-0.081945,-1.557206,-1.557335,-0.101777,-0.133994,0.395069,1.623444,201.594687,0.830962,355.398962,355.398962,60.963288,18.864103,9.274613,9.315211,61.002048,18.520732,243,496,699902,586,180436,180764,1683.0,-54.0,5698.0,5723.0,1551.0,1078.0
3,2015020314,283,4,1,899,3,1200,1200,97.0,41.0,52.0,52.0,97.0,41.0,0.720848,0.734982,756.166078,1.95053,598.484099,601.515901,7.826855,0.007067,29.85159,30.508834,1.989399,0.628975,0,0,600,1,0,0,-98.0,-41.0,0.0,0.0,-98.0,-41.0,0.035336,0.028269,0.024735,0.010601,0.010601,0.010601,0.003534,0.0,0.0,0.017668,0.858657,0.102473,0.162544,0.091873,0.017668,0.137809,0.091873,0.021201,0.159011,0.123675,0.042403,0.04947,1.522353,-1.070535,-0.007528,0.092206,0.008882,-0.008882,-0.186906,-0.022457,-0.069603,-0.135813,-0.035019,0.052598,1.09956,0.442124,83.082186,0.823647,367.588049,367.588049,56.917338,21.908577,19.659312,19.870211,57.42028,21.899516,204,208,213995,552,169371,170229,2215.0,2.0,8448.0,8634.0,563.0,178.0
4,2017020624,307,0,2,2797,3,1200,1200,98.0,41.0,29.0,29.0,98.0,41.0,0.0,0.436482,2633.387622,1.925081,605.117264,594.882736,9.094463,0.80456,19.224756,19.537459,-2.697068,1.228013,0,0,689,1,0,0,-98.0,-41.0,0.0,0.0,-97.0,-40.0,0.065147,0.039088,0.013029,0.009772,0.009772,0.009772,0.019544,0.003257,0.003257,0.026059,0.801303,0.087948,0.214984,0.026059,0.006515,0.104235,0.058632,0.026059,0.208469,0.179153,0.042345,0.045603,0.0,0.818735,-7.310205,0.138618,-0.018098,0.018098,-0.297153,-0.074535,-0.961977,-0.979348,0.090503,-0.056357,0.0,0.558685,153.398205,0.815048,348.79592,348.79592,56.00666,19.622605,10.950496,11.110308,56.678281,19.60059,0,134,808450,591,185771,182629,2792.0,247.0,5902.0,5998.0,-828.0,377.0


In [14]:
feat = features.merge(game_df, how='inner', on='game_id')
feat['label'] = 0
#feat.loc[feat['type'] == 'P', 'label'] = 1
feat.loc[feat['COUNT(plays)'] > 350, 'label'] = 1
print(feat['label'].value_counts())

c = features.columns.to_list()
feat = feat[c + ['label']]

display(feat.head(3))

0    295
1     60
Name: label, dtype: int64


Unnamed: 0,game_id,COUNT(plays),MAX(plays.goals_away),MAX(plays.goals_home),MAX(plays.index),MAX(plays.period),MAX(plays.periodTime),MAX(plays.periodTimeRemaining),MAX(plays.st_x),MAX(plays.st_y),MAX(plays.team_id_against),MAX(plays.team_id_for),MAX(plays.x),MAX(plays.y),MEAN(plays.goals_away),MEAN(plays.goals_home),MEAN(plays.index),MEAN(plays.period),MEAN(plays.periodTime),MEAN(plays.periodTimeRemaining),MEAN(plays.st_x),MEAN(plays.st_y),MEAN(plays.team_id_against),MEAN(plays.team_id_for),MEAN(plays.x),MEAN(plays.y),MIN(plays.goals_away),MIN(plays.goals_home),MIN(plays.index),MIN(plays.period),MIN(plays.periodTime),MIN(plays.periodTimeRemaining),MIN(plays.st_x),MIN(plays.st_y),MIN(plays.team_id_against),MIN(plays.team_id_for),MIN(plays.x),MIN(plays.y),PERCENT_TRUE(plays.description = Goalie Stopped),PERCENT_TRUE(plays.description = Icing),PERCENT_TRUE(plays.description = Offside),PERCENT_TRUE(plays.description = Period Official),PERCENT_TRUE(plays.description = Period Ready),PERCENT_TRUE(plays.description = Period Start),PERCENT_TRUE(plays.description = Puck Frozen),PERCENT_TRUE(plays.description = Puck in Benches),PERCENT_TRUE(plays.description = Puck in Crowd),PERCENT_TRUE(plays.description = Puck in Netting),PERCENT_TRUE(plays.description is unknown),PERCENT_TRUE(plays.event = Blocked Shot),PERCENT_TRUE(plays.event = Faceoff),PERCENT_TRUE(plays.event = Giveaway),PERCENT_TRUE(plays.event = Goal),PERCENT_TRUE(plays.event = Hit),PERCENT_TRUE(plays.event = Missed Shot),PERCENT_TRUE(plays.event = Penalty),PERCENT_TRUE(plays.event = Shot),PERCENT_TRUE(plays.event = Stoppage),PERCENT_TRUE(plays.event = Takeaway),PERCENT_TRUE(plays.event is unknown),SKEW(plays.goals_away),SKEW(plays.goals_home),SKEW(plays.index),SKEW(plays.period),SKEW(plays.periodTime),SKEW(plays.periodTimeRemaining),SKEW(plays.st_x),SKEW(plays.st_y),SKEW(plays.team_id_against),SKEW(plays.team_id_for),SKEW(plays.x),SKEW(plays.y),STD(plays.goals_away),STD(plays.goals_home),STD(plays.index),STD(plays.period),STD(plays.periodTime),STD(plays.periodTimeRemaining),STD(plays.st_x),STD(plays.st_y),STD(plays.team_id_against),STD(plays.team_id_for),STD(plays.x),STD(plays.y),SUM(plays.goals_away),SUM(plays.goals_home),SUM(plays.index),SUM(plays.period),SUM(plays.periodTime),SUM(plays.periodTimeRemaining),SUM(plays.st_x),SUM(plays.st_y),SUM(plays.team_id_against),SUM(plays.team_id_for),SUM(plays.x),SUM(plays.y),label
0,2016020045,299,4,7,298,3,1200,1200,98.0,41.0,16.0,16.0,99.0,40.0,1.12709,3.080268,149.0,1.929766,590.769231,609.230769,-0.391304,0.220736,8.468227,7.986622,6.852843,0.361204,0,0,0,1,0,0,-99.0,-40.0,0.0,0.0,-97.0,-41.0,0.043478,0.026756,0.026756,0.010033,0.010033,0.010033,0.0,0.010033,0.003344,0.003344,0.856187,0.06689,0.190635,0.093645,0.036789,0.167224,0.040134,0.023411,0.147157,0.130435,0.056856,0.046823,0.975225,-0.024071,0.0,0.130381,0.020171,-0.020171,-0.028413,0.001353,0.139132,0.295399,-0.144796,-0.122695,1.694019,1.674997,86.458082,0.818259,376.58643,376.58643,60.491588,21.216638,6.719312,6.590604,60.102137,21.214705,337,921,44551,577,176640,182160,-117.0,66.0,2532.0,2388.0,2049.0,108.0,0
1,2017020812,312,4,3,613,4,1200,1200,99.0,41.0,24.0,24.0,99.0,41.0,1.224359,1.375,455.596154,2.048077,588.701923,588.221154,9.288462,0.086538,12.153846,11.990385,6.865385,0.086538,0,0,299,1,0,0,-99.0,-41.0,0.0,0.0,-97.0,-41.0,0.051282,0.038462,0.032051,0.012821,0.012821,0.012821,0.012821,0.003205,0.003205,0.009615,0.810897,0.089744,0.205128,0.038462,0.022436,0.105769,0.070513,0.022436,0.192308,0.163462,0.032051,0.057692,0.487668,0.238289,0.001591,0.141167,0.058741,0.018086,-0.243346,-0.141594,0.21235,0.247879,-0.195384,0.15287,1.137345,0.66884,90.992546,0.87163,371.271004,364.998567,55.257132,20.00793,9.925869,9.869074,55.611316,20.00793,382,429,142146,639,183675,183525,2898.0,27.0,3792.0,3741.0,2142.0,27.0,0
2,2017020240,301,1,4,2492,3,1200,1200,97.0,41.0,24.0,24.0,99.0,41.0,0.807309,1.647841,2325.255814,1.946844,599.45515,600.54485,5.591362,-0.179402,18.930233,19.013289,5.152824,3.581395,0,0,381,1,0,0,-99.0,-40.0,0.0,0.0,-98.0,-40.0,0.056478,0.036545,0.016611,0.009967,0.009967,0.009967,0.0,0.006645,0.0,0.016611,0.837209,0.089701,0.202658,0.079734,0.016611,0.069767,0.099668,0.046512,0.166113,0.146179,0.036545,0.046512,-1.566128,0.549355,-7.340775,0.099883,-0.048536,0.048536,-0.188965,-0.081945,-1.557206,-1.557335,-0.101777,-0.133994,0.395069,1.623444,201.594687,0.830962,355.398962,355.398962,60.963288,18.864103,9.274613,9.315211,61.002048,18.520732,243,496,699902,586,180436,180764,1683.0,-54.0,5698.0,5723.0,1551.0,1078.0,0


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

y = feat['label']
X = feat.drop(['label', 'game_id'], axis=1).fillna(0)

lr = LogisticRegression()
model = lr.fit(X, y)

print('Acc: ' + str(model.score(X, y)))
print('ROC: ' + str(roc_auc_score(y, model.predict_proba(X)[:,1])))

Acc: 0.9943661971830986
ROC: 0.9997175141242939


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
