How often to teams comeback to win a game and when do they do it most often?

In [1]:
import pandas as pd

In [2]:
csv = "./data/eightthirtyfour.com/pbp/2000-01_pbp.csv"

In [3]:
data = pd.read_csv(csv)

In [4]:
data.dtypes

Unnamed: 0                     int64
EVENTMSGACTIONTYPE             int64
EVENTMSGTYPE                   int64
EVENTNUM                       int64
GAME_ID                        int64
HOMEDESCRIPTION               object
NEUTRALDESCRIPTION           float64
PCTIMESTRING                  object
PERIOD                         int64
PERSON1TYPE                  float64
PERSON2TYPE                    int64
PERSON3TYPE                    int64
PLAYER1_ID                     int64
PLAYER1_NAME                  object
PLAYER1_TEAM_ABBREVIATION     object
PLAYER1_TEAM_CITY             object
PLAYER1_TEAM_ID              float64
PLAYER1_TEAM_NICKNAME         object
PLAYER2_ID                     int64
PLAYER2_NAME                  object
PLAYER2_TEAM_ABBREVIATION     object
PLAYER2_TEAM_CITY             object
PLAYER2_TEAM_ID              float64
PLAYER2_TEAM_NICKNAME         object
PLAYER3_ID                     int64
PLAYER3_NAME                  object
PLAYER3_TEAM_ABBREVIATION     object
P

In [5]:
len(data)

537157

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,EVENTMSGACTIONTYPE,EVENTMSGTYPE,EVENTNUM,GAME_ID,HOMEDESCRIPTION,NEUTRALDESCRIPTION,PCTIMESTRING,PERIOD,PERSON1TYPE,...,PLAYER3_ID,PLAYER3_NAME,PLAYER3_TEAM_ABBREVIATION,PLAYER3_TEAM_CITY,PLAYER3_TEAM_ID,PLAYER3_TEAM_NICKNAME,SCORE,SCOREMARGIN,VISITORDESCRIPTION,WCTIMESTRING
0,0,0,12,0,20001116,,,12:00,1,0.0,...,0,,,,,,,,,12:11 PM
1,1,0,10,1,20001116,Jump Ball Grant vs. Mutombo: Tip to Hill,,12:00,1,4.0,...,238,Tyrone Hill,PHI,Philadelphia,1610613000.0,76ers,,,,12:12 PM
2,2,1,1,2,20001116,,,11:45,1,5.0,...,0,,,,,,2 - 0,-2,Iverson 21' Jump Shot (2 PTS),12:13 PM
3,3,1,6,3,20001116,,,11:28,1,5.0,...,0,,,,,,,,Hill P.FOUL (P1.T1),12:13 PM
4,4,1,1,4,20001116,Mason 17' Jump Shot (2 PTS),,11:19,1,4.0,...,0,,,,,,2 - 2,TIE,,12:13 PM


### How Many Individual Games

In [7]:
len(data['GAME_ID'].unique())

1189

### How many events per game?

In [8]:
len(data[(data['GAME_ID'] == 20000001)])

429

In [9]:
data['GAME_ID'].value_counts().max()

606

In [10]:
data['GAME_ID'].value_counts().min()

371

In [11]:
data['GAME_ID'].value_counts().mean()

451.7720773759462

Let's remove NaN values in score columns as they are non scoring events (though we may want to keep thes if we want a proper percentage of how long during a game is a team behind).

In [12]:
dropped = data.dropna(subset=['SCOREMARGIN'])

In [13]:
len(dropped)

137590

In [14]:
dropped['GAME_ID'].value_counts().max()

167

In [15]:
dropped['GAME_ID'].value_counts().min()

77

In [16]:
dropped['GAME_ID'].value_counts().mean()

115.71909167367535

In [17]:
dropped.head()

Unnamed: 0.1,Unnamed: 0,EVENTMSGACTIONTYPE,EVENTMSGTYPE,EVENTNUM,GAME_ID,HOMEDESCRIPTION,NEUTRALDESCRIPTION,PCTIMESTRING,PERIOD,PERSON1TYPE,...,PLAYER3_ID,PLAYER3_NAME,PLAYER3_TEAM_ABBREVIATION,PLAYER3_TEAM_CITY,PLAYER3_TEAM_ID,PLAYER3_TEAM_NICKNAME,SCORE,SCOREMARGIN,VISITORDESCRIPTION,WCTIMESTRING
2,2,1,1,2,20001116,,,11:45,1,5.0,...,0,,,,,,2 - 0,-2,Iverson 21' Jump Shot (2 PTS),12:13 PM
4,4,1,1,4,20001116,Mason 17' Jump Shot (2 PTS),,11:19,1,4.0,...,0,,,,,,2 - 2,TIE,,12:13 PM
9,9,5,1,9,20001116,,,10:44,1,5.0,...,0,,,,,,4 - 2,-2,Lynch Layup (2 PTS) (Snow 1 AST),12:14 PM
19,19,42,1,19,20001116,,,9:45,1,5.0,...,0,,,,,,6 - 2,-4,Iverson Driving Layup (4 PTS),12:15 PM
23,23,1,1,24,20001116,,,9:15,1,5.0,...,0,,,,,,9 - 2,-7,Iverson 24' 3PT Jump Shot (7 PTS) (Mutombo 1 AST),12:17 PM


Now we want to create columns for whether the home team won or not and for each scoring event for whether they were ahead or not. This is easy because if the SCOREMARGIN is Negative, the home team is losing. And if the last entry for a GAME_ID score margin is negative, then the home team lost.

In [18]:
final_scores = dropped.groupby('GAME_ID').tail(n=1)

In [19]:
final_scores.head()

Unnamed: 0.1,Unnamed: 0,EVENTMSGACTIONTYPE,EVENTMSGTYPE,EVENTNUM,GAME_ID,HOMEDESCRIPTION,NEUTRALDESCRIPTION,PCTIMESTRING,PERIOD,PERSON1TYPE,...,PLAYER3_ID,PLAYER3_NAME,PLAYER3_TEAM_ABBREVIATION,PLAYER3_TEAM_CITY,PLAYER3_TEAM_ID,PLAYER3_TEAM_NICKNAME,SCORE,SCOREMARGIN,VISITORDESCRIPTION,WCTIMESTRING
428,428,0,13,447,20001116,,,0:00,4,0.0,...,0,,,,,,81 - 83,2,,2:34 PM
860,431,0,13,469,20000520,,,0:00,4,0.0,...,0,,,,,,102 - 115,13,,4:16 PM
1294,433,0,13,455,20000629,,,0:00,4,0.0,...,0,,,,,,81 - 91,10,,9:00 AM
1720,425,0,13,444,20000441,,,0:00,4,0.0,...,0,,,,,,67 - 89,22,,4:16 PM
2185,464,0,13,509,20000805,,,0:00,4,0.0,...,0,,,,,,95 - 122,27,,11:38 AM


In [20]:
def home_adv(row):
    if row['SCOREMARGIN'] == 'TIE':
        return -1
    if int(row['SCOREMARGIN']) < 0:
        return 0
    if int(row['SCOREMARGIN']) > 0:
        return 1
    return -999

In [21]:
final_scores['home_win'] = final_scores.apply(lambda x: home_adv(x), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [43]:
final_scores.sort_values(by=['GAME_ID']).head()

Unnamed: 0.1,Unnamed: 0,EVENTMSGACTIONTYPE,EVENTMSGTYPE,EVENTNUM,GAME_ID,HOMEDESCRIPTION,NEUTRALDESCRIPTION,PCTIMESTRING,PERIOD,PERSON1TYPE,...,PLAYER3_NAME,PLAYER3_TEAM_ABBREVIATION,PLAYER3_TEAM_CITY,PLAYER3_TEAM_ID,PLAYER3_TEAM_NICKNAME,SCORE,SCOREMARGIN,VISITORDESCRIPTION,WCTIMESTRING,home_win
418799,428,0,13,467,20000001,,,0:00,4,0.0,...,,,,,,101 - 72,-29,,2:36 PM,0
514468,509,0,13,533,20000002,,,0:00,4,0.0,...,,,,,,86 - 82,-4,,3:06 PM,0
176105,477,0,13,522,20000003,,,0:00,4,0.0,...,,,,,,86 - 97,11,,2:01 PM,1
209334,447,0,13,475,20000004,,,0:00,4,0.0,...,,,,,,106 - 82,-24,,2:02 PM,0
364209,504,0,13,580,20000005,,,0:00,4,0.0,...,,,,,,104 - 95,-9,,14:24 PM,0


In [23]:
dropped['home_ahead'] = dropped.apply(lambda x: home_adv(x), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [24]:
dropped.head()

Unnamed: 0.1,Unnamed: 0,EVENTMSGACTIONTYPE,EVENTMSGTYPE,EVENTNUM,GAME_ID,HOMEDESCRIPTION,NEUTRALDESCRIPTION,PCTIMESTRING,PERIOD,PERSON1TYPE,...,PLAYER3_NAME,PLAYER3_TEAM_ABBREVIATION,PLAYER3_TEAM_CITY,PLAYER3_TEAM_ID,PLAYER3_TEAM_NICKNAME,SCORE,SCOREMARGIN,VISITORDESCRIPTION,WCTIMESTRING,home_ahead
2,2,1,1,2,20001116,,,11:45,1,5.0,...,,,,,,2 - 0,-2,Iverson 21' Jump Shot (2 PTS),12:13 PM,0
4,4,1,1,4,20001116,Mason 17' Jump Shot (2 PTS),,11:19,1,4.0,...,,,,,,2 - 2,TIE,,12:13 PM,-1
9,9,5,1,9,20001116,,,10:44,1,5.0,...,,,,,,4 - 2,-2,Lynch Layup (2 PTS) (Snow 1 AST),12:14 PM,0
19,19,42,1,19,20001116,,,9:45,1,5.0,...,,,,,,6 - 2,-4,Iverson Driving Layup (4 PTS),12:15 PM,0
23,23,1,1,24,20001116,,,9:15,1,5.0,...,,,,,,9 - 2,-7,Iverson 24' 3PT Jump Shot (7 PTS) (Mutombo 1 AST),12:17 PM,0


In [25]:
wins_df = final_scores[['GAME_ID', 'home_win']]

In [26]:
wins_df.head()

Unnamed: 0,GAME_ID,home_win
428,20001116,1
860,20000520,1
1294,20000629,1
1720,20000441,1
2185,20000805,1


In [27]:
prepped_data = dropped.merge(wins_df, left_on='GAME_ID', right_on='GAME_ID', how='inner')

Now we have our data prepped, can start to analyze play by play whether being ahead, and for what percentage of the game, determines the outcome of a win or a loss.

In [28]:
len(dropped)

137590

In [29]:
len(prepped_data)

137590

1) Group by `GAME_ID` and calculate the percentage of each game where the home team was ahead (i.e., number of events / total events where `home_ahead` == 1.

In [30]:
total_events_df = pd.DataFrame(prepped_data['GAME_ID'].value_counts()).reset_index()

In [31]:
total_events_period_df = pd.DataFrame(prepped_data.groupby(["GAME_ID", "PERIOD"]).size().reset_index(name="period_events"))

In [32]:
total_events_period_df.head()

Unnamed: 0,GAME_ID,PERIOD,period_events
0,20000001,1,32
1,20000001,2,26
2,20000001,3,30
3,20000001,4,23
4,20000002,1,28


In [33]:
total_events_df.columns = ['GAME_ID', 'total_events']

In [34]:
total_ahead = prepped_data[(prepped_data['home_ahead'] > -1)].groupby('GAME_ID').agg({'home_ahead':'sum'}).reset_index()

In [35]:
period_ahead = prepped_data[(prepped_data['home_ahead'] > -1)].groupby(['GAME_ID', 'PERIOD']).agg({'home_ahead':'sum'}).reset_index()

In [36]:
period_ahead.head()

Unnamed: 0,GAME_ID,PERIOD,home_ahead
0,20000001,1,4
1,20000001,2,0
2,20000001,3,0
3,20000001,4,0
4,20000002,1,22


In [37]:
total_events_period_pivot_df = total_events_period_df.pivot(index="GAME_ID", columns="PERIOD", values="period_events")
total_events_period_pivot_df = pd.DataFrame(total_events_period_pivot_df.to_records())
total_events_period_pivot_df.columns = ["period_{}_events".format(col) if col != 'GAME_ID' else col for col in total_events_period_pivot_df.columns ]
total_events_period_pivot_df.head()

Unnamed: 0,GAME_ID,period_1_events,period_2_events,period_3_events,period_4_events,period_5_events,period_6_events,period_7_events
0,20000001,32.0,26.0,30.0,23.0,,,
1,20000002,28.0,29.0,22.0,28.0,,,
2,20000003,27.0,33.0,22.0,31.0,,,
3,20000004,23.0,36.0,25.0,35.0,,,
4,20000005,31.0,29.0,31.0,28.0,,,


In [39]:
period_ahead_pivot_df = period_ahead.pivot(index="GAME_ID", columns="PERIOD", values="home_ahead")
period_ahead_pivot_df = pd.DataFrame(period_ahead_pivot_df.to_records())
period_ahead_pivot_df.columns = ["period_{}_ahead".format(col) if col != 'GAME_ID' else col for col in period_ahead_pivot_df.columns ]
period_ahead_pivot_df.head()

Unnamed: 0,GAME_ID,period_1_ahead,period_2_ahead,period_3_ahead,period_4_ahead,period_5_ahead,period_6_ahead,period_7_ahead
0,20000001,4.0,0.0,0.0,0.0,,,
1,20000002,22.0,27.0,0.0,0.0,,,
2,20000003,24.0,33.0,21.0,31.0,,,
3,20000004,5.0,0.0,1.0,0.0,,,
4,20000005,12.0,0.0,0.0,0.0,,,


In [41]:
total_calcs = total_events_df.merge(total_ahead, left_on='GAME_ID', right_on='GAME_ID', how='inner')
total_calcs.head()

Unnamed: 0,GAME_ID,total_events,home_ahead
0,20001149,167,15
1,20000923,161,78
2,20000936,155,109
3,20000432,155,155
4,20001024,151,30


In [42]:
period_calcs = total_events_period_pivot_df.merge(period_ahead_pivot_df, left_on='GAME_ID', right_on='GAME_ID', how='inner')
period_calcs.head()

Unnamed: 0,GAME_ID,period_1_events,period_2_events,period_3_events,period_4_events,period_5_events,period_6_events,period_7_events,period_1_ahead,period_2_ahead,period_3_ahead,period_4_ahead,period_5_ahead,period_6_ahead,period_7_ahead
0,20000001,32.0,26.0,30.0,23.0,,,,4.0,0.0,0.0,0.0,,,
1,20000002,28.0,29.0,22.0,28.0,,,,22.0,27.0,0.0,0.0,,,
2,20000003,27.0,33.0,22.0,31.0,,,,24.0,33.0,21.0,31.0,,,
3,20000004,23.0,36.0,25.0,35.0,,,,5.0,0.0,1.0,0.0,,,
4,20000005,31.0,29.0,31.0,28.0,,,,12.0,0.0,0.0,0.0,,,


In [44]:
def calc_pct(row):
    return row['home_ahead']/row['total_events']

In [47]:
total_calcs['pct_home_ahead'] = total_calcs.apply(lambda x: calc_pct(x), axis=1)

In [48]:
total_calcs.head()

Unnamed: 0,GAME_ID,total_events,home_ahead,pct_home_ahead
0,20001149,167,15,0.08982
1,20000923,161,78,0.484472
2,20000936,155,109,0.703226
3,20000432,155,155,1.0
4,20001024,151,30,0.198675


Now let's join everything together and save to a database

In [49]:
processed_df = total_calcs.merge(period_calcs, left_on='GAME_ID', right_on='GAME_ID', how='inner')

In [50]:
processed_df.head()

Unnamed: 0,GAME_ID,total_events,home_ahead,pct_home_ahead,period_1_events,period_2_events,period_3_events,period_4_events,period_5_events,period_6_events,period_7_events,period_1_ahead,period_2_ahead,period_3_ahead,period_4_ahead,period_5_ahead,period_6_ahead,period_7_ahead
0,20001149,167,15,0.08982,33.0,37.0,43.0,35.0,19.0,,,8.0,0.0,0.0,7.0,0.0,,
1,20000923,161,78,0.484472,28.0,37.0,31.0,23.0,15.0,13.0,14.0,7.0,26.0,25.0,0.0,5.0,2.0,13.0
2,20000936,155,109,0.703226,29.0,40.0,35.0,36.0,15.0,,,3.0,38.0,35.0,32.0,1.0,,
3,20000432,155,155,1.0,32.0,39.0,35.0,49.0,,,,32.0,39.0,35.0,49.0,,,
4,20001024,151,30,0.198675,30.0,34.0,32.0,41.0,14.0,,,10.0,0.0,0.0,10.0,10.0,,
