In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from interval import interval
from src.get_data import get_clean_player_data

# Read in Ids as strings - Numeric deletes first zeros!

In [3]:
df_schedule = pd.read_csv("./data/season_prediction/schedule.csv", dtype={'GAME_ID':str})
df_boxscores = pd.read_csv("./data/season_prediction/boxscores.csv", dtype={'GAME_ID':str})

In [39]:
df_schedule

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,...,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE,is_home_game
0,0,0,22014,1610612742,DAL,Dallas Mavericks,0021400002,2014-10-28,DAL @ SAS,L,...,33,17,9,3,10,20,100,-1,1,False
1,1,1,22014,1610612759,SAS,San Antonio Spurs,0021400002,2014-10-28,SAS vs. DAL,W,...,38,23,5,3,21,20,101,1,1,True
2,2,2,22014,1610612747,LAL,Los Angeles Lakers,0021400003,2014-10-28,LAL vs. HOU,L,...,36,16,7,3,13,32,90,-18,1,True
3,3,3,22014,1610612745,HOU,Houston Rockets,0021400003,2014-10-28,HOU @ LAL,W,...,47,22,7,3,14,30,108,18,1,False
4,4,4,22014,1610612740,NOP,New Orleans Pelicans,0021400001,2014-10-28,NOP vs. ORL,W,...,62,20,10,17,9,17,101,17,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16573,2155,2155,22020,1610612761,TOR,Toronto Raptors,0022001079,2021-05-16,TOR vs. IND,L,...,48,21,8,3,12,12,113,-12,1,True
16574,2156,2156,22020,1610612746,LAC,LA Clippers,0022001074,2021-05-16,LAC @ OKC,L,...,44,17,8,3,3,14,112,-5,1,False
16575,2157,2157,22020,1610612760,OKC,Oklahoma City Thunder,0022001074,2021-05-16,OKC vs. LAC,W,...,54,20,1,12,15,11,117,5,1,True
16576,2158,2158,22020,1610612747,LAL,Los Angeles Lakers,0022001072,2021-05-16,LAL @ NOP,W,...,42,30,14,5,10,16,110,12,1,False


In [4]:
df_boxscores['GAME_ID']

0         0021400002
1         0021400002
2         0021400002
3         0021400002
4         0021400002
             ...    
228157    0022001072
228158    0022001072
228159    0022001072
228160    0022001072
228161    0022001072
Name: GAME_ID, Length: 228162, dtype: object

> Get the pbp data for one game

In [5]:
import re
from nba_api.stats.endpoints import playbyplayv2

game_id = "0022001074"

call = playbyplayv2.PlayByPlayV2(game_id=game_id, start_period=1, end_period=4)
data = pd.concat(call.get_data_frames())
data.drop(data.tail(1).index,inplace=True)

# transform time
data['time_real_tmp'] = data['PCTIMESTRING'].apply(lambda x: re.sub(string=x, repl="", pattern=":"))
data['time_real_tmp'] = np.abs(data['time_real_tmp'].astype(float) * (5 - data['PERIOD'].astype(float)) - 4800)

data['HOME_PTS'] = data[data['EVENTMSGTYPE'] == 1]['SCORE'].apply(lambda x: x.split(' - ')[0])
data['AWAY_PTS'] = data[data['EVENTMSGTYPE'] == 1]['SCORE'].apply(lambda x: x.split(' - ')[1])

# get the starters of the game
starters = df_boxscores[df_boxscores['GAME_ID'] == 22001074]
starters = starters[~starters['START_POSITION'].isna()]

> create stint_marker

In [6]:
# create the stint_marker
bool_subs = data['EVENTMSGTYPE'] == 8
data.loc[bool_subs, 'stint_marker'] = np.arange(start=1, stop=np.sum(bool_subs)+1)

In [7]:
data['time_real_tmp']

1         0.0
2       184.0
3       184.0
4       228.0
5       288.0
        ...  
437    4795.0
438    4795.0
439    4800.0
440    4800.0
441    4800.0
Name: time_real_tmp, Length: 441, dtype: float64

> create the stint variable

In [8]:
# create idx array
inter_id = np.where(~data['stint_marker'].isna())[0] + 1
last_id = data.shape[0]
first_id = 0

# idx
idx = np.where(~data['stint_marker'].isna())[0] + 1

In [9]:
idx

array([  3,  46,  53,  58,  59,  71,  72,  81, 127, 144, 145, 156, 157,
       163, 172, 173, 179, 197, 239, 240, 282, 283, 284, 299, 321, 322,
       351, 372, 373])

In [10]:
list_stint = list()

for i, ids in enumerate(idx):
    # first iteration
    if i == 0:
        tmp = data.iloc[:idx[0]]
    
    # in between
    else:
        tmp = data.iloc[idx[i-1]:idx[i]]
    
    # append to list
    list_stint.append(tmp)

# append the last iteration
list_stint.append(data.iloc[idx[-1]:])

# insert the stint counter
for i, stint in enumerate(list_stint):
    stint['stint'] = i

data = pd.concat(list_stint)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stint['stint'] = i


In [11]:
game_stints = data['stint'].unique()

# Filtert stints raus, in denen nicht gescored wurde (nur zur validierung)

In [12]:
# filter scoring events
data_score = data[data['EVENTMSGTYPE'] == 1]

# create data_pts_stint, handle stints where there was no scoring
tmp_pts_stint = data_score.groupby('stint')[['HOME_PTS', 'AWAY_PTS']].max()
tmp_pts_stint['stint_merge'] = tmp_pts_stint.index
tmp2_pts_stint = pd.DataFrame(data={'HOME_PTS':None, 'AWAY_PTS':None, 'stint':game_stints})
tmp_merge = pd.merge(tmp2_pts_stint, tmp_pts_stint, how='left', left_on='stint', right_on='stint_merge', suffixes=("_drop", None))
data_pts_stint = tmp_merge.drop(['HOME_PTS_drop', 'AWAY_PTS_drop', 'stint_merge'], axis=1)
data_pts_stint = data_pts_stint[~data_pts_stint['stint'].isna()]

In [13]:
data_pts_stint

Unnamed: 0,stint,HOME_PTS,AWAY_PTS
0,0,,
1,1,8.0,9.0
2,2,10.0,23.0
3,3,13.0,23.0
4,4,,
5,5,20.0,30.0
6,6,,
7,7,24.0,33.0
8,8,38.0,41.0
9,9,39.0,43.0


> get substitutions!

In [14]:
subs = data.loc[bool_subs, ['PLAYER1_ID', 'PLAYER2_ID', 'stint', 'stint_marker']] # all substitutions!
subs

Unnamed: 0,PLAYER1_ID,PLAYER2_ID,stint,stint_marker
3,1627826.0,201586.0,0,1.0
46,201586.0,1630187.0,1,2.0
53,202704.0,1630206.0,2,3.0
58,1630197.0,1629658.0,3,4.0
59,1629676.0,1629718.0,4,5.0
71,201976.0,1629611.0,5,6.0
72,1630177.0,1630197.0,6,7.0
81,1628379.0,1627812.0,7,8.0
127,1630197.0,1630221.0,8,9.0
144,1629658.0,1630197.0,9,10.0


In [15]:
game = df_boxscores[df_boxscores['GAME_ID'] == game_id] # get the games boxscore

In [16]:
game

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,...,E_PACE,PACE,PACE_PER40,POSS,PIE,TEAM_NAME,E_TM_TOV_PCT,Unnamed: 0.1.1,player_game_index,len_game_id
228103,30960,30960,22001074,1610612746,LAC,LA,1628379.0,Luke Kennard,Luke,F,...,103.97,103.49,86.24,65,0.084,,,30960.0,30960.0,22001074
228104,30961,30961,22001074,1610612746,LAC,LA,202335.0,Patrick Patterson,Patrick,F,...,101.36,100.33,83.61,88,0.067,,,30961.0,30961.0,22001074
228105,30962,30962,22001074,1610612746,LAC,LA,1627826.0,Ivica Zubac,Ivica,C,...,0.0,240.0,200.0,0,1.0,,,30962.0,30962.0,22001074
228106,30963,30963,22001074,1610612746,LAC,LA,201976.0,Patrick Beverley,Patrick,G,...,104.67,108.13,90.11,43,0.104,,,30963.0,30963.0,22001074
228107,30964,30964,22001074,1610612746,LAC,LA,202704.0,Reggie Jackson,Reggie,G,...,101.36,104.85,87.38,15,0.085,,,30964.0,30964.0,22001074
228108,30965,30965,22001074,1610612746,LAC,LA,201586.0,Serge Ibaka,Serge,,...,100.52,98.47,82.06,39,0.152,,,30965.0,30965.0,22001074
228109,30966,30966,22001074,1610612746,LAC,LA,1630187.0,Daniel Oturu,Daniel,,...,100.29,98.84,82.37,77,0.032,,,30966.0,30966.0,22001074
228110,30967,30967,22001074,1610612746,LAC,LA,1630206.0,Jay Scrubb,Jay,,...,99.01,97.68,81.4,74,0.058,,,30967.0,30967.0,22001074
228111,30968,30968,22001074,1610612746,LAC,LA,1629611.0,Terance Mann,Terance,,...,101.43,101.31,84.42,48,0.199,,,30968.0,30968.0,22001074
228112,30969,30969,22001074,1610612746,LAC,LA,1627812.0,Yogi Ferrell,Yogi,,...,96.86,95.46,79.55,54,0.123,,,30969.0,30969.0,22001074


> get starters

In [17]:
# get the starters of the game
starter = game[~game['START_POSITION'].isna()]
starter

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,...,E_PACE,PACE,PACE_PER40,POSS,PIE,TEAM_NAME,E_TM_TOV_PCT,Unnamed: 0.1.1,player_game_index,len_game_id
228103,30960,30960,22001074,1610612746,LAC,LA,1628379.0,Luke Kennard,Luke,F,...,103.97,103.49,86.24,65,0.084,,,30960.0,30960.0,22001074
228104,30961,30961,22001074,1610612746,LAC,LA,202335.0,Patrick Patterson,Patrick,F,...,101.36,100.33,83.61,88,0.067,,,30961.0,30961.0,22001074
228105,30962,30962,22001074,1610612746,LAC,LA,1627826.0,Ivica Zubac,Ivica,C,...,0.0,240.0,200.0,0,1.0,,,30962.0,30962.0,22001074
228106,30963,30963,22001074,1610612746,LAC,LA,201976.0,Patrick Beverley,Patrick,G,...,104.67,108.13,90.11,43,0.104,,,30963.0,30963.0,22001074
228107,30964,30964,22001074,1610612746,LAC,LA,202704.0,Reggie Jackson,Reggie,G,...,101.36,104.85,87.38,15,0.085,,,30964.0,30964.0,22001074
228118,30975,30975,22001074,1610612760,OKC,Oklahoma City,1630221.0,Josh Hall,Josh,F,...,101.87,101.69,84.74,91,0.08,,,30975.0,30975.0,22001074
228119,30976,30976,22001074,1610612760,OKC,Oklahoma City,1629676.0,Isaiah Roby,Isaiah,F,...,104.74,106.96,89.13,27,0.19,,,30976.0,30976.0,22001074
228120,30977,30977,22001074,1610612760,OKC,Oklahoma City,1629650.0,Moses Brown,Moses,C,...,102.95,102.72,85.6,81,0.237,,,30977.0,30977.0,22001074
228121,30978,30978,22001074,1610612760,OKC,Oklahoma City,1630197.0,Aleksej Pokusevski,Aleksej,G,...,100.04,99.71,83.1,85,0.122,,,30978.0,30978.0,22001074
228122,30979,30979,22001074,1610612760,OKC,Oklahoma City,1630177.0,Theo Maledon,Theo,G,...,105.98,109.8,91.5,31,0.054,,,30979.0,30979.0,22001074


In [18]:
# get home team bool and id of the games home team
df_schedule['is_home_game'] = df_schedule['MATCHUP'].apply(lambda x: x.find('@') == -1)
tmp_ht = df_schedule[df_schedule['GAME_ID'] == game_id]
home_team_id = tmp_ht[tmp_ht['is_home_game']]['TEAM_ID'].values[0]

# get the ids of the home teams players
home_team_players = game[game['TEAM_ID'] == home_team_id]['PLAYER_ID'].values

In [19]:
home_team_players

array([1630221., 1629676., 1629650., 1630197., 1630177., 1629658.,
       1629718., 1628396., 1630466., 1629647., 1629660., 1629004.,
       1629026.,      nan])

In [20]:
starters = starter['PLAYER_ID'].values

In [21]:
def stint_lineup(starter_array, subs):
    
    starters = None
    starters = starter_array.copy()
    
    store = list()
    store.append(starters)
    
    out_player = subs['PLAYER1_ID'].tolist()
    in_player = subs['PLAYER2_ID'].tolist()
    
    for i, _ in enumerate(out_player):
        
        if i == 0:
            new_lineup = starters
        
        current_lineup = new_lineup
        new_lineup = np.where(current_lineup == out_player[i], in_player[i], current_lineup)
        
        store.append(new_lineup)
        
    mat = np.stack(store, axis=0)
    _, counts = np.unique(mat, axis=1, return_counts=True)
    
    assert np.all(counts == 1), "Same player multiple times on the court"
    assert mat.shape[1] == 10, "There are not 10 players on the court"
    
    return(mat)

In [22]:
mat = stint_lineup(starter_array=starters, subs=subs)
mat.shape

(30, 10)

In [23]:
# create column-names and dataframe
colnames_lineup = ['HOME_' + str(i) for i in np.arange(start=1, stop=6)]
colnames_lineup.extend(['AWAY_' + str(i) for i in np.arange(start=1, stop=6)])
game_lineups = pd.DataFrame(data=mat, columns=colnames_lineup)

# store stints in vector
stint = np.arange(game_lineups.shape[0])

In [24]:
game_lineups

Unnamed: 0,HOME_1,HOME_2,HOME_3,HOME_4,HOME_5,AWAY_1,AWAY_2,AWAY_3,AWAY_4,AWAY_5
0,1628379.0,202335.0,1627826.0,201976.0,202704.0,1630221.0,1629676.0,1629650.0,1630197.0,1630177.0
1,1628379.0,202335.0,201586.0,201976.0,202704.0,1630221.0,1629676.0,1629650.0,1630197.0,1630177.0
2,1628379.0,202335.0,1630187.0,201976.0,202704.0,1630221.0,1629676.0,1629650.0,1630197.0,1630177.0
3,1628379.0,202335.0,1630187.0,201976.0,1630206.0,1630221.0,1629676.0,1629650.0,1630197.0,1630177.0
4,1628379.0,202335.0,1630187.0,201976.0,1630206.0,1630221.0,1629676.0,1629650.0,1629658.0,1630177.0
5,1628379.0,202335.0,1630187.0,201976.0,1630206.0,1630221.0,1629718.0,1629650.0,1629658.0,1630177.0
6,1628379.0,202335.0,1630187.0,1629611.0,1630206.0,1630221.0,1629718.0,1629650.0,1629658.0,1630177.0
7,1628379.0,202335.0,1630187.0,1629611.0,1630206.0,1630221.0,1629718.0,1629650.0,1629658.0,1630197.0
8,1627812.0,202335.0,1630187.0,1629611.0,1630206.0,1630221.0,1629718.0,1629650.0,1629658.0,1630197.0
9,1627812.0,202335.0,1630187.0,1629611.0,1630206.0,1630221.0,1629718.0,1629650.0,1629658.0,1630221.0


In [25]:
# save as long format
game_lineups_long = pd.melt(game_lineups)
game_lineups_long['value'] = game_lineups_long['value'].astype(str) 

# transform to numpy matrix
game_lineups = game_lineups.to_numpy()
game_lineups_shape = game_lineups.shape

# get unique players used in the game
player_used = np.unique(game_lineups_long['value'])
n_player_used = player_used.shape[0]

# create vector indicating home team players
cond_ht_players = np.isin(player_used.astype(float), home_team_players)

In [26]:
player_used.shape

(18,)

In [27]:
home_team_players.shape[0]

14

In [28]:
game_lineups

array([[1628379.,  202335., 1627826.,  201976.,  202704., 1630221.,
        1629676., 1629650., 1630197., 1630177.],
       [1628379.,  202335.,  201586.,  201976.,  202704., 1630221.,
        1629676., 1629650., 1630197., 1630177.],
       [1628379.,  202335., 1630187.,  201976.,  202704., 1630221.,
        1629676., 1629650., 1630197., 1630177.],
       [1628379.,  202335., 1630187.,  201976., 1630206., 1630221.,
        1629676., 1629650., 1630197., 1630177.],
       [1628379.,  202335., 1630187.,  201976., 1630206., 1630221.,
        1629676., 1629650., 1629658., 1630177.],
       [1628379.,  202335., 1630187.,  201976., 1630206., 1630221.,
        1629718., 1629650., 1629658., 1630177.],
       [1628379.,  202335., 1630187., 1629611., 1630206., 1630221.,
        1629718., 1629650., 1629658., 1630177.],
       [1628379.,  202335., 1630187., 1629611., 1630206., 1630221.,
        1629718., 1629650., 1629658., 1630197.],
       [1627812.,  202335., 1630187., 1629611., 1630206., 163022

In [29]:
# init ohe matrix
ohe = np.zeros((game_lineups_shape[0], n_player_used))

# loop over the players used and create dummy variable for each one per stint
for i, player in enumerate(player_used):
    
    # home player gets a 1
    if cond_ht_players[i]:
        ohe[:,i] = np.sum((game_lineups == float(player)), axis=1)
    
    # away player gets a -1
    else:
        ohe[:,i] = np.sum((game_lineups == float(player)), axis=1) * (-1)
    
assert np.all(np.abs(ohe).sum(axis=1) == 10), "In some stint, there are not 10 players on the court"
# assert np.all(np.abs(ohe).max() == 1), "Players have been counted multiple times"

In [30]:
# dirty hotfix - delete later
ohe[ohe > 1] = 1
ohe[ohe < -1] = -1

In [31]:
ohe

array([[ 0., -1., -1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,
         0.,  0., -1., -1., -1.],
       [ 0.,  0., -1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,
         0., -1., -1., -1., -1.],
       [ 0.,  0., -1.,  0.,  1.,  0.,  1.,  0.,  1., -1.,  1.,  0.,  1.,
         0.,  0., -1., -1., -1.],
       [ 0.,  0., -1.,  0.,  1.,  0.,  1.,  0.,  1., -1.,  1., -1.,  1.,
         0.,  0., -1., -1.,  0.],
       [ 0.,  0., -1.,  0.,  1.,  1.,  1.,  0.,  1., -1.,  0., -1.,  1.,
         0.,  0., -1., -1.,  0.],
       [ 0.,  0., -1.,  0.,  1.,  1.,  0.,  1.,  1., -1.,  0., -1.,  1.,
         0.,  0., -1., -1.,  0.],
       [ 0.,  0., -1., -1.,  1.,  1.,  0.,  1.,  1., -1.,  0., -1.,  1.,
         0.,  0.,  0., -1.,  0.],
       [ 0.,  0., -1., -1.,  1.,  1.,  0.,  1.,  0., -1.,  1., -1.,  1.,
         0.,  0.,  0., -1.,  0.],
       [-1.,  0.,  0., -1.,  1.,  1.,  0.,  1.,  0., -1.,  1., -1.,  1.,
         0.,  0.,  0., -1.,  0.],
       [-1.,  0.,  0., -1.,  1.,  1.,

In [32]:
# transform to dataframe
data_player_stint = pd.DataFrame(ohe, columns=player_used)

# add additional data
data_player_stint['stint'] = data_player_stint.index
data_player_stint['GAME_ID'] = game_id

In [33]:
data_player_stint

Unnamed: 0,1627812.0,1627826.0,1628379.0,1629611.0,1629650.0,1629658.0,1629676.0,1629718.0,1630177.0,1630187.0,1630197.0,1630206.0,1630221.0,1630466.0,201586.0,201976.0,202335.0,202704.0,stint,GAME_ID
0,0.0,-1.0,-1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,-1.0,-1.0,-1.0,0,22001074
1,0.0,0.0,-1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,-1.0,-1.0,-1.0,-1.0,1,22001074
2,0.0,0.0,-1.0,0.0,1.0,0.0,1.0,0.0,1.0,-1.0,1.0,0.0,1.0,0.0,0.0,-1.0,-1.0,-1.0,2,22001074
3,0.0,0.0,-1.0,0.0,1.0,0.0,1.0,0.0,1.0,-1.0,1.0,-1.0,1.0,0.0,0.0,-1.0,-1.0,0.0,3,22001074
4,0.0,0.0,-1.0,0.0,1.0,1.0,1.0,0.0,1.0,-1.0,0.0,-1.0,1.0,0.0,0.0,-1.0,-1.0,0.0,4,22001074
5,0.0,0.0,-1.0,0.0,1.0,1.0,0.0,1.0,1.0,-1.0,0.0,-1.0,1.0,0.0,0.0,-1.0,-1.0,0.0,5,22001074
6,0.0,0.0,-1.0,-1.0,1.0,1.0,0.0,1.0,1.0,-1.0,0.0,-1.0,1.0,0.0,0.0,0.0,-1.0,0.0,6,22001074
7,0.0,0.0,-1.0,-1.0,1.0,1.0,0.0,1.0,0.0,-1.0,1.0,-1.0,1.0,0.0,0.0,0.0,-1.0,0.0,7,22001074
8,-1.0,0.0,0.0,-1.0,1.0,1.0,0.0,1.0,0.0,-1.0,1.0,-1.0,1.0,0.0,0.0,0.0,-1.0,0.0,8,22001074
9,-1.0,0.0,0.0,-1.0,1.0,1.0,0.0,1.0,0.0,-1.0,0.0,-1.0,1.0,0.0,0.0,0.0,-1.0,0.0,9,22001074


In [34]:
# impute missing values
data_pts = data_pts_stint.fillna(method='ffill')

data_pts[data_pts['HOME_PTS'].isna()] = 0
data_pts[data_pts['AWAY_PTS'].isna()] = 0

data_pts['HOME_PTS'] = data_pts['HOME_PTS'].astype(float)
data_pts['AWAY_PTS'] = data_pts['AWAY_PTS'].astype(float)

# plus minus
data_pts['HOME_PLUS_MINUS'] = data_pts['HOME_PTS'] - data_pts['AWAY_PTS']
data_pts['AWAY_PLUS_MINUS'] = data_pts['AWAY_PTS'] - data_pts['HOME_PTS']

# delta of plus-minus
data_pts['HOME_PM_DIFF'] = data_pts['HOME_PLUS_MINUS'].diff()
data_pts['AWAY_PM_DIFF'] = data_pts['AWAY_PLUS_MINUS'].diff()
data_pts[data_pts['HOME_PM_DIFF'].isna()] = 0
data_pts[data_pts['AWAY_PM_DIFF'].isna()] = 0

In [35]:
data_pts

Unnamed: 0,stint,HOME_PTS,AWAY_PTS,HOME_PLUS_MINUS,AWAY_PLUS_MINUS,HOME_PM_DIFF,AWAY_PM_DIFF
0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,8.0,9.0,-1.0,1.0,-1.0,1.0
2,2,10.0,23.0,-13.0,13.0,-12.0,12.0
3,3,13.0,23.0,-10.0,10.0,3.0,-3.0
4,4,13.0,23.0,-10.0,10.0,0.0,0.0
5,5,20.0,30.0,-10.0,10.0,0.0,0.0
6,6,20.0,30.0,-10.0,10.0,0.0,0.0
7,7,24.0,33.0,-9.0,9.0,1.0,-1.0
8,8,38.0,41.0,-3.0,3.0,6.0,-6.0
9,9,39.0,43.0,-4.0,4.0,-1.0,1.0


In [36]:
data_stint = pd.merge(data_player_stint, data_pts)
data_stint

Unnamed: 0,1627812.0,1627826.0,1628379.0,1629611.0,1629650.0,1629658.0,1629676.0,1629718.0,1630177.0,1630187.0,...,202335.0,202704.0,stint,GAME_ID,HOME_PTS,AWAY_PTS,HOME_PLUS_MINUS,AWAY_PLUS_MINUS,HOME_PM_DIFF,AWAY_PM_DIFF
0,0.0,-1.0,-1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,-1.0,-1.0,0,22001074,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,-1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,-1.0,-1.0,1,22001074,8.0,9.0,-1.0,1.0,-1.0,1.0
2,0.0,0.0,-1.0,0.0,1.0,0.0,1.0,0.0,1.0,-1.0,...,-1.0,-1.0,2,22001074,10.0,23.0,-13.0,13.0,-12.0,12.0
3,0.0,0.0,-1.0,0.0,1.0,0.0,1.0,0.0,1.0,-1.0,...,-1.0,0.0,3,22001074,13.0,23.0,-10.0,10.0,3.0,-3.0
4,0.0,0.0,-1.0,0.0,1.0,1.0,1.0,0.0,1.0,-1.0,...,-1.0,0.0,4,22001074,13.0,23.0,-10.0,10.0,0.0,0.0
5,0.0,0.0,-1.0,0.0,1.0,1.0,0.0,1.0,1.0,-1.0,...,-1.0,0.0,5,22001074,20.0,30.0,-10.0,10.0,0.0,0.0
6,0.0,0.0,-1.0,-1.0,1.0,1.0,0.0,1.0,1.0,-1.0,...,-1.0,0.0,6,22001074,20.0,30.0,-10.0,10.0,0.0,0.0
7,0.0,0.0,-1.0,-1.0,1.0,1.0,0.0,1.0,0.0,-1.0,...,-1.0,0.0,7,22001074,24.0,33.0,-9.0,9.0,1.0,-1.0
8,-1.0,0.0,0.0,-1.0,1.0,1.0,0.0,1.0,0.0,-1.0,...,-1.0,0.0,8,22001074,38.0,41.0,-3.0,3.0,6.0,-6.0
9,-1.0,0.0,0.0,-1.0,1.0,1.0,0.0,1.0,0.0,-1.0,...,-1.0,0.0,9,22001074,39.0,43.0,-4.0,4.0,-1.0,1.0


Real Plus Minus Rating as an example:
- Regress the Real Adjusted Plus Minus unto traditional statistics such as PTS, REB ...


In [37]:
import re
def extract_substitution(data, col, i):
    
    # get player who is subbed in
    in_player = (data[col].values[i].split("SUB: "))[1].split(" FOR")[0]
    
    # get player who is subbed out
    out_player = re.sub(string=data[col].values[i], repl="", pattern=f'SUB: {in_player} FOR ')
    
    return [in_player, out_player]

In [38]:
extract_substitution(data=data_subs, col='HOMEDESCRIPTION', i = 3)

NameError: name 'data_subs' is not defined

In [None]:
def find_between(s, start, end):
    return (s.split(start))[1].split(end)[0]

In [None]:
import re
in_player = find_between(data_subs['HOMEDESCRIPTION'].values[3], start="SUB: ", end=" FOR")
re.sub(string=data_subs['HOMEDESCRIPTION'].values[3], repl="", pattern=f'SUB: {in_player} FOR ')

In [None]:
data_subs = data[data['EVENTMSGTYPE'] == 8]

In [None]:
data_subs