In [511]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [512]:
df_schedule = pd.read_csv("./data/season_prediction/schedule.csv", dtype={'GAME_ID': str})
df_boxscores = pd.read_csv("./data/season_prediction/boxscores.csv", dtype={'GAME_ID': str})

In [513]:
from nba_api.stats.endpoints import playbyplayv2
import time
from tqdm import tqdm
def load_data_pbp(season_ids, limit=None):
    
    # load the data
    df_schedule = pd.read_csv("./data/season_prediction/schedule.csv", dtype={'GAME_ID': str})
    df_boxscores = pd.read_csv("./data/season_prediction/boxscores.csv", dtype={'GAME_ID': str})
    
    print(f'Get data for the season_ids: {season_ids}')
    
    #game_ids = ["0022001074", "0021900001"]
    game_ids = df_schedule[df_schedule['SEASON_ID'].isin(season_ids)]['GAME_ID'].unique()
    
    if limit is not None:
        game_ids = game_ids[0:limit]
    
    list_data = list()

    for game in tqdm(game_ids):
        
        try:
            # pbp call, append
            call = playbyplayv2.PlayByPlayV2(game_id=game, start_period=1, end_period=4)
            data_load = pd.concat(call.get_data_frames())
            list_data.append(data_load)
        
        except:
            print(f"Skipped the ID: {game}")
            next
        
        # sleep
        time.sleep(0.75)

    data_load = pd.concat(list_data)
    data_load = data_load[~data_load['GAME_ID'].isna()]
    
    return data_load

In [514]:
data_load = load_data_pbp(season_ids=[22020, 22019, 22018])

  0%|          | 0/3369 [00:00<?, ?it/s]

Get data for the season_ids: [22020, 22019, 22018]


 43%|████▎     | 1436/3369 [1:01:04<55:11,  1.71s/it]  

Skipped the ID: 0021900204


 90%|████████▉ | 3029/3369 [1:54:31<09:01,  1.59s/it]    

Skipped the ID: 0022000764


 91%|█████████ | 3065/3369 [1:56:12<11:44,  2.32s/it]

Skipped the ID: 0022000795


 92%|█████████▏| 3108/3369 [1:57:52<07:25,  1.71s/it]

Skipped the ID: 0022000837


 93%|█████████▎| 3124/3369 [1:58:49<07:35,  1.86s/it]

Skipped the ID: 0022000849


 93%|█████████▎| 3135/3369 [1:59:35<06:40,  1.71s/it]

Skipped the ID: 0022000861


 94%|█████████▍| 3165/3369 [2:00:53<05:41,  1.67s/it]

Skipped the ID: 0022000895


100%|██████████| 3369/3369 [2:07:00<00:00,  2.26s/it]


In [515]:
data = data_load

In [10]:
df_schedule['SEASON_ID'].unique()

array([22014, 22015, 22016, 22017, 22018, 22019, 22020])

In [11]:
data

Unnamed: 0,GAME_ID,EVENTNUM,EVENTMSGTYPE,EVENTMSGACTIONTYPE,PERIOD,WCTIMESTRING,PCTIMESTRING,HOMEDESCRIPTION,NEUTRALDESCRIPTION,VISITORDESCRIPTION,...,PLAYER2_TEAM_NICKNAME,PLAYER2_TEAM_ABBREVIATION,PERSON3TYPE,PLAYER3_ID,PLAYER3_NAME,PLAYER3_TEAM_ID,PLAYER3_TEAM_CITY,PLAYER3_TEAM_NICKNAME,PLAYER3_TEAM_ABBREVIATION,VIDEO_AVAILABLE_FLAG
0,0021900002,2.0,12.0,0.0,1.0,10:43 PM,12:00,,Start of 1st Period (10:43 PM EST),,...,,,0.0,0.0,,,,,,1
1,0021900002,4.0,10.0,0.0,1.0,10:43 PM,12:00,Jump Ball Zubac vs. McGee: Tip to James,,,...,Lakers,LAL,5.0,2544.0,LeBron James,1.610613e+09,Los Angeles,Lakers,LAL,1
2,0021900002,7.0,1.0,5.0,1.0,10:43 PM,11:47,,,James 2' Layup (2 PTS) (Davis 1 AST),...,Lakers,LAL,0.0,0.0,,,,,,1
3,0021900002,9.0,2.0,79.0,1.0,10:43 PM,11:40,MISS Shamet 27' 3PT Pullup Jump Shot,,,...,,,0.0,0.0,,,,,,1
4,0021900002,10.0,4.0,0.0,1.0,10:43 PM,11:38,,,Davis REBOUND (Off:0 Def:1),...,,,0.0,0.0,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452,0021900499,650.0,3.0,11.0,4.0,7:20 PM,0:35,,,Motley Free Throw 1 of 2 (1 PTS),...,,,0.0,0.0,,,,,,1
453,0021900499,651.0,3.0,12.0,4.0,7:20 PM,0:35,,,Motley Free Throw 2 of 2 (2 PTS),...,,,0.0,0.0,,,,,,1
454,0021900499,652.0,5.0,45.0,4.0,7:20 PM,0:25,Giles III Out of Bounds - Bad Pass Turnover Tu...,,,...,,,1.0,0.0,,,,,,1
455,0021900499,653.0,5.0,11.0,4.0,7:20 PM,0:01,,,Clippers Turnover: Shot Clock (T#12),...,,,1.0,0.0,,,,,,0


In [516]:
def preprocessing_stint_data(data):
    
    # transform time, get game_time_left in seconds
    data['game_time_s'] = 60*data['PCTIMESTRING'].apply(lambda x: str(x).split(':')).apply(lambda x: x[0]).astype(float) + data['PCTIMESTRING'].apply(lambda x: str(x).split(':')).apply(lambda x: x[-1]).astype(float)
    data['game_time_left'] = (5 - data['PERIOD'])*data['game_time_s']
    data = data[~data['GAME_ID'].isna()]
    data = data[data['NEUTRALDESCRIPTION'].isna()]

    bool_ft = (data['EVENTMSGTYPE'] == 3).to_numpy() # Free-throws
    bool_fgm = (data['EVENTMSGTYPE'] == 1).to_numpy() # Field goals made
    bool_sub = (data['EVENTMSGTYPE'] == 8).to_numpy() # substitutions
    bool_tech = ((data['EVENTMSGTYPE'] == 6) & (data['EVENTMSGACTIONTYPE'].isin([10, 11, 16, 18, 25]))).to_numpy() # technical fouls
    bool_eject_tech = ((data['EVENTMSGTYPE'] == 11) & (data['EVENTMSGACTIONTYPE'] == 1)).to_numpy() # ejct 2nd technical fouls
    bool_ingame_plays = ~(bool_sub | bool_tech | bool_eject_tech)
    bool_away = (data['HOMEDESCRIPTION'].isna()).to_numpy()
    bool_home = (data['VISITORDESCRIPTION'].isna()).to_numpy()

    data.loc[bool_away, 'TEAM_LOCATION'] = "AWAY"
    data.loc[bool_home, 'TEAM_LOCATION'] = "HOME"
    
    return data, bool_ingame_plays

In [517]:
data, bool_ingame_plays = preprocessing_stint_data(data)

In [518]:
id_vars=['GAME_ID','EVENTNUM', 'PERIOD', 'TEAM_LOCATION', 'PLAYER1_TEAM_ID']

In [519]:
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")
def create_stint(data):

    # create the stint_marker
    bool_subs = data['EVENTMSGTYPE'] == 8
    data.loc[bool_subs, 'stint_marker'] = np.arange(start=1, stop=np.sum(bool_subs) + 1)

    # create idx array
    inter_id = np.where(~data['stint_marker'].isna())[0] + 1
    last_id = data.shape[0]
    first_id = 0

    # idx
    idx = np.where(~data['stint_marker'].isna())[0] + 1

    list_stint = list()

    for i, ids in tqdm(enumerate(idx)):

        # first iteration
        if i == 0:
            tmp = data.iloc[:idx[0]]

        # in between
        else:
            tmp = data.iloc[idx[i - 1]:idx[i]]

        # append to list
        list_stint.append(tmp)

    # append the last iteration
    stint = data.iloc[idx[-1]:]
    stint['stint'] = i
    list_stint.append(stint)
    print(i)
    # insert the stint counter
    #for i, stint in enumerate(list_stint):
    #    stint['stint'] = i

    return pd.concat(list_stint)

In [520]:
def get_score(data):
    
    bool_score = ~data['SCORE'].isna()
    
    data.loc[bool_score, 'HOME_PTS'] = data[bool_score]['SCORE'].apply(lambda x: x.split(' - ')[0]).astype(int)
    data.loc[bool_score, 'AWAY_PTS'] = data[bool_score]['SCORE'].apply(lambda x: x.split(' - ')[1]).astype(int)
    
    data['HOME_PM'] = data['HOME_PTS'] - data['AWAY_PTS']
    data['AWAY_PM'] = data['AWAY_PTS'] - data['HOME_PTS']
    
    return data

In [521]:
def estimate_possessions(data, bool_ingame_plays):
    # get the estimated possessions - potential to be more accurate! Change forumula here
    tmp = np.floor((data[bool_ingame_plays].groupby(['GAME_ID', 'TEAM_LOCATION', 'stint']).count()/2))['EVENTNUM'].reset_index()
    tmp = tmp.rename(columns={'EVENTNUM':'EST_POSSESSIONS'})
    
    # merge to dataframe
    tmp2 = pd.merge(data, tmp, on=['GAME_ID', 'TEAM_LOCATION', 'stint'], how='left')
    bool_poss_na = tmp2['EST_POSSESSIONS'].isna()
    
    # set "empty stints" to zero
    tmp2.loc[bool_poss_na, 'EST_POSSESSIONS'] = None
    
    return tmp2

In [522]:
def estimate_pm_100(data):
    # formula for Plus Minus per 100 possessions
    data['HOME_PM_100'] = np.floor(data['HOME_PM'] * (100 / data['EST_POSSESSIONS']))
    data['AWAY_PM_100'] = np.floor(data['AWAY_PM'] * (100 / data['EST_POSSESSIONS']))
    
    # replace inf with nan
    data.loc[np.isinf(data['HOME_PM_100']), 'HOME_PM_100'] = np.nan
    data.loc[np.isinf(data['AWAY_PM_100']), 'AWAY_PM_100'] = np.nan
    
    return data

In [523]:
import time
from tqdm import tqdm
from nba_api.stats.endpoints import boxscoreadvancedv2
def get_roster_and_starters(data):
    
    list_starters = list()
    list_roster = list()
    
    unique_games = data['GAME_ID'].unique()
    
    for game in tqdm(unique_games):

            try:
                call_boxscore = boxscoreadvancedv2.BoxScoreAdvancedV2(game_id=game)
                game = pd.concat(call_boxscore.get_data_frames())
            
            except:
                print(f'ID {game} got skipped')
                next
                
            # get home or away
            away_team_id = game['TEAM_ID'].unique()[0] # maybe replace by table that has TEAM_LOCATION for all games
            game['TEAM_LOCATION'] = np.where(game['TEAM_ID'] == away_team_id, 'AWAY', 'HOME')
            
            tmp2 = game[~game['START_POSITION'].isna()][['START_POSITION', 'PLAYER_ID', 'GAME_ID', 'TEAM_ID', 'TEAM_LOCATION']]
            tmp2['STARTER'] = np.where((tmp2['START_POSITION'] == ""), False, True)
            tmp2 = tmp2.drop_duplicates()  # just in case
            
            list_starters.append(tmp2[tmp2['STARTER']])
            list_roster.append(tmp2)
            
            time.sleep(0.75)
    
    return pd.concat(list_starters), pd.concat(list_roster)

In [524]:
def get_all_subs(data):
    bool_sub = data['EVENTMSGTYPE'] == 8
    subs = data.loc[bool_sub, ['GAME_ID', 'stint', 'game_time_left', 'TEAM_LOCATION', 'PLAYER1_ID', 'PLAYER2_ID']]
    subs = subs.rename(columns={'PLAYER1_ID':'PLAYER_OUT_ID', 'PLAYER2_ID':'PLAYER_IN_ID'})
    return subs.sort_values(['GAME_ID', 'stint'])

In [525]:
def get_starting_lineup(starters):
    
    tmp = starters.groupby(['GAME_ID', 'TEAM_ID'])['PLAYER_ID'].unique().reset_index()
    starting_lineup = tmp.explode('PLAYER_ID')
    starting_lineup['stint'] = 0
    
    return starting_lineup

In [526]:
from tqdm import tqdm
def get_on_court(data, starters, sub):
    
    # init
    game_ids = data['GAME_ID'].unique()
    list_on_court = list()
    list_game_id = list()
    list_stint = list()
    
    # loop over all games
    for game_id in tqdm(game_ids):
        
        # form subsets
        game = data[data['GAME_ID'] == game_id]
        starting_lineup = starters[starters['GAME_ID'] == game_id]
        subs = sub[sub['GAME_ID'] == game_id]
        
        # prepare stints for the loop
        a = game['stint_marker'].unique()
        tmp = a[~np.isnan(a)]
        last_stint = np.max(tmp)
        stints = tmp - 1

        # get rosters for the game for both teams
        starters_home = starting_lineup[starting_lineup['TEAM_LOCATION'] == "HOME"]['PLAYER_ID'].unique()
        starters_away = starting_lineup[starting_lineup['TEAM_LOCATION'] == "AWAY"]['PLAYER_ID'].unique()
        
        # get the substitutions
        players_in, players_out = subs['PLAYER_IN_ID'].values, subs['PLAYER_OUT_ID'].values
        
        # more init
        on_court = np.zeros((stints.shape[0]+1, 10))
        lineup = np.append(starters_home, starters_away)
        
        # store game id, so lengths match
        list_game_id.append(game_id)

        for i, stint in enumerate(stints):
            
            j = i+1
            
            # for first iteration store starting lineup and first sub
            if stint == 0:
                on_court[i, :] = lineup
                lineup[lineup == players_out[i]] = players_in[i] # sub player out
                on_court[j, :] = lineup

            else:
                lineup[lineup == players_out[i]] = players_in[i]
                on_court[j, :] = lineup
            
            # store and keep track
            list_game_id.append(game_id)
            list_stint.append(stint)
        
        # store on court formations
        list_on_court.append(on_court)
        list_stint.append(last_stint)
        
        # numpy format
        a_on_court = np.concatenate(list_on_court)
        a_game_id = np.asarray(list_game_id)
        a_stint = np.asarray(list_stint)
        
        # create colnames
        col_names = ['stint']
        col_names.extend([f'HOME_{i}' for i in np.arange(start=1, stop=6)])
        col_names.extend([f'AWAY_{i}' for i in np.arange(start=1, stop=6)])
        col_names.extend(['GAME_ID'])
        
        # transform to dataframe
        df = pd.DataFrame(data=np.concatenate((a_stint.reshape(-1, 1), a_on_court, 
                                               a_game_id.reshape(-1, 1)), axis=1), columns=col_names)
        
        # adjust datatype
        df['stint'] = df['stint'].astype(float)
        
    return df

In [527]:
def merge_stint_pts(data, court_data, col_score):
    
    # store cols for merging and selecting
    col_merge = ['GAME_ID', 'stint']
    col_select = ['GAME_ID', 'stint', 'game_time_left']
    col_select.extend(col_score)

    # form subset
    data_subset = data[col_select]
    
    # merge
    df = pd.merge(court_data, data_subset, how='left', on=col_merge)
    
    # drop duplicates
    df = df.drop_duplicates()
    
    # impute values for missing scores
    tmp = df.groupby('GAME_ID')[col_score].ffill()
    df[col_score] = tmp
    
    # fill NA with zero - these are mostly before there was a score?
    df[col_score] = df[col_score].fillna(value=0)
    
    return df

In [528]:
def stints_to_dummy(data_stints):
    
    # colnames, create dummy out of player columns
    col_names = [f'HOME_{i}' for i in np.arange(start=1, stop=6)]
    col_names.extend([f'AWAY_{i}' for i in np.arange(start=1, stop=6)])
    data_dummy = pd.get_dummies(data_stints, prefix_sep='-', columns=col_names)
    
    # get series objects to track the players and their "position"
    series_location_player = pd.Series(data_dummy.columns[8:].values)
    series_position_player = series_location_player.apply(lambda x: x.split('-')[0])
    series_id_player = series_location_player.apply(lambda x: x.split('-')[1])
    series_id_player = series_id_player.apply(lambda x: x.replace('.0', ''))
    series_id_player = series_id_player.astype(int)
    
    return data_dummy, series_position_player, series_id_player

In [529]:
def estimate_model(data_dummy, ids_start, col_y, model):
    
    X = data_dummy.iloc[:, ids_start:].values
    y = data_dummy[col_y].values
    
    print("Starting model fitting...")
    
    model.fit(X, y)
    
    #print(f'Model alpha: {model.alpha_}')
    #print(f'Model -MSE {model.best_score_}')
    
    return model, X, y

In [530]:
def show_scores_player(coef, series_id_player, filter_na=False):
    
    # load player data
    player_data = pd.read_csv("players_data.csv")
    player_data['PLAYER_ID'] = player_data['id'].astype(int)
    
    # data array
    a = np.concatenate((series_id_player.values.reshape(-1,1), 
                coef.reshape(-1,1)), axis=1)
    
    # create dataframe and merge
    df_tmp = pd.DataFrame(a, columns=['PLAYER_ID', 'SCORE'])
    df_result = pd.merge(df_tmp, player_data, how='left')
    
    if filter_na:
        df_result = df_result[~df_result['player_names'].isna()]
    
    return df_result.sort_values('SCORE', ascending=False)

In [531]:
data.columns

Index(['GAME_ID', 'EVENTNUM', 'EVENTMSGTYPE', 'EVENTMSGACTIONTYPE', 'PERIOD',
       'WCTIMESTRING', 'PCTIMESTRING', 'HOMEDESCRIPTION', 'NEUTRALDESCRIPTION',
       'VISITORDESCRIPTION', 'SCORE', 'SCOREMARGIN', 'PERSON1TYPE',
       'PLAYER1_ID', 'PLAYER1_NAME', 'PLAYER1_TEAM_ID', 'PLAYER1_TEAM_CITY',
       'PLAYER1_TEAM_NICKNAME', 'PLAYER1_TEAM_ABBREVIATION', 'PERSON2TYPE',
       'PLAYER2_ID', 'PLAYER2_NAME', 'PLAYER2_TEAM_ID', 'PLAYER2_TEAM_CITY',
       'PLAYER2_TEAM_NICKNAME', 'PLAYER2_TEAM_ABBREVIATION', 'PERSON3TYPE',
       'PLAYER3_ID', 'PLAYER3_NAME', 'PLAYER3_TEAM_ID', 'PLAYER3_TEAM_CITY',
       'PLAYER3_TEAM_NICKNAME', 'PLAYER3_TEAM_ABBREVIATION',
       'VIDEO_AVAILABLE_FLAG', 'game_time_s', 'game_time_left',
       'TEAM_LOCATION'],
      dtype='object')

In [532]:
data.to_csv("pbp_safe_data.csv")

In [534]:
data.shape

(1536737, 37)

In [29]:
data = pd.read_csv("pbp_safe_data.csv", dtype={'GAME_ID':str})

In [535]:
bool_sub = (data['EVENTMSGTYPE'] == 8).to_numpy() # substitutions
bool_tech = ((data['EVENTMSGTYPE'] == 6) & (data['EVENTMSGACTIONTYPE'].isin([10, 11, 16, 18, 25]))).to_numpy() # technical fouls
bool_eject_tech = ((data['EVENTMSGTYPE'] == 11) & (data['EVENTMSGACTIONTYPE'] == 1)).to_numpy() # ejct 2nd technical fouls
bool_ingame_plays = ~(bool_sub | bool_tech | bool_eject_tech)

In [536]:
data = data.drop(['WCTIMESTRING', 'PCTIMESTRING', 'NEUTRALDESCRIPTION',
                  'PLAYER3_TEAM_NICKNAME', 'PLAYER3_TEAM_ABBREVIATION', 'PLAYER3_TEAM_CITY',
                 'PLAYER2_TEAM_NICKNAME', 'PLAYER2_TEAM_ABBREVIATION', 'PLAYER2_TEAM_CITY',
                 'PLAYER1_TEAM_NICKNAME', 'PLAYER1_TEAM_ABBREVIATION', 'PLAYER1_TEAM_CITY',
                 'VIDEO_AVAILABLE_FLAG', 'SCOREMARGIN'], axis=1)

In [537]:
data

Unnamed: 0,GAME_ID,EVENTNUM,EVENTMSGTYPE,EVENTMSGACTIONTYPE,PERIOD,HOMEDESCRIPTION,VISITORDESCRIPTION,SCORE,PERSON1TYPE,PLAYER1_ID,...,PLAYER2_ID,PLAYER2_NAME,PLAYER2_TEAM_ID,PERSON3TYPE,PLAYER3_ID,PLAYER3_NAME,PLAYER3_TEAM_ID,game_time_s,game_time_left,TEAM_LOCATION
1,0021800002,4.0,10.0,0.0,1.0,Jump Ball Jones vs. Adams: Tip to Green,,,4.0,1627745.0,...,203500.0,Steven Adams,1.610613e+09,4.0,203110.0,Draymond Green,1.610613e+09,720.0,2880.0,HOME
2,0021800002,7.0,5.0,1.0,1.0,Thompson Bad Pass Turnover (P1.T1),Adams STEAL (1 STL),,4.0,202691.0,...,203500.0,Steven Adams,1.610613e+09,0.0,0.0,,,708.0,2832.0,
3,0021800002,9.0,5.0,45.0,1.0,,George Out of Bounds - Bad Pass Turnover Turno...,,5.0,202331.0,...,0.0,,,1.0,0.0,,,703.0,2812.0,AWAY
4,0021800002,10.0,1.0,80.0,1.0,Curry 24' 3PT Step Back Jump Shot (3 PTS) (Dur...,,0 - 3,4.0,201939.0,...,201142.0,Kevin Durant,1.610613e+09,0.0,0.0,,,691.0,2764.0,HOME
5,0021800002,12.0,2.0,78.0,1.0,,MISS Adams 12' Floating Jump Shot,,5.0,203500.0,...,0.0,,,0.0,0.0,,,673.0,2692.0,AWAY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,0022001072,632.0,3.0,12.0,4.0,,Horton-Tucker Free Throw 2 of 2 (6 PTS),110 - 98,5.0,1629659.0,...,0.0,,,0.0,0.0,,,52.0,52.0,AWAY
438,0022001072,633.0,5.0,45.0,4.0,Marshall Out of Bounds - Bad Pass Turnover Tur...,,,4.0,1630230.0,...,0.0,,,1.0,0.0,,,35.0,35.0,HOME
439,0022001072,634.0,2.0,1.0,4.0,,MISS Dudley 26' 3PT Jump Shot,,5.0,201162.0,...,0.0,,,0.0,0.0,,,28.0,28.0,AWAY
440,0022001072,635.0,4.0,0.0,4.0,Alexander-Walker REBOUND (Off:1 Def:3),,,4.0,1629638.0,...,0.0,,,0.0,0.0,,,23.0,23.0,HOME


In [33]:
test = data.groupby(['GAME_ID']).apply(lambda x: create_stint(x))

46it [00:00, 29541.87it/s]
46it [00:00, 33700.96it/s]
51it [00:00, 40352.67it/s]
66it [00:00, 38506.62it/s]
61it [00:00, 40910.22it/s]


45
45
50
65
60


52it [00:00, 36588.46it/s]
48it [00:00, 40001.31it/s]
59it [00:00, 37443.48it/s]
65it [00:00, 40703.16it/s]
52it [00:00, 38466.28it/s]
0it [00:00, ?it/s]

51
47
58
64
51


44it [00:00, 34315.61it/s]
62it [00:00, 37422.20it/s]
52it [00:00, 37819.28it/s]
53it [00:00, 34459.48it/s]
58it [00:00, 35780.21it/s]
55it [00:00, 40764.57it/s]
0it [00:00, ?it/s]

43
61
51
52
57
54


46it [00:00, 33167.95it/s]
45it [00:00, 38401.56it/s]
57it [00:00, 35101.35it/s]
52it [00:00, 37682.07it/s]
52it [00:00, 35372.01it/s]
45it [00:00, 37315.87it/s]


45
44
56
51
51
44


48it [00:00, 36229.37it/s]
56it [00:00, 37835.22it/s]
47it [00:00, 38027.06it/s]
54it [00:00, 37160.36it/s]
68it [00:00, 39601.87it/s]
51it [00:00, 35084.39it/s]


47
55
46
53
67
50


69it [00:00, 37376.60it/s]
58it [00:00, 36249.39it/s]
45it [00:00, 35686.08it/s]
42it [00:00, 38395.98it/s]
49it [00:00, 38828.81it/s]
0it [00:00, ?it/s]

68
57
44
41
48


45it [00:00, 37345.41it/s]
43it [00:00, 39778.36it/s]
56it [00:00, 34999.41it/s]
38it [00:00, 29411.99it/s]
63it [00:00, 37976.60it/s]
42it [00:00, 32234.36it/s]


44
42
55
37
62
41


45it [00:00, 34619.16it/s]
51it [00:00, 33867.88it/s]
51it [00:00, 34018.69it/s]
49it [00:00, 32994.20it/s]
76it [00:00, 28699.66it/s]


44
50
50
48
75


44it [00:00, 35092.10it/s]
51it [00:00, 36115.06it/s]
62it [00:00, 37192.05it/s]
50it [00:00, 39651.20it/s]
45it [00:00, 35160.89it/s]
59it [00:00, 38684.37it/s]
52it [00:00, 32017.59it/s]

43
50
61
49
44
58



54it [00:00, 35645.64it/s]
53it [00:00, 35976.39it/s]
55it [00:00, 36657.67it/s]
47it [00:00, 33912.32it/s]
57it [00:00, 39392.87it/s]


51
53
52
54
46
56


46it [00:00, 34275.71it/s]
56it [00:00, 37235.42it/s]
58it [00:00, 40707.77it/s]
48it [00:00, 38553.54it/s]
39it [00:00, 38263.83it/s]
48it [00:00, 39835.10it/s]
0it [00:00, ?it/s]

45
55
57
47
38
47


46it [00:00, 39593.27it/s]
55it [00:00, 40001.17it/s]
40it [00:00, 38541.73it/s]
44it [00:00, 34156.83it/s]
55it [00:00, 35649.32it/s]
50it [00:00, 33447.40it/s]


45
54
39
43
54
49


56it [00:00, 32017.59it/s]
49it [00:00, 33020.71it/s]
46it [00:00, 32997.77it/s]
51it [00:00, 38843.20it/s]
58it [00:00, 37617.08it/s]
0it [00:00, ?it/s]

55
48
45
50
57


48it [00:00, 34117.37it/s]
44it [00:00, 37162.58it/s]
54it [00:00, 37399.67it/s]
51it [00:00, 38871.43it/s]
46it [00:00, 38750.35it/s]
44it [00:00, 39324.39it/s]
57it [00:00, 40944.57it/s]


47
43
53
50
45
43
56


42it [00:00, 32257.97it/s]
56it [00:00, 35848.75it/s]
56it [00:00, 36455.23it/s]
60it [00:00, 36851.40it/s]
52it [00:00, 35349.08it/s]
55it [00:00, 40649.64it/s]
0it [00:00, ?it/s]

41
55
55
59
51
54


39it [00:00, 30951.34it/s]
41it [00:00, 38488.47it/s]
45it [00:00, 39510.92it/s]
57it [00:00, 39779.59it/s]
48it [00:00, 39607.83it/s]
46it [00:00, 39351.01it/s]
44it [00:00, 39249.12it/s]
0it [00:00, ?it/s]

38
40
44
56
47
45
43


53it [00:00, 36228.51it/s]
60it [00:00, 39838.25it/s]
55it [00:00, 39890.49it/s]
47it [00:00, 38457.33it/s]
41it [00:00, 1357.43it/s]


52
59
54
46
40


45it [00:00, 34511.55it/s]
61it [00:00, 36660.34it/s]
47it [00:00, 35796.67it/s]
48it [00:00, 36981.37it/s]
48it [00:00, 32768.00it/s]
48it [00:00, 35899.89it/s]
0it [00:00, ?it/s]

44
60
46
47
47
47


43it [00:00, 38186.55it/s]
64it [00:00, 40530.80it/s]
41it [00:00, 38180.83it/s]
46it [00:00, 36042.96it/s]
55it [00:00, 38868.87it/s]
51it [00:00, 39968.14it/s]


42
63
40
45
54
50


54it [00:00, 38876.14it/s]
55it [00:00, 40028.93it/s]
53it [00:00, 39254.48it/s]
59it [00:00, 39727.71it/s]
52it [00:00, 40151.66it/s]
49it [00:00, 39645.23it/s]
0it [00:00, ?it/s]

53
54
52
58
51
48


43it [00:00, 37488.06it/s]
55it [00:00, 40168.33it/s]
42it [00:00, 39060.04it/s]
48it [00:00, 40370.28it/s]
57it [00:00, 40391.17it/s]
45it [00:00, 31515.06it/s]
57it [00:00, 40228.05it/s]
0it [00:00, ?it/s]

42
54
41
47
56
44
56


50it [00:00, 36740.57it/s]
64it [00:00, 39452.59it/s]
39it [00:00, 38344.55it/s]
48it [00:00, 39275.57it/s]
62it [00:00, 39046.07it/s]
52it [00:00, 39662.45it/s]


49
63
38
47
61
51


47it [00:00, 38248.41it/s]
44it [00:00, 37670.83it/s]
46it [00:00, 39246.95it/s]
55it [00:00, 40175.33it/s]
43it [00:00, 39267.38it/s]
52it [00:00, 39691.32it/s]
53it [00:00, 39731.57it/s]


46
43
45
54
42
51
52


55it [00:00, 39911.20it/s]
47it [00:00, 38248.41it/s]
48it [00:00, 38465.15it/s]
43it [00:00, 39165.05it/s]
48it [00:00, 38956.38it/s]
52it [00:00, 39604.83it/s]
49it [00:00, 39683.51it/s]


54
46
47
42
47
51
48


48it [00:00, 38739.00it/s]
53it [00:00, 39316.96it/s]
35it [00:00, 37024.12it/s]
50it [00:00, 39718.79it/s]
55it [00:00, 40154.35it/s]
44it [00:00, 38762.73it/s]
51it [00:00, 39752.74it/s]
0it [00:00, ?it/s]

47
52
34
49
54
43
50


57it [00:00, 39282.83it/s]
41it [00:00, 38003.64it/s]
49it [00:00, 39814.20it/s]
44it [00:00, 39082.88it/s]
56it [00:00, 40820.48it/s]
54it [00:00, 39499.90it/s]
47it [00:00, 39696.39it/s]


56
40
48
43
55
53
46


42it [00:00, 38080.58it/s]
55it [00:00, 39973.44it/s]
43it [00:00, 38397.93it/s]
34it [00:00, 37488.52it/s]
34it [00:00, 37696.63it/s]
68it [00:00, 40674.94it/s]
55it [00:00, 38816.54it/s]

41
54
42
33
33
67



46it [00:00, 38657.18it/s]
63it [00:00, 40746.52it/s]
46it [00:00, 38618.49it/s]
44it [00:00, 38256.50it/s]
55it [00:00, 40231.38it/s]
46it [00:00, 38812.71it/s]
0it [00:00, ?it/s]

54
45
62
45
43
54
45


53it [00:00, 39526.69it/s]
48it [00:00, 40168.91it/s]
49it [00:00, 38458.25it/s]
47it [00:00, 39371.34it/s]
40it [00:00, 39098.62it/s]
52it [00:00, 39185.02it/s]


52
47
48
46
39
51


58it [00:00, 40223.15it/s]
56it [00:00, 39911.81it/s]
32it [00:00, 36571.59it/s]
54it [00:00, 40322.67it/s]
35it [00:00, 37962.41it/s]
39it [00:00, 39161.56it/s]
57it [00:00, 40140.25it/s]


57
55
31
53
34
38
56


50it [00:00, 39516.71it/s]
38it [00:00, 37966.54it/s]
43it [00:00, 39335.89it/s]
39it [00:00, 38425.62it/s]
41it [00:00, 37745.05it/s]
48it [00:00, 39024.34it/s]
0it [00:00, ?it/s]

49
37
42
38
40
47


48it [00:00, 39275.57it/s]
51it [00:00, 39568.91it/s]
37it [00:00, 37676.44it/s]
55it [00:00, 34059.75it/s]
47it [00:00, 38615.53it/s]
54it [00:00, 39589.65it/s]
53it [00:00, 39888.41it/s]


47
50
36
54
46
53
52


54it [00:00, 39131.38it/s]
58it [00:00, 39511.07it/s]
59it [00:00, 40801.97it/s]
44it [00:00, 36186.15it/s]
39it [00:00, 35911.71it/s]
50it [00:00, 39309.32it/s]
51it [00:00, 39620.21it/s]


53
57
58
43
38
49
50


40it [00:00, 36671.51it/s]
73it [00:00, 39826.25it/s]
43it [00:00, 38291.95it/s]
44it [00:00, 38844.32it/s]
44it [00:00, 39082.88it/s]
38it [00:00, 37326.36it/s]
51it [00:00, 38899.71it/s]

39
72
42
43
43
37



52it [00:00, 38602.44it/s]
44it [00:00, 40041.09it/s]
61it [00:00, 40585.75it/s]
49it [00:00, 38792.17it/s]
48it [00:00, 38933.78it/s]
51it [00:00, 39539.65it/s]
0it [00:00, ?it/s]

50
51
43
60
48
47
50


47it [00:00, 38367.51it/s]
51it [00:00, 39723.21it/s]
60it [00:00, 39731.33it/s]
49it [00:00, 39364.28it/s]
47it [00:00, 39387.07it/s]
35it [00:00, 37844.97it/s]
44it [00:00, 39679.50it/s]
46it [00:00, 38587.60it/s]


46
50
59
48
46
34
43
45


49it [00:00, 38858.18it/s]
47it [00:00, 38517.45it/s]
44it [00:00, 38455.80it/s]
50it [00:00, 39598.79it/s]
51it [00:00, 38935.11it/s]
51it [00:00, 40057.96it/s]
46it [00:00, 38812.71it/s]

48
46
43
49
50
50



43it [00:00, 38537.41it/s]
48it [00:00, 38836.15it/s]
60it [00:00, 40349.24it/s]
49it [00:00, 39386.91it/s]
54it [00:00, 39362.60it/s]
0it [00:00, ?it/s]

45
42
47
59
48
53


54it [00:00, 38902.85it/s]
38it [00:00, 38267.36it/s]
36it [00:00, 37976.60it/s]
49it [00:00, 38762.90it/s]
43it [00:00, 39165.05it/s]
47it [00:00, 38683.73it/s]
50it [00:00, 39748.90it/s]
58it [00:00, 39568.91it/s]
0it [00:00, ?it/s]

53
37
35
48
42
46
49
57


40it [00:00, 39144.23it/s]
56it [00:00, 39823.84it/s]
41it [00:00, 38635.47it/s]
50it [00:00, 39847.08it/s]
59it [00:00, 39785.20it/s]
50it [00:00, 39272.51it/s]
51it [00:00, 38991.89it/s]

39
55
40
49
58
49



53it [00:00, 39646.53it/s]
32it [00:00, 38646.05it/s]
54it [00:00, 40293.97it/s]
48it [00:00, 39183.84it/s]
42it [00:00, 38930.56it/s]
55it [00:00, 39059.72it/s]
0it [00:00, ?it/s]

50
52
31
53
47
41
54


45it [00:00, 38653.22it/s]
40it [00:00, 38234.31it/s]
40it [00:00, 38069.47it/s]
57it [00:00, 39641.08it/s]
63it [00:00, 39777.38it/s]
51it [00:00, 39084.51it/s]
0it [00:00, ?it/s]

44
39
39
56
62
50


37it [00:00, 37603.40it/s]
43it [00:00, 39708.29it/s]
54it [00:00, 40115.55it/s]
45it [00:00, 39370.81it/s]
52it [00:00, 39326.33it/s]
44it [00:00, 39257.47it/s]
42it [00:00, 37729.87it/s]
49it [00:00, 39744.90it/s]


36
42
53
44
51
43
41
48


59it [00:00, 39594.23it/s]
47it [00:00, 38874.44it/s]
48it [00:00, 39468.06it/s]
44it [00:00, 38335.97it/s]
45it [00:00, 38868.14it/s]
51it [00:00, 39664.29it/s]
44it [00:00, 38232.73it/s]
46it [00:00, 38084.88it/s]


58
46
47
43
44
50
43
45


33it [00:00, 37941.89it/s]
38it [00:00, 37152.34it/s]
45it [00:00, 38425.02it/s]
55it [00:00, 40287.59it/s]
53it [00:00, 39081.95it/s]
47it [00:00, 38813.21it/s]
43it [00:00, 38324.49it/s]
0it [00:00, ?it/s]

32
37
44
54
52
46
42


51it [00:00, 38008.09it/s]
46it [00:00, 39552.68it/s]
43it [00:00, 37707.52it/s]
45it [00:00, 39962.67it/s]
52it [00:00, 39756.44it/s]
43it [00:00, 38422.47it/s]
43it [00:00, 38978.84it/s]


50
45
42
44
51
42
42


47it [00:00, 38434.84it/s]
50it [00:00, 39598.79it/s]
52it [00:00, 39633.62it/s]
55it [00:00, 39399.95it/s]
46it [00:00, 38595.32it/s]
41it [00:00, 38722.46it/s]
37it [00:00, 38186.33it/s]
0it [00:00, ?it/s]

46
49
51
54
45
40
36


42it [00:00, 37465.07it/s]
38it [00:00, 37768.61it/s]
35it [00:00, 36921.69it/s]
47it [00:00, 39363.48it/s]
44it [00:00, 37670.83it/s]
44it [00:00, 38770.88it/s]
55it [00:00, 40001.17it/s]
51it [00:00, 39908.49it/s]


41
37
34
46
43
43
54
50


45it [00:00, 38464.17it/s]
51it [00:00, 38321.30it/s]
39it [00:00, 37578.19it/s]
48it [00:00, 33063.98it/s]
48it [00:00, 36671.51it/s]
57it [00:00, 40282.28it/s]
55it [00:00, 1856.86it/s]


44
50
38
47
47
56
54


36it [00:00, 37374.99it/s]
41it [00:00, 39614.48it/s]
38it [00:00, 37555.03it/s]
38it [00:00, 37555.03it/s]
49it [00:00, 39485.28it/s]
46it [00:00, 38495.21it/s]
53it [00:00, 39881.25it/s]
49it [00:00, 40258.75it/s]

35
40
37
37
48
45
52



50it [00:00, 39250.46it/s]
36it [00:00, 36028.38it/s]
52it [00:00, 39368.92it/s]
66it [00:00, 37647.77it/s]
57it [00:00, 40418.48it/s]
52it [00:00, 39333.42it/s]


48
49
35
51
65
56
51


47it [00:00, 39418.57it/s]
40it [00:00, 38728.57it/s]
39it [00:00, 38006.01it/s]
36it [00:00, 37255.11it/s]
49it [00:00, 40258.75it/s]
54it [00:00, 39063.89it/s]
48it [00:00, 39031.91it/s]
57it [00:00, 39064.60it/s]


46
39
38
35
48
53
47
56


52it [00:00, 39418.73it/s]
52it [00:00, 39691.32it/s]
54it [00:00, 1782.59it/s]
47it [00:00, 39434.34it/s]
65it [00:00, 39437.26it/s]
0it [00:00, ?it/s]

51
51
53
46
64


49it [00:00, 38828.81it/s]
47it [00:00, 38866.78it/s]
44it [00:00, 39041.54it/s]
43it [00:00, 22337.76it/s]
41it [00:00, 36807.89it/s]
57it [00:00, 39859.17it/s]
51it [00:00, 39077.37it/s]
0it [00:00, ?it/s]

48
46
43
42
40
56
50


55it [00:00, 38621.58it/s]
58it [00:00, 39841.08it/s]
53it [00:00, 39731.57it/s]
52it [00:00, 39633.62it/s]
50it [00:00, 39346.19it/s]
59it [00:00, 39676.76it/s]
0it [00:00, ?it/s]

54
57
52
51
49
58


40it [00:00, 37888.93it/s]
40it [00:00, 33527.61it/s]
58it [00:00, 33820.33it/s]
40it [00:00, 36496.01it/s]
56it [00:00, 36896.17it/s]
53it [00:00, 39498.60it/s]


39
39
57
39
55
52


55it [00:00, 1858.56it/s]
50it [00:00, 39206.43it/s]
48it [00:00, 38347.92it/s]
56it [00:00, 39776.63it/s]
49it [00:00, 38523.13it/s]
51it [00:00, 39627.55it/s]
0it [00:00, ?it/s]

54
49
47
55
48
50


56it [00:00, 40143.74it/s]
48it [00:00, 34910.11it/s]
51it [00:00, 35362.79it/s]
46it [00:00, 37427.35it/s]
50it [00:00, 37117.73it/s]
52it [00:00, 37539.38it/s]


55
47
50
45
49
51


53it [00:00, 36252.14it/s]
40it [00:00, 38199.49it/s]
45it [00:00, 39239.85it/s]
62it [00:00, 40524.68it/s]
53it [00:00, 40032.07it/s]
58it [00:00, 40216.50it/s]


52
39
44
61
52
57


56it [00:00, 39140.31it/s]
56it [00:00, 39803.60it/s]
50it [00:00, 38550.59it/s]
42it [00:00, 38245.93it/s]
52it [00:00, 39938.44it/s]
0it [00:00, ?it/s]

55
55
49
41
51


45it [00:00, 38860.14it/s]
48it [00:00, 35791.39it/s]
45it [00:00, 39036.96it/s]
56it [00:00, 41027.25it/s]
41it [00:00, 38028.85it/s]
46it [00:00, 34178.56it/s]
57it [00:00, 33493.32it/s]
0it [00:00, ?it/s]

44
47
44
55
40
45
56


52it [00:00, 35521.79it/s]
41it [00:00, 36449.02it/s]
47it [00:00, 37837.29it/s]
47it [00:00, 33767.09it/s]
38it [00:00, 36092.29it/s]
41it [00:00, 34573.07it/s]
43it [00:00, 39310.17it/s]
0it [00:00, ?it/s]

51
40
46
46
37
40
42


72it [00:00, 37076.72it/s]
49it [00:00, 38165.44it/s]
42it [00:00, 32181.36it/s]
44it [00:00, 34728.90it/s]
53it [00:00, 34042.59it/s]
45it [00:00, 34907.28it/s]
42it [00:00, 30921.67it/s]
0it [00:00, ?it/s]

71
48
41
43
52
44
41


47it [00:00, 37096.78it/s]
44it [00:00, 34430.85it/s]
55it [00:00, 35737.68it/s]
48it [00:00, 35715.20it/s]
55it [00:00, 39535.00it/s]
37it [00:00, 36739.88it/s]


46
43
54
47
54
36


44it [00:00, 31862.81it/s]
46it [00:00, 35381.99it/s]
66it [00:00, 38841.60it/s]
43it [00:00, 34366.44it/s]
48it [00:00, 39153.36it/s]
61it [00:00, 37747.50it/s]


43
45
65
42
47
60


59it [00:00, 35979.05it/s]
59it [00:00, 34384.32it/s]
44it [00:00, 38592.51it/s]
47it [00:00, 38307.87it/s]
47it [00:00, 37649.41it/s]
39it [00:00, 37543.69it/s]
40it [00:00, 37837.65it/s]
0it [00:00, ?it/s]

58
58
43
46
46
38
39


35it [00:00, 30602.59it/s]
57it [00:00, 34443.93it/s]
59it [00:00, 35740.03it/s]
53it [00:00, 35291.02it/s]
49it [00:00, 39131.93it/s]
49it [00:00, 39079.84it/s]


34
56
58
52
48
48


52it [00:00, 39390.25it/s]
54it [00:00, 39239.85it/s]
36it [00:00, 37117.73it/s]
47it [00:00, 39230.31it/s]
54it [00:00, 39791.36it/s]
44it [00:00, 38106.42it/s]
45it [00:00, 38331.37it/s]
49it [00:00, 39005.67it/s]


51
53
35
46
53
43
44
48


43it [00:00, 37739.08it/s]
40it [00:00, 34127.78it/s]
54it [00:00, 39679.82it/s]
50it [00:00, 39801.71it/s]
50it [00:00, 39125.97it/s]
49it [00:00, 39326.62it/s]
56it [00:00, 40027.44it/s]


42
39
53
49
49
48
55


55it [00:00, 39366.33it/s]
38it [00:00, 38341.00it/s]
41it [00:00, 37928.20it/s]
40it [00:00, 37625.51it/s]
42it [00:00, 38530.35it/s]
49it [00:00, 39013.08it/s]
55it [00:00, 39911.20it/s]
42it [00:00, 38742.20it/s]


54
37
40
39
41
48
54
41


34it [00:00, 36556.35it/s]
48it [00:00, 39506.79it/s]
51it [00:00, 38786.85it/s]
46it [00:00, 38137.57it/s]
47it [00:00, 39592.75it/s]
51it [00:00, 39627.55it/s]
49it [00:00, 38458.25it/s]
0it [00:00, ?it/s]

33
47
50
45
46
50
48


37it [00:00, 37082.26it/s]
44it [00:00, 39679.50it/s]
58it [00:00, 39485.41it/s]
49it [00:00, 39584.15it/s]
47it [00:00, 38056.43it/s]
41it [00:00, 37214.12it/s]
52it [00:00, 31536.12it/s]
0it [00:00, ?it/s]

36
43
57
48
46
40
51


59it [00:00, 31649.05it/s]
49it [00:00, 35686.91it/s]
60it [00:00, 36012.91it/s]
40it [00:00, 35268.48it/s]
54it [00:00, 32925.19it/s]
52it [00:00, 36187.79it/s]


58
48
59
39
53
51


52it [00:00, 38371.54it/s]
40it [00:00, 30744.39it/s]
47it [00:00, 35770.69it/s]
51it [00:00, 34624.39it/s]
59it [00:00, 40521.36it/s]
42it [00:00, 35739.66it/s]


51
39
46
50
58
41


62it [00:00, 39365.25it/s]
60it [00:00, 36851.40it/s]
42it [00:00, 35147.80it/s]
44it [00:00, 33664.61it/s]
52it [00:00, 35713.74it/s]
45it [00:00, 36026.66it/s]
50it [00:00, 38843.34it/s]
0it [00:00, ?it/s]

61
59
41
43
51
44
49


49it [00:00, 34605.30it/s]
61it [00:00, 35426.83it/s]
40it [00:00, 34814.73it/s]
55it [00:00, 38731.82it/s]
48it [00:00, 35345.26it/s]
0it [00:00, ?it/s]

48
60
39
54
47


60it [00:00, 34663.67it/s]
45it [00:00, 35740.14it/s]
62it [00:00, 33697.92it/s]
50it [00:00, 30881.34it/s]
51it [00:00, 31498.97it/s]
0it [00:00, ?it/s]

59
44
61
49
50


52it [00:00, 29922.32it/s]
40it [00:00, 31823.25it/s]
48it [00:00, 34094.26it/s]
49it [00:00, 36459.27it/s]
54it [00:00, 32240.91it/s]


51
39
47
48
53


In [34]:
test = get_score(test)
test = estimate_possessions(data=test, bool_ingame_plays=bool_ingame_plays)
test = estimate_pm_100(data=test)

In [37]:
starters, roster = get_roster_and_starters(test)

100%|██████████| 500/500 [20:10<00:00,  2.42s/it]


In [39]:
starters.to_csv("starters_safe.csv")

In [40]:
starting_lineups = get_starting_lineup(starters)

In [41]:
subs = get_all_subs(data=test)

In [42]:
court = get_on_court(data=test, starters=starters, sub=subs)

100%|██████████| 500/500 [00:49<00:00, 10.14it/s]


In [79]:
col_score = ['HOME_PTS', 'AWAY_PTS', 'HOME_PM', 'HOME_PM_100', 'EST_POSSESSIONS']
data_stints = merge_stint_pts(data=test, court_data=court, col_score=col_score)

In [65]:
data_stints

Unnamed: 0,stint,HOME_1,HOME_2,HOME_3,HOME_4,HOME_5,AWAY_1,AWAY_2,AWAY_3,AWAY_4,AWAY_5,GAME_ID,game_time_left,HOME_PTS,AWAY_PTS,HOME_PM,HOME_PM_100,EST_POSSESSIONS
0,0.0,202695.0,202335.0,1627826.0,1629013.0,201976.0,203076.0,2544.0,201580.0,201980.0,202340.0,0021900002,,0.0,0.0,0.0,0.0,0.0
1,1.0,202695.0,202335.0,1626149.0,1629013.0,201976.0,203076.0,2544.0,201580.0,201980.0,202340.0,0021900002,,0.0,0.0,0.0,0.0,0.0
2,2.0,202695.0,202335.0,1626149.0,1629013.0,101150.0,203076.0,2544.0,201580.0,201980.0,202340.0,0021900002,,0.0,0.0,0.0,0.0,0.0
3,3.0,202695.0,203210.0,1626149.0,1629013.0,101150.0,203076.0,2544.0,201580.0,201980.0,202340.0,0021900002,,0.0,0.0,0.0,0.0,0.0
4,4.0,203090.0,203210.0,1626149.0,1629013.0,101150.0,203076.0,2544.0,201580.0,201980.0,202340.0,0021900002,,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31900,47.0,1627812.0,1627812.0,202357.0,1628385.0,1629713.0,1629599.0,203585.0,1628405.0,1627826.0,1629010.0,0021900499,35.0,104.0,87.0,17.0,850.0,2.0
31901,47.0,1627812.0,1627812.0,202357.0,1628385.0,1629713.0,1629599.0,203585.0,1628405.0,1627826.0,1629010.0,0021900499,35.0,105.0,87.0,18.0,900.0,2.0
31902,47.0,1627812.0,1627812.0,202357.0,1628385.0,1629713.0,1629599.0,203585.0,1628405.0,1627826.0,1629010.0,0021900499,25.0,105.0,87.0,18.0,900.0,3.0
31903,47.0,1627812.0,1627812.0,202357.0,1628385.0,1629713.0,1629599.0,203585.0,1628405.0,1627826.0,1629010.0,0021900499,1.0,105.0,87.0,18.0,900.0,2.0


In [156]:
def stints_to_dummy_unique(data_stints, col_scores):
    col_base = ['stint', 'GAME_ID', 'game_time_left']
    col_names = [f'HOME_{i}' for i in np.arange(start=1, stop=6)]
    col_names.extend([f'AWAY_{i}' for i in np.arange(start=1, stop=6)])
    
    data_slice = data_stints.drop(col_names, axis=1)
    
    # colnames, create dummy out of player columns
    col_names = [f'HOME_{i}' for i in np.arange(start=1, stop=6)]
    col_names.extend([f'AWAY_{i}' for i in np.arange(start=1, stop=6)])
    tmp = pd.melt(data_stints, id_vars=['stint', 'GAME_ID'], value_vars=col_names)
    tmp_dummy = pd.get_dummies(tmp, columns=['value'], prefix="", prefix_sep="")
    
    data_dummy = pd.merge(data_slice, tmp_dummy, on=['GAME_ID', 'stint'])
    
    data_dummy = data_dummy.drop('variable', axis=1)
    
    # get series objects to track the players and their "position"
    series_location_player = pd.Series(data_dummy.columns[8:].values)
    series_id_player = series_location_player.apply(lambda x: x.replace('.0', ''))
    series_id_player = series_id_player.astype(int)
    
    return data_dummy, series_id_player

In [157]:
data_stints['GAME_ID'].unique().shape

(500,)

In [363]:
data_stints

Unnamed: 0,stint,HOME_1,HOME_2,HOME_3,HOME_4,HOME_5,AWAY_1,AWAY_2,AWAY_3,AWAY_4,AWAY_5,GAME_ID,game_time_left,HOME_PTS,AWAY_PTS,HOME_PM,HOME_PM_100,EST_POSSESSIONS
0,0.0,202695.0,202335.0,1627826.0,1629013.0,201976.0,203076.0,2544.0,201580.0,201980.0,202340.0,0021900002,,0.0,0.0,0.0,0.0,0.0
1,1.0,202695.0,202335.0,1626149.0,1629013.0,201976.0,203076.0,2544.0,201580.0,201980.0,202340.0,0021900002,,0.0,0.0,0.0,0.0,0.0
2,2.0,202695.0,202335.0,1626149.0,1629013.0,101150.0,203076.0,2544.0,201580.0,201980.0,202340.0,0021900002,,0.0,0.0,0.0,0.0,0.0
3,3.0,202695.0,203210.0,1626149.0,1629013.0,101150.0,203076.0,2544.0,201580.0,201980.0,202340.0,0021900002,,0.0,0.0,0.0,0.0,0.0
4,4.0,203090.0,203210.0,1626149.0,1629013.0,101150.0,203076.0,2544.0,201580.0,201980.0,202340.0,0021900002,,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31900,47.0,1627812.0,1627812.0,202357.0,1628385.0,1629713.0,1629599.0,203585.0,1628405.0,1627826.0,1629010.0,0021900499,35.0,104.0,87.0,17.0,850.0,2.0
31901,47.0,1627812.0,1627812.0,202357.0,1628385.0,1629713.0,1629599.0,203585.0,1628405.0,1627826.0,1629010.0,0021900499,35.0,105.0,87.0,18.0,900.0,2.0
31902,47.0,1627812.0,1627812.0,202357.0,1628385.0,1629713.0,1629599.0,203585.0,1628405.0,1627826.0,1629010.0,0021900499,25.0,105.0,87.0,18.0,900.0,3.0
31903,47.0,1627812.0,1627812.0,202357.0,1628385.0,1629713.0,1629599.0,203585.0,1628405.0,1627826.0,1629010.0,0021900499,1.0,105.0,87.0,18.0,900.0,2.0


In [451]:
def stints_to_dummy_unique_pm(data_stints):
    
    home_names = ['HOME_1', 'HOME_2', 'HOME_3', 'HOME_4', 'HOME_5']
    away_names = ['AWAY_1', 'AWAY_2', 'AWAY_3', 'AWAY_4', 'AWAY_5']

    data_home = data_stints.drop(away_names, axis=1)
    data_away = data_stints.drop(home_names, axis=1)

    home_long = pd.melt(data_home, id_vars=['stint', 'GAME_ID', 'HOME_PM_100', 'HOME_PM', 'HOME_PTS', 'AWAY_PTS'], 
                        value_vars=home_names)
    dummy_home = pd.get_dummies(home_long, columns=['value'], prefix="", prefix_sep="")

    away_long = pd.melt(data_away, id_vars=['stint', 'GAME_ID', 'HOME_PM_100', 'HOME_PM', 'HOME_PTS', 'AWAY_PTS'],
                        value_vars=away_names)
    dummy_away = pd.get_dummies(away_long, columns=['value'], prefix="", prefix_sep="")
    dummy_away.iloc[:, 3:] =  dummy_away.iloc[:, 3:] * (-1)

    dummy_pm = pd.concat([dummy_home, dummy_away], join='inner')
    dummy_pm = dummy_pm.drop('variable', axis=1)
    
    # get series objects to track the players and their "position"
    series_location_player = pd.Series(dummy_pm.columns[6:].values)
    series_id_player = series_location_player.apply(lambda x: x.replace('.0', ''))
    series_id_player = series_id_player.astype(int)

    return dummy_pm, series_id_player

In [461]:
dummy_pm, player_ids = stints_to_dummy_unique_pm(data_stints=data_stints)

In [455]:
player_ids.unique().shape

(448,)

In [456]:
dummy_pm.shape

(312660, 454)

In [159]:
data_dummy.shape[1] - 8

473

**How to deal with empty stints? Leave them in or out? Do an ffill? How to deal with NA Home PTS?**

In [462]:
dummy_pm.to_csv("dummy_safe_long.csv") 
player_ids.to_csv("player_ids_model.csv")

In [463]:
data_dummy = pd.read_csv("dummy_safe_long.csv", dtype={'GAME_ID':str})

In [457]:
data_dummy.shape

(312660, 455)

In [238]:
from sklearn.linear_model import RidgeCV
model = RidgeCV(alphas=[1e-3, 1e-1, 1e1, 1e3, 1e4], normalize=True, cv=3)

In [498]:
from sklearn.linear_model import Ridge
model = Ridge(alpha=1.5e3) #1e-3

In [499]:
data_dummy

Unnamed: 0.1,Unnamed: 0,stint,GAME_ID,HOME_PM_100,HOME_PM,HOME_PTS,AWAY_PTS,101107.0,101108.0,101133.0,...,204025.0,204038.0,204060.0,204456.0,2199.0,2544.0,2546.0,2594.0,2730.0,2772.0
0,0,0.0,0021900002,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1.0,0021900002,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,2.0,0021900002,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,3.0,0021900002,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,4.0,0021900002,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312655,156325,47.0,0021900499,850.0,-17.0,-104.0,-87.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
312656,156326,47.0,0021900499,900.0,-18.0,-105.0,-87.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
312657,156327,47.0,0021900499,900.0,-18.0,-105.0,-87.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
312658,156328,47.0,0021900499,900.0,-18.0,-105.0,-87.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [500]:
model, X, y = estimate_model(data_dummy=data_dummy, ids_start=7, col_y="HOME_PM", model=model)

Starting model fitting...


In [501]:
# model.best_score_, model.alpha_

In [502]:
model.coef_.shape

(448,)

In [503]:
#coef = model.coef_[:-1]
#coef = coef[:-1]

In [504]:
coef = model.coef_

In [505]:
player_ids.shape

(448,)

In [429]:
import pickle
pickle.dump(model, open("model_ridge_no_cv", 'wb'))

Merge daten müssen zur Saison passen

In [506]:
scores = show_scores_player(coef=coef, series_id_player=player_ids, filter_na=False)

In [507]:
scores['PLAYER_ID'] = scores['PLAYER_ID'].astype(int).astype(str)

In [291]:
full_player_ratingdata import get_clean_player_data
import time
from tqdm import tqdm
list_player = list()

for player in tqdm(scores['PLAYER_ID']):
    list_player.append(get_clean_player_data(player_id=player))
    time.sleep(0.75)
    
data_player = pd.concat(list_player)

100%|██████████| 473/473 [17:58<00:00,  2.28s/it]


In [508]:
last_season = data_player.groupby('PLAYER_ID').nth(-2).reset_index()
last_season['PLAYER_ID'] = last_season['PLAYER_ID'].astype(str)

In [509]:
full_player_rating = pd.merge(last_season[last_season['MIN'] > 12], scores, on='PLAYER_ID')
full_player_rating.sort_values('SCORE', ascending=False).head(30)
#full_player_rating[~full_player_rating['id'].isna()].sort_values('SCORE', ascending=False).head(30)

Unnamed: 0,PLAYER_ID,SEASON_ID,TEAM_ID,PLAYER_AGE,GP,GS,MIN,FG_PCT,FGA,FG3_PCT,...,STL,BLK,TOV,PF,PTS,SCORE,id,player_names,position,team
305,1629002,2020-21,1610613000.0,24.0,36,6,13.6,0.508,5.0,0.351,...,0.4,0.5,0.8,1.6,6.3,0.552511,1629002.0,Chimezie Metu,F,SAC
34,201577,2020-21,1610613000.0,33.0,71,9,19.1,0.633,6.0,0.278,...,0.2,0.6,1.1,1.5,9.0,0.469818,201577.0,Robin Lopez,C,WAS
74,202689,2020-21,1610613000.0,31.0,43,43,31.8,0.42,15.7,0.36,...,1.1,0.3,2.0,1.4,19.3,0.406425,202689.0,Kemba Walker,G,BOS
202,1626224,2020-21,1610613000.0,26.0,59,26,25.6,0.374,9.9,0.306,...,0.9,0.2,1.4,2.0,10.4,0.370301,1626224.0,Cedi Osman,F,CLE
230,1627814,2020-21,1610613000.0,28.0,57,1,18.9,0.467,4.8,0.397,...,0.7,0.1,0.5,1.6,6.5,0.350379,1627814.0,Damion Lee,G,GSW
49,201952,2020-21,1610613000.0,33.0,34,5,18.1,0.415,5.5,0.464,...,0.8,0.2,1.1,1.3,6.9,0.306606,,,,
210,1627745,2020-21,1610613000.0,26.0,17,4,20.0,0.657,4.1,0.25,...,0.5,1.0,0.9,2.1,6.9,0.286884,1627745.0,Damian Jones,C,SAC
43,201935,2020-21,1610613000.0,31.0,36,35,36.6,0.471,16.6,0.366,...,1.3,0.8,4.0,2.4,24.6,0.279097,201935.0,James Harden,G,BKN
222,1627774,2020-21,1610613000.0,27.0,45,11,13.9,0.495,4.1,0.295,...,0.6,0.4,0.6,1.0,5.1,0.277446,1627774.0,Jake Layman,F,MIN
131,203503,2020-21,1610613000.0,29.0,47,23,21.1,0.515,3.6,0.569,...,0.3,0.2,0.4,1.6,5.3,0.274455,203503.0,Tony Snell,G,ATL


In [None]:
201142, 201935

In [444]:
full_player_rating[full_player_rating['PLAYER_ID'] == "201566"]

Unnamed: 0,PLAYER_ID,SEASON_ID,TEAM_ID,PLAYER_AGE,GP,GS,MIN,FG_PCT,FGA,FG3_PCT,...,STL,BLK,TOV,PF,PTS,SCORE,id,player_names,position,team
28,201566,2020-21,1610613000.0,32.0,65,65,36.4,0.439,19.0,0.315,...,1.4,0.4,4.8,2.9,22.2,-1.732368,201566.0,Russell Westbrook,G,WAS


In [294]:
full_player_rating.to_csv("full_player_rating.csv")