In [41]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [546]:
import pandas as pd
import numpy as np
import re
import time
from nba_api.stats.endpoints import playbyplayv2

def stint_lineup(starter_array, subs):
    """

    :param starter_array:
    :param subs:
    :return:
    """
    starters = None
    starters = starter_array.copy()

    print(starters)

    store = list()
    store.append(starters)

    out_player = subs['PLAYER1_ID'].tolist()
    in_player = subs['PLAYER2_ID'].tolist()

    for i, _ in enumerate(out_player):
        if i == 0:
            new_lineup = starters

        current_lineup = new_lineup
        new_lineup = np.where(current_lineup == out_player[i], in_player[i], current_lineup)

        store.append(new_lineup)

    mat = np.stack(store, axis=0)
    _, counts = np.unique(mat, axis=1, return_counts=True)

    assert np.all(counts == 1), "Same player multiple times on the court"
    assert mat.shape[1] == 10, "There are not 10 players on the court"

    return (mat)

def get_stint_pts_data(game_id: str):
    """

    :param game_id:
    :return:
    """
    # load the data
    df_schedule = pd.read_csv("./data/season_prediction/schedule.csv", dtype={'GAME_ID': str})
    df_boxscores = pd.read_csv("./data/season_prediction/boxscores.csv", dtype={'GAME_ID': str})

    # pbp call
    call = playbyplayv2.PlayByPlayV2(game_id=game_id, start_period=1, end_period=4)
    data = pd.concat(call.get_data_frames())
    data.drop(data.tail(1).index, inplace=True)
    
    # sleep
    time.sleep(0.5)

    # transform time
    data['time_real_tmp'] = data['PCTIMESTRING'].apply(lambda x: re.sub(string=x, repl="", pattern=":"))
    data['time_real_tmp'] = np.abs(data['time_real_tmp'].astype(float) * (5 - data['PERIOD'].astype(float)) - 4800)

    data['HOME_PTS'] = data[data['EVENTMSGTYPE'] == 1]['SCORE'].apply(lambda x: x.split(' - ')[0])
    data['AWAY_PTS'] = data[data['EVENTMSGTYPE'] == 1]['SCORE'].apply(lambda x: x.split(' - ')[1])

    # get the starters of the game
    starters = df_boxscores[df_boxscores['GAME_ID'] == game_id]
    starters = starters[~starters['START_POSITION'].isna()]

    # create the stint_marker
    bool_subs = data['EVENTMSGTYPE'] == 8
    data.loc[bool_subs, 'stint_marker'] = np.arange(start=1, stop=np.sum(bool_subs) + 1)

    # create idx array
    inter_id = np.where(~data['stint_marker'].isna())[0] + 1
    last_id = data.shape[0]
    first_id = 0

    # idx
    idx = np.where(~data['stint_marker'].isna())[0] + 1

    list_stint = list()

    for i, ids in enumerate(idx):

        # first iteration
        if i == 0:
            tmp = data.iloc[:idx[0]]

        # in between
        else:
            tmp = data.iloc[idx[i - 1]:idx[i]]

        # append to list
        list_stint.append(tmp)

    # append the last iteration
    list_stint.append(data.iloc[idx[-1]:])

    # insert the stint counter
    for i, stint in enumerate(list_stint):
        stint['stint'] = i

    data = pd.concat(list_stint)
    game_stints = data['stint'].unique()

    # filter scoring events
    data_score = data[data['EVENTMSGTYPE'] == 1]

    # create data_pts_stint, handle stints where there was no scoring
    tmp_pts_stint = data_score.groupby('stint')[['HOME_PTS', 'AWAY_PTS']].max()
    tmp_pts_stint['stint_merge'] = tmp_pts_stint.index
    tmp2_pts_stint = pd.DataFrame(data={'HOME_PTS': None, 'AWAY_PTS': None, 'stint': game_stints})
    tmp_merge = pd.merge(tmp2_pts_stint, tmp_pts_stint, how='left', left_on='stint', right_on='stint_merge',
                         suffixes=("_drop", None))
    data_pts_stint = tmp_merge.drop(['HOME_PTS_drop', 'AWAY_PTS_drop', 'stint_merge'], axis=1)
    data_pts_stint = data_pts_stint[~data_pts_stint['stint'].isna()]

    subs = data.loc[bool_subs, ['PLAYER1_ID', 'PLAYER2_ID', 'stint', 'stint_marker']]  # all substitutions!
    game = df_boxscores[df_boxscores['GAME_ID'] == game_id]  # get the games boxscore
    starter = game[~game['START_POSITION'].isna()]  # get the starters of the game
    starters = starter['PLAYER_ID'].values

    # get home team bool and id of the games home team
    df_schedule['is_home_game'] = df_schedule['MATCHUP'].apply(lambda x: x.find('@') == -1)
    tmp_ht = df_schedule[df_schedule['GAME_ID'] == game_id]
    home_team_id = tmp_ht[tmp_ht['is_home_game']]['TEAM_ID'].values[0]

    # get the ids of the home teams players
    home_team_players = game[game['TEAM_ID'] == home_team_id]['PLAYER_ID'].values

    store = list()
    store.append(starters)

    out_player = subs['PLAYER1_ID'].tolist()
    in_player = subs['PLAYER2_ID'].tolist()

    for i, _ in enumerate(out_player):

        if i == 0:
            new_lineup = starters

        current_lineup = new_lineup
        new_lineup = np.where(current_lineup == out_player[i], in_player[i], current_lineup)

        store.append(new_lineup)

    mat = np.stack(store, axis=0)
    _, counts = np.unique(mat, axis=1, return_counts=True)

    assert np.all(counts == 1), "Same player multiple times on the court"
    assert mat.shape[1] == 10, "There are not 10 players on the court"

    # create column-names and dataframe
    colnames_lineup = ['HOME_' + str(i) for i in np.arange(start=1, stop=6)]
    colnames_lineup.extend(['AWAY_' + str(i) for i in np.arange(start=1, stop=6)])
    game_lineups = pd.DataFrame(data=mat, columns=colnames_lineup)

    # store stints in vector
    stint = np.arange(game_lineups.shape[0])

    # save as long format
    game_lineups_long = pd.melt(game_lineups)
    game_lineups_long['value'] = game_lineups_long['value'].astype(str)

    # transform to numpy matrix
    game_lineups = game_lineups.to_numpy()
    game_lineups_shape = game_lineups.shape

    # get unique players used in the game
    player_used = np.unique(game_lineups_long['value'])
    n_player_used = player_used.shape[0]

    # create vector indicating home team players
    cond_ht_players = np.isin(player_used.astype(float), home_team_players)

    # init ohe matrix
    ohe = np.zeros((game_lineups_shape[0], n_player_used))

    # loop over the players used and create dummy variable for each one per stint
    for i, player in enumerate(player_used):

        # home player gets a 1
        if cond_ht_players[i]:
            ohe[:, i] = np.sum((game_lineups == float(player)), axis=1)

        # away player gets a -1
        else:
            ohe[:, i] = np.sum((game_lineups == float(player)), axis=1) * (-1)

    assert np.all(np.abs(ohe).sum(axis=1) == 10), "In some stint, there are not 10 players on the court"
    # assert np.all(np.abs(ohe).max() == 1), "Players have been counted multiple times"

    # dirty hotfix - delete later
    ohe[ohe > 1] = 1
    ohe[ohe < -1] = -1

    # transform to dataframe
    data_player_stint = pd.DataFrame(ohe, columns=player_used)

    # add additional data
    data_player_stint['stint'] = data_player_stint.index
    data_player_stint['GAME_ID'] = game_id

    # impute missing values
    data_pts = data_pts_stint.fillna(method='ffill')

    data_pts[data_pts['HOME_PTS'].isna()] = 0
    data_pts[data_pts['AWAY_PTS'].isna()] = 0

    data_pts['HOME_PTS'] = data_pts['HOME_PTS'].astype(float)
    data_pts['AWAY_PTS'] = data_pts['AWAY_PTS'].astype(float)

    # plus minus
    data_pts['HOME_PLUS_MINUS'] = data_pts['HOME_PTS'] - data_pts['AWAY_PTS']
    data_pts['AWAY_PLUS_MINUS'] = data_pts['AWAY_PTS'] - data_pts['HOME_PTS']

    # delta of plus-minus
    data_pts['HOME_PM_DIFF'] = data_pts['HOME_PLUS_MINUS'].diff()
    data_pts['AWAY_PM_DIFF'] = data_pts['AWAY_PLUS_MINUS'].diff()
    data_pts[data_pts['HOME_PM_DIFF'].isna()] = 0
    data_pts[data_pts['AWAY_PM_DIFF'].isna()] = 0

    data_stint = pd.merge(data_player_stint, data_pts)

    return data_stint


In [547]:
df_schedule = pd.read_csv("./data/season_prediction/schedule.csv", dtype={'GAME_ID': str})
df_boxscore = pd.read_csv("./data/season_prediction/boxscores.csv", dtype={'GAME_ID': str})

In [548]:
all_player_ids = df_boxscore['PLAYER_ID'].unique()

In [549]:
all_player_ids.shape

(1128,)

In [550]:
season_game_ids = df_schedule[df_schedule['SEASON_ID'] == 22019]['GAME_ID'].unique()
season_game_ids

array(['0021900002', '0021900001', '0021900008', ..., '0021901315',
       '0021901316', '0021901318'], dtype=object)

In [551]:
games = df_boxscore[df_boxscore['GAME_ID'].isin(season_game_ids)]

In [552]:
games

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,E_OFF_RATING,OFF_RATING,E_DEF_RATING,DEF_RATING,E_NET_RATING,NET_RATING,AST_PCT,AST_TOV,AST_RATIO,OREB_PCT,DREB_PCT,REB_PCT,TM_TOV_PCT,EFG_PCT,TS_PCT,USG_PCT,E_USG_PCT,E_PACE,PACE,PACE_PER40,POSS,PIE,TEAM_NAME,E_TM_TOV_PCT,Unnamed: 0.1.1,player_game_index,len_game_id
168487,0,0,0021900002,1610612747,LAL,Los Angeles,203076.0,Anthony Davis,Anthony,F,,37:21,103.3,106.4,105.7,102.6,-2.4,3.8,0.227,1.67,14.3,0.081,0.154,0.118,8.6,0.381,0.460,0.345,0.347,100.26,100.23,83.53,78,0.126,,,,,21900002
168488,1,1,0021900002,1610612747,LAL,Los Angeles,2544.0,LeBron James,LeBron,F,,36:00,95.5,94.7,110.6,105.3,-15.0,-10.5,0.444,1.60,23.5,0.026,0.273,0.139,14.7,0.395,0.434,0.302,0.316,98.48,101.33,84.44,76,0.135,,,,,21900002
168489,2,2,0021900002,1610612747,LAL,Los Angeles,201580.0,JaVale McGee,JaVale,C,,17:21,93.1,97.1,96.6,91.9,-3.5,5.3,0.000,0.00,0.0,0.059,0.067,0.063,25.0,0.667,0.667,0.103,0.104,99.23,99.62,83.01,35,0.079,,,,,21900002
168490,3,3,0021900002,1610612747,LAL,Los Angeles,201980.0,Danny Green,Danny,G,,32:19,107.8,110.3,104.7,101.5,3.0,8.8,0.000,0.00,0.0,0.031,0.194,0.111,0.0,0.964,0.970,0.182,0.188,99.89,100.24,83.54,68,0.267,,,,,21900002
168491,4,4,0021900002,1610612747,LAL,Los Angeles,202340.0,Avery Bradley,Avery,G,,24:02,91.1,93.9,99.4,94.0,-8.4,-0.1,0.000,0.00,0.0,0.000,0.136,0.067,22.2,0.571,0.571,0.170,0.171,97.66,98.86,82.39,49,0.028,,,,,21900002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197138,28651,21,0021901318,1610612761,TOR,Toronto,1626259.0,Malcolm Miller,Malcolm,,,25:37,116.8,120.4,110.9,112.7,5.9,7.6,0.143,0.00,37.5,0.037,0.000,0.019,0.0,1.000,1.000,0.079,0.081,104.52,102.12,85.10,54,0.085,,,,,21901318
197139,28652,22,0021901318,1610612761,TOR,Toronto,1629608.0,Dewan Hernandez,Dewan,,,12:42,113.5,110.7,104.3,110.7,9.2,0.0,0.100,1.00,12.5,0.188,0.286,0.233,12.5,0.333,0.333,0.212,0.223,107.79,105.83,88.19,28,0.100,,,,,21901318
197140,28653,23,0021901318,1610612761,TOR,Toronto,1629052.0,Oshae Brissett,Oshae,,DND - Injury/Illness,,,0.0,,0.0,,0.0,,,,0.000,0.000,0.000,,,,0.000,0.000,,,,0,,,,,,21901318
197141,28654,0,0021901318,1610612761,TOR,Toronto,,,,,,240:00,114.5,117.0,104.7,109.0,9.8,8.0,0.600,1.42,19.0,0.340,0.755,0.553,19.0,0.600,0.608,1.000,0.198,103.14,100.00,83.33,100,0.507,Raptors,18.598,,,21901318


In [553]:
games.groupby(['GAME_ID'])['POSS'].sum()

GAME_ID
0021900001    1374
0021900002    1181
0021900003    1268
0021900004    1256
0021900005    1211
              ... 
0021901314    1219
0021901315    1248
0021901316    1232
0021901317    1378
0021901318    1207
Name: POSS, Length: 1059, dtype: int64

In [57]:
get_stint_pts_data(game_id="0021400002") # 0022001074

Unnamed: 0,101145.0,1495.0,1717.0,1938.0,201148.0,201158.0,201980.0,202329.0,202709.0,202718.0,202962.0,203109.0,203382.0,2199.0,2210.0,2225.0,2564.0,2588.0,2734.0,2749.0,stint,GAME_ID,HOME_PTS,AWAY_PTS,HOME_PLUS_MINUS,AWAY_PLUS_MINUS,HOME_PM_DIFF,AWAY_PM_DIFF
0,-1.0,1.0,-1.0,0.0,0.0,1.0,1.0,0.0,0.0,-1.0,0.0,0.0,0.0,-1.0,0.0,1.0,0.0,1.0,0.0,-1.0,0,21400002,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.0,1.0,-1.0,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,-1.0,0.0,1.0,0.0,1.0,0.0,-1.0,1,21400002,6.0,7.0,-1.0,1.0,0.0,0.0
2,-1.0,1.0,-1.0,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,-1.0,2,21400002,6.0,7.0,-1.0,1.0,0.0,0.0
3,-1.0,1.0,-1.0,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,-1.0,3,21400002,9.0,7.0,2.0,-2.0,3.0,-3.0
4,-1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,-1.0,-1.0,4,21400002,9.0,7.0,2.0,-2.0,0.0,0.0
5,-1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,-1.0,0.0,1.0,1.0,0.0,-1.0,-1.0,5,21400002,13.0,9.0,4.0,-4.0,2.0,-2.0
6,-1.0,1.0,0.0,1.0,-1.0,1.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,-1.0,-1.0,6,21400002,13.0,9.0,4.0,-4.0,0.0,0.0
7,-1.0,0.0,0.0,1.0,-1.0,1.0,0.0,0.0,0.0,-1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,-1.0,-1.0,7,21400002,18.0,19.0,-1.0,1.0,-5.0,5.0
8,-1.0,0.0,-1.0,1.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,-1.0,-1.0,8,21400002,18.0,19.0,-1.0,1.0,0.0,0.0
9,0.0,0.0,-1.0,1.0,-1.0,1.0,0.0,-1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,-1.0,-1.0,9,21400002,18.0,19.0,-1.0,1.0,0.0,0.0


In [554]:
season_game_ids = df_schedule[df_schedule['SEASON_ID'] == 22019]['GAME_ID'].unique()

In [557]:
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

list_stints = list()

for season in tqdm(season_game_ids[0:30]):
    
    try:
        list_stints.append(get_stint_pts_data(game_id=season))
    
    except:
        next
    
# get some info about dataframe structure
list_score = list()
list_player_stint = list()

for stint in list_stints:
    list_score.append(stint.loc[:, 'stint':'AWAY_PM_DIFF'])
    list_player_stint.append(stint.loc[:, :'GAME_ID'])
    
df_tmp_score = pd.concat(list_score)
df_tmp_player = pd.concat(list_player_stint)

# save dataframes
df_tmp_score.to_csv('./data/season_prediction/tmp_score.csv')
df_tmp_player.to_csv('./data/season_prediction/tmp_player.csv')

# merge together in two steps
data_stint_season = pd.DataFrame(data={'GAME_ID':season_game_ids})
tmp = pd.merge(data_stint_season, df_tmp_score, how='left')
df = pd.merge(tmp, df_tmp_player)

df.to_csv('./data/season_prediction/stints_player_season.csv')

100%|██████████| 30/30 [01:01<00:00,  2.04s/it]


In [236]:
df

Unnamed: 0,GAME_ID,stint,HOME_PTS,AWAY_PTS,HOME_PLUS_MINUS,AWAY_PLUS_MINUS,HOME_PM_DIFF,AWAY_PM_DIFF,101150.0,1626149.0,1626188.0,1627826.0,1629013.0,201162.0,201580.0,201976.0,201980.0,202335.0,202340.0,202695.0,203076.0,203090.0,203210.0,203484.0,203584.0,2544.0,2730.0,1626143.0,1626181.0,1627742.0,1627783.0,1627832.0,1628366.0,1628384.0,1628402.0,1628404.0,1629026.0,1629056.0,1629638.0,1629740.0,...,1627854.0,1628372.0,1628401.0,1628995.0,1629019.0,1629628.0,200746.0,200752.0,201158.0,201942.0,201959.0,201961.0,201988.0,202694.0,203901.0,203944.0,1626158.0,1626162.0,1626163.0,1626164.0,1627741.0,1627767.0,1627812.0,1628368.0,1628963.0,1628969.0,1628975.0,1629028.0,1629117.0,1629607.0,201937.0,202357.0,202709.0,203084.0,203382.0,203473.0,203967.0,203992.0,204020.0,2772.0
0,0021900002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,-1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,0021900002,1.0,9.0,2.0,7.0,-7.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,0021900002,2.0,13.0,6.0,7.0,-7.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,-1.0,0.0,-1.0,1.0,-1.0,1.0,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,0021900002,3.0,15.0,8.0,7.0,-7.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,1.0,-1.0,0.0,1.0,0.0,0.0,-1.0,0.0,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,0021900002,4.0,15.0,8.0,7.0,-7.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,1.0,1.0,0.0,0.0,-1.0,0.0,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258,0021900012,48.0,89.0,119.0,-30.0,30.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,0.0,1.0,0.0,0.0,0.0,1.0,-1.0,0.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,1.0,0.0,0.0,-1.0,0.0,0.0
259,0021900012,49.0,89.0,119.0,-30.0,30.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,0.0,-1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,-1.0,-1.0,1.0,0.0,0.0,-1.0,0.0,0.0
260,0021900012,50.0,89.0,119.0,-30.0,30.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,0.0,-1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,-1.0,-1.0,1.0,0.0,0.0,-1.0,0.0,0.0
261,0021900012,51.0,89.0,119.0,-30.0,30.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,0.0,-1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,-1.0,-1.0,1.0,0.0,0.0,-1.0,0.0,0.0


In [560]:
#data = pd.read_csv("./data/season_prediction/stints_player_season_2019.csv", dtype={'GAME_ID': str})
data = pd.read_csv("./data/season_prediction/stints_player_season.csv", dtype={'GAME_ID': str})

In [561]:
data

Unnamed: 0.1,Unnamed: 0,GAME_ID,stint,HOME_PTS,AWAY_PTS,HOME_PLUS_MINUS,AWAY_PLUS_MINUS,HOME_PM_DIFF,AWAY_PM_DIFF,101150.0,1626149.0,1626188.0,1627826.0,1629013.0,201162.0,201580.0,201976.0,201980.0,202335.0,202340.0,202695.0,203076.0,203090.0,203210.0,203484.0,203584.0,2544.0,2730.0,1626143.0,1626181.0,1627742.0,1627783.0,1627832.0,1628366.0,1628384.0,1628402.0,1628404.0,1629026.0,1629056.0,1629638.0,...,203953.0,204025.0,1627777.0,1628396.0,1628430.0,1628961.0,1629668.0,1629714.0,1629742.0,1629011.0,1626169.0,1629684.0,1628403.0,1629018.0,1629642.0,1629713.0,1626195.0,1628395.0,1628997.0,1628998.0,1629667.0,202687.0,1627846.0,1629126.0,1629022.0,202738.0,1626178.0,1627756.0,1627885.0,1629655.0,1629744.0,1628373.0,1628422.0,1629605.0,1629738.0,1629750.0,1629059.0,1629611.0,1629661.0,203463.0
0,0,0021900002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,-1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1,0021900002,1.0,9.0,2.0,7.0,-7.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2,0021900002,2.0,13.0,6.0,7.0,-7.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,-1.0,0.0,-1.0,1.0,-1.0,1.0,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,3,0021900002,3.0,15.0,8.0,7.0,-7.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,1.0,-1.0,0.0,1.0,0.0,0.0,-1.0,0.0,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,4,0021900002,4.0,15.0,8.0,7.0,-7.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,1.0,1.0,0.0,0.0,-1.0,0.0,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1601,1601,0021900032,45.0,108.0,110.0,-2.0,2.0,2.0,-2.0,,,,,,,,,,,,,,,,,,,,,,0.0,,,-1.0,,0.0,0.0,0.0,,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0
1602,1602,0021900032,46.0,108.0,112.0,-4.0,4.0,-2.0,2.0,,,,,,,,,,,,,,,,,,,,,,0.0,,,-1.0,,0.0,0.0,0.0,,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0
1603,1603,0021900032,47.0,108.0,114.0,-6.0,6.0,-2.0,2.0,,,,,,,,,,,,,,,,,,,,,,0.0,,,-1.0,,0.0,0.0,0.0,,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0
1604,1604,0021900032,48.0,108.0,114.0,-6.0,6.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,-1.0,,,-1.0,,0.0,0.0,0.0,,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0


In [562]:
player_ids_model = data.loc[:, 'AWAY_PM_DIFF':].drop(['AWAY_PM_DIFF'], axis=1).columns.values
X = data.loc[:, 'AWAY_PM_DIFF':].drop(['AWAY_PM_DIFF'], axis=1).to_numpy()
y = data.loc[:, 'HOME_PM_DIFF'].to_numpy()
X[np.isnan(X)] = 0

In [523]:
#mask = np.abs(X).sum(axis=0) > 2e1

In [524]:
#mask = y != 0
#X = X[:, mask]
#y = y[mask]

In [563]:
np.unique(X)

array([-1.,  0.,  1.])

In [564]:
X.shape, y.shape

((1606, 350), (1606,))

In [565]:
y

array([ 0.,  0.,  0., ..., -2.,  0.,  5.])

In [566]:
from sklearn.linear_model import RidgeCV
model = RidgeCV(alphas=[1e-6, 1e-5, 1e-4, 1e-3, 1e-1, 1, 5, 1e1, 2e1], normalize=True).fit(X, y)
print(model.alpha_)
print(model.best_score_)

20.0
-5.761131890630868


In [567]:
player_score = model.coef_

In [568]:
# store ids and scores in vectors
player_ids_model = player_ids_model.reshape(-1,1)
player_score = player_score.reshape(-1,1)
a = np.concatenate([player_ids_model, player_score], axis=1)

# create dataframe
player_ratings = pd.DataFrame(a, columns=['PLAYER_ID', 'SCORE'])
player_ratings['PLAYER_ID'] = player_ratings['PLAYER_ID'].astype(str)
player_ratings['PLAYER_ID'] = player_ratings['PLAYER_ID'].apply(lambda x: x.replace(".0", ""))


In [569]:
player_data = pd.read_csv("players_data.csv")
player_data['PLAYER_ID'] = player_data['id'].astype(str)

In [570]:
player_data

Unnamed: 0,id,player_names,position,team,PLAYER_ID
0,201939,Stephen Curry,G,GSW,201939
1,201935,James Harden,G,BKN,201935
2,202322,John Wall,G,HOU,202322
3,201566,Russell Westbrook,G,WAS,201566
4,2544,LeBron James,F,LAL,2544
...,...,...,...,...,...
412,1629719,Devontae Cacok,F,LAL,1629719
413,1630238,Malik Fitts,F,LAC,1630238
414,1630235,Trent Forrest,G,UTA,1630235
415,1630216,Cassius Winston,G,WAS,1630216


In [571]:
player_ratings

Unnamed: 0,PLAYER_ID,SCORE
0,101150,-0.007781
1,1626149,-0.005808
2,1626188,-0.00277
3,1627826,0.005267
4,1629013,0.001449
...,...,...
345,1629750,0.02761
346,1629059,-0.016806
347,1629611,-0.000051
348,1629661,0.002045


In [572]:
df = pd.merge(player_data, player_ratings)
df = df.drop(['id'], axis=1)

In [573]:
df

Unnamed: 0,player_names,position,team,PLAYER_ID,SCORE
0,Stephen Curry,G,GSW,201939,0.008844
1,James Harden,G,BKN,201935,-0.008882
2,Russell Westbrook,G,WAS,201566,0.012482
3,LeBron James,F,LAL,2544,0.00024
4,Kawhi Leonard,F,LAC,202695,-0.0204
...,...,...,...,...,...
264,Matt Thomas,G,UTA,1629744,-0.054329
265,Tacko Fall,C,BOS,1629605,0.014201
266,Chris Chiozza,G,BKN,1629185,0.002051
267,Garrison Mathews,G,WAS,1629726,-0.01562


In [574]:
df.sort_values(['SCORE'], ascending=False)

Unnamed: 0,player_names,position,team,PLAYER_ID,SCORE
233,Nassir Little,F,POR,1629642,0.180507
149,DJ Augustin,G,HOU,201571,0.142114
73,Gary Trent Jr,G,TOR,1629018,0.116929
255,Daniel Gafford,F,WAS,1629655,0.088274
241,Abdel Nader,F,PHX,1627846,0.073747
...,...,...,...,...,...
226,Hassan Whiteside,C,SAC,202355,-0.035521
22,Joel Embiid,C,PHI,203954,-0.035628
264,Matt Thomas,G,UTA,1629744,-0.054329
244,Shake Milton,G,PHI,1629003,-0.055822
