In [70]:
import pandas as pd
import os
import json
import re
from tqdm import tqdm

In [71]:
directory = 'plays'
plays_agg = []
for i in range(29):
    file_name = f'plays_{i}.csv'
    path = os.path.join(directory, file_name)
    try:
        plays = pd.read_csv(path)
        plays['Game'] = i
    except FileNotFoundError:
        print(f'file {i} skipped')
        continue
    plays_agg.append(plays)

In [72]:
directory = 'team_player_dict'
dict_agg = {}
duplicate_players = []
for i in range(29):
    file_name = f'dict_{i}.json'
    path = os.path.join(directory, file_name)
    try:
        with open(path, 'r') as f:
            diction = json.load(f)
        for player, team in diction.items():
            if player in dict_agg:
                duplicate_players.append(player)
            else:
                dict_agg[player] = team
    except FileNotFoundError:
        print(f'file {i} skipped')
        continue

In [86]:
dict_agg['Butler III']

'MIA'

In [74]:
comb_plays = pd.concat(plays_agg)
comb_plays = comb_plays.reset_index(drop = True)

In [75]:
actionTypes = comb_plays['actionType'].unique()
actionTypes

array(['period', 'Jump Ball', 'Made Shot', 'Missed Shot', 'Rebound',
       'Timeout', 'Turnover', nan, 'Foul', 'Free Throw', 'Substitution',
       'Instant Replay', 'Violation', 'Ejection'], dtype=object)

In [76]:
types = {}
for action in actionTypes:
    subTypes = comb_plays[comb_plays['actionType'] == action]['subType'].unique()
    types[action] = subTypes

In [77]:
comb_plays[comb_plays['actionType'] == 'Violation']['subType'].unique()

array(['Defensive Goaltending', 'Kicked Ball', 'Delay Of Game', 'Lane'],
      dtype=object)

In [78]:
comb_plays[(comb_plays['actionType'] == 'Ejection') & (comb_plays['subType'] == 'Other')]['Game']

7594     14
11339    21
Name: Game, dtype: int64

In [79]:
# comb_plays[comb_plays['actionType'] == 'Jump Ball'].iloc[:450]

In [80]:
problematic_indices = []
key_errors = []
for idx, row in tqdm(comb_plays.iterrows()):
    if row['actionType'] == 'Made Shot':
        text = re.match(r'(^\w+)', row['description'])
        player = text.group(1)
        try:
            dictionary_team = dict_agg[player]
        except KeyError:
            key_errors.append((player, row['description']))
            continue
        possession = row['possession']
        if (dictionary_team != possession and dictionary_team != 0) and player not in duplicate_players:
            problematic_indices.append((idx, f'Shot made by {player} ({dictionary_team})', f'Possession written as {possession}'))

print(len(problematic_indices), ' / ', len(comb_plays))
print(len(key_errors), ' / ', len(comb_plays))

14996it [00:00, 19298.33it/s]

2  /  14996
213  /  14996





In [81]:
problematic_indices[:30]

[(10091, 'Shot made by Butler (WAS)', 'Possession written as MIA'),
 (10598, 'Shot made by Butler (WAS)', 'Possession written as MIA')]

In [82]:
start = 1105
comb_plays.iloc[start:start + 30]

Unnamed: 0.1,Unnamed: 0,actionNumber,teamId,scoreHome,scoreAway,description,actionType,subType,time,location,newPossession,possession,possessionCount,Game
1105,301,405,1610612750,64,53,Conley Free Throw 3 of 3 (5 PTS),Free Throw,Free Throw 3 of 3,17410.0,v,False,MIN,114,2
1106,302,406,1610612750,64,53,Conley S.FOUL (P1.T3) (N.Buchert),Foul,Shooting,17520.0,v,True,LAL,115,2
1107,303,409,1610612747,65,53,Davis Free Throw 1 of 2 (16 PTS),Free Throw,Free Throw 1 of 2,17520.0,h,False,LAL,115,2
1108,304,410,1610612747,66,53,Davis Free Throw 2 of 2 (17 PTS),Free Throw,Free Throw 2 of 2,17520.0,h,False,LAL,115,2
1109,305,411,1610612750,66,53,MISS Conley 26' 3PT Pullup Jump Shot,Missed Shot,Pullup Jump shot,17630.0,v,True,MIN,116,2
1110,306,412,0,66,53,LAKERS Rebound,Rebound,Unknown,17650.0,h,True,LAL,117,2
1111,307,414,1610612747,66,53,MISS Davis 2' Driving Layup,Missed Shot,Driving Layup Shot,17740.0,h,False,LAL,117,2
1112,308,415,1610612750,66,53,DiVincenzo REBOUND (Off:1 Def:1),Rebound,Unknown,17760.0,v,True,MIN,118,2
1113,309,416,1610612747,66,53,Christie S.FOUL (P3.T4) (N.Buchert),Foul,Shooting,17830.0,h,False,MIN,118,2
1114,310,418,1610612750,66,53,MISS Randle Free Throw 1 of 2,Free Throw,Free Throw 1 of 2,17830.0,v,False,MIN,118,2
