# Testing Validity of Encoded Box Scores

In [1]:
import pandas as pd
import numpy as np

In [2]:
encoded_box_scores = pd.read_csv('../Resources/fully_encoded_box_scores_raw.csv')

## General Stats

In [21]:
num_of_nulls = encoded_box_scores.isnull().sum(axis=1)
encoded_scores_with_null_count = pd.concat([encoded_box_scores,num_of_nulls],axis='columns', join='inner')
encoded_scores_with_null_count.columns = ['slug', 'name', 'Team', 'Location', 'Opponent', 'Outcome','date', 'game_id1', 'game_id2',
       'game_id3', 'game_id4', 'null_count']
encoded_scores_with_null_count.describe()

Unnamed: 0,game_id1,game_id2,game_id3,game_id4,null_count
count,30910.0,38723.0,83543.0,82281.0,202219.0
mean,5170.520479,5012.632931,5194.079193,5199.943037,2.835634
std,2708.591261,2692.202322,2745.606345,2751.018307,0.370609
min,456.0,453.0,457.0,457.0,2.0
25%,2754.0,2643.0,2842.0,2844.0,3.0
50%,5140.0,4972.0,5215.0,5231.0,3.0
75%,7604.0,7340.0,7525.0,7527.0,3.0
max,9944.0,9947.0,9959.0,9959.0,3.0


This indicates that each field had either one game ID encoded or 2 game ID's encoded.

In [81]:
single_id = encoded_scores_with_null_count[encoded_scores_with_null_count.null_count==3].copy()
double_id = encoded_scores_with_null_count[encoded_scores_with_null_count.null_count==2].copy()

In [64]:
print(f'Number of scores with 1 encoded game ID : {len(single_id)}')
print(f'Number of scores with 2 encoded game IDs : {len(double_id)}')

Number of scores with 1 encoded game ID : 168981
Number of scores with 2 encoded game IDs : 33238


# Getting one final ID

In [65]:
single_id.head()

Unnamed: 0,slug,name,Team,Location,Opponent,Outcome,date,game_id1,game_id2,game_id3,game_id4,null_count
0,anthoca01,Carmelo Anthony,NEW_YORK_KNICKS,HOME,PORTLAND_TRAIL_BLAZERS,LOSS,2013-01-01,,,,458.0,3
2,holidjr01,Jrue Holiday,PHILADELPHIA_76ERS,AWAY,LOS_ANGELES_LAKERS,WIN,2013-01-01,,,461.0,,3
3,smithjr01,J.R. Smith,NEW_YORK_KNICKS,HOME,PORTLAND_TRAIL_BLAZERS,LOSS,2013-01-01,,,,458.0,3
4,turneev01,Evan Turner,PHILADELPHIA_76ERS,AWAY,LOS_ANGELES_LAKERS,WIN,2013-01-01,,,461.0,,3
5,batumni01,Nicolas Batum,PORTLAND_TRAIL_BLAZERS,AWAY,NEW_YORK_KNICKS,WIN,2013-01-01,,,458.0,,3


In [66]:
single_id.fillna(0, inplace=True)

In [67]:
single_id['final_id']=single_id.game_id1+single_id.game_id2+single_id.game_id3+single_id.game_id4

In [68]:
single_id['final_id'] = single_id['final_id'].astype('int')

In [69]:
single_id.isnull().sum()

slug          0
name          0
Team          0
Location      0
Opponent      0
Outcome       0
date          0
game_id1      0
game_id2      0
game_id3      0
game_id4      0
null_count    0
final_id      0
dtype: int64

In [70]:
single_id.drop(columns=['game_id1','game_id2','game_id3','game_id4','null_count'], inplace=True)
single_id.head()

Unnamed: 0,slug,name,Team,Location,Opponent,Outcome,date,final_id
0,anthoca01,Carmelo Anthony,NEW_YORK_KNICKS,HOME,PORTLAND_TRAIL_BLAZERS,LOSS,2013-01-01,458
2,holidjr01,Jrue Holiday,PHILADELPHIA_76ERS,AWAY,LOS_ANGELES_LAKERS,WIN,2013-01-01,461
3,smithjr01,J.R. Smith,NEW_YORK_KNICKS,HOME,PORTLAND_TRAIL_BLAZERS,LOSS,2013-01-01,458
4,turneev01,Evan Turner,PHILADELPHIA_76ERS,AWAY,LOS_ANGELES_LAKERS,WIN,2013-01-01,461
5,batumni01,Nicolas Batum,PORTLAND_TRAIL_BLAZERS,AWAY,NEW_YORK_KNICKS,WIN,2013-01-01,458


In [104]:
single_id.to_csv('../Resources/encoded_single.csv')

In [71]:
single_id_sample = single_id.sample(50)
single_id_sample.to_csv('../Resources/sample_single_id.csv')

In [82]:
double_id['game_id1']=double_id['game_id1'].astype('float')
double_id['game_id2']=double_id['game_id2'].astype('float')
double_id['game_id3']=double_id['game_id3'].astype('float')
double_id['game_id4']=double_id['game_id4'].astype('float')

double_id['game_id1']=double_id['game_id1'].astype('str')
double_id['game_id2']=double_id['game_id2'].astype('str')
double_id['game_id3']=double_id['game_id3'].astype('str')
double_id['game_id4']=double_id['game_id4'].astype('str')


In [84]:
double_id['combo']=double_id.game_id1+double_id.game_id2+double_id.game_id3+double_id.game_id4
double_id.head()

Unnamed: 0,slug,name,Team,Location,Opponent,Outcome,date,game_id1,game_id2,game_id3,game_id4,null_count,combo
1,smithjo03,Josh Smith,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,,453.0,459.0,,2,nan453.0459.0nan
17,pachuza01,Zaza Pachulia,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,,453.0,459.0,,2,nan453.0459.0nan
22,horfoal01,Al Horford,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,,453.0,459.0,,2,nan453.0459.0nan
31,korveky01,Kyle Korver,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,,453.0,459.0,,2,nan453.0459.0nan
59,jenkijo01,John Jenkins,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,,453.0,459.0,,2,nan453.0459.0nan


In [85]:
double_id.drop(columns=['game_id1','game_id2','game_id3','game_id4','null_count'], inplace=True)
double_id.head()

Unnamed: 0,slug,name,Team,Location,Opponent,Outcome,date,combo
1,smithjo03,Josh Smith,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,nan453.0459.0nan
17,pachuza01,Zaza Pachulia,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,nan453.0459.0nan
22,horfoal01,Al Horford,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,nan453.0459.0nan
31,korveky01,Kyle Korver,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,nan453.0459.0nan
59,jenkijo01,John Jenkins,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,nan453.0459.0nan


In [91]:
double_id.combo = double_id.combo.str.replace('nan','')
double_id.head()

Unnamed: 0,slug,name,Team,Location,Opponent,Outcome,date,combo
1,smithjo03,Josh Smith,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453.0459.0
17,pachuza01,Zaza Pachulia,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453.0459.0
22,horfoal01,Al Horford,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453.0459.0
31,korveky01,Kyle Korver,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453.0459.0
59,jenkijo01,John Jenkins,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453.0459.0


In [97]:
two_ids = double_id.combo.str.split('.',n=1,expand=True)
two_ids.head()

Unnamed: 0,0,1
1,453,459.0
17,453,459.0
22,453,459.0
31,453,459.0
59,453,459.0


In [98]:
two_ids.columns=['first_id','second_id']
two_ids.head(2)

Unnamed: 0,first_id,second_id
1,453,459.0
17,453,459.0


In [99]:
two_ids.second_id = two_ids.second_id.str[1:-2]
two_ids.sample(5)

Unnamed: 0,first_id,second_id
175687,8707,8717
110420,5648,5658
28286,1787,1796
83882,4422,4427
70615,3804,3806


In [101]:
revised_double = pd.concat([double_id,two_ids], axis='columns',join='inner')
revised_double.head()

Unnamed: 0,slug,name,Team,Location,Opponent,Outcome,date,combo,first_id,second_id
1,smithjo03,Josh Smith,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453.0459.0,453,459
17,pachuza01,Zaza Pachulia,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453.0459.0,453,459
22,horfoal01,Al Horford,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453.0459.0,453,459
31,korveky01,Kyle Korver,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453.0459.0,453,459
59,jenkijo01,John Jenkins,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453.0459.0,453,459


In [103]:
revised_double.drop(columns=['combo'], inplace=True)
revised_double.head()

Unnamed: 0,slug,name,Team,Location,Opponent,Outcome,date,first_id,second_id
1,smithjo03,Josh Smith,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459
17,pachuza01,Zaza Pachulia,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459
22,horfoal01,Al Horford,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459
31,korveky01,Kyle Korver,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459
59,jenkijo01,John Jenkins,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459


In [105]:
revised_double.to_csv('../Resources/encoded_double.csv')

# Comparing the Encoded Schedule

In [107]:
encoded_schedule = pd.read_csv('../Resources/encoded_schedule.csv')
encoded_schedule.head()

Unnamed: 0.1,Unnamed: 0,date1,date2,away,home,away_team_score,home_team_score,game_number,game_id
0,0,2012-10-30,2012-10-29,WASHINGTON_WIZARDS,CLEVELAND_CAVALIERS,84.0,94.0,0,0
1,1,2012-10-31,2012-10-30,BOSTON_CELTICS,MIAMI_HEAT,107.0,120.0,1,1
2,2,2012-10-31,2012-10-30,DALLAS_MAVERICKS,LOS_ANGELES_LAKERS,99.0,91.0,2,2
3,3,2012-10-31,2012-10-30,INDIANA_PACERS,TORONTO_RAPTORS,90.0,88.0,3,3
4,4,2012-10-31,2012-10-30,DENVER_NUGGETS,PHILADELPHIA_76ERS,75.0,84.0,4,4


In [108]:
encoded_schedule=encoded_schedule[['date1','date2','away','home','game_id']]

In [122]:
encoded_schedule[encoded_schedule.game_id == 6654]

Unnamed: 0,date1,date2,away,home,game_id
6654,2017-10-29,2017-10-28,SAN_ANTONIO_SPURS,INDIANA_PACERS,6654


In [125]:
encoded_schedule.head()

Unnamed: 0,date1,date2,away,home,game_id
0,2012-10-30,2012-10-29,WASHINGTON_WIZARDS,CLEVELAND_CAVALIERS,0
1,2012-10-31,2012-10-30,BOSTON_CELTICS,MIAMI_HEAT,1
2,2012-10-31,2012-10-30,DALLAS_MAVERICKS,LOS_ANGELES_LAKERS,2
3,2012-10-31,2012-10-30,INDIANA_PACERS,TORONTO_RAPTORS,3
4,2012-10-31,2012-10-30,DENVER_NUGGETS,PHILADELPHIA_76ERS,4


In [128]:
away_encoder_1 = encoded_schedule.copy()
away_encoder_1.columns=['date1','date2','Team','Opponent','first_id']

In [129]:
away_encoder_2 = encoded_schedule.copy()
away_encoder_2.columns=['date1','date2','Team','Opponent','second_id']

In [135]:
revised_double_away = revised_double[revised_double.Location == 'AWAY'].copy()
revised_double_home = revised_double[revised_double.Location == 'HOME'].copy()
revised_double_away.head()

Unnamed: 0,slug,name,Team,Location,Opponent,Outcome,date,first_id,second_id
1,smithjo03,Josh Smith,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459
17,pachuza01,Zaza Pachulia,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459
22,horfoal01,Al Horford,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459
31,korveky01,Kyle Korver,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459
59,jenkijo01,John Jenkins,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459


In [123]:
revised_double_home.describe()

Unnamed: 0,slug,name,Team,Location,Opponent,Outcome,date,first_id,second_id
count,11244,11244,11244,11244,11244,11244,11244,11244,11244
unique,956,952,32,1,32,2,564,1036,1051
top,walkeke02,Ersan İlyasova,MILWAUKEE_BUCKS,HOME,MILWAUKEE_BUCKS,WIN,2013-11-23,9568,7579
freq,50,50,566,11244,482,6524,74,25,23


In [124]:
revised_double_away.describe()

Unnamed: 0,slug,name,Team,Location,Opponent,Outcome,date,first_id,second_id
count,21994,21994,21994,21994,21994,21994,21994,21994,21994
unique,1013,1010,32,1,32,2,840,1921,2067
top,barneha02,Harrison Barnes,BROOKLYN_NETS,AWAY,UTAH_JAZZ,LOSS,2013-12-28,7381,957
freq,74,74,909,21994,976,13876,110,26,24


## Verifying Away Games

In [137]:
revised_double_away.first_id = revised_double_away.first_id.astype('int')
revised_double_away.second_id = revised_double_away.second_id.astype('int')

In [158]:
away_entries = len(revised_double_away)

In [132]:
away_encoder_1.head(2)

Unnamed: 0,date1,date2,Team,Opponent,first_id
0,2012-10-30,2012-10-29,WASHINGTON_WIZARDS,CLEVELAND_CAVALIERS,0
1,2012-10-31,2012-10-30,BOSTON_CELTICS,MIAMI_HEAT,1


In [138]:
away_part_1 = pd.merge(revised_double_away,away_encoder_1, on = ['first_id'], how='left' )
away_part_1.head()

Unnamed: 0,slug,name,Team_x,Location,Opponent_x,Outcome,date,first_id,second_id,date1,date2,Team_y,Opponent_y
0,smithjo03,Josh Smith,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459,2013-01-01,2012-12-31,ATLANTA_HAWKS,HOUSTON_ROCKETS
1,pachuza01,Zaza Pachulia,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459,2013-01-01,2012-12-31,ATLANTA_HAWKS,HOUSTON_ROCKETS
2,horfoal01,Al Horford,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459,2013-01-01,2012-12-31,ATLANTA_HAWKS,HOUSTON_ROCKETS
3,korveky01,Kyle Korver,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459,2013-01-01,2012-12-31,ATLANTA_HAWKS,HOUSTON_ROCKETS
4,jenkijo01,John Jenkins,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459,2013-01-01,2012-12-31,ATLANTA_HAWKS,HOUSTON_ROCKETS


In [146]:
away_part_1_encoded = away_part_1[(away_part_1.Team_x == away_part_1.Team_y) & (away_part_1.Opponent_x == away_part_1.Opponent_y)].copy()

In [161]:
away_entries_1 = len(away_part_1_encoded)

In [151]:
away_part_2 = pd.merge(revised_double_away,away_encoder_2, on = ['second_id'], how='left' )
away_part_2.head()

Unnamed: 0,slug,name,Team_x,Location,Opponent_x,Outcome,date,first_id,second_id,date1,date2,Team_y,Opponent_y
0,smithjo03,Josh Smith,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459,2013-01-02,2013-01-01,ATLANTA_HAWKS,NEW_ORLEANS_HORNETS
1,pachuza01,Zaza Pachulia,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459,2013-01-02,2013-01-01,ATLANTA_HAWKS,NEW_ORLEANS_HORNETS
2,horfoal01,Al Horford,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459,2013-01-02,2013-01-01,ATLANTA_HAWKS,NEW_ORLEANS_HORNETS
3,korveky01,Kyle Korver,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459,2013-01-02,2013-01-01,ATLANTA_HAWKS,NEW_ORLEANS_HORNETS
4,jenkijo01,John Jenkins,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459,2013-01-02,2013-01-01,ATLANTA_HAWKS,NEW_ORLEANS_HORNETS


In [154]:
away_part_2_encoded = away_part_2[(away_part_2.Team_x == away_part_2.Team_y) &
                                 (away_part_2.Opponent_x == away_part_2.Opponent_y)].copy()

In [155]:
away_part_2_encoded.head(2)

Unnamed: 0,slug,name,Team_x,Location,Opponent_x,Outcome,date,first_id,second_id,date1,date2,Team_y,Opponent_y
0,smithjo03,Josh Smith,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459,2013-01-02,2013-01-01,ATLANTA_HAWKS,NEW_ORLEANS_HORNETS
1,pachuza01,Zaza Pachulia,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2013-01-01,453,459,2013-01-02,2013-01-01,ATLANTA_HAWKS,NEW_ORLEANS_HORNETS


In [162]:
away_entries_2 = len(away_part_2_encoded)

In [171]:
if away_entries == away_entries_1 + away_entries_2:
    print(f'{away_entries} = {away_entries_1} + {away_entries_2}')
    print( "No Data Loss")
else:
    print("Keep Scrubbing")
    

21994 = 2733 + 19261
No Data Loss


## Verifying Home Games