# Cleaning and Encoding Game ID

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import datetime
import sys
sys.path.append('../Scripts')

### Import Scripts

In [2]:
import box_score_functions as bfx
import scheduling_functions as sfx


### Import Files

In [3]:
all_box_scores = pd.read_csv('../Resources/2013-2020.csv')
all_games = pd.read_csv('../Resources/game_schedule_13-current.csv')

### Cleaning up the box scores

In [4]:
cleaned_box_scores = bfx.get_cleaned_box_scores(all_box_scores)

In [5]:
cleaned_box_scores.head(2)

Unnamed: 0,slug,name,Team,Location,Opponent,Outcome,seconds_played,made_field_goals,attempted_field_goals,made_three_point_field_goals,...,attempted_free_throws,offensive_rebounds,defensive_rebounds,assists,steals,blocks,turnovers,personal_fouls,game_score,date
0,anthoca01,Carmelo Anthony,NEW_YORK_KNICKS,HOME,PORTLAND_TRAIL_BLAZERS,LOSS,2472.0,14.0,24.0,5.0,...,14.0,2.0,5.0,4.0,1.0,0.0,2.0,3.0,36.5,2013-01-01
1,smithjo03,Josh Smith,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2234.0,11.0,20.0,1.0,...,0.0,2.0,11.0,7.0,3.0,4.0,1.0,2.0,27.0,2013-01-01


In [6]:
sfx.what_is_this(cleaned_box_scores)

length : 200897
<class 'pandas.core.frame.DataFrame'>
slug                                  object
name                                  object
Team                                  object
Location                              object
Opponent                              object
Outcome                               object
seconds_played                       float64
made_field_goals                     float64
attempted_field_goals                float64
made_three_point_field_goals         float64
attempted_three_point_field_goals    float64
made_free_throws                     float64
attempted_free_throws                float64
offensive_rebounds                   float64
defensive_rebounds                   float64
assists                              float64
steals                               float64
blocks                               float64
turnovers                            float64
personal_fouls                       float64
game_score                           float64
d

### Cleaning up the Schedules

In [7]:
encoded_schedule = sfx.get_encoded_schedule(all_games)

In [8]:
encoded_schedule.head()

Unnamed: 0,date1,date2,away,home,away_team_score,home_team_score,game_number,game_id
0,2012-10-30,2012-10-29,WASHINGTON_WIZARDS,CLEVELAND_CAVALIERS,84.0,94.0,0,0
1,2012-10-31,2012-10-30,BOSTON_CELTICS,MIAMI_HEAT,107.0,120.0,1,1
2,2012-10-31,2012-10-30,DALLAS_MAVERICKS,LOS_ANGELES_LAKERS,99.0,91.0,2,2
3,2012-10-31,2012-10-30,INDIANA_PACERS,TORONTO_RAPTORS,90.0,88.0,3,3
4,2012-10-31,2012-10-30,DENVER_NUGGETS,PHILADELPHIA_76ERS,75.0,84.0,4,4


### Left Join Game ID's

In [9]:
# Make first encoder
encoder1 = encoded_schedule[['date1','home','game_id']].copy()
merge1_cols = ['date', 'Team','game_id1']
encoder1.columns = merge1_cols
encoder1.date = encoder1.date.astype('str')
encoder1.Team = encoder1.Team.astype('str')
encoder1.date = encoder1.date.str.strip()
encoder1.Team = encoder1.Team.str.strip()
encoder1.head(2)

Unnamed: 0,date,Team,game_id1
0,2012-10-30,CLEVELAND_CAVALIERS,0
1,2012-10-31,MIAMI_HEAT,1


In [10]:
# Make second encoder
encoder2 = encoded_schedule[['date1','away','game_id']].copy()
merge2_cols = ['date','Team','game_id2']
encoder2.columns = merge2_cols
encoder2.date = encoder2.date.astype('str')
encoder2.Team = encoder2.Team.astype('str')
encoder2.date = encoder2.date.str.strip()
encoder2.Team = encoder2.Team.str.strip()
encoder2.head(2)

Unnamed: 0,date,Team,game_id2
0,2012-10-30,WASHINGTON_WIZARDS,0
1,2012-10-31,BOSTON_CELTICS,1


In [11]:
# Make second encoder
encoder3 = encoded_schedule[['date2','away','game_id']].copy()
merge3_cols = ['date','Team','game_id3']
encoder3.columns = merge3_cols
encoder3.date = encoder3.date.astype('str')
encoder3.Team = encoder3.Team.astype('str')
encoder3.date = encoder3.date.str.strip()
encoder3.Team = encoder3.Team.str.strip()
encoder3.head(2)


Unnamed: 0,date,Team,game_id3
0,2012-10-29,WASHINGTON_WIZARDS,0
1,2012-10-30,BOSTON_CELTICS,1


In [12]:
encoder4 = encoded_schedule[['date2','home','game_id']].copy()
merge4_cols = ['date','Team','game_id4']
encoder4.columns = merge4_cols
encoder4.date = encoder4.date.astype('str')
encoder4.Team = encoder4.Team.astype('str')
encoder4.date = encoder4.date.str.strip()
encoder4.Team = encoder4.Team.str.strip()
encoder4.head(2)

Unnamed: 0,date,Team,game_id4
0,2012-10-29,CLEVELAND_CAVALIERS,0
1,2012-10-30,MIAMI_HEAT,1


In [13]:
cleaned_box_scores.date = cleaned_box_scores.date.astype('str')
cleaned_box_scores.Team = cleaned_box_scores.Team.astype('str')
cleaned_box_scores.date = cleaned_box_scores.date.str.strip()
cleaned_box_scores.Team = cleaned_box_scores.Team.str.strip()

In [14]:
quarter_encoded_box_scores = pd.merge(cleaned_box_scores,encoder1, on=['date','Team'], how='left')
quarter_encoded_box_scores.head(3)

Unnamed: 0,slug,name,Team,Location,Opponent,Outcome,seconds_played,made_field_goals,attempted_field_goals,made_three_point_field_goals,...,offensive_rebounds,defensive_rebounds,assists,steals,blocks,turnovers,personal_fouls,game_score,date,game_id1
0,anthoca01,Carmelo Anthony,NEW_YORK_KNICKS,HOME,PORTLAND_TRAIL_BLAZERS,LOSS,2472.0,14.0,24.0,5.0,...,2.0,5.0,4.0,1.0,0.0,2.0,3.0,36.5,2013-01-01,
1,smithjo03,Josh Smith,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2234.0,11.0,20.0,1.0,...,2.0,11.0,7.0,3.0,4.0,1.0,2.0,27.0,2013-01-01,
2,holidjr01,Jrue Holiday,PHILADELPHIA_76ERS,AWAY,LOS_ANGELES_LAKERS,WIN,2352.0,10.0,19.0,2.0,...,0.0,2.0,10.0,2.0,0.0,1.0,1.0,24.9,2013-01-01,


In [15]:
half_encoded_box_scores = pd.merge(quarter_encoded_box_scores,encoder2, on=['date','Team'], how='left')
half_encoded_box_scores.head(3)

Unnamed: 0,slug,name,Team,Location,Opponent,Outcome,seconds_played,made_field_goals,attempted_field_goals,made_three_point_field_goals,...,defensive_rebounds,assists,steals,blocks,turnovers,personal_fouls,game_score,date,game_id1,game_id2
0,anthoca01,Carmelo Anthony,NEW_YORK_KNICKS,HOME,PORTLAND_TRAIL_BLAZERS,LOSS,2472.0,14.0,24.0,5.0,...,5.0,4.0,1.0,0.0,2.0,3.0,36.5,2013-01-01,,
1,smithjo03,Josh Smith,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2234.0,11.0,20.0,1.0,...,11.0,7.0,3.0,4.0,1.0,2.0,27.0,2013-01-01,,453.0
2,holidjr01,Jrue Holiday,PHILADELPHIA_76ERS,AWAY,LOS_ANGELES_LAKERS,WIN,2352.0,10.0,19.0,2.0,...,2.0,10.0,2.0,0.0,1.0,1.0,24.9,2013-01-01,,


In [16]:
three_q_encoded_box_scores = pd.merge(half_encoded_box_scores,encoder3, on = ['date','Team'], how='left')
three_q_encoded_box_scores.head(3)

Unnamed: 0,slug,name,Team,Location,Opponent,Outcome,seconds_played,made_field_goals,attempted_field_goals,made_three_point_field_goals,...,assists,steals,blocks,turnovers,personal_fouls,game_score,date,game_id1,game_id2,game_id3
0,anthoca01,Carmelo Anthony,NEW_YORK_KNICKS,HOME,PORTLAND_TRAIL_BLAZERS,LOSS,2472.0,14.0,24.0,5.0,...,4.0,1.0,0.0,2.0,3.0,36.5,2013-01-01,,,
1,smithjo03,Josh Smith,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2234.0,11.0,20.0,1.0,...,7.0,3.0,4.0,1.0,2.0,27.0,2013-01-01,,453.0,459.0
2,holidjr01,Jrue Holiday,PHILADELPHIA_76ERS,AWAY,LOS_ANGELES_LAKERS,WIN,2352.0,10.0,19.0,2.0,...,10.0,2.0,0.0,1.0,1.0,24.9,2013-01-01,,,461.0


In [17]:
fully_encoded_box_scores = pd.merge(three_q_encoded_box_scores,encoder4, on = ['date','Team'], how='left')
fully_encoded_box_scores.head(3)

Unnamed: 0,slug,name,Team,Location,Opponent,Outcome,seconds_played,made_field_goals,attempted_field_goals,made_three_point_field_goals,...,steals,blocks,turnovers,personal_fouls,game_score,date,game_id1,game_id2,game_id3,game_id4
0,anthoca01,Carmelo Anthony,NEW_YORK_KNICKS,HOME,PORTLAND_TRAIL_BLAZERS,LOSS,2472.0,14.0,24.0,5.0,...,1.0,0.0,2.0,3.0,36.5,2013-01-01,,,,458.0
1,smithjo03,Josh Smith,ATLANTA_HAWKS,AWAY,NEW_ORLEANS_HORNETS,WIN,2234.0,11.0,20.0,1.0,...,3.0,4.0,1.0,2.0,27.0,2013-01-01,,453.0,459.0,
2,holidjr01,Jrue Holiday,PHILADELPHIA_76ERS,AWAY,LOS_ANGELES_LAKERS,WIN,2352.0,10.0,19.0,2.0,...,2.0,0.0,1.0,1.0,24.9,2013-01-01,,,461.0,


In [25]:
game_id_matrix = fully_encoded_box_scores[['game_id1','game_id2','game_id3','game_id4']].copy()

In [26]:
len(game_id_matrix)

202219

In [28]:
checking_for_null = game_id_matrix.dropna(axis=1, how = 'all')
len(checking_for_null)

202219

## Each Box score was assigned at least 1 game_id