# Source

#### basketball-reference.com
https://www.basketball-reference.com/teams/HOU/2017_games.html

- saved regular and playoff seasons from 2010 - 2017 to csv

In [8]:
import pandas as pd
import numpy as np
import glob, os
import warnings
warnings.filterwarnings('ignore')

In [2]:
path = 'data/NBA/reg'
all_files = glob.glob(os.path.join(path, "*.csv")) 

df_from_each_file = (pd.read_csv(f) for f in all_files)
df_reg   = pd.concat(df_from_each_file, ignore_index=True)

In [4]:
path = 'data/NBA/plo'
all_files = glob.glob(os.path.join(path, "*.csv")) 

df_from_each_file = (pd.read_csv(f) for f in all_files)
df_plo   = pd.concat(df_from_each_file, ignore_index=True)

# Regular Season

In [5]:
df_reg['season'] = 'reg'

In [6]:
hou_rockets = df_reg[['Date','Unnamed: 2','Unnamed: 5','Opponent','Unnamed: 7','Tm','Opp','season']]

# Create col with home team name

In [9]:
hou_rockets['team'] ='Houston Rockets'

In [10]:
hou_rockets = hou_rockets[['Date','Unnamed: 2', 'team','Tm','Unnamed: 5','Opponent','Opp','Unnamed: 7','season']]

## Rename cols

In [12]:
hou_rockets.rename(columns={
    'Unnamed: 2':'Time',
    'team': 'Team',
    'Tm': 'Team_score',
    'Unnamed: 5': 'Location',
    'Opp': 'Opponent_score',
    'Unnamed: 7': 'Result'
},inplace=True)

In [14]:
hou_rockets.head()

Unnamed: 0,Date,Time,Team,Team_score,Location,Opponent,Opponent_score,Result,season
0,"Tue, Oct 27, 2009",7:00p ET,Houston Rockets,87,@,Portland Trail Blazers,96,L,reg
1,"Wed, Oct 28, 2009",7:30p ET,Houston Rockets,108,@,Golden State Warriors,107,W,reg
2,"Sat, Oct 31, 2009",7:30p ET,Houston Rockets,111,,Portland Trail Blazers,107,W,reg
3,"Mon, Nov 2, 2009",7:00p ET,Houston Rockets,113,@,Utah Jazz,96,W,reg
4,"Wed, Nov 4, 2009",7:30p ET,Houston Rockets,102,,Los Angeles Lakers,103,L,reg


## Check for missing values

In [15]:
hou_rockets.apply(lambda x: sum(x.isnull()))

Date                0
Time                0
Team                0
Team_score          0
Location          320
Opponent            0
Opponent_score      0
Result              0
season              0
dtype: int64

## Location Column

- change @ to away, nan to home

In [16]:
hou_rockets.Location.value_counts(dropna=False)

@      320
NaN    320
Name: Location, dtype: int64

In [18]:
hou_rockets['Location'] = np.where(hou_rockets['Location'] =='@', 'away', 'home')

In [19]:
hou_rockets.Location.value_counts(dropna=False)

away    320
home    320
Name: Location, dtype: int64

## Date col

In [20]:
hou_rockets.head()

Unnamed: 0,Date,Time,Team,Team_score,Location,Opponent,Opponent_score,Result,season
0,"Tue, Oct 27, 2009",7:00p ET,Houston Rockets,87,away,Portland Trail Blazers,96,L,reg
1,"Wed, Oct 28, 2009",7:30p ET,Houston Rockets,108,away,Golden State Warriors,107,W,reg
2,"Sat, Oct 31, 2009",7:30p ET,Houston Rockets,111,home,Portland Trail Blazers,107,W,reg
3,"Mon, Nov 2, 2009",7:00p ET,Houston Rockets,113,away,Utah Jazz,96,W,reg
4,"Wed, Nov 4, 2009",7:30p ET,Houston Rockets,102,home,Los Angeles Lakers,103,L,reg


In [21]:
hou_rockets['Date'] = pd.to_datetime(hou_rockets['Date'])
hou_rockets = hou_rockets.set_index('Date').sort_index(ascending=True)

In [22]:
hou_rockets.head()

Unnamed: 0_level_0,Time,Team,Team_score,Location,Opponent,Opponent_score,Result,season
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2009-10-27,7:00p ET,Houston Rockets,87,away,Portland Trail Blazers,96,L,reg
2009-10-28,7:30p ET,Houston Rockets,108,away,Golden State Warriors,107,W,reg
2009-10-31,7:30p ET,Houston Rockets,111,home,Portland Trail Blazers,107,W,reg
2009-11-02,7:00p ET,Houston Rockets,113,away,Utah Jazz,96,W,reg
2009-11-04,7:30p ET,Houston Rockets,102,home,Los Angeles Lakers,103,L,reg


In [27]:
hou_rockets.apply(lambda x: sum(x.isnull()))

Time              0
Team              0
Team_score        0
Location          0
Opponent          0
Opponent_score    0
Result            0
season            0
dtype: int64

## save

In [28]:
hou_rockets.to_csv('data/clean/hou_rockets_reg.csv')

## Playoffs

In [29]:
df_plo.head()

Unnamed: 0,G,Date,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Opponent,Unnamed: 7,Unnamed: 8,Tm,Opp,W,L,Streak,Notes
0,1,"Sun, Apr 21, 2013",9:30p ET,,Box Score,@,Oklahoma City Thunder,L,,91,120,0,1,L 1,
1,2,"Wed, Apr 24, 2013",7:00p ET,,Box Score,@,Oklahoma City Thunder,L,,102,105,0,2,L 2,
2,3,"Sat, Apr 27, 2013",9:30p ET,,Box Score,,Oklahoma City Thunder,L,,101,104,0,3,L 3,
3,4,"Mon, Apr 29, 2013",9:30p ET,,Box Score,,Oklahoma City Thunder,W,,105,103,1,3,W 1,
4,5,"Wed, May 1, 2013",9:30p ET,,Box Score,@,Oklahoma City Thunder,W,,107,100,2,3,W 2,


## Create col for plo

In [30]:
df_plo['season'] = 'plo'

In [31]:
hou_rockets_plo = df_plo[['Date','Unnamed: 2','Unnamed: 5','Opponent','Unnamed: 7','Tm','Opp','season']]

In [32]:
hou_rockets_plo['team'] ='Houston Rockets'

In [33]:
hou_rockets_plo = hou_rockets_plo[['Date','Unnamed: 2', 'team','Tm','Unnamed: 5','Opponent','Opp','Unnamed: 7','season']]

## rename col

In [34]:
hou_rockets_plo.rename(columns={
    'Unnamed: 2':'Time',
    'team': 'Team',
    'Tm': 'Team_score',
    'Unnamed: 5': 'Location',
    'Opp': 'Opponent_score',
    'Unnamed: 7': 'Result'
},inplace=True)

In [35]:
hou_rockets_plo.head()

Unnamed: 0,Date,Time,Team,Team_score,Location,Opponent,Opponent_score,Result,season
0,"Sun, Apr 21, 2013",9:30p ET,Houston Rockets,91,@,Oklahoma City Thunder,120,L,plo
1,"Wed, Apr 24, 2013",7:00p ET,Houston Rockets,102,@,Oklahoma City Thunder,105,L,plo
2,"Sat, Apr 27, 2013",9:30p ET,Houston Rockets,101,,Oklahoma City Thunder,104,L,plo
3,"Mon, Apr 29, 2013",9:30p ET,Houston Rockets,105,,Oklahoma City Thunder,103,W,plo
4,"Wed, May 1, 2013",9:30p ET,Houston Rockets,107,@,Oklahoma City Thunder,100,W,plo


## Check missing values

In [37]:
hou_rockets_plo.apply(lambda x: sum(x.isnull()))

Date               0
Time               0
Team               0
Team_score         0
Location          23
Opponent           0
Opponent_score     0
Result             0
season             0
dtype: int64

## Location Column

- change @ to away, nan to home

In [39]:
hou_rockets_plo.Location.value_counts(dropna=False)

NaN    23
@      22
Name: Location, dtype: int64

In [41]:
hou_rockets_plo['Location'] = np.where(hou_rockets_plo['Location'] =='@', 'away', 'home')

In [42]:
hou_rockets_plo.Location.value_counts(dropna=False)

home    23
away    22
Name: Location, dtype: int64

## Date col

In [43]:
hou_rockets_plo.head()

Unnamed: 0,Date,Time,Team,Team_score,Location,Opponent,Opponent_score,Result,season
0,"Sun, Apr 21, 2013",9:30p ET,Houston Rockets,91,away,Oklahoma City Thunder,120,L,plo
1,"Wed, Apr 24, 2013",7:00p ET,Houston Rockets,102,away,Oklahoma City Thunder,105,L,plo
2,"Sat, Apr 27, 2013",9:30p ET,Houston Rockets,101,home,Oklahoma City Thunder,104,L,plo
3,"Mon, Apr 29, 2013",9:30p ET,Houston Rockets,105,home,Oklahoma City Thunder,103,W,plo
4,"Wed, May 1, 2013",9:30p ET,Houston Rockets,107,away,Oklahoma City Thunder,100,W,plo


In [44]:
hou_rockets_plo['Date'] = pd.to_datetime(hou_rockets_plo['Date'])
hou_rockets_plo = hou_rockets_plo.set_index('Date').sort_index(ascending=True)
hou_rockets_plo.head()

Unnamed: 0_level_0,Time,Team,Team_score,Location,Opponent,Opponent_score,Result,season
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013-04-21,9:30p ET,Houston Rockets,91,away,Oklahoma City Thunder,120,L,plo
2013-04-24,7:00p ET,Houston Rockets,102,away,Oklahoma City Thunder,105,L,plo
2013-04-27,9:30p ET,Houston Rockets,101,home,Oklahoma City Thunder,104,L,plo
2013-04-29,9:30p ET,Houston Rockets,105,home,Oklahoma City Thunder,103,W,plo
2013-05-01,9:30p ET,Houston Rockets,107,away,Oklahoma City Thunder,100,W,plo


In [45]:
hou_rockets.apply(lambda x: sum(x.isnull()))

Time              0
Team              0
Team_score        0
Location          0
Opponent          0
Opponent_score    0
Result            0
season            0
dtype: int64

## Save

In [46]:
hou_rockets_plo.to_csv('data/clean/hou_rockets_plo.csv')

## Merge both dfs

In [47]:
frames = [hou_rockets,hou_rockets_plo]
rockets = pd.concat(frames)

In [48]:
rockets

Unnamed: 0_level_0,Time,Team,Team_score,Location,Opponent,Opponent_score,Result,season
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2009-10-27,7:00p ET,Houston Rockets,87,away,Portland Trail Blazers,96,L,reg
2009-10-28,7:30p ET,Houston Rockets,108,away,Golden State Warriors,107,W,reg
2009-10-31,7:30p ET,Houston Rockets,111,home,Portland Trail Blazers,107,W,reg
2009-11-02,7:00p ET,Houston Rockets,113,away,Utah Jazz,96,W,reg
2009-11-04,7:30p ET,Houston Rockets,102,home,Los Angeles Lakers,103,L,reg
2009-11-06,7:30p ET,Houston Rockets,105,home,Oklahoma City Thunder,94,W,reg
2009-11-10,7:30p ET,Houston Rockets,103,away,Dallas Mavericks,121,L,reg
2009-11-11,7:30p ET,Houston Rockets,104,home,Memphis Grizzlies,79,W,reg
2009-11-13,7:00p ET,Houston Rockets,100,away,Sacramento Kings,109,L,reg
2009-11-15,6:30p ET,Houston Rockets,101,away,Los Angeles Lakers,91,W,reg


## Only home games

In [49]:
rockets = rockets[rockets.Location =='home']

## select cols

In [51]:
rockets = rockets[['Team','Team_score','Location','Opponent','Opponent_score','season']]

In [52]:
rockets.head()

Unnamed: 0_level_0,Team,Team_score,Location,Opponent,Opponent_score,season
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-10-31,Houston Rockets,111,home,Portland Trail Blazers,107,reg
2009-11-04,Houston Rockets,102,home,Los Angeles Lakers,103,reg
2009-11-06,Houston Rockets,105,home,Oklahoma City Thunder,94,reg
2009-11-11,Houston Rockets,104,home,Memphis Grizzlies,79,reg
2009-11-17,Houston Rockets,105,home,Phoenix Suns,111,reg


In [53]:
rockets.to_csv('data/clean/rockets_clean.csv')