## Source

####  baseball-reference.com

https://www.baseball-reference.com/teams/HOU/2018-schedule-scores.shtml

- saved regular seasons from 2008 - 2017 to csv

In [1]:
import pandas as pd
import numpy as np
import glob, os

import warnings
warnings.filterwarnings('ignore')

In [2]:
ls data/MLB/plo

[0m[01;32mmlb_h_15_plo.csv[0m*  [01;32mmlb_h_17_plo.csv[0m*


In [3]:
path = 'data/MLB/plo'

In [4]:
all_files = glob.glob(os.path.join(path, "*.csv")) 

df_from_each_file = (pd.read_csv(f) for f in all_files)
df   = pd.concat(df_from_each_file, ignore_index=True)

In [5]:
df.head()

Unnamed: 0,Unnamed: 7,Unnamed: 8,Unnamed: 9,date,location,score_1,score_2,team_1,team_2,time
0,,,,"Tuesday, October 6, 2015",Yankee Stadium III,3,0,Houston,New York,8:10
1,,,,"Thursday, October 8, 2015",Kauffman Stadium,5,2,Houston,Kansas City,6:37
2,,,,"Friday, October 9, 2015",Kauffman Stadium,4,5,Houston,Kansas City,2:47
3,,,,"Sunday, October 11, 2015",Minute Maid Park,2,4,Kansas City,Houston,3:10
4,,,,"Monday, October 12, 2015",Minute Maid Park,9,6,Kansas City,Houston,12:07


## slice neede cols

In [6]:
mlb = df[['date','time','location','team_1','score_1','team_2','score_2']]
mlb.head()

Unnamed: 0,date,time,location,team_1,score_1,team_2,score_2
0,"Tuesday, October 6, 2015",8:10,Yankee Stadium III,Houston,3,New York,0
1,"Thursday, October 8, 2015",6:37,Kauffman Stadium,Houston,5,Kansas City,2
2,"Friday, October 9, 2015",2:47,Kauffman Stadium,Houston,4,Kansas City,5
3,"Sunday, October 11, 2015",3:10,Minute Maid Park,Kansas City,2,Houston,4
4,"Monday, October 12, 2015",12:07,Minute Maid Park,Kansas City,9,Houston,6


## Convert date to datetime 

In [7]:
mlb['date'] = pd.to_datetime(mlb['date'])

mlb = mlb.set_index('date').sort_index(ascending=True)

In [8]:
mlb.head()

Unnamed: 0_level_0,time,location,team_1,score_1,team_2,score_2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-10-06,8:10,Yankee Stadium III,Houston,3,New York,0
2015-10-08,6:37,Kauffman Stadium,Houston,5,Kansas City,2
2015-10-09,2:47,Kauffman Stadium,Houston,4,Kansas City,5
2015-10-11,3:10,Minute Maid Park,Kansas City,2,Houston,4
2015-10-12,12:07,Minute Maid Park,Kansas City,9,Houston,6


In [11]:
mlb.location.value_counts()

Minute Maid Park      10
Yankee Stadium III     4
Dodger Stadium         4
Kauffman Stadium       3
Fenway Park            2
 Minute Maid Park      1
Name: location, dtype: int64

In [13]:
mlb['location'] = mlb.location.str.strip()

In [14]:
mlb.location.value_counts()

Minute Maid Park      11
Yankee Stadium III     4
Dodger Stadium         4
Kauffman Stadium       3
Fenway Park            2
Name: location, dtype: int64

In [15]:
mlb[mlb.team_1 == 'Houston']

Unnamed: 0_level_0,time,location,team_1,score_1,team_2,score_2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-10-06,8:10,Yankee Stadium III,Houston,3,New York,0
2015-10-08,6:37,Kauffman Stadium,Houston,5,Kansas City,2
2015-10-09,2:47,Kauffman Stadium,Houston,4,Kansas City,5
2015-10-14,7:08,Kauffman Stadium,Houston,2,Kansas City,7
2017-10-08,2:30,Fenway Park,Houston,3,Boston,10
2017-10-16,8:00,Yankee Stadium III,Houston,1,New York,8
2017-10-17,5:00,Yankee Stadium III,Houston,4,New York,6
2017-10-18,5:00,Yankee Stadium III,Houston,0,New York,5
2017-10-24,8:00,Dodger Stadium,Houston,1,Los Angeles,3
2017-10-25,8:00,Dodger Stadium,Houston,7,Los Angeles,6


In [16]:
mlb[mlb.team_2 == 'Houston']

Unnamed: 0_level_0,time,location,team_1,score_1,team_2,score_2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-10-11,3:10,Minute Maid Park,Kansas City,2,Houston,4
2015-10-12,12:07,Minute Maid Park,Kansas City,9,Houston,6
2017-10-05,4pm,Minute Maid Park,Boston,2,Houston,8
2017-10-06,2pm,Minute Maid Park,Boston,2,Houston,8
2017-10-13,8:00,Minute Maid Park,New York,1,Houston,2
2017-10-14,4:00,Minute Maid Park,New York,1,Houston,2
2017-10-20,8:00,Minute Maid Park,New York,1,Houston,7
2017-10-21,8:00,Minute Maid Park,New York,0,Houston,4
2017-10-27,8:00,Minute Maid Park,Los Angeles,3,Houston,5
2017-10-28,8:00,Minute Maid Park,Los Angeles,6,Houston,2


## Crete new col for home_away

In [17]:
mlb['home_away'] = np.where(mlb.location == 'Minute Maid Park' , 'HOME', 'AWAY') 

In [18]:
mlb.head()

Unnamed: 0_level_0,time,location,team_1,score_1,team_2,score_2,home_away
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-10-06,8:10,Yankee Stadium III,Houston,3,New York,0,AWAY
2015-10-08,6:37,Kauffman Stadium,Houston,5,Kansas City,2,AWAY
2015-10-09,2:47,Kauffman Stadium,Houston,4,Kansas City,5,AWAY
2015-10-11,3:10,Minute Maid Park,Kansas City,2,Houston,4,HOME
2015-10-12,12:07,Minute Maid Park,Kansas City,9,Houston,6,HOME


## Create column for seaason 

In [19]:
mlb['season'] = 'plo'
mlb.head()

Unnamed: 0_level_0,time,location,team_1,score_1,team_2,score_2,home_away,season
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-10-06,8:10,Yankee Stadium III,Houston,3,New York,0,AWAY,plo
2015-10-08,6:37,Kauffman Stadium,Houston,5,Kansas City,2,AWAY,plo
2015-10-09,2:47,Kauffman Stadium,Houston,4,Kansas City,5,AWAY,plo
2015-10-11,3:10,Minute Maid Park,Kansas City,2,Houston,4,HOME,plo
2015-10-12,12:07,Minute Maid Park,Kansas City,9,Houston,6,HOME,plo


## Select columns

In [20]:
mlb.head()

Unnamed: 0_level_0,time,location,team_1,score_1,team_2,score_2,home_away,season
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-10-06,8:10,Yankee Stadium III,Houston,3,New York,0,AWAY,plo
2015-10-08,6:37,Kauffman Stadium,Houston,5,Kansas City,2,AWAY,plo
2015-10-09,2:47,Kauffman Stadium,Houston,4,Kansas City,5,AWAY,plo
2015-10-11,3:10,Minute Maid Park,Kansas City,2,Houston,4,HOME,plo
2015-10-12,12:07,Minute Maid Park,Kansas City,9,Houston,6,HOME,plo


In [21]:
mlb = mlb[['team_2','score_2','home_away','team_1','score_1','season']]


In [22]:
mlb.rename(columns = {'team_2':'team','score_2':'team_score','team_1':'opposing','score_1':'opp_score'}, inplace=True)

## select only home games

In [25]:
hou = mlb.team == 'Houston'
home = mlb.home_away =='HOME'

In [26]:
mlb[hou]

Unnamed: 0_level_0,team,team_score,home_away,opposing,opp_score,season
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-10-11,Houston,4,HOME,Kansas City,2,plo
2015-10-12,Houston,6,HOME,Kansas City,9,plo
2017-10-05,Houston,8,HOME,Boston,2,plo
2017-10-06,Houston,8,HOME,Boston,2,plo
2017-10-13,Houston,2,HOME,New York,1,plo
2017-10-14,Houston,2,HOME,New York,1,plo
2017-10-20,Houston,7,HOME,New York,1,plo
2017-10-21,Houston,4,HOME,New York,0,plo
2017-10-27,Houston,5,HOME,Los Angeles,3,plo
2017-10-28,Houston,2,HOME,Los Angeles,6,plo


In [29]:
mlb = mlb[home]

## change team name to HOU

In [32]:
mlb.team = 'HOU'

In [33]:
mlb

Unnamed: 0_level_0,team,team_score,home_away,opposing,opp_score,season
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-10-11,HOU,4,HOME,Kansas City,2,plo
2015-10-12,HOU,6,HOME,Kansas City,9,plo
2017-10-05,HOU,8,HOME,Boston,2,plo
2017-10-06,HOU,8,HOME,Boston,2,plo
2017-10-13,HOU,2,HOME,New York,1,plo
2017-10-14,HOU,2,HOME,New York,1,plo
2017-10-20,HOU,7,HOME,New York,1,plo
2017-10-21,HOU,4,HOME,New York,0,plo
2017-10-27,HOU,5,HOME,Los Angeles,3,plo
2017-10-28,HOU,2,HOME,Los Angeles,6,plo


In [34]:
mlb.to_csv('data/clean/astros_clean_plo.csv')

## Merge reg season and plo

In [42]:
path = 'data/clean/astros_clean_reg.csv'

In [45]:
reg = pd.read_csv(path,index_col='date')

In [47]:
reg.head()

Unnamed: 0_level_0,team,team_score,home_away,opposing,opp_score,win_lost,season
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-04-05,HOU,2,HOME,SFG,5,L,reg
2010-04-06,HOU,0,HOME,SFG,3,L,reg
2010-04-07,HOU,4,HOME,SFG,10,L,reg
2010-04-09,HOU,0,HOME,PHI,8,L,reg
2010-04-10,HOU,6,HOME,PHI,9,L,reg


In [48]:
mlb.head()

Unnamed: 0_level_0,team,team_score,home_away,opposing,opp_score,season
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-10-11,HOU,4,HOME,Kansas City,2,plo
2015-10-12,HOU,6,HOME,Kansas City,9,plo
2017-10-05,HOU,8,HOME,Boston,2,plo
2017-10-06,HOU,8,HOME,Boston,2,plo
2017-10-13,HOU,2,HOME,New York,1,plo


In [49]:
frames = [reg,mlb]
astros = pd.concat(frames)

In [50]:
astros

Unnamed: 0_level_0,home_away,opp_score,opposing,season,team,team_score,win_lost
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-04-05,HOME,5,SFG,reg,HOU,2,L
2010-04-06,HOME,3,SFG,reg,HOU,0,L
2010-04-07,HOME,10,SFG,reg,HOU,4,L
2010-04-09,HOME,8,PHI,reg,HOU,0,L
2010-04-10,HOME,9,PHI,reg,HOU,6,L
2010-04-11,HOME,2,PHI,reg,HOU,1,L
2010-04-20,HOME,5,FLA,reg,HOU,7,W
2010-04-21,HOME,4,FLA,reg,HOU,5,W
2010-04-22,HOME,5,FLA,reg,HOU,1,L
2010-04-23,HOME,3,PIT,reg,HOU,4,W


In [51]:
astros.info()

<class 'pandas.core.frame.DataFrame'>
Index: 659 entries, 2010-04-05 to 2017-10-29 00:00:00
Data columns (total 7 columns):
home_away     659 non-null object
opp_score     659 non-null int64
opposing      659 non-null object
season        659 non-null object
team          659 non-null object
team_score    659 non-null int64
win_lost      648 non-null object
dtypes: int64(2), object(5)
memory usage: 41.2+ KB


In [53]:
astros.apply(lambda x: sum(x.isnull()))

home_away      0
opp_score      0
opposing       0
season         0
team           0
team_score     0
win_lost      11
dtype: int64

In [56]:
astros.to_csv('data/clean/astros_clean.csv')

In [58]:
ls data/clean

[0m[01;32mastros_clean.csv[0m*      [01;32mastros_clean_reg.csv[0m*  [01;32mUH_clean.csv[0m*
[01;32mastros_clean_plo.csv[0m*  [01;32mRice_clean.csv[0m*
