In [1]:
import pandas as pd
import numpy as np
import glob, os
import re

## Source

####  baseball-reference.com

https://www.baseball-reference.com/teams/HOU/2018-schedule-scores.shtml

- saved regular seasons from 2008 - 2017 to csv

In [3]:
ls sports_data_raw/MLB/reg

[0m[01;32mmlb_h10.csv[0m*  [01;32mmlb_h12.csv[0m*  [01;32mmlb_h14.csv[0m*  [01;32mmlb_h16.csv[0m*
[01;32mmlb_h11.csv[0m*  [01;32mmlb_h13.csv[0m*  [01;32mmlb_h15.csv[0m*  [01;32mmlb_h17.csv[0m*


## get all data

In [4]:
path = 'sports_data_raw/MLB/reg'
all_files = glob.glob(os.path.join(path, "*.csv")) 

df_from_each_file = (pd.read_csv(f) for f in all_files)
df   = pd.concat(df_from_each_file, ignore_index=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296 entries, 0 to 1295
Data columns (total 22 columns):
Gm#                1296 non-null int64
Year               1296 non-null int64
Date               1296 non-null object
Unnamed: 3         1296 non-null object
Tm                 1296 non-null object
Unnamed: 5         648 non-null object
Opp                1296 non-null object
W/L                1296 non-null object
R                  1296 non-null int64
RA                 1296 non-null int64
Inn                107 non-null float64
W-L                1296 non-null object
Rank               1296 non-null int64
GB                 1296 non-null object
Win                1296 non-null object
Loss               1296 non-null object
Save               648 non-null object
Time               1296 non-null object
D/N                1296 non-null object
Attendance         1295 non-null float64
Streak             1296 non-null object
Orig. Scheduled    1 non-null object
dtypes: float64(2), in

In [7]:
df.head()

Unnamed: 0,Gm#,Year,Date,Unnamed: 3,Tm,Unnamed: 5,Opp,W/L,R,RA,...,Rank,GB,Win,Loss,Save,Time,D/N,Attendance,Streak,Orig. Scheduled
0,1,2010,Monday Apr 5,boxscore,HOU,,SFG,L,2,5,...,3,1.0,Lincecum,Oswalt,Wilson,2:34,N,43836.0,-,
1,2,2010,Tuesday Apr 6,boxscore,HOU,,SFG,L,0,3,...,6,1.5,Zito,Rodriguez,Wilson,2:38,N,24237.0,--,
2,3,2010,Wednesday Apr 7,boxscore,HOU,,SFG,L,4,10,...,6,2.5,Affeldt,Gervacio,,3:01,D,21599.0,---,
3,4,2010,Friday Apr 9,boxscore,HOU,,PHI,L,0,8,...,6,3.0,Happ,Norris,,3:17,N,27288.0,----,
4,5,2010,Saturday Apr 10,boxscore,HOU,,PHI,L,6,9,...,6,4.0,Moyer,Lyon,,3:04,N,35138.0,-----,


## Create slice with only needed columns

In [8]:
mlb = df[['Date','Year','Tm','R','Unnamed: 5','Opp','RA','W/L', 'Attendance']]

In [9]:
mlb.head()

Unnamed: 0,Date,Year,Tm,R,Unnamed: 5,Opp,RA,W/L,Attendance
0,Monday Apr 5,2010,HOU,2,,SFG,5,L,43836.0
1,Tuesday Apr 6,2010,HOU,0,,SFG,3,L,24237.0
2,Wednesday Apr 7,2010,HOU,4,,SFG,10,L,21599.0
3,Friday Apr 9,2010,HOU,0,,PHI,8,L,27288.0
4,Saturday Apr 10,2010,HOU,6,,PHI,9,L,35138.0


In [10]:
mlb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296 entries, 0 to 1295
Data columns (total 9 columns):
Date          1296 non-null object
Year          1296 non-null int64
Tm            1296 non-null object
R             1296 non-null int64
Unnamed: 5    648 non-null object
Opp           1296 non-null object
RA            1296 non-null int64
W/L           1296 non-null object
Attendance    1295 non-null float64
dtypes: float64(1), int64(3), object(5)
memory usage: 91.2+ KB


## convert year column to string

In [11]:
mlb['Year'] = mlb['Year'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [12]:
mlb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296 entries, 0 to 1295
Data columns (total 9 columns):
Date          1296 non-null object
Year          1296 non-null object
Tm            1296 non-null object
R             1296 non-null int64
Unnamed: 5    648 non-null object
Opp           1296 non-null object
RA            1296 non-null int64
W/L           1296 non-null object
Attendance    1295 non-null float64
dtypes: float64(1), int64(2), object(6)
memory usage: 91.2+ KB


## Create new column with full date

In [13]:
mlb['full_date'] = mlb['Date']+' '+ mlb['Year']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [14]:
mlb.head()

Unnamed: 0,Date,Year,Tm,R,Unnamed: 5,Opp,RA,W/L,Attendance,full_date
0,Monday Apr 5,2010,HOU,2,,SFG,5,L,43836.0,Monday Apr 5 2010
1,Tuesday Apr 6,2010,HOU,0,,SFG,3,L,24237.0,Tuesday Apr 6 2010
2,Wednesday Apr 7,2010,HOU,4,,SFG,10,L,21599.0,Wednesday Apr 7 2010
3,Friday Apr 9,2010,HOU,0,,PHI,8,L,27288.0,Friday Apr 9 2010
4,Saturday Apr 10,2010,HOU,6,,PHI,9,L,35138.0,Saturday Apr 10 2010


## Remove parenthesis from dates from double  games

In [15]:
mlb['full_date'] = mlb['full_date'].str.replace(r"\(.*\)"," ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [16]:
mlb.loc[[1590]]

KeyError: 'None of [[1590]] are in the [index]'

## convert full_date to datetime

In [17]:
mlb['full_date'] = pd.to_datetime(mlb['full_date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [18]:
mlb.head()

Unnamed: 0,Date,Year,Tm,R,Unnamed: 5,Opp,RA,W/L,Attendance,full_date
0,Monday Apr 5,2010,HOU,2,,SFG,5,L,43836.0,2010-04-05
1,Tuesday Apr 6,2010,HOU,0,,SFG,3,L,24237.0,2010-04-06
2,Wednesday Apr 7,2010,HOU,4,,SFG,10,L,21599.0,2010-04-07
3,Friday Apr 9,2010,HOU,0,,PHI,8,L,27288.0,2010-04-09
4,Saturday Apr 10,2010,HOU,6,,PHI,9,L,35138.0,2010-04-10


## Select columns


In [19]:
astros = mlb[['full_date','Tm','R','Unnamed: 5','Opp','RA','W/L','Attendance']]

In [20]:
astros.head()

Unnamed: 0,full_date,Tm,R,Unnamed: 5,Opp,RA,W/L,Attendance
0,2010-04-05,HOU,2,,SFG,5,L,43836.0
1,2010-04-06,HOU,0,,SFG,3,L,24237.0
2,2010-04-07,HOU,4,,SFG,10,L,21599.0
3,2010-04-09,HOU,0,,PHI,8,L,27288.0
4,2010-04-10,HOU,6,,PHI,9,L,35138.0


## Rename columns

In [21]:
astros.rename(columns={'full_date': 'date',
                       'Tm': 'team',
                       'R':'team_score',
                       'Unnamed: 5':'home_away',
                      'Opp': 'opposing',
                      'RA':'opp_score',
                      'W/L':'win_lost',
                      'Attendance':'attendance'}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [22]:
astros.head()

Unnamed: 0,date,team,team_score,home_away,opposing,opp_score,win_lost,attendance
0,2010-04-05,HOU,2,,SFG,5,L,43836.0
1,2010-04-06,HOU,0,,SFG,3,L,24237.0
2,2010-04-07,HOU,4,,SFG,10,L,21599.0
3,2010-04-09,HOU,0,,PHI,8,L,27288.0
4,2010-04-10,HOU,6,,PHI,9,L,35138.0


## Reindex dataframe by date

- reindex the dataframe and sort the dates

In [23]:
astros = astros.set_index('date').sort_index(ascending=True)

In [24]:
astros.head()

Unnamed: 0_level_0,team,team_score,home_away,opposing,opp_score,win_lost,attendance
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-04-05,HOU,2,,SFG,5,L,43836.0
2010-04-06,HOU,0,,SFG,3,L,24237.0
2010-04-07,HOU,4,,SFG,10,L,21599.0
2010-04-09,HOU,0,,PHI,8,L,27288.0
2010-04-10,HOU,6,,PHI,9,L,35138.0


In [25]:
print(astros[['home_away', 'opposing']].head())

           home_away opposing
date                         
2010-04-05       NaN      SFG
2010-04-06       NaN      SFG
2010-04-07       NaN      SFG
2010-04-09       NaN      PHI
2010-04-10       NaN      PHI


In [26]:
astros['home_away'].unique()

array([nan, '@'], dtype=object)

In [27]:
astros['home_away'].value_counts(dropna=False)

@      648
NaN    648
Name: home_away, dtype: int64

## Change column
- change home_away values to  home or away

In [28]:
astros['home_away'] = np.where(astros['home_away'] =='@', 'AWAY', 'HOME')

In [29]:
astros[['home_away', 'opposing']].head()

Unnamed: 0_level_0,home_away,opposing
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-04-05,HOME,SFG
2010-04-06,HOME,SFG
2010-04-07,HOME,SFG
2010-04-09,HOME,PHI
2010-04-10,HOME,PHI


In [30]:
astros.head()

Unnamed: 0_level_0,team,team_score,home_away,opposing,opp_score,win_lost,attendance
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-04-05,HOU,2,HOME,SFG,5,L,43836.0
2010-04-06,HOU,0,HOME,SFG,3,L,24237.0
2010-04-07,HOU,4,HOME,SFG,10,L,21599.0
2010-04-09,HOU,0,HOME,PHI,8,L,27288.0
2010-04-10,HOU,6,HOME,PHI,9,L,35138.0


In [31]:
astros.win_lost.unique()

array(['L', 'W', 'W-wo', 'L-wo', 'W &H'], dtype=object)

## Find and replace
 - replace 'W-wo' : 'W'
 - replace  'L-wo': 'L'
 - replace 'W &H' : 'W

In [32]:
astros['win_lost'] = astros['win_lost'].str.replace('L-wo','L')
astros['win_lost'] = astros['win_lost'].str.replace('W-wo','W')
astros['win_lost'] = astros['win_lost'].str.replace('W &H','W')

In [33]:
astros.win_lost.unique()

array(['L', 'W'], dtype=object)

In [34]:
astros.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1296 entries, 2010-04-05 to 2017-10-01
Data columns (total 7 columns):
team          1296 non-null object
team_score    1296 non-null int64
home_away     1296 non-null object
opposing      1296 non-null object
opp_score     1296 non-null int64
win_lost      1296 non-null object
attendance    1295 non-null float64
dtypes: float64(1), int64(2), object(4)
memory usage: 81.0+ KB


In [35]:
astros['season'] = 'reg'

In [36]:
astros.head()

Unnamed: 0_level_0,team,team_score,home_away,opposing,opp_score,win_lost,attendance,season
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-04-05,HOU,2,HOME,SFG,5,L,43836.0,reg
2010-04-06,HOU,0,HOME,SFG,3,L,24237.0,reg
2010-04-07,HOU,4,HOME,SFG,10,L,21599.0,reg
2010-04-09,HOU,0,HOME,PHI,8,L,27288.0,reg
2010-04-10,HOU,6,HOME,PHI,9,L,35138.0,reg


## Save clean data to  to csv

In [37]:
astros.to_csv('sports_data_clean/astros_10_17_reg_clean.csv')

In [38]:
!ls sports_data_clean/

astros_08_17_clean.csv	     hou_rockets_10_17_reg_plo_clean.csv
astros_10_17_clean.csv	     nfl_08_17_clean.csv
astros_10_17_reg_clean.csv   nfl10clean.csv
dynamo_08_17_clean.csv	     nfl10yearsfull.csv
dynamo_10_17_clean.csv	     nfl17years_clean.csv
dynamo_10_17_plo.csv	     nfl17years_PST_clean.csv
dynamo_10_17__reg_clean.csv  rice_08_17_clean.csv
hou_home_10_17_clean.csv     tsu_08_17_clean.csv
hou_rockets_08_17_clean.csv  uh_08_17_clean.csv


In [None]:
# finish

In [None]:
astros.info()