In [1]:
import pandas as pd
import numpy as np
import glob, os
import re

## Source

####  baseball-reference.com

https://www.baseball-reference.com/teams/HOU/2018-schedule-scores.shtml

- saved regular seasons from 2008 - 2017 to csv

In [2]:
ls sports_data_raw/MLB

[0m[01;32mmlb_h08.csv[0m*  [01;32mmlb_h10.csv[0m*  [01;32mmlb_h12.csv[0m*  [01;32mmlb_h14.csv[0m*  [01;32mmlb_h16.csv[0m*
[01;32mmlb_h09.csv[0m*  [01;32mmlb_h11.csv[0m*  [01;32mmlb_h13.csv[0m*  [01;32mmlb_h15.csv[0m*  [01;32mmlb_h17.csv[0m*


## get all data

In [3]:
path = 'sports_data_raw/MLB'
all_files = glob.glob(os.path.join(path, "*.csv")) 

df_from_each_file = (pd.read_csv(f) for f in all_files)
df   = pd.concat(df_from_each_file, ignore_index=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1619 entries, 0 to 1618
Data columns (total 22 columns):
Gm#                1619 non-null int64
Year               1619 non-null int64
Date               1619 non-null object
Unnamed: 3         1619 non-null object
Tm                 1619 non-null object
Unnamed: 5         810 non-null object
Opp                1619 non-null object
W/L                1619 non-null object
R                  1619 non-null int64
RA                 1619 non-null int64
Inn                126 non-null float64
W-L                1619 non-null object
Rank               1619 non-null int64
GB                 1619 non-null object
Win                1619 non-null object
Loss               1619 non-null object
Save               811 non-null object
Time               1619 non-null object
D/N                1619 non-null object
Attendance         1618 non-null object
Streak             1619 non-null object
Orig. Scheduled    1 non-null object
dtypes: float64(1), int

In [5]:
df.head()

Unnamed: 0,Gm#,Year,Date,Unnamed: 3,Tm,Unnamed: 5,Opp,W/L,R,RA,...,Rank,GB,Win,Loss,Save,Time,D/N,Attendance,Streak,Orig. Scheduled
0,1,2008,"Monday, Mar 31",boxscore,HOU,@,SDP,L,0,4,...,4,1.0,Peavy,Oswalt,,2:53,N,44965,-,
1,2,2008,"Tuesday, Apr 1",boxscore,HOU,@,SDP,L,1,2,...,6,1.5,Young,Backe,Hoffman,2:49,N,20825,--,
2,3,2008,"Wednesday, Apr 2",boxscore,HOU,@,SDP,W,9,6,...,5,1.5,Valverde,Hoffman,,2:53,N,18714,+,
3,4,2008,"Thursday, Apr 3",boxscore,HOU,@,SDP,L,2,3,...,6,1.5,Gonzalez,Villarreal,Hoffman,2:29,D,24432,-,
4,5,2008,"Friday, Apr 4",boxscore,HOU,@,CHC,W,4,3,...,5,1.5,Wright,Lieber,Valverde,2:26,D,37812,+,


## Create slice with only needed columns

In [6]:
mlb = df[['Date','Year','Tm','R','Unnamed: 5','Opp','RA','W/L', 'Attendance']]

In [7]:
mlb.head()

Unnamed: 0,Date,Year,Tm,R,Unnamed: 5,Opp,RA,W/L,Attendance
0,"Monday, Mar 31",2008,HOU,0,@,SDP,4,L,44965
1,"Tuesday, Apr 1",2008,HOU,1,@,SDP,2,L,20825
2,"Wednesday, Apr 2",2008,HOU,9,@,SDP,6,W,18714
3,"Thursday, Apr 3",2008,HOU,2,@,SDP,3,L,24432
4,"Friday, Apr 4",2008,HOU,4,@,CHC,3,W,37812


In [8]:
mlb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1619 entries, 0 to 1618
Data columns (total 9 columns):
Date          1619 non-null object
Year          1619 non-null int64
Tm            1619 non-null object
R             1619 non-null int64
Unnamed: 5    810 non-null object
Opp           1619 non-null object
RA            1619 non-null int64
W/L           1619 non-null object
Attendance    1618 non-null object
dtypes: int64(3), object(6)
memory usage: 113.9+ KB


## convert year column to string

In [9]:
mlb['Year'] = mlb['Year'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [10]:
mlb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1619 entries, 0 to 1618
Data columns (total 9 columns):
Date          1619 non-null object
Year          1619 non-null object
Tm            1619 non-null object
R             1619 non-null int64
Unnamed: 5    810 non-null object
Opp           1619 non-null object
RA            1619 non-null int64
W/L           1619 non-null object
Attendance    1618 non-null object
dtypes: int64(2), object(7)
memory usage: 113.9+ KB


## Create new column with full date

In [11]:
mlb['full_date'] = mlb['Date']+' '+ mlb['Year']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [12]:
mlb.head()

Unnamed: 0,Date,Year,Tm,R,Unnamed: 5,Opp,RA,W/L,Attendance,full_date
0,"Monday, Mar 31",2008,HOU,0,@,SDP,4,L,44965,"Monday, Mar 31 2008"
1,"Tuesday, Apr 1",2008,HOU,1,@,SDP,2,L,20825,"Tuesday, Apr 1 2008"
2,"Wednesday, Apr 2",2008,HOU,9,@,SDP,6,W,18714,"Wednesday, Apr 2 2008"
3,"Thursday, Apr 3",2008,HOU,2,@,SDP,3,L,24432,"Thursday, Apr 3 2008"
4,"Friday, Apr 4",2008,HOU,4,@,CHC,3,W,37812,"Friday, Apr 4 2008"


## Remove parenthesis from dates from double  games

In [13]:
mlb['full_date'] = mlb['full_date'].str.replace(r"\(.*\)"," ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [14]:
mlb.loc[[1590]]

Unnamed: 0,Date,Year,Tm,R,Unnamed: 5,Opp,RA,W/L,Attendance,full_date
1590,Saturday Sep 2 (1),2017,HOU,12,,NYM,8,W,30319,Saturday Sep 2 2017


## convert full_date to datetime

In [15]:
mlb['full_date'] = pd.to_datetime(mlb['full_date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [16]:
mlb.head()

Unnamed: 0,Date,Year,Tm,R,Unnamed: 5,Opp,RA,W/L,Attendance,full_date
0,"Monday, Mar 31",2008,HOU,0,@,SDP,4,L,44965,2008-03-31
1,"Tuesday, Apr 1",2008,HOU,1,@,SDP,2,L,20825,2008-04-01
2,"Wednesday, Apr 2",2008,HOU,9,@,SDP,6,W,18714,2008-04-02
3,"Thursday, Apr 3",2008,HOU,2,@,SDP,3,L,24432,2008-04-03
4,"Friday, Apr 4",2008,HOU,4,@,CHC,3,W,37812,2008-04-04


## Select columns


In [17]:
astros = mlb[['full_date','Tm','R','Unnamed: 5','Opp','RA','W/L','Attendance']]

In [18]:
astros.head()

Unnamed: 0,full_date,Tm,R,Unnamed: 5,Opp,RA,W/L,Attendance
0,2008-03-31,HOU,0,@,SDP,4,L,44965
1,2008-04-01,HOU,1,@,SDP,2,L,20825
2,2008-04-02,HOU,9,@,SDP,6,W,18714
3,2008-04-03,HOU,2,@,SDP,3,L,24432
4,2008-04-04,HOU,4,@,CHC,3,W,37812


## Rename columns

In [19]:
astros.rename(columns={'full_date': 'date',
                       'Tm': 'team',
                       'R':'team_score',
                       'Unnamed: 5':'home_away',
                      'Opp': 'opposing',
                      'RA':'opp_score',
                      'W/L':'win_lost',
                      'Attendance':'attendance'}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [20]:
astros.head()

Unnamed: 0,date,team,team_score,home_away,opposing,opp_score,win_lost,attendance
0,2008-03-31,HOU,0,@,SDP,4,L,44965
1,2008-04-01,HOU,1,@,SDP,2,L,20825
2,2008-04-02,HOU,9,@,SDP,6,W,18714
3,2008-04-03,HOU,2,@,SDP,3,L,24432
4,2008-04-04,HOU,4,@,CHC,3,W,37812


## Reindex dataframe by date

- reindex the dataframe and sort the dates

In [21]:
astros = astros.set_index('date').sort_index(ascending=True)

In [22]:
astros.head()

Unnamed: 0_level_0,team,team_score,home_away,opposing,opp_score,win_lost,attendance
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2008-03-31,HOU,0,@,SDP,4,L,44965
2008-04-01,HOU,1,@,SDP,2,L,20825
2008-04-02,HOU,9,@,SDP,6,W,18714
2008-04-03,HOU,2,@,SDP,3,L,24432
2008-04-04,HOU,4,@,CHC,3,W,37812


## Change column
- change home_away values to  home or away

In [23]:
astros['home_away'] = np.where(astros['home_away'] =='@', 'AWAY', 'HOME')

In [24]:
astros.head()

Unnamed: 0_level_0,team,team_score,home_away,opposing,opp_score,win_lost,attendance
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2008-03-31,HOU,0,AWAY,SDP,4,L,44965
2008-04-01,HOU,1,AWAY,SDP,2,L,20825
2008-04-02,HOU,9,AWAY,SDP,6,W,18714
2008-04-03,HOU,2,AWAY,SDP,3,L,24432
2008-04-04,HOU,4,AWAY,CHC,3,W,37812


In [25]:
astros.win_lost.unique()

array(['L', 'W', 'W-wo', 'L-wo', 'W &H'], dtype=object)

## Find and replace
 - replace 'W-wo' : 'W'
 - replace  'L-wo': 'L'
 - replace 'W &H' : 'W

In [26]:
astros['win_lost'] = astros['win_lost'].str.replace('L-wo','L')
astros['win_lost'] = astros['win_lost'].str.replace('W-wo','W')
astros['win_lost'] = astros['win_lost'].str.replace('W &H','W')

In [27]:
astros.win_lost.unique()

array(['L', 'W'], dtype=object)

In [28]:
astros.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1619 entries, 2008-03-31 to 2017-10-01
Data columns (total 7 columns):
team          1619 non-null object
team_score    1619 non-null int64
home_away     1619 non-null object
opposing      1619 non-null object
opp_score     1619 non-null int64
win_lost      1619 non-null object
attendance    1618 non-null object
dtypes: int64(2), object(5)
memory usage: 101.2+ KB


## Save clean data to  to csv

In [29]:
astros.to_csv('sports_data_clean/astros_08_17_clean.csv')

In [32]:
!ls sports_data_clean/

astros_08_17_clean.csv


In [31]:
# finish