## Combines 2019 and 2015-2018 Game Data

Requirements - 2 CSV Files (in the same working directory):
1. 2019_games.csv
2. games.csv

Output: A master CSV in the same directory

Notes: We do not have umpire data for the 2019 season

In [1]:
import pandas as pd
import numpy as np

In [2]:
path = '../../raw-data/games.csv'
path_2019 = '../../raw-data/2019_games.csv'

In [3]:
g = pd.read_csv(path)
g_2019 = pd.read_csv(path_2019)

In [4]:
print(np.setdiff1d(g.columns, g_2019.columns))

['delay']


In [5]:
g.isnull().sum()

attendance          0
away_final_score    0
away_team           0
date                0
elapsed_time        0
g_id                0
home_final_score    0
home_team           0
start_time          0
umpire_1B           0
umpire_2B           3
umpire_3B           0
umpire_HP           0
venue_name          0
weather             0
wind                0
delay               0
dtype: int64

In [6]:
g_2019.isnull().sum() #missing some cols - oh well

g_id                   0
home_team              0
away_team              0
home_final_score       0
away_final_score       0
date                   0
umpire_HP           2408
umpire_1B           2408
umpire_2B           2408
umpire_3B           2408
start_time          2408
venue_name             0
weather             2408
wind                2408
elapsed_time        2408
attendance          2408
dtype: int64

In [7]:
# dropping the delay col
g.drop(["delay"], axis=1, inplace=True)

# making sure columns line up correctly
print(np.setdiff1d(g.columns, g_2019.columns))

[]


In [8]:
g = g.append(g_2019, ignore_index=True)

In [9]:
g.head()

Unnamed: 0,attendance,away_final_score,away_team,date,elapsed_time,g_id,home_final_score,home_team,start_time,umpire_1B,umpire_2B,umpire_3B,umpire_HP,venue_name,weather,wind
0,35055.0,3.0,sln,2015-04-05,184.0,201500001.0,0.0,chn,7:17 PM,Mark Wegner,Marty Foster,Mike Muchlinski,Mike Winters,Wrigley Field,"44 degrees, clear","7 mph, In from CF"
1,45909.0,1.0,ana,2015-04-06,153.0,201500002.0,4.0,sea,1:12 PM,Ron Kulpa,Brian Knight,Vic Carapazza,Larry Vanover,Safeco Field,"54 degrees, cloudy","1 mph, Varies"
2,36969.0,2.0,atl,2015-04-06,156.0,201500003.0,1.0,mia,4:22 PM,Laz Diaz,Chris Guccione,Cory Blaser,Jeff Nelson,Marlins Park,"80 degrees, partly cloudy","16 mph, In from CF"
3,31042.0,6.0,bal,2015-04-06,181.0,201500004.0,2.0,tba,3:12 PM,Ed Hickox,Paul Nauert,Mike Estabrook,Dana DeMuth,Tropicana Field,"72 degrees, dome","0 mph, None"
4,45549.0,8.0,bos,2015-04-06,181.0,201500005.0,0.0,phi,3:08 PM,Phil Cuzzi,Tony Randazzo,Will Little,Gerry Davis,Citizens Bank Park,"71 degrees, partly cloudy","11 mph, Out to RF"


In [10]:
# dropping elapsed_time
g.drop(["elapsed_time"], axis=1, inplace=True)

# dropping start_time
g.drop(["start_time"], axis=1, inplace=True)

# updating g_id to int
g['g_id'] = g['g_id'].astype(np.int64)

In [11]:
# how many games do we have from 2015-2019
g.shape

(12126, 14)

In [12]:
g.head()

Unnamed: 0,attendance,away_final_score,away_team,date,g_id,home_final_score,home_team,umpire_1B,umpire_2B,umpire_3B,umpire_HP,venue_name,weather,wind
0,35055.0,3.0,sln,2015-04-05,201500001,0.0,chn,Mark Wegner,Marty Foster,Mike Muchlinski,Mike Winters,Wrigley Field,"44 degrees, clear","7 mph, In from CF"
1,45909.0,1.0,ana,2015-04-06,201500002,4.0,sea,Ron Kulpa,Brian Knight,Vic Carapazza,Larry Vanover,Safeco Field,"54 degrees, cloudy","1 mph, Varies"
2,36969.0,2.0,atl,2015-04-06,201500003,1.0,mia,Laz Diaz,Chris Guccione,Cory Blaser,Jeff Nelson,Marlins Park,"80 degrees, partly cloudy","16 mph, In from CF"
3,31042.0,6.0,bal,2015-04-06,201500004,2.0,tba,Ed Hickox,Paul Nauert,Mike Estabrook,Dana DeMuth,Tropicana Field,"72 degrees, dome","0 mph, None"
4,45549.0,8.0,bos,2015-04-06,201500005,0.0,phi,Phil Cuzzi,Tony Randazzo,Will Little,Gerry Davis,Citizens Bank Park,"71 degrees, partly cloudy","11 mph, Out to RF"


In [13]:
g.to_csv('./all_games.csv', index=False)

In [14]:
testdf = pd.read_csv('./all_games.csv')

In [15]:
testdf.head()

Unnamed: 0,attendance,away_final_score,away_team,date,g_id,home_final_score,home_team,umpire_1B,umpire_2B,umpire_3B,umpire_HP,venue_name,weather,wind
0,35055.0,3.0,sln,2015-04-05,201500001,0.0,chn,Mark Wegner,Marty Foster,Mike Muchlinski,Mike Winters,Wrigley Field,"44 degrees, clear","7 mph, In from CF"
1,45909.0,1.0,ana,2015-04-06,201500002,4.0,sea,Ron Kulpa,Brian Knight,Vic Carapazza,Larry Vanover,Safeco Field,"54 degrees, cloudy","1 mph, Varies"
2,36969.0,2.0,atl,2015-04-06,201500003,1.0,mia,Laz Diaz,Chris Guccione,Cory Blaser,Jeff Nelson,Marlins Park,"80 degrees, partly cloudy","16 mph, In from CF"
3,31042.0,6.0,bal,2015-04-06,201500004,2.0,tba,Ed Hickox,Paul Nauert,Mike Estabrook,Dana DeMuth,Tropicana Field,"72 degrees, dome","0 mph, None"
4,45549.0,8.0,bos,2015-04-06,201500005,0.0,phi,Phil Cuzzi,Tony Randazzo,Will Little,Gerry Davis,Citizens Bank Park,"71 degrees, partly cloudy","11 mph, Out to RF"


In [16]:
testdf.isnull().sum()

attendance          2408
away_final_score       0
away_team              0
date                   0
g_id                   0
home_final_score       0
home_team              0
umpire_1B           2408
umpire_2B           2411
umpire_3B           2408
umpire_HP           2408
venue_name             0
weather             2408
wind                2408
dtype: int64