## Combines 2019 and 2015-2018 AtBat Data

Requirements - 2 CSV Files (in the same working directory):
1. 2019_atbats.csv
2. atbats.csv

Output: A master CSV in the same directory

In [1]:
import pandas as pd
import numpy as np

In [2]:
path = '../../raw-data/atbats.csv'
path_2019 = '../../raw-data/2019_atbats.csv'

In [3]:
ab = pd.read_csv(path)
ab_2019 = pd.read_csv(path_2019)

### Checking for any missing values

In [4]:
ab.isnull().sum()

ab_id         0
batter_id     0
event         0
g_id          0
inning        0
o             0
p_score       0
p_throws      0
pitcher_id    0
stand         0
top           0
dtype: int64

In [5]:
ab_2019.isnull().sum()

inning        0
top           0
ab_id         0
g_id          0
p_score       0
batter_id     0
pitcher_id    0
stand         0
p_throws      0
event         0
o             0
dtype: int64

In [6]:
ab.info() == ab_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740389 entries, 0 to 740388
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   ab_id       740389 non-null  int64 
 1   batter_id   740389 non-null  int64 
 2   event       740389 non-null  object
 3   g_id        740389 non-null  int64 
 4   inning      740389 non-null  int64 
 5   o           740389 non-null  int64 
 6   p_score     740389 non-null  int64 
 7   p_throws    740389 non-null  object
 8   pitcher_id  740389 non-null  int64 
 9   stand       740389 non-null  object
 10  top         740389 non-null  bool  
dtypes: bool(1), int64(7), object(3)
memory usage: 57.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185245 entries, 0 to 185244
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   inning      185245 non-null  float64
 1   top         185245 non-null  float64
 2   ab_id       

True

In [7]:
# checking to make sure we have same/similar categorical values
print(np.setdiff1d(ab['event'].value_counts().keys(), ab_2019['event'].value_counts().keys()))

['Sac Fly DP' 'Sacrifice Bunt DP' 'Strikeout - DP']


In [8]:
print(ab.shape)
print(ab_2019.shape)

(740389, 11)
(185245, 11)


In [9]:
ab = ab.append(ab_2019, ignore_index=True)

In [10]:
ab["o"].value_counts() #reasonable dist

1    308153
2    297494
3    216105
0    103882
Name: o, dtype: int64

In [11]:
ab.head()

Unnamed: 0,ab_id,batter_id,event,g_id,inning,o,p_score,p_throws,pitcher_id,stand,top
0,2015000000.0,572761,Groundout,201500001.0,1.0,1,0.0,L,452657,L,1.0
1,2015000000.0,518792,Double,201500001.0,1.0,1,0.0,L,452657,L,1.0
2,2015000000.0,407812,Single,201500001.0,1.0,1,0.0,L,452657,R,1.0
3,2015000000.0,425509,Strikeout,201500001.0,1.0,2,0.0,L,452657,R,1.0
4,2015000000.0,571431,Strikeout,201500001.0,1.0,3,0.0,L,452657,L,1.0


In [12]:
#updating ab_id to int
ab['ab_id'] = ab['ab_id'].astype(np.int64)

#updating g_id to int
ab['g_id'] = ab['g_id'].astype(np.int64)

#updating inning to int
ab['inning'] = ab['inning'].astype(np.int64)

#dropping top, p_score col
ab.drop(['top', 'p_score'], axis=1, inplace=True)

In [13]:
print(ab.shape)
print(ab.dtypes)

(925634, 9)
ab_id          int64
batter_id      int64
event         object
g_id           int64
inning         int64
o              int64
p_throws      object
pitcher_id     int64
stand         object
dtype: object


In [14]:
ab.to_csv('./all_atbats.csv', index=False)

In [15]:
testdf = pd.read_csv('./all_atbats.csv')

In [16]:
testdf.head()

Unnamed: 0,ab_id,batter_id,event,g_id,inning,o,p_throws,pitcher_id,stand
0,2015000001,572761,Groundout,201500001,1,1,L,452657,L
1,2015000002,518792,Double,201500001,1,1,L,452657,L
2,2015000003,407812,Single,201500001,1,1,L,452657,R
3,2015000004,425509,Strikeout,201500001,1,2,L,452657,R
4,2015000005,571431,Strikeout,201500001,1,3,L,452657,L
