# Build Dataset

## Ideas

- Stadium
- Weather
- Historical
    - Last x games
    - Last x days
    - Career
    - Career vs pitcher (hitter)
- Team stats
    - Opp hitting
    - Bullpen pitching
    
Game Directory

- Home Team
- Away Team
- Starting Pitcher Stats
    - general prefermance
    - against hitter
- Team Stats
    - Defensive Stats??
    - W/L??
    - Team Batting Stats??
- Month

## Batter Exploration

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
pd.set_option('display.max_columns', 500)

In [3]:
batter_raw = pd.read_csv(r'..\Data_Pull\data_files\reduced_data\batter_raw_red.csv', index_col='Unnamed: 0', parse_dates = ['Date'])

In [4]:
len(batter_raw)

166885

In [50]:
def gen_avg(df, stat, time_period, agg_type):
    
    """
    Function to average previous x amount of lines.
    Requires dataframe to be sorted by time and player.
    
    Args
    df - dataframe to loop over
    stat - column to average
    time_period - number of rows to average
    agg_type - how to aggregate over time (sum/mean/count)
    """

    ## Create new column
    if time_period < 100000:
        new_col_name = '{0}_{1}'.format(stat, str(time_period))
    else:
        new_col_name = '{0}_lifetime'.format(stat)
    
    ## Aggregate based on specified input
    if agg_type in ['Sum', 'sum']:
        df[new_col_name]=df.groupby('pid')[stat].apply(lambda x : x.shift().rolling(time_period, min_periods = 1).sum())
    
    elif agg_type in ['Mean', 'mean']:
        df[new_col_name]=df.groupby('pid')[stat].apply(lambda x : x.shift().rolling(time_period, min_periods = 1).mean())
        
    elif agg_type in ['Sum', 'sum']:
        df[new_col_name]=df.groupby('pid')[stat].apply(lambda x : x.shift().rolling(time_period, min_periods = 1).count())
    
    else:
        print('Error: Enter Valid Aggregation Type')

In [28]:
batter_raw.sort_values(['pid', 'Date'], inplace = True)
batter_raw.reset_index(inplace = True, drop = True)

In [9]:
batter_raw.head(5)

Unnamed: 0,Date,Team,Opp,BO,Pos,G,AB,PA,H,1B,2B,3B,HR,R,RBI,BB,IBB,SO,HBP,SF,SH,GDP,SB,CS,AVG,pid,AB_7
0,2017-04-05,MIL,COL,9,P,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,18.0,
1,2017-04-06,MIL,COL,9,P,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,18.0,0.0
2,2017-04-07,MIL,CHC,9,P,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,18.0,0.0
3,2017-04-11,MIL,@TOR,0,P,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,18.0,0.0
4,2017-04-12,MIL,@TOR,0,P,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,18.0,0.0


In [8]:
gen_avg(batter_raw, 'AB', 7, 'sum')

## General Info Parse

In [5]:
gen_inf = pd.read_csv(r'../Data_Pull/data_files/general_info.csv', index_col='Unnamed: 0')

In [6]:
gen_inf.head()

Unnamed: 0,Age,Bats/Throws,Birthdate,Height/Weight,Name,POS,pid
0,Age: 41,Bats/Throws: B/R,"1/16/1978 (41 y, 4 m, 20 d)","5' 11"" / 165",Alfredo Amezaga,OF,1.0
1,Age: 44,Bats/Throws: R/R,"1/20/1975 (44 y, 4 m, 16 d)","5' 7"" / 177",David Eckstein,SS,10.0
2,Age: 48,Bats/Throws: R/R,"5/11/1971 (48 y, 25 d)","6' 2"" / 220",Kerry Ligtenberg,P,100.0
3,Age: 50,Bats/Throws: R/R,"10/22/1968 (50 y, 7 m, 14 d)","6' 0"" / 200",Keith Osik,C,1001.0
4,Age: 31,Bats/Throws: R/R,"6/18/1987 (31 y, 11 m, 18 d)","6' 5"" / 225",Taylor Thompson,P,10019.0


In [12]:
len(gen_inf)

5070

In [7]:
## Parse columns
gen_inf.loc[:, 'Age'] = gen_inf.loc[:, 'Age'].apply(lambda x: int(x.split(':')[1].strip()))
gen_inf.loc[:, 'Bats'] = gen_inf.loc[:, 'Bats/Throws'].apply(lambda x: x.split('/')[1].split(':')[1].strip())
gen_inf.loc[:, 'Throws'] = gen_inf.loc[:, 'Bats/Throws'].apply(lambda x: x.split('/')[2].strip())
gen_inf.loc[:, 'Birthdate'] = gen_inf.loc[:, 'Birthdate'].apply(lambda x: pd.to_datetime(x.split('(')[0].strip()))
gen_inf.loc[:, 'Height'] = gen_inf.loc[:, 'Height/Weight'].apply(lambda x: (int(x.split("'")[0].strip()) * 12) + int(x.split('"')[0].split("'")[1].strip()))
gen_inf.loc[:, 'Weight'] = gen_inf.loc[:, 'Height/Weight'].apply(lambda x: int(x.split('/')[1].strip()))

## Drop not needed cols
gen_inf.drop(columns = {'Bats/Throws', 'Height/Weight'}, inplace = True)

In [34]:
gen_inf.to_csv(r'../Data_Pull/data_files/general_info_clean.csv')

In [14]:
gen_inf.head()

Unnamed: 0,Age,Birthdate,Name,POS,pid,Bats,Throws,Height,Weight
0,41,1978-01-16,Alfredo Amezaga,OF,1.0,B,R,71,165
1,44,1975-01-20,David Eckstein,SS,10.0,R,R,67,177
2,48,1971-05-11,Kerry Ligtenberg,P,100.0,R,R,74,220
3,50,1968-10-22,Keith Osik,C,1001.0,R,R,72,200
4,31,1987-06-18,Taylor Thompson,P,10019.0,R,R,77,225


## 2 Game Days Issue

In [140]:
batter_raw = pd.read_csv(r'..\Data_Pull\data_files\reduced_data\batter_raw_red.csv', index_col='Unnamed: 0', parse_dates = ['Date'])
pitching_raw = pd.read_csv(r'..\Data_Pull\data_files\reduced_data\pitching_raw_red.csv', index_col='Unnamed: 0', parse_dates = ['Date'])

In [141]:
pitching_raw.sort_values(['pid', 'Date'], inplace = True)
pitching_raw.reset_index(inplace = True, drop = True)

In [142]:
batter_raw = batter_raw.loc[batter_raw['Date'] >= pd.to_datetime('2000-1-1')]
pitching_raw = pitching_raw.loc[pitching_raw['Date'] >= pd.to_datetime('2000-1-1')]

In [143]:
batter_raw.head(3)

Unnamed: 0,Date,Team,Opp,BO,Pos,G,AB,PA,H,1B,2B,3B,HR,R,RBI,BB,IBB,SO,HBP,SF,SH,GDP,SB,CS,AVG,pid
0,2017-08-13,KCR,@CHW,0,P,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,18.0
1,2017-08-10,KCR,@STL,9,P,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,18.0
2,2017-08-08,KCR,STL,0,P,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,18.0


In [144]:
pitching_raw.head(3)

Unnamed: 0,Date,Team,Opp,GS,W,L,SV,HLD,IP,TBF,H,R,ER,HR,BB,SO,K/9,BB/9,HR/9,BABIP,LOB%,GB%,HR/FB,ERA,FIP,xFIP,GSv2,pid
0,2017-04-05,MIL,COL,0,0,0,1,0.0,0.1,1,0,0,0,0,0,1,27.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-2.84,-2.84,,18.0
1,2017-04-06,MIL,COL,0,0,1,0,0.0,1.0,4,1,1,1,1,0,1,9.0,0.0,9.0,0.0,1.0,0.667,1.0,9.0,14.16,2.94,,18.0
2,2017-04-07,MIL,CHC,0,0,0,0,0.0,0.1,1,0,0,0,0,0,1,27.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-2.84,-2.84,,18.0


In [145]:
batter_raw.loc[:, 'Home/Away'] = batter_raw.loc[:, 'Opp'].apply(lambda x: 'Away' if '@' in x else 'Home')
pitching_raw.loc[:, 'Home/Away'] = pitching_raw.loc[:, 'Opp'].apply(lambda x: 'Away' if '@' in x else 'Home')

batter_raw.loc[:, 'Opp'] = batter_raw.loc[:, 'Opp'].apply(lambda x: x.split('@')[1] if '@' in x else x)
pitching_raw.loc[:, 'Opp'] = pitching_raw.loc[:, 'Opp'].apply(lambda x: x.split('@')[1] if '@' in x else x)

In [146]:
## Function to make sure matchup can be join key (sort alphabetically)
def sort_teams(a, b):
    
    sort_list = [a, b]
    
    sort_list = sorted(sort_list)
    
    name = '_'.join(sort_list)
    
    return name

batter_raw.loc[:, 'Teams'] = batter_raw.loc[:, ['Team', 'Opp']].apply(lambda x: sort_teams(*x), axis = 1)
pitching_raw.loc[:, 'Teams'] = pitching_raw.loc[:, ['Team', 'Opp']].apply(lambda x: sort_teams(*x), axis = 1)

In [215]:
two_game_issue = pitching_raw.loc[pitching_raw['GS'] >= 1].groupby(['Date', 'Teams'])['GS'].count()[pitching_raw[pitching_raw['GS'] >= 1].groupby(['Date', 'Teams'])['GS'].count() > 1].reset_index()
two_game_issue = two_game_issue.loc[two_game_issue['GS'] > 2]

## Generate Historical Stats

In [174]:
adv_df = pd.read_csv(r'..\Data_Pull\data_files\reduced_data\batter_advanced_raw_red.csv', index_col='Unnamed: 0', parse_dates = ['Date'])

In [175]:
adv_df.sort_values(['pid', 'Date'], inplace = True)
adv_df.reset_index(inplace = True, drop = True)

In [176]:
bat_all = pd.merge(batter_raw, adv_df, how = 'outer', on = ['pid', 'Date'])

In [177]:
bat_all = bat_all[~pd.isnull(bat_all['Team_x'])]

In [178]:
for col in [x for x in batter_raw.columns if batter_raw[x].dtype == np.int64]:
    for days in [1, 7, 30, 1000000]: #1, 3, 7, 15, 30, 1000000
        if col == 'BO':
            gen_avg(batter_raw, col, days, 'mean')
        else:
            gen_avg(batter_raw, col, days, 'sum')
        print('{0} {1} Complete'.format(col, str(days)))

BO 1 Complete
BO 7 Complete
BO 30 Complete
BO 1000000 Complete
G 1 Complete
G 7 Complete
G 30 Complete
G 1000000 Complete
AB 1 Complete
AB 7 Complete
AB 30 Complete
AB 1000000 Complete
PA 1 Complete
PA 7 Complete
PA 30 Complete
PA 1000000 Complete
H 1 Complete
H 7 Complete
H 30 Complete
H 1000000 Complete
1B 1 Complete
1B 7 Complete
1B 30 Complete
1B 1000000 Complete
2B 1 Complete
2B 7 Complete
2B 30 Complete
2B 1000000 Complete
3B 1 Complete
3B 7 Complete
3B 30 Complete
3B 1000000 Complete
HR 1 Complete
HR 7 Complete
HR 30 Complete
HR 1000000 Complete
R 1 Complete
R 7 Complete
R 30 Complete
R 1000000 Complete
RBI 1 Complete
RBI 7 Complete
RBI 30 Complete
RBI 1000000 Complete
BB 1 Complete
BB 7 Complete
BB 30 Complete
BB 1000000 Complete
IBB 1 Complete
IBB 7 Complete
IBB 30 Complete
IBB 1000000 Complete
SO 1 Complete
SO 7 Complete
SO 30 Complete
SO 1000000 Complete
HBP 1 Complete
HBP 7 Complete
HBP 30 Complete
HBP 1000000 Complete
SF 1 Complete
SF 7 Complete
SF 30 Complete
SF 1000000 C

In [148]:
starting_pitcher_df = pitching_raw.loc[pitching_raw['GS'] == 1]

In [149]:
for col in starting_pitcher_df.columns[3:16]:
    for days in [1, 7, 30, 1000000]: #1, 3, 7, 15, 30, 1000000
        gen_avg(starting_pitcher_df, col, days, 'sum')
        print('{0} {1} Complete'.format(col, str(days)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


GS 1 Complete
GS 7 Complete
GS 30 Complete
GS 1000000 Complete
W 1 Complete
W 7 Complete
W 30 Complete
W 1000000 Complete
L 1 Complete
L 7 Complete
L 30 Complete
L 1000000 Complete
SV 1 Complete
SV 7 Complete
SV 30 Complete
SV 1000000 Complete
HLD 1 Complete
HLD 7 Complete
HLD 30 Complete
HLD 1000000 Complete
IP 1 Complete
IP 7 Complete
IP 30 Complete
IP 1000000 Complete
TBF 1 Complete
TBF 7 Complete
TBF 30 Complete
TBF 1000000 Complete
H 1 Complete
H 7 Complete
H 30 Complete
H 1000000 Complete
R 1 Complete
R 7 Complete
R 30 Complete
R 1000000 Complete
ER 1 Complete
ER 7 Complete
ER 30 Complete
ER 1000000 Complete
HR 1 Complete
HR 7 Complete
HR 30 Complete
HR 1000000 Complete
BB 1 Complete
BB 7 Complete
BB 30 Complete
BB 1000000 Complete
SO 1 Complete
SO 7 Complete
SO 30 Complete
SO 1000000 Complete


## Current ML Progress

In [150]:
def qs_gen(IP, ER):
    
    if IP >= 6 and ER <= 3:
        return 1
    else:
        return 0

In [151]:
## Add QS
starting_pitcher_df.loc[:, 'QS'] = starting_pitcher_df.loc[:, ['IP', 'ER']].apply(lambda x: qs_gen(*x), axis = 1)

In [152]:
for days in [1, 7, 30, 1000000]: #1, 3, 7, 15, 30, 1000000
    gen_avg(starting_pitcher_df, 'QS', days, 'sum')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [43]:
batter_raw.to_csv(r'..\Data_Pull\data_files\Simple_ML_files\bat_ml_one.csv')
starting_pitcher_df.to_csv(r'..\Data_Pull\data_files\Simple_ML_files\pitch_ml_one.csv')

## Combine

In [153]:
starting_pitcher_df.columns = [str(col) + '_pitch' for col in starting_pitcher_df.columns]

In [193]:
batter_raw.drop_duplicates(inplace = True)
starting_pitcher_df.drop_duplicates(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [225]:
batter_raw = batter_raw.loc[~((batter_raw['Teams'].isin(two_game_issue['Teams'].values)) & (batter_raw['Date'].isin(two_game_issue['Date'].values)))]

In [226]:
combine_df = pd.merge(batter_raw, starting_pitcher_df, how = 'left', left_on = ['Date', 'Teams', 'Opp'], right_on = ['Date_pitch', 'Teams_pitch', 'Team_pitch'])

In [186]:
bat_cols = [col for col in combine_df.columns if 'pitch' not in col]
pitch_cols = [col for col in combine_df.columns if 'pitch' in col]

In [230]:
combine_df.to_csv(r'..\Data_Pull\data_files\Simple_ML_files\combined_ml.csv')