In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from mlb.Data import Data

import re


In [None]:
df = pd.read_csv('data/games.csv')

In [None]:
df

In [None]:
df.columns

In [None]:
def normalize_columns(cols):
    new_cols = []
    for col in cols.copy():
        replace_dict = {' - ': '_',
                       '-': '_',
                       ' ': '_',
                       '/': '_'}
        temp = col.lower()
        for k, v in replace_dict.items():
            temp = temp.replace(k, v)

        temp = re.sub(r'^(.+)_(home|away)$', r'\2_\1', temp)
        new_cols.append(temp)


    return new_cols

In [None]:
df.columns = normalize_columns(df.columns)
df.columns

In [None]:
df.dtypes

In [None]:
def clean_games(df):
    df = df.set_index('game')
    df['datetime'] = pd.to_datetime(df['date'])
    df['date'] = df.datetime.dt.date
    df['time'] = df.datetime.dt.time
    df = df.sort_values('datetime')
    df = pd.concat([df[['date', 'home', 'away']],
                    df.drop(['date', 'home', 'away'], axis=1)], axis=1)
    df = df.drop(df[df.duplicated()].index)
    df = df.replace('(null-null)', np.nan)
    for col in df.columns:
        if df[col].isna().sum() < 100:
            df = df.drop(df[df[col].isna()].index)
        if col == 'save_pitcher_record':
            df[col] = df[col].fillna('(0)')

        if 'record' in col:
            if 'pitcher' in col:
                df[col] = df[col].str.replace(r'\((\d+-?\d*)\)', r'\1', regex=True)

                if 'save' in col:
                    df[col.replace('record', 'saves')] = df[col].str.replace(r'\((\d+)\)', r'\1', regex=True).astype(int)
                    df = df.drop(col, axis=1)
                    continue

            df[col.replace('record', 'wins')] = df[col].str.replace(r'(\d+)-{1}(\d+)\s*\w*', r'\1', regex=True).astype(int)
            df[col.replace('record', 'losses')] = df[col].str.replace(r'(\d+)-{1}(\d+)\s*\w*', r'\2', regex=True).astype(int)
            df = df.drop(col, axis=1)
        if 'postseason' in col:
            df['regular_season'] = df[col].isna()
            df[col] = df[col].fillna('Regular Season')

        if col == 'stadium':
            df[col] = df[col].str.replace(r'^\s*(.+(\s\w+)*){1}(.*\s*)*', r'\1', regex=True)
        if col == 'location':
            df[col] = df[col].str.replace(r'^(.+(\s\w+)*)\s*\d*',r'\1', regex=True)
        if col in ['attendance', 'capacity']:
            df[col] = df[col].str.replace(',','')
            df[col] = pd.to_numeric(df[col])
            df[col] = df[col].fillna(int(df[col].mean()))
        if col == 'duration':
            df['hours'] = df[col].str.split(':').apply(lambda x: int(x[0]))
            df['minutes'] = df[col].str.split(':').apply(lambda x: int(x[1][:2]))
            df[col] = df['minutes'] + 60 * df['hours']
            df = df.drop(['minutes', 'hours'], axis=1)

        if 'stats' in col:
            if 'save' in col:
                df[col] = df[col].fillna('0.0 IP, 0 ER, 0 K, 0 BB')
            stat_regex = r'(\d)\.(\d) IP, (\d+) ER, (\d+) K, (\d+) BB'
            df[col.replace('stats','full')] = df[col].str.replace(stat_regex, r'\1', regex=True).apply(lambda x: int(x))
            df[col.replace('stats', 'partial')] = df[col].str.replace(stat_regex, r'\2', regex=True).apply(lambda x: int(x))
            df[col.replace('stats', 'outs')]  = df[col.replace('stats','full')] * 3 + df[col.replace('stats','partial')]
            df = df.drop([col.replace('stats','full'), col.replace('stats', 'partial')], axis=1)
            df[col.replace('stats', 'er')] = df[col].str.replace(stat_regex, r'\3', regex=True).apply(lambda x: int(x))
            df[col.replace('stats', 'k')] = df[col].str.replace(stat_regex, r'\4', regex=True).apply(lambda x: int(x))
            df[col.replace('stats', 'bb')] = df[col].str.replace(stat_regex, r'\5', regex=True).apply(lambda x: int(x))
            df = df.drop(col, axis=1)
        if 'extra' in col:
            df[col] = df[col].fillna(False)
        if col == 'odds':
            temp = df[col].str.extract(r'Line:\s(\w+)(?:\s-(\d+))?')
            temp.columns = ['fav','odds']
            def proces_row(row):
                if row['fav'] == 'EVEN':
                    row['odds'] = '100'
                return row
            temp = temp.apply(proces_row, axis=1)
            temp.odds = temp.odds.astype(float)
            df[col] = temp.odds
            df['fav'] = temp.fav
        if col == 'o_u':
            df[col] = df[col].str.extract(r'\w+/\w+:\s(\d+)').astype(float)


    return df

df_clean = clean_games(df)

In [None]:
df_clean

In [None]:
df_clean.dtypes

In [None]:
df_clean.describe()

In [None]:
df_clean.select_dtypes(include=[object]).describe()

In [None]:
def get_schedule(df, team):
    df_home =  df[(df.home == team)]
    df_away = df[(df.away == team)]

    df_home['venue'] = df_home.home.apply(lambda x : 'HOME')
    df_away['venue'] = df_away.home.apply(lambda x : 'AWAY')

    r_home = r'(?<!home)(home)'
    r_away = r'(?<!away)(away)'
    r__ = r'(team|opp)(?!(_|$))'
    s__ = r'\1_'
    df_home = df_home.rename(lambda x: re.sub(r_home, 'team', x), axis = 1).rename(lambda x: re.sub(r_away, 'opp', x), axis = 1).rename(lambda x: re.sub(r__, s__, x), axis = 1)
    df_away = df_away.rename(lambda x: re.sub(r_away, 'team', x), axis = 1).rename(lambda x: re.sub(r_home, 'opp', x), axis = 1).rename(lambda x: re.sub(r__, s__, x), axis = 1)
    df =  pd.concat([df_home, df_away]).sort_values('datetime')
    temp = df.filter(regex = r'(wins|losses)$').fillna(method = 'ffill').fillna(0)
    df[temp.columns] = temp
    return df

In [None]:
get_schedule(df_clean, 'BOS')

In [None]:
def get_avg(df, team, hl = 3, window = 3, type = 'exp'):
    sched = get_schedule(df, team).filter(regex = r'(datetime)|(venue)|(.*(opp|team).*)').drop(['datetime', 'opp', 'venue'], axis=1)
    team = sched.pop('team')
    #dt = sched.pop('datetime')
    sched = sched.drop(sched.filter(regex = r'^opp.*(wins|losses)$').columns, axis = 1)
    if type == 'exp':
        sched = sched.ewm(halflife = hl).mean().dropna()
        sched = sched.rename(lambda x: x+f'_exp_avg_{hl}', axis = 1)
    elif type == 'roll':
        sched = sched.rolling(closed = 'left', window = window, min_periods = 1).mean().dropna()
        sched = sched.rename(lambda x: x+f'_roll_avg_{window}', axis = 1)
    sched = pd.concat([team, sched], axis = 1)
    return sched


In [None]:
get_avg(df_clean, 'BOS', hl = 3, type = 'exp')

In [None]:
def process_df(df, hls = [3,10], windows = [10]):
    avgs = {}


    for venue in ['home', 'away']:
        for hl in hls:
            s = f'{venue}_exp_{hl}'
            print(s)
            avgs[s] = pd.DataFrame()
            for team in df.home.unique():
                d = get_avg(df = df, team = team, hl = hl, type = 'exp').rename(columns = {'team':f'{venue}'}).set_index(f'{venue}', append = True).rename(lambda x: f'{venue}_'+x, axis = 1)
                d = df.set_index(venue, append = True).join(d, how = 'inner').reset_index(level = f'{venue}').drop(df.columns, axis = 1)
                avgs[s] = pd.concat([avgs[s], d])
        for w in windows:
            s = f'{venue}_roll_{hl}'
            print(s)
            avgs[s] = pd.DataFrame()
            for team in df.home.unique():
                d = get_avg(df = df, team = team, window = w, type = 'roll').rename(columns = {'team':f'{venue}'}).set_index(f'{venue}', append = True).rename(lambda x: f'{venue}_'+x, axis = 1)
                d = df.set_index(venue, append = True).join(d, how = 'inner').reset_index(level = f'{venue}').drop(df.columns, axis = 1)
                avgs[s] = pd.concat([avgs[s], d])
    return pd.concat([df] + list(avgs.values()), axis = 1)



In [None]:
process_df(df_clean)

In [None]:
hitters = pd.read_csv('data/hittersByGame.csv')

In [None]:
hitters.columns = normalize_columns(hitters.columns)

In [None]:
hitters = hitters.drop('h_ab', axis = 1)

In [None]:
hitters = hitters.set_index('game')

In [None]:
hitters = hitters[hitters.position != 'TEAM']

In [None]:
hitters

In [None]:
pitchers = pd.read_csv('data/pitchersByGame.csv')
pitchers.columns = normalize_columns(pitchers.columns)
pitchers = pitchers.set_index(['game', 'pitcher_id'])
pitchers =  pitchers[pitchers.pitchers != 'TEAM']
pitchers = pd.concat([pitchers.pc_st.str.extract(r'(?:\d+|--)-(\d+|--)').rename(columns = {0:'st'}), pitchers], axis = 1)
pitchers.st = pitchers.st.replace('--',np.nan)
extra = {'loss': r'\(L',
         'win': r'\(W',
         'hold': r'\(H',
         'save': r'\(S'}
for k, v in extra.items():
    pitchers[k] = pitchers.extra.str.match(v).fillna(False)

d = pitchers.ip.astype(str).str.extract(r'(\d)\.(\d)').astype(int)
pitchers['outs'] = d[0] * 3 + d[1]
pitchers.drop(['pc_st', 'ip', 'extra'], axis = 1)

In [None]:
pitches = pd.read_csv('data/pitches.csv')
pitches.columns = normalize_columns(pitches.columns)

In [None]:
pitches = pitches.set_index(['game', 'event_id', 'num'])

In [None]:
pb = pitches.play_bases.astype(str)
pitches['runner_on_1'] = pb.str.match(r'\d*1')
pitches['runner_on_2'] = pb.str.match(r'\d*2')
pitches['runner_on_3'] = pb.str.match(r'\d*3')

In [None]:
pitches.mph = pitches.mph.replace('--', np.nan).astype(float)

In [None]:
pitches = pd.concat([pitches, pitches.play_field.str.extract(r'top: (\d+\.*\d*)px; right: (\d+\.*\d*)px;').astype(float).rename(columns = {0:'play_field_top', 1:'play_field_right'})],axis = 1)

In [None]:
pitches

In [None]:
pitches = pd.concat([pitches, pitches.play_hitzone.str.extract(r'top: (\d+\.*\d*)px; right: (\d+\.*\d*)px;').astype(float).rename(columns = {0:'play_hitzone_top', 1:'play_hitzone_right'})],axis = 1)

In [None]:
pitches = pd.concat([pitches, pitches.inning.str.extract(r'(\w+) (\d+).+').rename(columns = {1:'inning_num', 0:'inning_topbottom'})], axis = 1)

In [None]:
pitches.type.unique()
#Fastball: fastball, four-seam fb, two-seam fb, cutter, sinker
#Offspeed: slider, changeup, curve, splitter, knuckleball, screwball, knuckle curve, eephus pitch, slow curve
#Intentional ball: intentional ball, pitch out
#Breaking ball: slider, curve, splitter, knuckleball, screwball, forkball, knuckle curve, slow curve
#knuckle balls: knuckleball, knuckle curve, forkball

In [None]:
pitches.type = pitches.type.replace({'--': np.nan, 'Unknown': np.nan})

In [None]:
fastball = r'Fastball|.*FB|Cutter|Sinker'
offspeed = r'Slider|Changeup|.*Curve|Splitter|Knuckle.*|Screwball|Forkball|Eephus.*'
intentional = r'Intentional Ball|Pitch Out'
breaking = r'Slider|.*Curve|Splitter|Knuckle.*|Screwball|Forkball'
knuckle = r'Knuckle.*|Forkball'
pitches.type.replace(fastball,'fast', regex = True).replace(offspeed,'offspeed', regex = True).replace(intentional,'intentional', regex = True).unique()

In [None]:
pitches['fastball'] = pitches.type.str.match(fastball)
pitches['offspeed'] = pitches.type.str.match(offspeed)
pitches['intentional'] = pitches.type.str.match(intentional)
pitches['breaking'] = pitches.type.str.match(breaking)
pitches['knuckle'] = pitches.type.str.match(knuckle)

In [None]:
pitches

In [None]:
pitches.pitch.value_counts()
#Ball: Ball, Intentional Ball
#Strike: Strike Looking, Strike Swinging, 'Strikeout Batter Safe, Passed Ball|Error'
#Foul: Foul Ball, Foul Out, Error On A Dropped Foul Ball
#Hit:(Bunt) Single, (Ground Rule|Bunt) Double, Triple, (I.T.P.) Home Run
#Out: