In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from mlb.Data import Data

In [None]:
df = pd.read_csv('data/games.csv')

In [None]:
df

In [None]:
df.columns

In [None]:
import re
def normalize_columns(cols):
    new_cols = []
    for col in cols.copy():
        replace_dict = {' - ': '_',
                       '-': '_',
                       ' ': '_',
                       '/': '_'}
        temp = col.lower()
        for k, v in replace_dict.items():
            temp = temp.replace(k, v)

        temp = re.sub(r'^(.+)_(home|away)$', r'\2_\1', temp)
        new_cols.append(temp)


    return new_cols

In [None]:
df.columns = normalize_columns(df.columns)
df.columns

In [None]:
df.dtypes

In [None]:
def clean_games(df):
    df = df.set_index('game')
    df['datetime'] = pd.to_datetime(df['date'])
    df['date'] = df.datetime.dt.date
    df['time'] = df.datetime.dt.time
    df = df.sort_values('datetime')
    df = pd.concat([df[['date', 'home', 'away']],
                    df.drop(['date', 'home', 'away'], axis=1)], axis=1)
    df = df.drop(df[df.duplicated()].index)
    df = df.replace('(null-null)', np.nan)
    for col in df.columns:
        if df[col].isna().sum() < 100:
            df = df.drop(df[df[col].isna()].index)
        if col == 'save_pitcher_record':
            df[col] = df[col].fillna('(0)')

        if 'record' in col:
            if 'pitcher' in col:
                df[col] = df[col].str.replace(r'\((\d+-?\d*)\)', r'\1', regex=True)

                if 'save' in col:
                    df[col.replace('record', 'saves')] = df[col].str.replace(r'\((\d+)\)', r'\1', regex=True).astype(int)
                    df = df.drop(col, axis=1)
                    continue

            df[col.replace('record', 'wins')] = df[col].str.replace(r'(\d+)-{1}(\d+)\s*\w*', r'\1', regex=True).astype(int)
            df[col.replace('record', 'losses')] = df[col].str.replace(r'(\d+)-{1}(\d+)\s*\w*', r'\2', regex=True).astype(int)
            df = df.drop(col, axis=1)
        if 'postseason' in col:
            df['regular_season'] = df[col].isna()
            df[col] = df[col].fillna('Regular Season')

        if col == 'stadium':
            df[col] = df[col].str.replace(r'^\s*(.+(\s\w+)*){1}(.*\s*)*', r'\1', regex=True)
        if col == 'location':
            df[col] = df[col].str.replace(r'^(.+(\s\w+)*)\s*\d*',r'\1', regex=True)
        if col in ['attendance', 'capacity']:
            df[col] = df[col].str.replace(',','')
            df[col] = pd.to_numeric(df[col])
            df[col] = df[col].fillna(int(df[col].mean()))
        if col == 'duration':
            df['hours'] = df[col].str.split(':').apply(lambda x: int(x[0]))
            df['minutes'] = df[col].str.split(':').apply(lambda x: int(x[1][:2]))
            df[col] = df['minutes'] + 60 * df['hours']
            df = df.drop(['minutes', 'hours'], axis=1)

        if 'stats' in col:
            if 'save' in col:
                df[col] = df[col].fillna('0.0 IP, 0 ER, 0 K, 0 BB')
            stat_regex = r'(\d)\.(\d) IP, (\d+) ER, (\d+) K, (\d+) BB'
            df[col.replace('stats','full')] = df[col].str.replace(stat_regex, r'\1', regex=True).apply(lambda x: int(x))
            df[col.replace('stats', 'partial')] = df[col].str.replace(stat_regex, r'\2', regex=True).apply(lambda x: int(x))
            df[col.replace('stats', 'outs')]  = df[col.replace('stats','full')] * 3 + df[col.replace('stats','partial')]
            df = df.drop([col.replace('stats','full'), col.replace('stats', 'partial')], axis=1)
            df[col.replace('stats', 'er')] = df[col].str.replace(stat_regex, r'\3', regex=True).apply(lambda x: int(x))
            df[col.replace('stats', 'k')] = df[col].str.replace(stat_regex, r'\4', regex=True).apply(lambda x: int(x))
            df[col.replace('stats', 'bb')] = df[col].str.replace(stat_regex, r'\5', regex=True).apply(lambda x: int(x))
            df = df.drop(col, axis=1)
        if 'extra' in col:
            df[col] = df[col].fillna(False)
        if col == 'odds':
            temp = df[col].str.extract(r'Line:\s(\w+)(?:\s-(\d+))?')
            temp.columns = ['fav','odds']
            def proces_row(row):
                if row['fav'] == 'EVEN':
                    row['odds'] = '100'
                return row
            temp = temp.apply(proces_row, axis=1)
            temp.odds = temp.odds.astype(float)
            df[col] = temp.odds
            df['fav'] = temp.fav
        if col == 'o_u':
            df[col] = df[col].str.extract(r'\w+/\w+:\s(\d+)').astype(float)


    return df

df_clean = clean_games(df)

In [None]:
df_clean

In [None]:
df_clean.dtypes

In [None]:
df_clean.describe()

In [None]:
    df_clean.select_dtypes(include=[object]).describe()

In [None]:
df_num = df_clean.select_dtypes(include = [float, int, 'datetime64[ns, UTC]']).filter(regex = r'.+(?<!id)$').drop(['odds','o_u', 'datetime', 'attendance', 'capacity'], axis=1)

In [None]:
df_num.dtypes

In [None]:
def get_schedule(df, team):
    df_home =  df[(df.home == team)]
    df_away = df[(df.away == team)]

    df_home['venue'] = df_home.home.apply(lambda x : 'HOME')
    df_away['venue'] = df_away.home.apply(lambda x : 'AWAY')

    r_home = r'(?<!home)(home)'
    r_away = r'(?<!away)(away)'
    r__ = r'(team|opp)(?!(_|$))'
    s__ = r'\1_'
    df_home = df_home.rename(lambda x: re.sub(r_home, 'team', x), axis = 1).rename(lambda x: re.sub(r_away, 'opp', x), axis = 1).rename(lambda x: re.sub(r__, s__, x), axis = 1)
    df_away = df_away.rename(lambda x: re.sub(r_away, 'team', x), axis = 1).rename(lambda x: re.sub(r_home, 'opp', x), axis = 1).rename(lambda x: re.sub(r__, s__, x), axis = 1)
    df =  pd.concat([df_home, df_away]).sort_values('datetime')
    temp = df.filter(regex = r'(wins|losses)$').fillna(method = 'ffill').fillna(0)
    df[temp.columns] = temp
    return df

In [None]:
get_schedule(df_clean, 'BOS')

In [None]:
def get_rolling_averages(df, team, window = 3):
    sched = get_schedule(df, team).filter(regex = r'(datetime)|(venue)|(.*(opp|team).*)').drop(['team', 'opp', 'datetime', 'venue'], axis=1)
    sched = sched.drop(sched.filter(regex = r'^opp.*(wins|losses)$').columns, axis = 1)
    sched = sched.rolling(closed = 'left', window = window).mean().dropna()
    sched = sched.rename(lambda x: x+f'_avg_{window}', axis = 1)
    return sched

In [None]:
get_rolling_averages(df_clean, 'BOS')

In [None]:
def get_exp_avg(df, team, hl = 3):
    sched = get_schedule(df, team).filter(regex = r'(datetime)|(venue)|(.*(opp|team).*)').drop(['team', 'opp', 'datetime', 'venue'], axis=1)
    sched = sched.drop(sched.filter(regex = r'^opp.*(wins|losses)$').columns, axis = 1)
    sched = sched.ewm(halflife = hl).mean().dropna()
    sched = sched.rename(lambda x: x+f'_exp_{hl}', axis = 1)
    return sched


In [None]:
get_exp_avg(df_clean, 'BOS', hl = 3)