In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os

import pandas as pd
import numpy as np
from string import Template
import pickle

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:
def get_batting_data(years):
    chromedriver = "/Applications/chromedriver"
    os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Chrome(chromedriver)
    url_base = Template('https://www.baseball-reference.com/leagues/MLB/$year-standard-batting.shtml')
    df_list = []
    for year in years:
        url = url_base.substitute(year=year)
        driver.get(url)
        tables = pd.read_html(driver.page_source)
        df = tables[len(tables) - 1]
        df_list.append(df)
        time.sleep(1);
    return df_list

def get_value_data():
    chromedriver = "/Applications/chromedriver"
    os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Chrome(chromedriver)
    driver.get('https://www.baseball-reference.com/leagues/MLB/2018-value-batting.shtml')
    tables = pd.read_html(driver.page_source)
    df_value = tables[len(tables) - 1]
    return df_value

def pickle_3yr_data():
    years = [2015, 2016, 2017]
    df_list = get_batting_data(years)
    df_value = get_value_data()
    pickle_object = (df_list, df_value)
    with open("batting_value_pickle.pkl", "wb") as f:
        pickle.dump(pickle_object, f)
    
def clean_salary_data():
    with open("batting_value_pickle.pkl", "rb") as f:
        pickle_object = pickle.load(f)
    df_value = pickle_object[1]
    df_value.columns = [x.strip() for x in df_value.columns]
    drop_columns = ['Rk', 'Age', 'Tm', 'G', 'PA', 'Rbat', 'Rbaser', 'Rdp', 'Rfield', 'Rpos', 'RAA', 'WAA', 'Rrep',
                'RAR', 'WAR', 'waaWL%', '162WL%', 'oWAR','dWAR', 'oRAR','Acquired']
    df_salary = df_value.drop(drop_columns, axis=1)
    df_salary.columns = ['Name', 'salary', 'position']
    df_salary.dropna(axis=0, how = 'any', inplace=True)
    df_salary = df_salary[df_salary['Name'] != 'Name']
    df_salary['salary'] = df_salary['salary'].str.replace(',', '')
    df_salary['salary'] = df_salary['salary'].str.replace('$', '')
    df_salary['salary'] = df_salary['salary'].astype(int)
    df_salary = df_salary[~df_salary['position'].str.contains('1')]
    df_salary = df_salary[df_salary['salary'] > 1000000]
    df_salary = df_salary.drop('position', axis=1)
    return df_salary

def clean_batting_data(df):
    df.columns = [x.strip() for x in df.columns]
    df.dropna(axis=0, how = 'any', inplace=True)
    df = df[df['Name'] != 'Name']
    df = df.drop_duplicates(subset = 'Name', keep = 'first')
    columns = ['Name', 'Age', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B',
       'HR', 'RBI', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'OPS+',
       'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB']
    df = df[columns]
    float_columns = ['Age', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B',
       'HR', 'RBI', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'OPS+',
       'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB']
    df[float_columns] = df[float_columns].astype('float')
    return df

def combine_data():
    with open("batting_value_pickle.pkl", "rb") as f:
        pickle_object = pickle.load(f)
    df_list = pickle_object[0]
    df_batting_2015 = df_list[0]
    df_batting_2015 = clean_batting_data(df_batting_2015)
    df_batting_2016 = df_list[1]
    df_batting_2016 = clean_batting_data(df_batting_2016)
    df_batting_2017 = df_list[2]
    df_batting_2017 = clean_batting_data(df_batting_2017)
    df_salary = clean_salary_data()
    df_combined = pd.merge(df_salary, df_batting_2015, how = 'left', on='Name')
    df_combined = df_combined.merge(df_batting_2016, on='Name', how='left', suffixes=("_2015", "_2016"))
    df_combined = df_combined.merge(df_batting_2017, on='Name', how='left')
    df_combined.dropna(axis=0, how='any', inplace=True)
    return df_combined

def calculate_average_features(df_combined):
    df_avg_features = df_combined.loc[:, ['Name', 'salary']]
    df_avg_features['avg_age'] = (df_combined.loc[:, 'Age_2016']).astype('int')
    df_avg_features['avg_games'] = ((df_combined['G_2015'] + df_combined['G_2016'] + df_combined['G']) / 3.0).astype('int')
    df_avg_features['avg_PA'] = ((df_combined['PA_2015'] + df_combined['PA_2016'] + df_combined['PA']) / 3.0).astype('int')
    df_avg_features['avg_AB'] = ((df_combined['AB_2015'] + df_combined['AB_2016'] + df_combined['AB']) / 3.0).astype('int')
    df_avg_features['avg_R'] = ((df_combined['R_2015'] + df_combined['R_2016'] + df_combined['R']) / 3.0).astype('int')
    df_avg_features['avg_H'] = ((df_combined['H_2015'] + df_combined['H_2016'] + df_combined['H']) / 3.0).astype('int')
    df_avg_features['avg_2B'] = ((df_combined['2B_2015'] + df_combined['2B_2016'] + df_combined['2B']) / 3.0).astype('int')
    df_avg_features['avg_3B'] = ((df_combined['3B_2015'] + df_combined['3B_2016'] + df_combined['3B']) / 3.0).astype('int')
    df_avg_features['avg_HR'] = ((df_combined['HR_2015'] + df_combined['HR_2016'] + df_combined['HR']) / 3.0).astype('int')
    df_avg_features['avg_RBI'] = ((df_combined['RBI_2015'] + df_combined['RBI_2016'] + df_combined['RBI']) / 3.0).astype('int')
    df_avg_features['avg_BB'] = ((df_combined['BB_2015'] + df_combined['BB_2016'] + df_combined['BB']) / 3.0).astype('int')
    df_avg_features['avg_SO'] = ((df_combined['SO_2015'] + df_combined['SO_2016'] + df_combined['SO']) / 3.0).astype('int')
    df_avg_features['avg_BA'] = ((df_combined['BA_2015'] + df_combined['BA_2016'] + df_combined['BA']) / 3.0)
    df_avg_features['avg_OBP'] = ((df_combined['OBP_2015'] + df_combined['OBP_2016'] + df_combined['OBP']) / 3.0)
    df_avg_features['avg_SLG'] = ((df_combined['SLG_2015'] + df_combined['SLG_2016'] + df_combined['SLG']) / 3.0)
    df_avg_features['avg_OPS'] = ((df_combined['OPS_2015'] + df_combined['OPS_2016'] + df_combined['OPS']) / 3.0)
    df_avg_features['avg_OPS+'] = ((df_combined['OPS+_2015'] + df_combined['OPS+_2016'] + df_combined['OPS+']) / 3.0)
    df_avg_features['avg_TB'] = ((df_combined['TB_2015'] + df_combined['TB_2016'] + df_combined['TB']) / 3.0).astype('int')
    df_avg_features['avg_GDP'] = ((df_combined['GDP_2015'] + df_combined['GDP_2016'] + df_combined['GDP']) / 3.0).astype('int')
    df_avg_features['avg_HBP'] = ((df_combined['HBP_2015'] + df_combined['HBP_2016'] + df_combined['HBP']) / 3.0).astype('int')
    df_avg_features['avg_SH'] = ((df_combined['SH_2015'] + df_combined['SH_2016'] + df_combined['SH']) / 3.0).astype('int')
    df_avg_features['avg_SF'] = ((df_combined['SF_2015'] + df_combined['SF_2016'] + df_combined['SF']) / 3.0).astype('int')
    df_avg_features['avg_IBB'] = ((df_combined['IBB_2015'] + df_combined['IBB_2016'] + df_combined['IBB']) / 3.0).astype('int')
    return df_avg_features

def simple_features(df_avg_features):
    simple_columns = ['salary', 'avg_R', 'avg_H', 'avg_RBI', 'avg_BB', 'avg_SO',
        'avg_GDP', 'avg_HBP', 'avg_SH', 'avg_SF', 'avg_IBB']
    df_simple = df_avg_features[simple_columns]
    return df_simple

def linear_regression_model(df):
    X = df[[x for x in df.columns if x != 'salary']]
    y = df['salary']
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3, random_state=42)
    y_train = np.log(y_train)
    y_test = np.log(y_test)
    ssX = StandardScaler()
    ssX.fit(X_train)
    X_train = ssX.transform(X_train)
    X_test = ssX.transform(X_test)
    model= LinearRegression()
    model = model.fit(X_train, y_train)
    return model, X_train, X_test, y_train, y_test

def mse(model, X, y):
    y_predict = model.predict(X)
    mse = mean_squared_error(y, y_predict)
    return mse

def mae(model, X, y):
    y_exp = np.exp(y)
    y_predict = model.predict(X)
    y_predict_exp = np.exp(y_predict)
    mae = mean_absolute_error(y_exp, y_predict_exp)
    return mae

In [3]:
pickle_3yr_data()

In [4]:
df_combined = combine_data()
df_average_features = calculate_average_features(df_combined)
df_simple = simple_features(df_average_features)
model, X_train, X_test, y_train, y_test = linear_regression_model(df_simple)
mse_test = mse(model, X_test, y_test)
mse_train = mse(model, X_train, y_train)
mae = mae(model, X_test, y_test)
print("train mean squared error = ", mse_train, "test mean squared error = ", mse_test, "mean_absolute_error = ", mae )

train mean squared error =  0.38674348867827474 test mean squared error =  0.37127053929629716 mean_absolute_error =  4145517.9500637283
