In [1]:
# import libraries 
import pandas as pd
import numpy as np
import math
from scipy.stats import kurtosis, skew

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.metrics import r2_score, mean_squared_error

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MaxAbsScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline

# this will allow us to visualize the pipeline (may not be available in learn-env)
from sklearn import set_config
set_config(display= 'diagram')

from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, \
RandomizedSearchCV, StratifiedKFold, KFold
from sklearn.feature_selection import SelectFromModel

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, \
AdaBoostClassifier, GradientBoostingClassifier

from pickle import dump, load

pd.set_option('display.max_columns', None)

np.random_state = 42

In [2]:
# load batting_basic
batting_basic = pd.read_csv('Data/batting_basic', index_col = 0)
batting_basic = batting_basic.drop(columns = ['Season', 'Name'])
batting_basic.head()

Unnamed: 0,POS,Team,Age,Salary,G,AB,PA,AVG,H,1B,2B,3B,HR,R,RBI,BB,IBB,SO,HBP,SF,SH,GDP,SB,CS,OBP,SLG,OPS
0,OF,CHW,27.0,255000.0,80.0,215.0,242.0,0.274,59.0,40.0,15.0,1.0,3.0,31.0,29.0,21.0,1.0,38.0,2.0,1.0,2.0,2.0,2.0,1.0,0.343,0.395,0.738
1,2B,NYM,31.0,500000.0,79.0,157.0,173.0,0.217,34.0,20.0,7.0,1.0,6.0,22.0,12.0,14.0,2.0,51.0,1.0,1.0,0.0,2.0,1.0,1.0,0.283,0.389,0.672
5,OF,MIA,24.0,327000.0,111.0,255.0,281.0,0.212,54.0,35.0,12.0,2.0,5.0,39.0,24.0,18.0,2.0,78.0,3.0,1.0,4.0,2.0,6.0,5.0,0.271,0.333,0.604
6,OF,MIA,25.0,327000.0,35.0,76.0,80.0,0.197,15.0,10.0,3.0,0.0,2.0,16.0,5.0,2.0,0.0,22.0,2.0,0.0,0.0,1.0,7.0,1.0,0.238,0.316,0.553
7,OF,HOU,26.0,327000.0,34.0,55.0,60.0,0.309,17.0,10.0,5.0,0.0,2.0,10.0,5.0,1.0,0.0,23.0,2.0,1.0,1.0,0.0,5.0,2.0,0.339,0.509,0.848


## Train-Test Split

In [3]:
# feature selection, target variable = 'Salary'
X = batting_basic.drop(['Salary'], axis = 1)
y = batting_basic['Salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 42)

In [4]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8568, 26)
(2856, 26)
(8568,)
(2856,)


## Preprocessing

In [5]:
# load the transformer
bb_ct = load(open('pkl/bb_ct.pkl', 'rb'))

In [6]:
# inspect the preprocessed batting basic X train aka pp_bb_X_train
pp_bb_X_train = pd.DataFrame(bb_ct.fit_transform(X_train))
pp_bb_X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.615324,-1.277737,-1.320185,-1.278757,-3.256567,-1.31574,-1.280789,-1.271027,-0.729205,-0.94182,-1.214658,-1.213309,-0.991055,-0.532858,-0.847974,-0.571759,-0.969914,2.033278,-1.163921,-0.602031,-0.721621,-2.163013,-2.929161,-2.762856
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.101455,-0.639005,-0.457531,-0.488064,-0.088735,-0.494367,-0.198899,-0.764249,-0.729205,-0.94182,-0.690033,-0.790427,-0.573329,-0.270018,-0.671409,-0.291008,-0.545728,-0.330878,-0.621142,0.788319,0.813442,-0.254618,-1.096442,-0.814301
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.372258,0.308081,0.008623,-0.026033,-0.607107,-0.257081,-0.115677,-0.173007,-0.729205,-0.639171,-0.296564,-0.367546,-0.406239,-0.532858,-0.368725,-0.571759,0.302643,1.357805,0.464415,-0.34924,0.429676,-0.820668,-0.907502,-0.913908
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.572267,-0.110399,-0.0396,-0.064138,0.410439,-0.019795,-0.087936,-0.341933,-0.729205,0.773187,-0.06704,0.83604,-0.322694,-0.270018,-0.620962,0.831998,0.302643,-0.668614,0.645342,-0.602031,-0.337856,0.262912,0.972451,0.735828
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.858389,1.233142,1.17133,1.236218,0.160852,0.9476,0.605583,1.178402,-0.252744,1.681132,1.178945,0.998686,1.389981,-0.270018,1.043799,1.955003,1.151015,-0.668614,1.188121,-0.475635,-0.721621,0.521678,0.783511,0.723377


In [7]:
pp_bb_X_train.shape

(8568, 61)

Now lets do the same transformation for the `X_test`

In [8]:
# inspect the preprocessed batting basic X train aka pp_bb_X_test
pp_bb_X_test = pd.DataFrame(bb_ct.transform(X_test))
pp_bb_X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.615324,-1.475964,-1.288037,-1.269231,-1.394266,-1.224476,-1.225308,-1.186564,-0.729205,-0.740054,-1.050713,-0.953074,-0.90751,-0.532858,-1.074987,-0.010256,-0.969914,-0.668614,-0.982995,-0.475635,-0.721621,-0.060544,-0.605198,-0.4221
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.344521,-1.101535,-0.795092,-0.802436,-0.107934,-0.768158,-0.809196,-0.848712,-0.252744,-0.23564,-0.657244,-0.595251,-0.573329,-0.532858,-0.141712,-0.85251,-0.969914,-0.330878,-0.982995,0.029947,0.04591,-0.012026,0.528442,0.343627
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.329201,1.51947,1.841093,1.745882,1.831164,2.517337,2.935806,2.191958,2.129559,0.167891,1.605204,2.234801,0.805165,1.30702,0.438432,-0.291008,2.847758,-0.668614,1.911826,0.156342,0.04591,1.168591,0.859087,1.015972
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.615324,1.541495,1.514249,1.822093,-0.319123,0.984106,0.355916,1.516254,1.653098,1.98378,1.539625,1.551685,3.687472,1.044181,2.834678,2.235754,1.575201,-0.668614,0.645342,-0.34924,0.04591,0.909826,0.670147,0.798082
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.858389,0.21998,0.07292,0.116864,0.94801,0.235743,0.13399,0.502697,0.700177,0.067008,-0.034251,0.413158,0.220349,0.255661,-0.091265,1.112749,1.151015,-0.330878,-0.25929,-0.475635,-0.721621,1.006863,0.934663,1.003521


In [9]:
pp_bb_X_test.shape

(2856, 61)

**Log Transforming Salary**

In [10]:
log_y_train = np.log(y_train)
log_y_test = np.log(y_test)

## Modeling

In [17]:
from functions import *

In [18]:
# Test of the model_results function
lr_model = LinearRegression()
model_results(lr_model, pp_bb_X_train, log_y_train, pp_bb_X_test, log_y_test)

             LinearRegression() RESULTS
LinearRegression() Training R2:               0.5206
LinearRegression() Test R2:                   0.4888
-------------------------------------------------------
LinearRegression() Cross Validation R2:       0.5133
LinearRegression() Cross Validation RMSE:    3854964
-------------------------------------------------------
LinearRegression() Training RMSE:            4790610
LinearRegression() Model Test RMSE:          4438849


In [None]:
test