In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
%matplotlib inline
sns.set_style('white')
import warnings
# Suppress annoying harmless error.
warnings.simplefilter('ignore')

# Description of Hitters Dataset #

### Major League Baseball Data from the 1986 and 1987 seasons. ###

#### Format ####

* A data frame with 322 observations of major league players on 20 variables.

#### Description of Variables ####

* AtBat: Number of times at bat in 1986

* Hits: Number of hits in 1986

* HmRun: Number of home runs in 1986

* Runs: Number of runs in 1986

* RBI: Number of runs batted in in 1986

* Walks: Number of walks in 1986

* Years: Number of years in the major leagues

* CAtBat: Number of times at bat during his career

* CHits: Number of hits during his career

* CHmRun: Number of home runs during his career

* CRuns: Number of runs during his career

* CRBI: Number of runs batted in during his career

* CWalks: Number of walks during his career

* League: A factor with levels A and N indicating player’s league at the end of 1986

* Division: A factor with levels E and W indicating player’s division at the end of 1986

* PutOuts: Number of put outs in 1986

* Assists: Number of assists in 1986

* Errors: Number of errors in 1986

* Salary: 1987 annual salary on opening day in thousands of dollars

* NewLeague: A factor with levels A and N indicating player’s league at the beginning of 1987


#### Source ####

This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.
This is part of the data that was used in the 1988 ASA Graphics Section Poster Session. The salary
data were originally from Sports Illustrated, April 20, 1987. The 1986 and career statistics were
obtained from The 1987 Baseball Encyclopedia Update published by Collier Books, Macmillan
Publishing Company, New York

In [2]:
# Load the data again. Keep Hitters data, drop the index column
# and any missing data columns.
df = pd.read_csv(
    'https://vincentarelbundock.github.io/Rdatasets/csv/ISLR/Hitters.csv'
).iloc[:,1:].dropna()

In [3]:
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A


In [4]:
# turn factors variables into binary dummies for pandas
# recode columns
df['League'] = np.where(df['League']=='N', 1, 0)
df['Division'] = np.where(df['Division']=='W', 1, 0)
df['NewLeague'] = np.where(df['NewLeague']=='A', 1, 0)

In [5]:
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,1,1,632,43,10,475.0,0
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,0,1,880,82,14,480.0,1
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,1,0,200,11,3,500.0,0
4,321,87,10,39,42,30,2,396,101,12,48,46,33,1,0,805,40,4,91.5,0
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,0,1,282,421,25,750.0,1


In [6]:
df.describe()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
count,263.0,263.0,263.0,263.0,263.0,263.0,263.0,263.0,263.0,263.0,263.0,263.0,263.0,263.0,263.0,263.0,263.0,263.0,263.0,263.0
mean,403.642586,107.828897,11.619772,54.745247,51.486692,41.114068,7.311787,2657.543726,722.186312,69.239544,361.220532,330.418251,260.26616,0.471483,0.509506,290.711027,118.760456,8.593156,535.925882,0.536122
std,147.307209,45.125326,8.757108,25.539816,25.882714,21.718056,4.793616,2286.582929,648.199644,82.197581,331.198571,323.367668,264.055868,0.500138,0.500863,279.934575,145.080577,6.606574,451.118681,0.499644
min,19.0,1.0,0.0,0.0,0.0,0.0,1.0,19.0,4.0,0.0,2.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,67.5,0.0
25%,282.5,71.5,5.0,33.5,30.0,23.0,4.0,842.5,212.0,15.0,105.5,95.0,71.0,0.0,0.0,113.5,8.0,3.0,190.0,0.0
50%,413.0,103.0,9.0,52.0,47.0,37.0,6.0,1931.0,516.0,40.0,250.0,230.0,174.0,0.0,1.0,224.0,45.0,7.0,425.0,1.0
75%,526.0,141.5,18.0,73.0,71.0,57.0,10.0,3890.5,1054.0,92.5,497.5,424.5,328.5,1.0,1.0,322.5,192.0,13.0,750.0,1.0
max,687.0,238.0,40.0,130.0,121.0,105.0,24.0,14053.0,4256.0,548.0,2165.0,1659.0,1566.0,1.0,1.0,1377.0,492.0,32.0,2460.0,1.0


### Models Compared ###

* We will predict NewLeague (whether a player switched leagues in the '87 season) using the '86 and earlier data.

* There will be three models compared by CV accuracy score. 
    1. Logistic Regression (with large value of C) ie. without any regularization.
    2. Logistic with l2 regularization (Ridge).
    3. logistic with l1 regularization (Lasso).


In [30]:
# Hold out 40% of data for final out-of-sample test
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df.NewLeague, test_size=0.4, random_state=42)

In [35]:
# Declare a logistic regression classifier.
# Parameter regularization coefficient C described above.
lr = LogisticRegression(C=1e9)


# Fit the model.
fit = lr.fit(X_train, y_train)

# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lr.predict(X_test)

print('\n Accuracy by admission status')
print(pd.crosstab(pred_y_sklearn, y_test))

print('-------------------------------------------------------')
print('Results for unregularized Logistic Regression')
print('\n Percentage accuracy Out of Sample')
print("{:.2f}%".format(lr.score(X_test, y_test)*100))

Coefficients
[[-3.44276258e-03  1.61539933e-02 -7.99058965e-02  1.37859144e-02
   2.48511759e-02 -2.34681854e-02  4.42992877e-02  2.63105085e-03
  -7.78271449e-03 -3.27902291e-03 -1.01787959e-02  1.66629930e-03
   6.14109853e-03 -6.62344119e+00 -3.65936282e-01 -2.14620232e-03
  -3.59425870e-03  1.28408352e-02  1.15149129e-03]]
[3.93627336]

 Accuracy by admission status
NewLeague   0   1
row_0            
0          45   4
1           7  50
-------------------------------------------------------
Results for unregularized Logistic Regression

 Percentage accuracy Out of Sample
89.62%


In [38]:
grid = [.1, 1, 10,100,200,300,500,700, 1000, 10000]
out =[]
for c in grid:
    lrr = LogisticRegression(C=c)
    #fitRidge = lrr.fit(X_train, y_train)
    scores = cross_val_score(lrr, X_train, y_train, cv=10)
    out.append(scores.mean())
bestc = grid[out.index(max(out))]
print("best value of C is = ", bestc)

lrr = LogisticRegression(C=bestc)
lrr.fit(X_train,y_train)


print('Results for l2 (ridge) Logistic Regression')
print('Best C from 10-fold CV on train dataset')
print('-------------------------------------------------------')
print('\n Percentage accuracy Out of Sample')
print("{:.2f}%".format(lrr.score(X_test, y_test)*100))

best value of C is =  1
Results for l2 (ridge) Logistic Regression
Best C from 10-fold CV on train dataset
-------------------------------------------------------

 Percentage accuracy Out of Sample
91.51%


In [39]:
grid = [.1, 1, 10,100,200,300,500,700, 1000, 10000]
out =[]
for c in grid:
    lrl = LogisticRegression(penalty='l1',C=c)
    #fitRidge = lrr.fit(X_train, y_train)
    scores = cross_val_score(lrl, X_train, y_train, cv=10)
    out.append(scores.mean())
bestc = grid[out.index(max(out))]
print("best value of C is = ", bestc)

lrl = LogisticRegression(penalty='l1', C=bestc)
lrl.fit(X_train,y_train)


print('Results for l1 (lasso) Logistic Regression')
print('Best C from 10-fold CV on train dataset')
print('-------------------------------------------------------')
print('\n Percentage accuracy Out of Sample')
print("{:.2f}%".format(lrl.score(X_test, y_test)*100))

best value of C is =  1
Results for l1 (lasso) Logistic Regression
Best C from 10-fold CV on train dataset
-------------------------------------------------------

 Percentage accuracy Out of Sample
90.57%


### Results ###

* Overall, out-of-sample accuracies are very good for all three methods tried.

* A grid of regularization parameters are used for l2 and l1 regularization.

* Default Logistic Regression (C=1e9) with no regularization does quite well out-of-sample.

* Imposing regularization appears to help modestly (89.62% vs. 91.51% accuracy) on this dataset.

* Best out-of-sample results are found by using l2 regularized (Ridge) Logistic regression.

* l1 regularized Logistic Regression is also quite good at 90.57% test set accuracy.