In [24]:
#packages
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

# Part 0: Data Cleaning

In [41]:
#import data set
hitters = pd.read_csv("/Users/conniechou/Library/CloudStorage/OneDrive-Personal/Connie/Homework/GSB544_Bodwin/Labs/Hitters.csv")
hitters

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,497,127,7,65,48,37,5,2703,806,32,379,311,138,N,E,325,9,3,700.0,N
318,492,136,5,76,50,94,12,5511,1511,39,897,451,875,A,E,313,381,20,875.0,A
319,475,126,3,61,43,52,6,1700,433,7,217,93,146,A,W,37,113,7,385.0,A
320,573,144,9,85,60,78,8,3198,857,97,470,420,332,A,E,1314,131,12,960.0,A


In [26]:
hitters.dtypes

AtBat          int64
Hits           int64
HmRun          int64
Runs           int64
RBI            int64
Walks          int64
Years          int64
CAtBat         int64
CHits          int64
CHmRun         int64
CRuns          int64
CRBI           int64
CWalks         int64
League        object
Division      object
PutOuts        int64
Assists        int64
Errors         int64
Salary       float64
NewLeague     object
dtype: object

All variable types look ok. Salary (dependent variable) is in float. League, Division, and NewLeague are categorical variables, so they are object type. Everything else is numerical integers.

In [27]:
#drop na values
hitters_df = hitters.dropna()
hitters_df

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,497,127,7,65,48,37,5,2703,806,32,379,311,138,N,E,325,9,3,700.0,N
318,492,136,5,76,50,94,12,5511,1511,39,897,451,875,A,E,313,381,20,875.0,A
319,475,126,3,61,43,52,6,1700,433,7,217,93,146,A,W,37,113,7,385.0,A
320,573,144,9,85,60,78,8,3198,857,97,470,420,332,A,E,1314,131,12,960.0,A


# Part 1: Different Model Specs

## Regression without regularization

In [28]:
#defining our X and y in global environment because we want to use these for the entire lab
X = hitters_df.drop(["Salary"], axis = 1)
y = hitters_df["Salary"]

In [29]:
#function of the general steps for different modeling types
def model(regression_label, type_of_regression):
    
    """
    transforms columns and outputs a pipeline of the desired kind of regression model
  
    Parameter
    ---------
    regression_label : str
    A string represeting the label of the modeling type
    
    type_of_regression: sklearn function
    A specific function for the modeling type 

    Returns
    -------
    arrays 
    returns the pipeline of the specified type of model
    """
    
    ct = ColumnTransformer([
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown = "ignore"),
    make_column_selector(dtype_include=object)), #selecting all columns that are categorical
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number)) #standardizing numerical variables
    ], remainder = "passthrough" #keep everything else
    )
    
    lr_pipeline = Pipeline(
      [("preprocessing", ct),
      (regression_label, type_of_regression)]
    ).set_output(transform = "pandas")
    
    return lr_pipeline


In [30]:
#fitting pipeline onto ENTIRE dataset
pipeline = model(regression_label = "linear_regression", type_of_regression = LinearRegression())
fitted_pipeline = pipeline.fit(X, y)

In [31]:
#coefficients for linear model
coeff = fitted_pipeline.named_steps["linear_regression"].coef_
coeff

array([ -31.29971152,   31.29971152,   58.42462282,  -58.42462282,
         12.38116255,  -12.38116255, -291.0945557 ,  337.83047948,
         37.85383676,  -60.57247861,  -26.99498379,  135.07389695,
        -16.69335888, -391.03865466,   86.68761664,  -14.18172332,
        480.74713477,  260.68988581, -213.89225864,   78.76129639,
         53.73248973,  -22.16086217])

In [32]:
#cross validation mse for linear model
cross_val_scores = cross_val_score(pipeline, X, y, cv = 5, scoring = 'neg_mean_squared_error')
-cross_val_scores

array([ 76408.91409134, 129937.76968228,  79976.48830477, 204818.01839674,
       114540.36111572])

Interpreting coefficients:

Interpreting cross validation scores: 

## Ridge Regression

In [33]:
#creating ridge regression pipeline

pipeline_r = model(regression_label = "ridge_reg", type_of_regression = Ridge())

In [34]:
#creating dictionary with different lambda values for tuning
lambdas_list = [0.001, 0.01, 0.1, 1, 10]
lambdas = {'ridge_reg__alpha': lambdas_list}

#running girdsearchcv to tune to different lambda values
gscv = GridSearchCV(pipeline_r, lambdas, cv = 5, scoring='neg_mean_squared_error')

In [35]:
#fitting gridsearchcv to entire dataset
gscv_fitted = gscv.fit(X, y)

#creating dataframe for mse values and lambda values
ridge_df = pd.DataFrame(gscv_fitted.cv_results_)
ridge_df = ridge_df[["param_ridge_reg__alpha", "mean_test_score"]] #get the columns i want
ridge_df["mean_test_score"] = ridge_df["mean_test_score"].abs() #absolute value of mean square errors
ridge_df.sort_values(by = "mean_test_score", ascending = True) #sort mse values, want smallest one

Unnamed: 0,param_ridge_reg__alpha,mean_test_score
3,1.0,119144.432677
4,10.0,119348.984776
2,0.1,120343.621067
1,0.01,121022.903286
0,0.001,121124.458592


The model with the lambda value that equals 1 is the best model, since it has the lowest MSE value.

In [40]:
#fit ridge regression model on entire data set using this lambda = 1
pipeline_r_1 = model(regression_label = "ridge_reg", type_of_regression = Ridge(alpha = 1))
fitted_pipeline_r = pipeline_r_1.fit(X, y)

#coefficients for fitted ridge pipeline
coeff_r = fitted_pipeline_r.named_steps["ridge_reg"].coef_
coeff_r

array([ -30.43885531,   30.43885531,   60.01559493,  -60.01559493,
         13.11128155,  -13.11128155, -270.6864407 ,  296.64505003,
         18.10059158,  -29.33940613,   -9.11329453,  124.40717273,
        -38.66774782, -225.40654798,  126.65960655,   39.07092364,
        320.41216891,  160.38678418, -184.4236106 ,   78.62365619,
         47.46259711,  -23.72419031])

In [None]:
#mse for fitted_pipeline_r??????????????


## Lasso Regression

In [42]:
#creating ridge regression pipeline

pipeline_lasso = model(regression_label = "lasso_reg", type_of_regression = Lasso())

In [44]:
lambdas_list = [0.001, 0.01, 0.1, 1, 10]
lambdas = {'lasso_reg__alpha': lambdas_list}

#running girdsearchcv to tune to different lambda values
gscv_l = GridSearchCV(pipeline_lasso, lambdas, cv = 5, scoring='neg_mean_squared_error')

In [46]:
#fitting gridsearchcv to entire dataset
gscv_fitted_l = gscv_l.fit(X, y)

#creating dataframe for mse values and lambda values
lasso_df = pd.DataFrame(gscv_fitted_l.cv_results_)
lasso_df = lasso_df[["param_lasso_reg__alpha", "mean_test_score"]] #get the columns i want
lasso_df["mean_test_score"] = ridge_df["mean_test_score"].abs() #absolute value of mean square errors
lasso_df.sort_values(by = "mean_test_score", ascending = True) #sort mse values, want smallest one

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,param_lasso_reg__alpha,mean_test_score
3,1.0,119144.432677
4,10.0,119348.984776
2,0.1,120343.621067
1,0.01,121022.903286
0,0.001,121124.458592


Model with lambda = 1 is the best model, since it has the lowest MSE.

In [47]:
#fit lasso regression model on entire data set using this lambda = 1
pipeline_l_1 = model(regression_label = "lasso_reg", type_of_regression = Lasso(alpha = 1))
fitted_pipeline_l = pipeline_l_1.fit(X, y)

#coefficients for fitted ridge pipeline
coeff_l = fitted_pipeline_l.named_steps["lasso_reg"].coef_
coeff_l

array([-3.58260721e+01,  9.97464147e-14,  1.14412951e+02, -2.07892950e-11,
        0.00000000e+00, -0.00000000e+00, -2.82370957e+02,  3.04359509e+02,
        1.11270220e+01, -2.49665071e+01, -0.00000000e+00,  1.20695275e+02,
       -3.49481481e+01, -1.62639794e+02,  0.00000000e+00,  1.42259932e+01,
        3.75565519e+02,  1.92610892e+02, -1.89644642e+02,  7.87603658e+01,
        4.19966795e+01, -1.84793784e+01])

In [None]:
#MSE??????????????

## Elastic Net

In [48]:
pipeline_en = model(regression_label = "elastic_net", type_of_regression = ElasticNet())

In [49]:
#lambda AND alpha lists

lambdas_list = [0.001, 0.01, 0.1, 1, 10]
alpha_list = [0.001, 0.01, 0.1, 1, 10]
param = {'elastic_net__alpha': lambdas_list,
          "elastic_net__l1_ratio" : alpha_list}

#running girdsearchcv to tune to different lambda values
gscv_en = GridSearchCV(pipeline_en, param, cv = 5, scoring='neg_mean_squared_error')
gscv_fitted_en = gscv_en.fit(X, y)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [53]:
#creating dataframe for mse values and lambda values
en_df = pd.DataFrame(gscv_fitted_en.cv_results_)

en_df = en_df[["param_elastic_net__alpha", "param_elastic_net__l1_ratio", "mean_test_score"]] #get the columns i want
en_df["mean_test_score"] = en_df["mean_test_score"].abs() #absolute value of mean square errors
en_df.sort_values(by = "mean_test_score", ascending = True).head() #sort mse values, want smallest one

Unnamed: 0,param_elastic_net__alpha,param_elastic_net__l1_ratio,mean_test_score
5,0.01,0.001,118958.09879
6,0.01,0.01,118959.303084
7,0.01,0.1,118973.789477
12,0.1,0.1,119745.301065
18,1.0,1.0,119761.587407


Best model with elastic net is when the lambda value is 0.01 and the alpha value is 0.001. It has the lowest MSE.

In [55]:
pipeline_en = model(regression_label = "elastic_net", type_of_regression = ElasticNet(alpha = 0.01, l1_ratio = 0.001))
fitted_pipeline_en = pipeline_en.fit(X, y)

#coefficients for fitted ridge pipeline
coeff_en = fitted_pipeline_en.named_steps["elastic_net"].coef_
coeff_en

  model = cd_fast.enet_coordinate_descent(


array([ -29.05785518,   29.05785595,   60.81261747,  -60.81261663,
         12.39660091,  -12.39660087, -233.34388128,  249.99503586,
          5.38062416,   -6.9773345 ,    1.89754395,  111.88494513,
        -49.50593632, -122.2410746 ,  123.66236751,   55.63591034,
        226.92489624,  122.96379112, -156.53102835,   77.97700305,
         41.46248505,  -24.75284625])

In [56]:
#dataframe of all coefficients
coeff_df = pd.DataFrame({"linear": coeff,
                         "ridge": coeff_r,
                         "lasso": coeff_l,
                         "elastic_net": coeff_en})
coeff_df

Unnamed: 0,linear,ridge,lasso,elastic_net
0,-31.299712,-30.438855,-35.82607,-29.057855
1,31.299712,30.438855,9.974641e-14,29.057856
2,58.424623,60.015595,114.413,60.812617
3,-58.424623,-60.015595,-2.078929e-11,-60.812617
4,12.381163,13.111282,0.0,12.396601
5,-12.381163,-13.111282,-0.0,-12.396601
6,-291.094556,-270.686441,-282.371,-233.343881
7,337.830479,296.64505,304.3595,249.995036
8,37.853837,18.100592,11.12702,5.380624
9,-60.572479,-29.339406,-24.96651,-6.977334
