In [36]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from plotnine import *

Data Cleaning

In [37]:
# read data
df = pd.read_csv("/Users/avanti/OneDrive/MASTERS/GSB544-Computing and Machine Learning/Labs/Lab Data/Hitters.csv")
# clean missing data
df.dropna(inplace=True)

In [38]:
# ChatGPT helped annotate and edit code

def tune_regression(X, y, model_type="linear", alpha_values=[0.001, 0.01, 0.1, 1, 10], l1_ratio_values=[0.0, 0.25, 0.5, 0.75, 1.0], cv=5):
    """
    Function to perform hyperparameter tuning for different regression models using GridSearchCV.
    
    Parameters:
        X (pd.DataFrame or np.array): The input features for training.
        y (pd.Series or np.array): The target variable for regression.
        model_type (str): Specifies the type of regression model ('linear', 'ridge', 'lasso', or 'elasticnet').
        alpha_values (list): List of alpha values for regularization strength in applicable models.
        l1_ratio_values (list): List of l1 ratios for ElasticNet regularization (ratio of lasso to ridge).
        cv (int): Number of cross-validation folds for grid search.
        
    Returns:
        coefs_df (pd.DataFrame): DataFrame containing feature names and their corresponding coefficients.
    """

    # Select the model and hyperparameters based on the model_type
    if model_type == "linear":
        model = LinearRegression()
        alpha = {}  # No alpha parameter for LinearRegression
        l1_ratio = {}
    elif model_type == "ridge":
        model = Ridge()
        alpha = {"regression__alpha": alpha_values}  # Alpha is used to tune regularization strength
        l1_ratio = {}
    elif model_type == "lasso":
        model = Lasso()
        alpha = {"regression__alpha": alpha_values}  # Alpha for Lasso regularization
        l1_ratio = {}
    elif model_type == "elasticnet":
        model = ElasticNet()
        alpha = {"regression__alpha": alpha_values, "regression__l1_ratio": l1_ratio_values}  # ElasticNet uses both alpha and l1_ratio
    else:
        raise ValueError("Unsupported model_type. Choose from 'lasso', 'ridge', 'elasticnet', or 'linear'.")

    # Set up preprocessing with ColumnTransformer
    # - Categorical features are one-hot encoded
    # - Numerical features are standardized (scaled)
    ct = ColumnTransformer([
        ("dummify", OneHotEncoder(sparse_output=False, handle_unknown='ignore'), make_column_selector(dtype_include=object)),
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ], remainder="passthrough")

    # Create the complete pipeline for preprocessing and regression
    pipeline = Pipeline([
        ("preprocessing", ct),  # Apply preprocessing
        ("regression", model)   # Apply the regression model
    ])
    
    # Conduct grid search with specified parameters and cross-validation
    grid_search = GridSearchCV(pipeline, alpha, cv=cv, scoring='neg_mean_squared_error')
    grid_search_fitted = grid_search.fit(X, y)  # Fit grid search to find best hyperparameters

    # Retrieve the best model after grid search
    best_model = grid_search.best_estimator_
    best_model_fitted = best_model.fit(X, y)  # Fit the best model on the entire dataset

    # Extract coefficients and feature names from the best model
    coefs = best_model.named_steps['regression'].coef_  # Model coefficients
    feature_names = best_model_fitted.named_steps['preprocessing'].get_feature_names_out()  # Extract feature names from preprocessing

    # Create a DataFrame to display feature names and coefficients
    coefs_df = pd.DataFrame({
        "Feature Name": feature_names,
        "Coefficients": coefs
    })

    # Output cross-validated MSE scores for each grid search iteration
    print("Cross-validated MSE scores:", -grid_search_fitted.cv_results_['mean_test_score'])

    # Extract best hyperparameters from the grid search results
    best_alpha = grid_search.best_params_.get("regression__alpha", None)  # Get best alpha if applicable
    best_l1_ratio = grid_search.best_params_.get("regression__l1_ratio", None)  # Get best l1_ratio if applicable
    best_score = grid_search.best_score_

    # Print best alpha and l1_ratio values if applicable
    if best_alpha is not None:
        print(f"Best alpha: {best_alpha}")
    if best_l1_ratio is not None:
        print(f"Best l1 ratio: {best_l1_ratio}")
    print(f"Best cross-validated MSE score: {-best_score}")

    # Return the DataFrame of coefficients for further analysis
    return coefs_df


Part 1: Regression without Regularization
1. Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary linear regression
2. Fit this pipeline to the full dataset, and interpret a few of the most important coefficients.
3. Use cross-validation to estimate the MSE you would expect if you used this pipeline to predict 1989 salaries.

In [39]:
# Create variables
X = df.drop(["Salary"], axis = 1)
y = df["Salary"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

tune_regression(X, y, model_type = "linear")

Cross-validated MSE scores: [121136.31031817]
Best cross-validated MSE score: 121136.31031816892
Cross-validated MSE scores: [121136.31031817]
Best cross-validated MSE score: 121136.31031816892


Unnamed: 0,Feature Name,Coefficients
0,dummify__League_A,-31.299712
1,dummify__League_N,31.299712
2,dummify__Division_E,58.424623
3,dummify__Division_W,-58.424623
4,dummify__NewLeague_A,12.381163
5,dummify__NewLeague_N,-12.381163
6,standardize__AtBat,-291.094556
7,standardize__Hits,337.830479
8,standardize__HmRun,37.853837
9,standardize__Runs,-60.572479


Part B: Ridge Regression
1. Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary ridge regression
2. Use cross-validation to tune the lambda hyperparameter.
3. Fit the pipeline with your chosen lambda to the full dataset, and interpret a few of the most important coefficients.
4. Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

In [40]:
tune_regression(X, y, model_type = "ridge")

Cross-validated MSE scores: [121124.45859214 121022.90328584 120343.62106698 119144.43267692
 119348.9847757 ]
Best alpha: 1
Best cross-validated MSE score: 119144.43267691587
Cross-validated MSE scores: [121124.45859214 121022.90328584 120343.62106698 119144.43267692
 119348.9847757 ]
Best alpha: 1
Best cross-validated MSE score: 119144.43267691587


Unnamed: 0,Feature Name,Coefficients
0,dummify__League_A,-30.438855
1,dummify__League_N,30.438855
2,dummify__Division_E,60.015595
3,dummify__Division_W,-60.015595
4,dummify__NewLeague_A,13.111282
5,dummify__NewLeague_N,-13.111282
6,standardize__AtBat,-270.686441
7,standardize__Hits,296.64505
8,standardize__HmRun,18.100592
9,standardize__Runs,-29.339406


Part C: Lasso Regression
1. Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary ridge regression
2. Use cross-validation to tune the lambda hyperparameter.
3. Fit the pipeline with your chosen lambda to the full dataset, and interpret a few of the most important coefficients.
4. Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

In [41]:
tune_regression(X, y, model_type = "lasso")



Cross-validated MSE scores: [120994.17981481 120964.76468618 120682.25263745 119761.58740741
 121828.14133339]
Best alpha: 1
Best cross-validated MSE score: 119761.58740741298
Cross-validated MSE scores: [120994.17981481 120964.76468618 120682.25263745 119761.58740741
 121828.14133339]
Best alpha: 1
Best cross-validated MSE score: 119761.58740741298




Unnamed: 0,Feature Name,Coefficients
0,dummify__League_A,-35.82607
1,dummify__League_N,6.938881e-15
2,dummify__Division_E,114.413
3,dummify__Division_W,-2.233911e-11
4,dummify__NewLeague_A,0.0
5,dummify__NewLeague_N,-0.0
6,standardize__AtBat,-282.371
7,standardize__Hits,304.3595
8,standardize__HmRun,11.12702
9,standardize__Runs,-24.96651


Part D: Elastic Net
1. Create a pipeline that includes all the columns as predictors for Salary, and performs ordinary ridge regression
2. Use cross-validation to tune the lambda and alpha hyperparameters.
3. Fit the pipeline with your chosen hyperparameters to the full dataset, and interpret a few of the most important coefficients.
4. Report the MSE you would expect if you used this pipeline to predict 1989 salaries.

In [42]:
tune_regression(X, y, model_type = "elasticnet")















Cross-validated MSE scores: [119911.32888951 120077.76864311 120296.21077347 120590.44872106
 120994.17981481 118957.96789175 119009.79955442 119123.79308625
 119404.65600998 120964.76468618 119805.47261377 119636.17022676
 119381.37557129 119036.41354271 120682.25263745 122029.76100625
 121374.33374621 120775.68540067 120356.59599777 119761.58740741
 150034.24612426 144021.73108229 136766.42534889 128407.50504979
 121828.14133339]
Best alpha: 0.01
Best l1 ratio: 0.0
Best cross-validated MSE score: 118957.9678917496
Cross-validated MSE scores: [119911.32888951 120077.76864311 120296.21077347 120590.44872106
 120994.17981481 118957.96789175 119009.79955442 119123.79308625
 119404.65600998 120964.76468618 119805.47261377 119636.17022676
 119381.37557129 119036.41354271 120682.25263745 122029.76100625
 121374.33374621 120775.68540067 120356.59599777 119761.58740741
 150034.24612426 144021.73108229 136766.42534889 128407.50504979
 121828.14133339]
Best alpha: 0.01
Best l1 ratio: 0.0
Best c



Unnamed: 0,Feature Name,Coefficients
0,dummify__League_A,-29.055921
1,dummify__League_N,29.055922
2,dummify__Division_E,60.813166
3,dummify__Division_W,-60.813166
4,dummify__NewLeague_A,12.395109
5,dummify__NewLeague_N,-12.395109
6,standardize__AtBat,-233.28853
7,standardize__Hits,249.932752
8,standardize__HmRun,5.366905
9,standardize__Runs,-6.952571


Part Two: Variable Selection
Based on the above results, decide on:
- Which numeric variable is most important.
- Which five numeric variables are most important
- Which categorical variable is most important

For each of the four model specifications, compare the following possible feature sets:
1. Using only the one best numeric variable.
2. Using only the five best variables.
3. Using the five best numeric variables and their interactions with the one best categorical variable.

Report which combination of features and model performed best, based on the validation metric of MSE.

(Note: lambda and alpha must be re-tuned for each feature set.)