# Lecture 19 – part II

## Prediction with Lasso 

   - 3 sample approach:            
       - train and test sample     
         to do cross-validation    
         or tuning                 
       - hold-out sample to        
         evaluate prediction       
   - Model selection with:         
     - lin.regression with cv      
     - lasso (ridge & elastic net) 
   - Diagnostics and evaluation    
     - which model gives           
       best prediction on hold-out 
     - stability of the prediction 
     - further diagnostics with    
         figures              
                                            
#### Case Study:                                 
  - CH14B Predicting AirBnB apartment prices: selecting a regression model       

####  Dataset:       
    airbnb
---

In [None]:
import os
import sys
import warnings

import numpy as np
import pandas as pd
import patchworklib as pw
import patsy
import statsmodels.formula.api as smf
from mizani.formatters import percent_format
from plotnine import *
from skimpy import skim
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

%matplotlib inline
warnings.filterwarnings("ignore")


Import work data

In [None]:
data = pd.read_csv("data/airbnb_hackney_work.csv")


In [None]:
skim(data)

## Setting up models

 Define grouping variables which contains variable names

In [None]:
basic_lev = (
    "n_accommodates",
    "n_beds",
    "f_property_type",
    "f_room_type",
    "n_days_since",
    "flag_days_since",
)
basic_add = ("f_bathroom", "f_cancellation_policy", "f_bed_type")
reviews = ("f_number_of_reviews", "n_review_scores_rating", "flag_review_scores_rating")
poly_lev = ("n_accommodates2", "n_days_since2", "n_days_since3")
# not use p_host_response_rate due to missing obs
amenities = tuple(list(data.filter(regex="^d_.*")))
                 

Look for interactions


In [None]:
def price_diff_by_variables(df, factor_var, dummy_var, factor_lab, dummy_lab):
    stats = df.groupby([factor_var, dummy_var]).agg(
        Mean=("price", np.mean), sd=("price", np.std), size=("price", "size")
    )
    stats["se"] = stats["sd"] / stats["size"] ** (1 / 2)
    stats["Mean_l"] = stats["Mean"] - (1.96 * stats["se"])
    stats["Mean_u"] = stats["Mean"] + (1.96 * stats["se"])
    stats = stats.drop(["sd", "size"], axis=1).reset_index()
    plot = (
        ggplot(
            stats,
            aes(
                stats.columns[0],
                stats.columns[2],
                fill="factor(" + stats.columns[1] + ")",
            ),
        )
        + geom_bar(stat="identity", position=position_dodge(width=0.9))
        + geom_errorbar(
            aes(ymin="Mean_l", ymax="Mean_u"),
            position=position_dodge(width=0.9),
            width=0.25,
        )
        + scale_color_manual(name=dummy_lab, values=("blue","red"))
        + scale_fill_manual(name=dummy_lab, values=("blue","red"))
        + ylab("Mean Price")
        + xlab(factor_lab)
        + theme_bw()
        + theme(
            panel_grid_major=element_blank(),
            panel_grid_minor=element_blank(),
            panel_border=element_blank(),
            axis_line=element_line(),
            legend_position="top",
            legend_box="vertical",
            legend_text=element_text(size=5),
            legend_title=element_text(size=5, face="bold"),
        )
    )

    return plot

Look up room type interactions

In [None]:
p1 = price_diff_by_variables(
    data, "f_room_type", "d_familykidfriendly", "Room type", "Family kid friendly"
)


p2 = price_diff_by_variables(
    data, "f_room_type", "f_property_type", "Room type", "Property type"
)

Look up cancelation policy


In [None]:
p3 = price_diff_by_variables(
    data,
    "f_cancellation_policy",
    "d_familykidfriendly",
    "Cancellation policy",
    "Family kid friendly",
)


p4 = price_diff_by_variables(
    data, "f_cancellation_policy", "d_tv", "Cancellation policy", "TV"
)


Look up room type interactions

In [None]:
p5 = price_diff_by_variables(data, "f_property_type", "d_cats", "Property type", "Cats")


p6 = price_diff_by_variables(data, "f_property_type", "d_dogs", "Property type", "Dogs")


In [None]:
g1 = pw.load_ggplot(p1, figsize=(3, 3))
g2 = pw.load_ggplot(p2, figsize=(3, 3))
g3 = pw.load_ggplot(p3, figsize=(3, 3))
g4 = pw.load_ggplot(p4, figsize=(3, 3))
g5 = pw.load_ggplot(p5, figsize=(3, 3))
g6 = pw.load_ggplot(p6, figsize=(3, 3))

interactions = (g1 | g2) / (g3 | g4 ) / (g5 | g6)

In [None]:
interactions.savefig()

### Create the interaction terms

dummies suggested by graphs

In [None]:
X1 = ("f_room_type*f_property_type", "f_room_type*d_familykidfriendly")


 Additional interactions of factors and dummies

In [None]:
X2 = (
    "d_airconditioning*f_property_type",
    "d_cats*f_property_type",
    "d_dogs*f_property_type",
)
X3 = (
    "(f_property_type + f_room_type + f_cancellation_policy + f_bed_type) * ("
    + "+".join(amenities)
    + ")"
)


### Create model setups

 Create models in levels models: 1-8

In [None]:
model_eq1 = "~ n_accommodates"
model_eq2 = "~" + "+".join(basic_lev)
model_eq3 = "~" + "+".join(basic_lev + basic_add + reviews)
model_eq4 = "~" + "+".join(basic_lev + basic_add + reviews + poly_lev)
model_eq5 = "~" + "+".join(basic_lev + basic_add + reviews + poly_lev + X1)
model_eq6 = "~" + "+".join(basic_lev + basic_add + reviews + poly_lev + X1 + X2)
model_eq7 = "~" + "+".join(
    basic_lev + basic_add + reviews + poly_lev + X1 + X2 + amenities
)
model_eq8 = (
    "~"
    + "+".join(basic_lev + basic_add + reviews + poly_lev + X1 + X2 + amenities)
    + "+"
    + X3
)
model_equations = [
    model_eq1,
    model_eq2,
    model_eq3,
    model_eq4,
    model_eq5,
    model_eq6,
    model_eq7,
    model_eq8,
]

Create a holdout set (20% of observations)

In [None]:
smp_size = data.shape[0] // 5

Set the random number generator: It will make results reproducable


In [None]:
np.random.seed(20180123)

Create work and holdout set with sklearn's train_test_split function

In [None]:
from sklearn.model_selection import train_test_split
data_work, data_holdout = train_test_split(data, test_size=smp_size)

### Utilize the Working data set:
  1. estimate measures on the whole working sample (R2,BIC,RMSE)
  2. DO K-fold cross validation to get proper Test RMSE


In [None]:
n_folds = 5

In [None]:
from sklearn.model_selection import KFold
from statsmodels.tools.eval_measures import mse, rmse

k = KFold(n_splits=n_folds, shuffle=False, random_state=None)


In [None]:
import statsmodels.formula.api as smf
from sklearn.model_selection import KFold
from statsmodels.tools.eval_measures import rmse


def ols_crossvalidator(
    formula: str, data: pd.DataFrame, n_folds=5, average_rmse=True
) -> dict:
    """OLS cross-validator


    Estimates `formula` equation with OLS and returns values of RMSE, R`2, No. coefficients,
    BIC on `data`. Does k-fold cross-validation and either returns train and test RMSE for each
    fold, or return averarage train and test RMSEs.

    Parameters
    ----------
    formula : str
        Equation that is estimated by OLS.
    data : pd.DataFrame
        Database in a wide format.
    n_folds : int, default=5
        Number of folds. Must be at least 2.
    average_rmse : bool, default=True
        Whether to return the average train and test RMSE of the k-fold CV, or return
        train and test RMSE-s for each fold.

    """

    # Get dependent variable

    y = formula.split("~")[0].strip()

    # Get statistics on the whole work data

    model = smf.ols(formula, data=data).fit()

    rsquared = model.rsquared
    # n_coefficients = model.params.shape[0]
    n_coefficients = (
        model.df_model + 1
    )  # This might differ from model.params.shape[0], because of collinear predictors
    bic = model.bic
    rmse_alldata = rmse(model.predict(), data[y])

    # Calculating test and train RMSE-s for each fold

    k = KFold(n_splits=n_folds, shuffle=False, random_state=None)

    rmse_train = []
    rmse_test = []

    for train_index, test_index in k.split(data):

        data_train, data_test = data.iloc[train_index, :], data.iloc[test_index, :]

        model = smf.ols(formula, data=data_train).fit()

        rmse_train.append(rmse(data_train[y], model.predict(data_train)))
        rmse_test.append(rmse(data_test[y], model.predict(data_test)))

    if average_rmse:
        rmse_train = np.mean(rmse_train)
        rmse_test = np.mean(rmse_test)

    return {
        "RMSE": rmse_alldata,
        "R-squared": rsquared,
        "BIC": bic,
        "Coefficients": n_coefficients,
        "Training RMSE": rmse_train,
        "Test RMSE": rmse_test,
    }


def summarize_cv(cvlist, stat="rmse"):
    """
    Summarises cross-validated OLS regression results received from `cv_reg`.
    """
    result = pd.DataFrame(
        {"Model " + str(x + 1): cvlist[x][stat] for x in range(len(cv_list))}
    )
    result["Resample"] = ["Fold" + str(x + 1) for x in range(len(cvlist[0][stat]))]
    result = result.set_index("Resample")
    result = pd.concat([result, pd.DataFrame(result.mean(), columns=["Average"]).T])
    return result

In [None]:
cv_list = []
for model_eq in model_equations:
    cv_list += [
        ols_crossvalidator("price" + model_eq, data_work, n_folds, average_rmse=False)
    ]

In [None]:
rmse_test_folds = (
    pd.DataFrame(
        [cv["Test RMSE"] for cv in cv_list],
        index=["Model " + str(i + 1) for i in range(len(cv_list))],
        columns=["Fold" + str(i + 1) for i in range(len(cv_list[0]["Test RMSE"]))],
    )
    .assign(Average=lambda x: x.mean(axis=1))
    .T.round(2)
)
rmse_test_folds

Re-estimate the same models with `average_rmse=True`

In [None]:
cv_list = []
for model_eq in model_equations:
    cv_list += [
        ols_crossvalidator("price" + model_eq, data_work, n_folds, average_rmse=True)
    ]

In [None]:
compare_model_fits = (
    pd.DataFrame(cv_list)
    .round(2)
    .assign(
        Model=["M" + str(i + 1) for i in range(len(cv_list))],
        BIC=lambda x: x["BIC"].astype(int),
        Coefficients=lambda x: x["Coefficients"].astype(int),
    )
    .filter(["Model", "Coefficients", "R-squared", "BIC", "Training RMSE", "Test RMSE"])
)
compare_model_fits

RMSE training vs test graph

In [None]:
(
    ggplot(
        compare_model_fits.melt(
            id_vars="Coefficients", value_vars=["Training RMSE", "Test RMSE"]
        ),
        aes(x="factor(Coefficients)", y="value", color="variable", group="variable"),
    )
    + geom_line(size=1, show_legend=True, na_rm=True)
    + scale_color_manual(name=" ", values=("red", "blue"))
    + scale_y_continuous(name="RMSE", limits=(30, 43), breaks=np.arange(30, 44, 2))
    + scale_x_discrete(name="Number of coefficients", expand=(0.01, 0.01))
    + theme_bw()
)

### Lasso - first, we do cross validation by hand for educational purposes

Take model equation 8

In [None]:
vars_model_8 = model_eq

Define range for lambdas – the algo will look only in this set

In [None]:
lambdas = np.arange(0.05, 1.01, 0.05)
print(lambdas)

Create train and test sets to perform the cross validation on

In [None]:
y_work, X_work = patsy.dmatrices("price" + vars_model_8, data_work)
X_work_featnames = X_work.design_info.column_names


X_train, X_test, y_train, y_test = train_test_split(
    X_work, y_work, test_size=smp_size, random_state=10
)

To cross-validate lambda-s (which is Lasso's hyperparameter) one has to scale the feature matrix.

Note: do this separately on train and test X-sets to avoid information spillover between sets

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
X_work = scaler.fit_transform(X_work)

Cross validate, now manualy in a for loop

In [None]:
train_r_squared = np.zeros(len(lambdas))
test_r_squared = np.zeros(len(lambdas))
train_rmse = np.zeros(len(lambdas))
test_rmse = np.zeros(len(lambdas))

for ind, i in enumerate(lambdas):
    reg = Lasso(alpha=i)
    reg.fit(X_train, y_train)
    train_r_squared[ind] = reg.score(X_train, y_train)
    test_r_squared[ind] = reg.score(X_test, y_test)
    train_rmse[ind] = rmse(reg.predict(X_train), y_train.reshape(1,-1)[0])
    test_rmse[ind] = rmse(reg.predict(X_test), y_test.reshape(1,-1)[0])

Plot the evolution of $R^2$ depending on lambdas in the train and test set. Recall bias-variance trade-off!

In [None]:
r_squared_data = pd.DataFrame(
    {
        "$R^2$ Test set": test_r_squared,
        "$R^2$ Training set": train_r_squared,
        "lambda": lambdas,
    }
).melt(id_vars=["lambda"])

r_squared_data["variable"] = (
    r_squared_data["variable"]
    .astype("category")
    .cat.reorder_categories(["$R^2$ Training set", "$R^2$ Test set"])
)


(
    ggplot(r_squared_data, aes(x="lambda", y="value", color="variable"))
    + geom_point()
    + geom_line(size=1, show_legend=False, na_rm=True)
    + scale_color_manual(name="", values=("blue", "red"))
    + scale_y_continuous(name="$R^2$")
        + scale_x_continuous(name="$\lambda$", limits=(0,1))
    + theme_bw()
    + theme(subplots_adjust={"wspace": 0.25}, legend_title=element_blank())
)

Plot the evolution of RMSE depending on lambdas in the train and test set.

In [None]:
r_squared_data = pd.DataFrame(
    {
        "RMSE Test set": test_rmse,
        "RMSE Training set": train_rmse,
        "lambda": lambdas,
    }
).melt(id_vars=["lambda"])

r_squared_data["variable"] = (
    r_squared_data["variable"]
    .astype("category")
    .cat.reorder_categories(["RMSE Training set", "RMSE Test set"])
)


(
    ggplot(r_squared_data, aes(x="lambda", y="value", color="variable"))
    + geom_point()
    + geom_line(size=1, show_legend=False, na_rm=True)
    + scale_color_manual(name="", values=("blue", "red"))
    + scale_y_continuous(name="RMSE", limits=(31, 38))
    + scale_x_continuous(name="$\lambda$", limits=(0, 1))
    + theme_bw()
    + theme(subplots_adjust={"wspace": 0.25}, legend_title=element_blank())
)

Extract the lambda with lowesr test $RMSE$

In [None]:
df_lam = pd.DataFrame(test_rmse, columns=["RMSE test"])
df_lam["lambda"] = lambdas

df_lam.loc[df_lam["RMSE test"].idxmin()]


Re-estimate Lasso model with cross-validated lambda

In [None]:
lasso_best = Lasso(alpha=df_lam.loc[lambda x: x["RMSE test"].idxmin()]["lambda"])
lasso_best.fit(X_work, y_work)

See coefficients that are greater than 0

In [None]:
notnull_lasso_coefs = pd.DataFrame(
    lasso_best.coef_, index=X_work_featnames, columns=["coefficient"]
).loc[lambda x: x["coefficient"] != 0]
notnull_lasso_coefs

Calculate MSE on test on the test set, within the work set

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, lasso_best.predict(X_test))


### One can alternatively do cross validation with sklearn's LassoCV

In production, this is much easier, we did all above step by step manually for educational purposes.

In [None]:
from sklearn.linear_model import LassoCV

lasso_cv_fit = LassoCV(alphas=lambdas, cv=5, random_state=42).fit(X_work, y_work)

Now, the algo choose a different alpha. This is because it did 5 fold CV for each alpha and choose the one with the best average test

In [None]:
lasso_cv_fit.alpha_

In [None]:
rmse_lambda_folds = (
    pd.DataFrame(lasso_cv_fit.mse_path_, index=lambdas[::-1])
    .apply(np.sqrt)
    .mean(axis=1)
    .to_frame(name="test RMSE")
    .rename_axis("$\lambda$")
)
rmse_lambda_folds.round(3)

In [None]:
notnull_lasso_coefs = (
    pd.DataFrame(lasso_cv_fit.coef_, index=X_work_featnames, columns=["coefficient"])
    .loc[lambda x: x["coefficient"] != 0]
    .round(3)
)
notnull_lasso_coefs

In [None]:
compare_model_fits.loc[8, :] = {
    "Model": "Lasso",
    "Coefficients": notnull_lasso_coefs.shape[0],
    "Test RMSE": rmse_lambda_folds.loc[lasso_cv_fit.alpha_].round(2).values[0],
}

compare_model_fits

### Elastic net

We can also cross-validate Elastic Net regression which is a regularized regression method that linearly combines the L1 and L2 penalties of the LASSO and Ridge methods.

In [None]:
from sklearn.linear_model import ElasticNetCV

ElasticNetCV's main parameter is `l1_ratio` (default=0.5, a float between 0 and 1 passed to ElasticNet (scaling between l1 and l2 penalties). For `l1_ratio` = 0 the penalty is an L2 penalty. For `l1_ratio` = 1 it is an L1 penalty.

Cross validate through a set of `l1_ratio`s

In [None]:
elasticnet_cv_fit = ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], cv=5, random_state=0).fit(
    X_work, y_work
)

If the `l1_ratios` is 1, this means that the cross validated best model used only L1 penalty which is used by the LASSO model (as a lambda)

In [None]:
elasticnet_cv_fit.l1_ratio_

In [None]:
elasticnet_cv_fit.alpha_

## Diagnostics

### 1. Evaluate performance on the hold-out sample


Let us check only Models: 3, 7 and LASSO and ElasticNet


First re-fit OLS models on the whole work data

In [None]:
model3 = smf.ols("price" + model_eq3, data=data_work).fit(cov_type="HC0")
model7 = smf.ols("price" + model_eq7, data=data_work).fit(cov_type="HC0")

Get standardized X matrix on holdout for Lasso, for evaluation


In [None]:
_, X_holdout = patsy.dmatrices("price" + vars_model_8, data_holdout)
X_holdout = scaler.fit_transform(X_holdout)

In [None]:
holdout_performances = pd.DataFrame(
    {
        "Model 3": rmse(model3.predict(data_holdout), data_holdout["price"]),
        "Model 7": rmse(model7.predict(data_holdout), data_holdout["price"]),
        "LASSO": rmse(lasso_cv_fit.predict(X_holdout), data_holdout["price"]),
        "ElasticNet": rmse(elasticnet_cv_fit.predict(X_holdout), data_holdout["price"]),
    },
    index=["RMSE on holdout"],
).T.round(2)
holdout_performances

Save the predicted values of model 7 in data_holdout

In [None]:
data_holdout["predicted_price"] = model7.predict(data_holdout)

Plot predicted price vs. actual price

In [None]:
(
    ggplot(data=data_holdout)
    + geom_point(
        aes(y="price", x="predicted_price"),
        color="blue",
        size=1,
        alpha=0.7,
        show_legend=False,
        na_rm=True,
    )
    + geom_segment(
        aes(x=0, y=0, xend=350, yend=350), size=0.5, linetype="dashed", color="red"
    )
    + coord_equal()  # to get equally lengthed y and x axis
    + scale_x_continuous(
        expand=(0.01, 0.01), limits=(0, 350), breaks=np.arange(0, 351, 50)
    )
    + scale_y_continuous(
        expand=(0.01, 0.01), limits=(0, 350), breaks=np.arange(0, 351, 50)
    )
    + labs(y="Price (US dollars)", x="Predicted price  (US dollars)")
    + theme_bw()
)

Redo predicted values at 80% PI

In [None]:
prediction_agg_by_nacc = (
    model7.get_prediction(data_holdout)
    .summary_frame(alpha=0.2)
    .filter(["mean", "obs_ci_lower", "obs_ci_upper"])
    .rename(columns={"mean": "predicted_price"})
    .assign(n_accommodates=data_holdout["n_accommodates"].values)
    .groupby(by=["n_accommodates"])
    .mean()
    .reset_index()
)

Prediction interval by apartment size

In [None]:
(
    ggplot(prediction_agg_by_nacc, aes(x="n_accommodates"))
    + geom_bar(aes(y="predicted_price"), stat="identity", fill="blue", alpha=0.7)
    + geom_errorbar(
        aes(ymin="obs_ci_lower", ymax="obs_ci_upper"), color="red", width=0.2
    )
    + scale_y_continuous(name="Predicted price (US dollars)")
    + scale_x_continuous(name="Accomodates (Persons)", breaks=np.arange(1, 8, 1))
    + scale_color_manual(values=("red", "red"))
    + theme_bw()
    + theme(legend_title=element_blank(), legend_position="none")
)

### Two extras, if we have time

#### 1. We may check how the coefficients are changing as lambda changes for LASSO

In [None]:
from sklearn.linear_model import lasso_path

eps = 5e-3  # the smaller it is the longer is the path
lambdas_lasso, coefs_lasso, _ = lasso_path(X_work, y_work, eps=eps)

In [None]:
from itertools import cycle
import matplotlib.pyplot as plt

plt.figure(1)
colors = cycle(["b", "r", "g", "c", "k"])
neg_log_lambdas_lasso = -np.log10(lambdas_lasso)
for coef_l, c in zip(coefs_lasso[0], colors):
    l1 = plt.plot(neg_log_lambdas_lasso, coef_l, c=c)

plt.xlabel("-Log(lambda)")
plt.ylabel("coefficients")
plt.title("LASSO Paths")
plt.axis("tight")
plt.show()

#### 2.  Using One-Standard-Error (1SE) rule for selecting a more parsimonious model:

Get the 1SE value

In [None]:
one_se = rmse_lambda_folds["test RMSE"].std() / np.sqrt(rmse_lambda_folds.shape[0])

Get a decision rule: minimum RMSE + 1SE

One can see that we may even have lambda = 0.5 as well based on 1SE rule... (but note, that lambda in (0,Inf[ )

In [None]:
rmse_lambda_folds["RMSE + 1SE"] = rmse_lambda_folds["test RMSE"].min() + one_se

rmse_lambda_folds.loc[lambda x: x["test RMSE"] < x["RMSE + 1SE"]]

In [None]:
lambda_max = (
    rmse_lambda_folds.loc[lambda x: x["test RMSE"] < x["RMSE + 1SE"]]
    .reset_index()["$\lambda$"]
    .max()
)

lambda_max

Let's compare:

Run parsimonious LASSO

In [None]:
lasso_parsimonious_fit = LassoCV(alphas=[lambda_max], cv=5, random_state=42).fit(X_work, y_work)

One can get the coefficients as well

In [None]:
notnull_lasso_coefs = (
    pd.DataFrame(lasso_cv_fit.coef_, index=X_work_featnames, columns=["coefficient"])
    .loc[lambda x: x["coefficient"] != 0]
    .round(3)
)
notnull_lasso_coefs

Finally, test holdout performance

In [None]:
holdout_performances.loc["Parsim. LASSO"] = rmse(
    lasso_parsimonious_fit.predict(X_holdout), data_holdout["price"]
)

holdout_performances.round(3)

Actually, we got a worse holdout prediction