#              Lecture 18                   
                                            
## Model selection with cross-validation  

   - Data cleaning & refactoring   
   - Basic feature engineering     
   - Multiple var regression       
   - Model selection with:         
     - cross validation by hand    
     - built-in function by caret  
   - Prediction with best model    
   - Log-transformed outcome model 
     - transformation of           
       log prediction to level              
                                            
#### Case Study:                                 
  - CH13A Predicting used car value with linear regressions   
  - CH14A Predicting used car value: log prices         

####  Dataset:       
    used-cars
---

In [None]:
import os
import sys
import warnings

import numpy as np
import pandas as pd
import statsmodels
import statsmodels.formula.api as smf
from mizani.formatters import percent_format
from plotnine import *
from skimpy import skim
from stargazer import stargazer
from statsmodels.tools.eval_measures import mse, rmse

warnings.filterwarnings("ignore")
# turning off scientific notation
pd.set_option("display.float_format", lambda x: "%.3f" % x)

Import data

In [None]:
data = pd.read_csv("https://osf.io/7gvz9/download")

### Sample design

Manage missing

In [None]:
data["fuel"] = data["fuel"].fillna("Missing")
data["condition"] = data["condition"].fillna("Missing")
data["drive"] = data["drive"].fillna("Missing")
data["cylinders"] = data["cylinders"].fillna("Missing")
data["transmission"] = data["transmission"].fillna("Missing")
data["type"] = data["type"].fillna("Missing")


check frequency by fuel type

In [None]:
freq = data.groupby("fuel").agg(frequency=("type", "size"))
freq["percent"] = round(freq["frequency"] / sum(freq["frequency"]) * 100, 3)
freq["cumulative_percent"] = np.cumsum(freq["percent"])
freq


keep gas-fuelled vehicles

In [None]:
data = data.loc[data.fuel == "gas"]


check frequency by vehicle condition

In [None]:
freq = data.groupby("condition").agg(frequency=("type", "size"))
freq["percent"] = round(freq["frequency"] / sum(freq["frequency"]) * 100, 3)
freq["cumulative_percent"] = np.cumsum(freq["percent"])
freq


drop vehicles in fair and new condition, trucks

In [None]:
data = data.loc[~data.condition.isin(["new", "fair"])]

drop unrealistic values for price and odometer reading

In [None]:
data = data.loc[(data.price >= 500) & (data.price <= 25000) & (data.odometer <= 100)]


drop if price is smaller than 1000 and condition is like new or age is less than 8

In [None]:
data = data.loc[
    ~((data.price < 1000) & ((data.condition == "like new") | (data.age < 8)))
]


check frequency by transmission

In [None]:
freq = data.groupby("transmission").agg(frequency=("type", "size"))
freq["percent"] = round(freq["frequency"] / sum(freq["frequency"]) * 100, 3)
freq["cumulative_percent"] = np.cumsum(freq["percent"])
freq


Remove observations with manual transmission

In [None]:
data = data.loc[~(data.transmission == "manual")]


 check frequency by transmission

In [None]:
freq = data.groupby("type").agg(frequency=("type", "size"))
freq["percent"] = round(freq["frequency"] / sum(freq["frequency"]) * 100, 3)
freq["cumulative_percent"] = np.cumsum(freq["percent"])
freq


drop trucks

In [None]:
data = data.loc[~(data.type == "truck")]


drop pricestr

In [None]:
data = data.drop(["pricestr"], axis=1)


### Data generation & descriptives

Condition

In [None]:
data["cond_excellent"] = np.where(data["condition"] == "excellent", 1, 0)
data["cond_good"] = np.where(data["condition"] == "good", 1, 0)
data["cond_likenew"] = np.where(data["condition"] == "like new", 1, 0)

cylinders

In [None]:
data["cylind6"] = np.where(data["cylinders"] == "6 cylinders", 1, 0)
data.cylinders.value_counts().sort_index()

In [None]:
data.cylind6.value_counts()

age: quadratic, cubic

In [None]:
data["agesq"] = data["age"] ** 2
data["agecu"] = data["age"] ** 3

odometer quadratic

In [None]:
data["odometersq"] = data["odometer"] ** 2

#### Frequency tables

Area

In [None]:
data.groupby("area").agg(frequency=("price", "size"), mean=("price", np.mean))


focus only on Chicago

In [None]:
data = data.loc[data.area == "chicago"]


condition

In [None]:
data.groupby("condition").agg(frequency=("price", "size"), mean=("price", np.mean))


drive

In [None]:
data.groupby("drive").agg(frequency=("price", "size"), mean=("price", np.mean))


dealer

In [None]:
data.groupby("dealer").agg(frequency=("price", "size"), mean=("price", np.mean))


Summary descriptives

In [None]:
data.filter(
    [
        "age",
        "odometer",
        "LE",
        "XLE",
        "SE",
        "cond_likenew",
        "cond_excellent",
        "cond_good",
        "cylind6",
    ]
).describe()

In [None]:
skim(
    data.filter(
        [
            "age",
            "odometer",
            "LE",
            "XLE",
            "SE",
            "cond_likenew",
            "cond_excellent",
            "cond_good",
            "cylind6",
        ]
    )
)

### Histograms

a) price

In [None]:
(
    ggplot(data, aes(x="price"))
    + geom_histogram(
        aes(y="(stat(count))/sum(stat(count))"),
        binwidth=1000,
        boundary=0,
        color="white",
        fill="blue",
        size=0.25,
        alpha=0.8,
        show_legend=False,
        na_rm=True,
    )
    + coord_cartesian(xlim=(0, 20000))
    + labs(x="Price (US dollars)", y="Percent")
    + theme_bw()
    + expand_limits(x=0.01, y=0.01)
    + scale_y_continuous(expand=(0.01, 0.01), labels=percent_format())
    + scale_x_continuous(expand=(0.01, 0.01), breaks=np.arange(0, 20000, 2500))
)


b) log of price (for later usage)

In [None]:
(
    ggplot(data, aes(x="lnprice"))
    + geom_histogram(
        aes(y="(stat(count)) / sum(stat(count))"),
        binwidth=0.2,
        boundary=0,
        color="white",
        fill="blue",
        size=0.25,
        alpha=0.8,
        show_legend=False,
        na_rm=True,
    )
    + coord_cartesian(xlim=(6, 10))
    + labs(x="ln(Price, US dollars)", y="Percent")
    + expand_limits(x=0.01, y=0.01)
    + scale_y_continuous(expand=(0.01, 0.01), labels=percent_format())
    + scale_x_continuous(expand=(0.01, 0.01), breaks=np.arange(6, 10, 1))
    + theme_bw()
)


### Regression analysis

lowess

In [None]:
(
    ggplot(data, aes(x="age", y="price"))
    + geom_point(color="blue", size=1, alpha=0.8, show_legend=False, na_rm=True)
    + geom_smooth(method="loess", se=False, colour="red", size=1, span=0.9)
    + labs(x="Age (years)", y="Price (US dollars)")
    + theme_bw()
    + expand_limits(x=0.01, y=0.01)
    + scale_y_continuous(
        expand=(0.01, 0.01), limits=(0, 20000), breaks=np.arange(0, 20000, 5000)
    )
    + scale_x_continuous(expand=(0.01, 0.01), limits=(0, 30), breaks=np.arange(0, 30, 5))
)


 Lowess vs. quadratic specification with age

In [None]:
(
    ggplot(data, aes(x="age", y="price"))
    + geom_point(color="blue", size=1, alpha=0.8, show_legend=False, na_rm=True)
    + geom_smooth(method="loess", se=False, colour="red", size=1)
    + geom_smooth(
        method="lm", se=False, colour="black", formula="y ~ x + np.square(x)", size=1
    )
    + labs(x="Age (years)", y="Price (US dollars)")
    + theme_bw()
    + expand_limits(x=0.01, y=0.01)
    + scale_color_manual(
        name="", values=("red", "black"), labels=("Lowess in age", "Quadratic in age")
    )
    + scale_y_continuous(
        expand=(0.01, 0.01), limits=(0, 20000), breaks=np.arange(0, 20001, 5000)
    )
    + scale_x_continuous(
        expand=(0.01, 0.01), limits=(0, 30), breaks=np.arange(0, 31, 5)
    )
)

### Running linear regressions using all observations

In [None]:
reg1 = smf.ols("price ~ age + agesq", data=data).fit(cov_type="HC1")
reg2 = smf.ols("price ~ age + agesq + odometer", data=data).fit(cov_type="HC1")
reg3 = smf.ols(
    "price ~ age + agesq + odometer + odometersq + LE + cond_excellent + cond_good + dealer",
    data=data,
).fit(cov_type="HC1")
reg4 = smf.ols(
    "price ~ age + agesq + odometer + odometersq + LE + XLE + SE + cond_likenew + cond_excellent + cond_good + cylind6 + dealer",
    data=data,
).fit(cov_type="HC1")
reg5 = smf.ols(
    "price ~ age + agesq + odometer + odometersq + LE * age + XLE * age + SE * age + cond_likenew * age + cond_excellent * age + cond_good * age + cylind6 * age + odometer * age + dealer * age",
    data=data,
).fit(cov_type="HC1")


In [None]:
def extract_regression_statistics(
    reg: statsmodels.regression.linear_model.RegressionResultsWrapper,
) -> dict:
    """
    Quick function to extract statistics (eg. AIC, BIC, RMSE) for statsmodels' fitted regressions.
    """

    return {
        "AIC": reg.aic,
        "BIC": reg.bic,
        "RMSE": np.sqrt(reg.mse_resid),
        "No. Variables": reg.df_model,
    }

In [None]:
regression_fits = [reg1, reg2, reg3, reg4, reg5]

Extract statistics for each regression

In [None]:
regression_statistics = pd.DataFrame(
    [extract_regression_statistics(reg) for reg in regression_fits]
).round(2)

Create table with statistics (AIC, BIC, RMSE) in custom line

In [None]:
table = stargazer.Stargazer([reg1, reg2, reg3, reg4, reg5])
for i in range(4):
    table.add_line(
        regression_statistics.columns[i], regression_statistics.iloc[:, i].tolist()
    )
table

##  Cross-validation for better evaluation of predictive performance

Simple k-fold cross validation setup:
 1. Used method for estimating the model: 'lm' - linear model (y_hat = b0+b1*x1+b2*x2 + ...)
 2. set number of folds to use (must be less than the no. observations)

In [None]:
from sklearn.model_selection import KFold

k = KFold(n_splits=4, shuffle=False, random_state=None)


Cross validate OLS with combining sklearn k-fold cross validation and statsmodels ols formula



In [None]:
def cv_reg(formula: str, data: pd.DataFrame, kfold: int, robustse=None) -> dict:
    """
    Estimates OLS regression in a k-fold cross validat.ion.
    Saves the estimated models, cv-test set predictions, r^2 and rmse.
    """
    regression_list = []
    predicts_on_test = []
    rsquared = []
    rmse_list = []

    # Calculating OLS for each fold

    for train_index, test_index in k.split(data):

        # train-test split
        data_train, data_test = data.iloc[train_index, :], data.iloc[test_index, :]

        # fit model
        if robustse is None:
            model = smf.ols(formula, data=data_train).fit()
        else:
            model = smf.ols(formula, data=data_train).fit(cov_type=robustse)

        # Save fitted regression object
        regression_list += [model]

        # save test set prediciton and r`2 and test set rmse
        predicts_on_test += [model.predict(data_test)]
        rsquared += [model.rsquared]
        rmse_list += [rmse(data_test[formula.split("~")[0]], model.predict(data_test))]

    return {
        "regressions": regression_list,
        "test_predict": predicts_on_test,
        "r2": rsquared,
        "rmse": rmse_list,
    }


def summarize_cv(cvlist, stat="rmse"):
    """
    Summarises cross-validated OLS regression results received from `cv_reg`.
    """
    result = pd.DataFrame(
        {"Model " + str(x + 1): cvlist[x][stat] for x in range(len(cv_list))}
    )
    result["Resample"] = ["Fold" + str(x + 1) for x in range(len(cvlist[0][stat]))]
    result = result.set_index("Resample")
    result = pd.concat([result, pd.DataFrame(result.mean(), columns=["Average"]).T])
    return result

Estimate the regressions with cross validation

In [None]:
cv1 = cv_reg("price~age+agesq", data, k, "HC0")
cv2 = cv_reg("price~age+agesq+odometer", data, k, "HC0")
cv3 = cv_reg(
    "price~age+agesq+ odometer + odometersq + LE + cond_excellent + cond_good + dealer",
    data,
    k,
    "HC0",
)
cv4 = cv_reg(
    "price~age+agesq+ odometer + odometersq + LE + XLE + SE + cond_likenew + cond_excellent + cond_good + cylind6 + dealer",
    data,
    k,
    "HC0",
)
cv5 = cv_reg(
    "price~age+agesq + odometer + odometersq + LE*age + XLE*age + SE*age + cond_likenew*age + cond_excellent*age + cond_good*age + cylind6*age + odometer*age + dealer*age",
    data,
    k,
    "HC0",
)
cv_list = [cv1, cv2, cv3, cv4, cv5]


Display RMSE for each model spec. - cv fold, and average cross validated RMSE

In [None]:
summarize_cv(cv_list, stat="rmse")

Display $R^2$ for each model spec. - cv fold, and average cross validated $R^2$

In [None]:
summarize_cv(cv_list, stat="r2")

 #### Show model complexity with in-sample and out-of-sample RMSE performance

In [None]:
complexity_plotdata = (
    regression_statistics.filter(["No. Variables", "RMSE"])
    .merge(
        summarize_cv(cv_list, stat="rmse").T.filter(["Average"]).reset_index(drop=True),
        left_index=True,
        right_index=True,
    )
    .rename(columns={"RMSE": "in-sample RMSE", "Average": "out-of-sample RMSE"})
    .melt(id_vars=["No. Variables"])
)

In [None]:
(
    ggplot(
        complexity_plotdata,
        aes(x="No. Variables", y="value", color="variable", fill="variable"),
    )
    + geom_point(size=2)
    + geom_line(size=0.5)
    + scale_color_manual(name=" ", values=("blue", "red"))
    + scale_fill_manual(name=" ", values=("blue", "red"))
    + labs(
        x="Number of explanatory variables",
        y="Averaged RMSE",
        title="Prediction performance and model compexity",
    )
    + theme_bw()
)

### Prediction

   Compare model1 and model3 to predict our car

 Use only the predictor variables and outcome

In [None]:
data = data.loc[
    :,
    [
        "age",
        "agesq",
        "odometer",
        "odometersq",
        "SE",
        "LE",
        "XLE",
        "cond_likenew",
        "cond_excellent",
        "cond_good",
        "dealer",
        "price",
        "cylind6",
    ],
]


In [None]:
data.dtypes


 Add new observation that we would like to predict

In [None]:
new = pd.DataFrame(
    pd.Series(
        {
            "age": 10,
            "agesq": 10**2,
            "odometer": 12,
            "odometersq": 12**2,
            "SE": 0,
            "LE": 1,
            "XLE": 0,
            "cond_likenew": 0,
            "cond_excellent": 1,
            "cond_good": 0,
            "dealer": 0,
            "price": np.nan,
            "cylind6": 0,
        }
    )
).T
new


In [None]:
reg1.resid.describe()


 Predict price with only 2 predictors (Model1)

In [None]:
p1 = reg1.get_prediction(new).summary_frame()
p1


Predict price with all predictors (Model3)

In [None]:
p3 = reg3.get_prediction(new).summary_frame()
p3


summary of predictions and PI 80% version

In [None]:
p1_80 = reg1.get_prediction(new).summary_frame(alpha=0.2)
p3_80 = reg3.get_prediction(new).summary_frame(alpha=0.2)

 Result summary

In [None]:
pred_summary_level = pd.DataFrame(
    {
        " ": [
            "Predicted",
            "PI_low(95%)",
            "PI_high(95%)",
            "PI_low(80%)",
            "PI_high(80%)",
        ],
        "Model1": p1[["mean", "obs_ci_lower", "obs_ci_upper"]].values.tolist()[0]
        + p1_80[["obs_ci_lower", "obs_ci_upper"]].values.tolist()[0],
        "Model3": p3[["mean", "obs_ci_lower", "obs_ci_upper"]].values.tolist()[0]
        + p3_80[["obs_ci_lower", "obs_ci_upper"]].values.tolist()[0],
    }
).set_index(" ")
pred_summary_level

## Log transformation

Reminder: lnprice

In [None]:
data["lnprice"] = np.log(data["price"])

In [None]:
(
    ggplot(data, aes(x="age", y="lnprice"))
    + geom_point(color="blue", size=1, alpha=0.8, show_legend=False, na_rm=True)
    + geom_smooth(method="loess", color="red", se=False, size=0.8, na_rm=True)
    + scale_x_continuous(expand=(0.01, 0.01), limits=(0, 30), breaks=np.arange(0, 31, 5))
    + scale_y_continuous(expand=(0.01, 0.01), limits=(6, 10), breaks=np.arange(6, 11, 1))
    + labs(x="Age (years)", y="ln(price, US dollars)")
    + theme_bw()
)

### Linear regressions with logs

In [None]:
reg1 = smf.ols("lnprice~age", data=data).fit(cov_type="HC0")
reg2 = smf.ols("lnprice~age+odometer", data=data).fit(cov_type="HC0")
reg3 = smf.ols(
    "lnprice ~ age  + odometer +  LE + cond_excellent + cond_good + dealer", data=data
).fit(cov_type="HC0")
reg4 = smf.ols(
    "lnprice ~ age  + odometer +  LE + XLE + SE + cond_likenew + cond_excellent + cond_good + cylind6 + dealer",
    data=data,
).fit(cov_type="HC0")
reg5 = smf.ols(
    "lnprice ~ age +  odometer + LE*age + XLE*age + SE*age + cond_likenew*age + cond_excellent*age + cond_good*age + cylind6*age + odometer*age + dealer*age",
    data=data,
).fit(cov_type="HC0")

regression_fits = [reg1, reg2, reg3, reg4, reg5]

regression_statistics = pd.DataFrame(
    [extract_regression_statistics(reg) for reg in regression_fits]
).round(2)

Create table with statistics (AIC, BIC, RMSE) in custom line

In [None]:
table = stargazer.Stargazer([reg1, reg2, reg3, reg4, reg5])
for i in range(4):
    table.add_line(
        regression_statistics.columns[i], regression_statistics.iloc[:, i].tolist()
    )
table

### Cross-validation for log models

In [None]:
cv1 = cv_reg("lnprice~age", data, k, "HC0")
cv2 = cv_reg("lnprice~age+odometer", data, k, "HC0")
cv3 = cv_reg(
    "lnprice~ age  + odometer +  LE + cond_excellent + cond_good + dealer",
    data,
    k,
    "HC0",
)
cv4 = cv_reg(
    "lnprice~ age  + odometer +  LE + XLE + SE + cond_likenew + cond_excellent + cond_good + cylind6 + dealer",
    data,
    k,
    "HC0",
)
cv5 = cv_reg(
    "lnprice~ age +  odometer + LE*age + XLE*age + SE*age + cond_likenew*age + cond_excellent*age + cond_good*age + cylind6*age + odometer*age + dealer*age",
    data,
    k,
    "HC0",
)
cv_list = [cv1, cv2, cv3, cv4, cv5]


In [None]:
summarize_cv(cv_list, stat="rmse")

### Prediction with log models

We do this now only for Model 3

In [None]:
predln_new = reg3.get_prediction(new).summary_frame()
predln_new

In [None]:
predln_new80 = reg3.get_prediction(new).summary_frame(alpha=0.2)
predln_new80

RMSE for reg3

In [None]:
rmse3 = rmse(reg3.fittedvalues, data.lnprice)
rmse3

Get new prediction 

In [None]:
lnp2_new = predln_new["mean"][0]
lnp2_new

Convert log prediction to level prediction

In [None]:
data["lnplev"] = np.exp(reg3.fittedvalues) * np.exp(rmse3**2 / 2)
lnp2_new_lev = np.exp(lnp2_new) * np.exp(rmse3**2 / 2)
lnp2_new_lev

 Check the RMSE to compare with the level model:

Prediction interval (log and level)

In [None]:
lnp2_PIlow = predln_new["obs_ci_lower"][0]
lnp2_PIhigh = predln_new["obs_ci_upper"][0]
lnplev_PIlow = np.exp(lnp2_PIlow) * np.exp(rmse3**2 / 2)
lnplev_PIhigh = np.exp(lnp2_PIhigh) * np.exp(rmse3**2 / 2)

Prediction interval (log and level) - 80%


In [None]:
lnp2_PIlow80 = predln_new80["obs_ci_lower"][0]
lnp2_PIhigh80 = predln_new80["obs_ci_upper"][0]
lnplev_PIlow80 = np.exp(lnp2_PIlow80) * np.exp(rmse3**2 / 2)
lnplev_PIhigh80 = np.exp(lnp2_PIhigh80) * np.exp(rmse3**2 / 2)


In [None]:
pred_summary_log = pd.DataFrame(
    {
        " ": [
            "Predicted",
            "PI_low(95%)",
            "PI_high(95%)",
            "PI_low(80%)",
            "PI_high(80%)",
        ],
        "Model3 prediction in logs": [
            lnp2_new,
            lnp2_PIlow,
            lnp2_PIhigh,
            lnp2_PIlow80,
            lnp2_PIhigh80,
        ],
        "Recalculated to level": [
            lnp2_new_lev,
            lnplev_PIlow,
            lnplev_PIhigh,
            lnplev_PIlow80,
            lnplev_PIhigh80,
        ],
    }
)

In [None]:
pred_summary_log["Original level prediction"] = pred_summary_level["Model3"].values

In [None]:
pred_summary_log