<a href="https://colab.research.google.com/github/francji1/01RAD/blob/main/code/01RAD_Ex06.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

01RAD Exercise 06

Lets use the same dataset from the last exercise

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import f,t,norm

import statsmodels.api as sm

import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

import matplotlib.pyplot as plt
import seaborn as sns

import itertools
from itertools import combinations

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


In [None]:
cars_all = pd.read_csv("https://raw.githubusercontent.com/francji1/01RAD/main/data/carsdata2.csv", sep=";")
cars_all.head()

In [None]:
cars_all

In [None]:
cars_all.isna().sum()


In [None]:
# Define car type and wheel drive conditions
sedan_condition = cars_all['Sedan'] == 1
sport_condition = cars_all['Sports'] == 1
suv_condition = cars_all['SUV'] == 1
minivan_condition = (cars_all['Wagon'] == 1) | (cars_all['Minivan'] == 1) | (cars_all['Pickup'] == 1)
awd_condition = cars_all['AWD'] == 1
rwd_condition = cars_all['RWD'] == 1

# Create new DataFrame with car_type and other derived columns in a single step
cars_all = cars_all.assign(
    car_type=np.select(
        [sedan_condition, sport_condition, suv_condition, minivan_condition],
        ['sedan', 'sport', 'suv', 'minivan'],
        default='Unknown'
    ),
    wheel_drive=np.select(
        [awd_condition, rwd_condition],
        ['AWD', 'RWD'],
        default='FWD'
    ),
    consumption=100 / (1.60934 * ((cars_all['CityMPG'] + cars_all['HwyMPG']) / 2) / 3.7854)
).astype({
    'car_type': 'category',
    'wheel_drive': 'category'
}).filter([
    'RetailPrice', 'car_type', 'consumption', 'wheel_drive',
    'DealerCost', 'EngineSize', 'Cyl', 'HP', 'Weight', 'WheelBase', 'Len', 'Width'
])

cars_all.head()


In [None]:
# Drop redundant columns and rows with NA values
cars = cars_all.drop(columns = ['Cyl','DealerCost']).copy()
cars.dropna(inplace=True)
cars.isna().sum()


Show how to handle with formula with/without one hot encoded varialbes.

In [None]:
# One-hot encoding for categorical variables
cars_data_encoded = pd.get_dummies(cars, columns=['car_type', 'wheel_drive'], drop_first=True)

# Building the full model with all predictors and their second-order interactions

predictors = cars_data_encoded.columns.drop('Weight')
interaction_terms = ['{}:{}'.format(a, b) for a, b in combinations(predictors, 2)]
formula_full = 'Weight ~ ' + ' + '.join(predictors) + ' + ' + ' + '.join(interaction_terms) + '-' +  'type_sport:type_suv'
# not work: formula_full = 'Weight ~ (.)^2 ' , * works
formula_full

In [None]:
# Fit the full model
full_model = smf.ols(formula=formula_full, data=cars_data_encoded).fit()

# Display the summary of the full model
full_model_summary = full_model.summary()
full_model_aic = full_model.aic
full_model_bic = full_model.bic

print(full_model_summary)

In [None]:
# Function for fitting a model and getting AIC and BIC
def fit_model(formula, data):
    model = smf.ols(formula, data=data).fit()
    return model.aic, model.bic, model

# Stepwise Regression with column validation and dynamic name matching
def stepwise_selection(data, response, initial_list=[], threshold_in=0.01, threshold_out=0.05):
    included = list(initial_list)
    while True:
        changed = False

        # Forward step
        excluded = list(set(data.columns) - set(included) - {response})
        new_pval = pd.Series(index=excluded, dtype=float)
        for new_column in excluded:
            try:
                model = smf.ols(f'{response} ~ ' + ' + '.join(included + [new_column]), data=data).fit()
                new_pval[new_column] = model.pvalues[new_column]
            except KeyError:
                # In case the predictor isn't in the dataset, skip to avoid error
                continue
        if not new_pval.empty:
            best_pval = new_pval.min()
            if best_pval < threshold_in:
                best_feature = new_pval.idxmin()
                included.append(best_feature)
                changed = True

        # Backward step
        model = smf.ols(f'{response} ~ ' + ' + '.join(included), data=data).fit()
        # Use all p-values except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max()  # null if pvalues is empty
        if worst_pval > threshold_out:
            changed = True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)

        if not changed:
            break

    return included

# Ensure categorical variables are properly encoded
cars_data_encoded = pd.get_dummies(cars, columns=['car_type', 'wheel_drive'], drop_first=True)

# Run stepwise selection
predictors_stepwise = stepwise_selection(cars_data_encoded, 'Weight')

# Fit the model with selected predictors
formula_stepwise = 'Weight ~ ' + ' + '.join(predictors_stepwise)
aic_stepwise, bic_stepwise, reduced_model_t = fit_model(formula_stepwise, cars_data_encoded)

# Output the selected predictors, AIC, BIC, and the final formula
predictors_stepwise, aic_stepwise, bic_stepwise, formula_stepwise

print(reduced_model_t.summary())

In [None]:
# Function to fit and compare models using F-test
def fit_and_compare_models(data, full_formula, sub_formula):
    # Fit both the full model and the sub-model
    full_model = smf.ols(full_formula, data=data).fit()
    sub_model = smf.ols(sub_formula, data=data).fit()
    # Perform ANOVA to compare the models and get the p-value
    anova_results = anova_lm(sub_model, full_model)
    f_pvalue = anova_results["Pr(>F)"][1]  # p-value for the comparison
    return f_pvalue, full_model

# Stepwise Selection using F-tests and ANOVA
def stepwise_selection(data, response, initial_list=[], threshold_in=0.01, threshold_out=0.05):
    included = list(initial_list)
    while True:
        changed = False

        # Forward step: try adding each excluded variable and test significance with F-test
        excluded = list(set(data.columns) - set(included) - {response})
        new_pvalues = pd.Series(index=excluded, dtype=float)
        for new_column in excluded:
            # Only proceed if included is non-empty
            formula_with = f'{response} ~ ' + ' + '.join(included + [new_column])
            formula_without = f'{response} ~ ' + ' + '.join(included) if included else f'{response} ~ 1'
            try:
                f_pvalue, _ = fit_and_compare_models(data, formula_with, formula_without)
                new_pvalues[new_column] = f_pvalue
            except Exception as e:
                print(f"Error fitting model with {new_column}: {e}")
                continue

        # Add the variable with the lowest F-test p-value if below threshold_in
        if not new_pvalues.empty:
            best_pvalue = new_pvalues.min()
            if best_pvalue < threshold_in:
                best_feature = new_pvalues.idxmin()
                included.append(best_feature)
                changed = True

        # Backward step: try removing each variable in the model and test significance with F-test
        if included:
            pvalues = pd.Series(index=included, dtype=float)
            for column in included:
                formula_with = f'{response} ~ ' + ' + '.join(included)
                remaining_columns = [col for col in included if col != column]
                formula_without = f'{response} ~ ' + ' + '.join(remaining_columns) if remaining_columns else f'{response} ~ 1'
                try:
                    f_pvalue, _ = fit_and_compare_models(data, formula_with, formula_without)
                    pvalues[column] = f_pvalue
                except Exception as e:
                    print(f"Error fitting model without {column}: {e}")
                    continue

            # Remove the variable with the highest p-value if above threshold_out
            worst_pvalue = pvalues.max()
            if worst_pvalue > threshold_out:
                worst_feature = pvalues.idxmax()
                included.remove(worst_feature)
                changed = True

        # Stop if no predictors were added or removed
        if not changed:
            break

    return included

# Ensure categorical variables are properly encoded
cars_data_encoded = pd.get_dummies(cars, columns=['car_type', 'wheel_drive'], drop_first=True)

# Run stepwise selection
predictors_stepwise = stepwise_selection(cars_data_encoded, 'Weight')

# Fit the model with selected predictors
if predictors_stepwise:  # Ensure we have predictors before fitting the model
    formula_stepwise = 'Weight ~ ' + ' + '.join(predictors_stepwise)
    aic_stepwise, bic_stepwise, reduced_model_F = fit_model(formula_stepwise, cars_data_encoded)
else:
    print("No predictors were selected.")

print(reduced_model_F.summary())

In [None]:
# 5 years old package
# !pip install stepwise-regression

# Install mlxtend if not already installed
!pip install mlxtend


In [None]:
# Old appraoch with mlxtend
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

# Prepare data for mlxtend
X = cars_data_encoded.drop(columns=['Weight']) # no need for constant
y = cars_data_encoded['Weight']

# Initialize SequentialFeatureSelector for forward stepwise selection
lr = LinearRegression()
sfs = SFS(lr,
          k_features="best",
          forward=True,
          floating=False,
          scoring='r2', # r2 neg_mean_squared_error
          cv=5)  # Optional cross-validation

# Fit the selector
sfs = sfs.fit(X, y)

# Get the selected feature names
selected_features = list(sfs.k_feature_names_)

# Construct the formula for statsmodels
formula_stepwise = 'Weight ~ ' + ' + '.join(selected_features)
final_model_mlx = smf.ols(formula=formula_stepwise, data=cars_data_encoded).fit()

# Display the final model summary
print(final_model_mlx.summary())


In [None]:
# Approach with sklearn directly
from sklearn.feature_selection import SequentialFeatureSelector

# Prepare data
X = cars_data_encoded.drop(columns=['Weight'])
y = cars_data_encoded['Weight']

# Initialize Linear Regression model
lr = LinearRegression()

# Initialize Sequential Feature Selector
sfs = SequentialFeatureSelector(
    estimator=lr,
    n_features_to_select="auto",  # Automatically determine the optimal number of features
    direction="forward",          # Perform forward selection
    scoring="r2",                 # (other options: 'neg_mean_squared_error')
    cv=5                          # 5-fold cross-validation
)

# Fit the Sequential Feature Selector
sfs.fit(X, y)

# Get the names of the selected features
selected_features = X.columns[sfs.get_support()]

# Construct the formula for statsmodels
formula_stepwise = 'Weight ~ ' + ' + '.join(selected_features)
final_model_sk = smf.ols(formula=formula_stepwise, data=cars_data_encoded).fit()

# Display the final model summary
print(final_model_sk.summary())


In [None]:
full_model.summary()


In [None]:
# Conduct ANOVA (F-test) to compare the full model and the reduced model
anova_results = anova_lm(final_model_sk, final_model_mlx)
anova_results


## Residual Diagnostics and Plots

Residual analysis is critical for validating model assumptions. We focus on normality, linearity, and constant variance assumptions. Key diagnostic tools include:

### 1. **Q-Q Plot for Residual Normality**

Plots the quantiles of the residuals against theoretical quantiles of a normal distribution.

### 2. **Residuals vs. Fitted Values**

- **Homoscedasticity**: Residuals should be evenly scattered around zero.
- **Non-linearity**: A pattern in residuals suggests that the relationship between predictors and response may not be linear.






### 3. **Residuals plots**

##Component-Residual Plot (Partial Residual Plots):

* What to See: These plots show the relationship between each predictor and the response variable while controlling for the effect of other variables. They are useful for checking linearity and identifying outliers or influential points.
It visualize the isolated effect of each predictor by adjusting for other variables.
* Why to Plot: To verify the assumption that the relationship between predictors and the response is linear, and to spot any non-linear patterns, outliers, or points that might have a disproportionate impact on the regression model.

##Added Variable Plot (Partial Regression Plots):

* What to See: These plots display the relationship between the response and a given predictor, after removing the effect of all other predictors. They help in understanding the individual contribution of a predictor to the model.
* Why to Plot: To assess the unique impact of each predictor on the response, checking for linearity, and identifying potential outliers or influential observations that might affect the slope of the regression line.

## Spread-Level Plot:

* What to See: This plot shows the spread or variance of the residuals against the predicted values or a predictor. It's used to check the assumption of homoscedasticity (constant variance of errors).
* Why to Plot: To ensure that the error variance is constant across all levels of the predictors. Non-constant variance (heteroscedasticity) can indicate that the model is not capturing some aspect of the data, possibly violating regression assumptions.

In [None]:
reduced_model = reduced_model_F

In [None]:
from statsmodels.graphics.regressionplots import plot_partregress_grid, plot_ccpr_grid


In [None]:
# Spread-Level Plot (Residuals vs Predicted)
plt.figure(figsize=(10, 8))
sns.scatterplot(x=reduced_model.fittedvalues, y=reduced_model.resid)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Spread-Level Plot')
plt.show()


In [None]:
# Component-Residual Plot (Partial Residual Plots)
plot_ccpr_grid(reduced_model)
plt.tight_layout()
plt.show()


In [None]:
# Added Variable Plot (Partial Regression Plots)
fig = plt.figure(figsize=(16, 12))
plot_partregress_grid(reduced_model, fig=fig)
plt.tight_layout()
plt.show()


In [None]:

def plot_regression_diagnostics(model):
    """
    Generate diagnostic plots for a regression model.

    :param model: The fitted regression model object from statsmodels.
    :return: A matplotlib figure object containing the diagnostic plots.
    """

    #residuals = model.resid
    residuals = model.get_influence().resid_studentized  # internal studentized residuals

    num_regressors = len(model.model.exog_names) - 1  # Exclude intercept
    total_plots = num_regressors + 4  # Total plots needed (1 plot per regressor + 4 diagnostics)
    rows = (total_plots + 2) // 3  # Calculate rows needed to fit all plots in 3 columns

    fig, axes = plt.subplots(rows, 3, figsize=(15, 5 * rows))
    axes = axes.flatten()  # Flatten to iterate easily

    # Plot Fitted Values vs Residuals
    axes[0].scatter(model.fittedvalues, residuals)
    axes[0].axhline(0, color='red', linestyle='--')
    axes[0].set_xlabel('Fitted Values')
    axes[0].set_ylabel('Residuals')
    axes[0].set_title('Fitted Values vs Residuals')

    # Plot Response vs Residuals for each regressor
    for i, col in enumerate(model.model.exog_names[1:], start=1):
        ax = axes[i]
        ax.scatter(model.model.exog[:, i], residuals)
        ax.axhline(0, color='red', linestyle='--')
        ax.set_xlabel(col)
        ax.set_ylabel('Residuals')
        ax.set_title(f'Response vs Residuals: {col}')

    # Normal Q-Q plot
    sm.qqplot(residuals, line='s', ax=axes[num_regressors + 1])
    axes[num_regressors + 1].set_title('Normal Q-Q')

    # Scale-Location plot
    axes[num_regressors + 2].scatter(model.fittedvalues, np.sqrt(np.abs(residuals)))
    axes[num_regressors + 2].axhline(0, color='red', linestyle='--')
    axes[num_regressors + 2].set_xlabel('Fitted Values')
    axes[num_regressors + 2].set_ylabel('Standardized Residuals')
    axes[num_regressors + 2].set_title('Scale-Location')

    # Hide any unused subplots
    for j in range(num_regressors + 3, len(axes)):
        axes[j].set_visible(False)

    plt.tight_layout()
    return fig

# Generate the diagnostic plots
fig0 = plot_regression_diagnostics(reduced_model)
plt.show()


In [None]:
from statsmodels.graphics.regressionplots import plot_regress_exog

# Check residuals against each independent variable using plot_regress_exog
key_predictors = ['consumption', 'WheelBase', 'Width']

for predictor in key_predictors:
    fig = plt.figure(figsize=(14, 10))
    plot_regress_exog(reduced_model, predictor, fig=fig)
    plt.show()


In [None]:
# Extract predictors from the reduced model
predictors_stepwise = reduced_model.model.exog_names
predictors_stepwise.remove('Intercept')  # Remove the intercept from the list


In [None]:
# Specify the formula based on the variables provided
formula = 'Weight ~ consumption + WheelBase + Width + RetailPrice + HP'

# Fit the OLS model using statsmodels with the defined formula
reduced_model = smf.ols(formula=formula, data=cars_data_encoded).fit()

# Print the summary to see the coefficients and confirm they match
print(reduced_model.summary())

In [None]:

# Log Transformation of the Response
cars_data_encoded['log_Weight'] = np.log(cars_data_encoded['Weight'])
formula_log = 'log_Weight ~ consumption + WheelBase + Width + RetailPrice + HP'
model_log = smf.ols(formula=formula_log, data=cars_data_encoded).fit()
print(model_log.summary())

In [None]:
import statsmodels.formula.api as smf
from scipy import stats
import numpy as np

# One-hot encoding for categorical variables
cars_data_encoded = pd.get_dummies(cars, columns=['car_type', 'wheel_drive'], drop_first=True)

# Extract predictors from the reduced model, excluding 'Intercept' if present
predictors_stepwise = [name for name in reduced_model_F.model.exog_names if name != 'Intercept']

# Define the formula dynamically for both transformations
formula = 'Weight ~ consumption + WheelBase + Width + RetailPrice + HP'

# Log Transformation of the Response
cars_data_encoded['log_Weight'] = np.log(cars_data_encoded['Weight'])
model_log = smf.ols(formula=formula.replace("Weight", "log_Weight"), data=cars_data_encoded).fit()

# Box-Cox Transformation of the Response
box_cox_transformed, best_lambda = stats.boxcox(cars_data_encoded['Weight'])
cars_data_encoded['box_cox_Weight'] = box_cox_transformed
model_box_cox = smf.ols(formula=formula.replace("Weight", "box_cox_Weight"), data=cars_data_encoded).fit()

# Collecting and printing summary statistics for comparison
print("Best Lambda for Box-Cox Transformation:", best_lambda)
print("\nLog-Transformed Model Summary:\n", model_log.summary())
print("\nBox-Cox Transformed Model Summary:\n", model_box_cox.summary())


In [None]:
from scipy import stats
from matplotlib import gridspec
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

# Using the 'Weight' variable from the cars_data_encoded dataframe
x = cars_data_encoded['Weight']

# Lambda range and corresponding log-likelihood values
lmbdas = np.linspace(-2, 2, 400)
llf = [stats.boxcox_llf(lmbda, x) for lmbda in lmbdas]

# Finding the lambda that maximizes the log-likelihood
lmbda_optimal = lmbdas[np.argmax(llf)]

# Plotting the log-likelihood as a function of lambda
fig = plt.figure(figsize=(10, 6))
gs = gridspec.GridSpec(1, 1)
ax = fig.add_subplot(gs[0])
ax.plot(lmbdas, llf, 'b.-')
ax.axhline(stats.boxcox_llf(lmbda_optimal, x), color='r')
ax.set_xlabel('Lambda parameter')
ax.set_ylabel('Box-Cox log-likelihood')

# Inset plots for different lambda values
locs = [3, 10, 4]  # 'lower left', 'center', 'lower right'
for lmbda, loc in zip([-1, lmbda_optimal, 9], locs):
    xt = stats.boxcox(x, lmbda=lmbda)
    (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt)
    ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc)
    ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-')
    ax_inset.set_xticklabels([])
    ax_inset.set_yticklabels([])
    ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda)

plt.show()

lmbda_optimal


Scipy box cox functions:
* https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.boxcox.html
* https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.boxcox_llf.html


In [None]:
# Recalculating the confidence interval for the optimal lambda using the correct method
_, lmbda_optimal, (lmbda_ci_lower, lmbda_ci_upper) = stats.boxcox(x, alpha=0.05)

# Replotting the log-likelihood as a function of lambda with the correct confidence interval
fig = plt.figure(figsize=(10, 6))
gs = gridspec.GridSpec(1, 1)
ax = fig.add_subplot(gs[0])
ax.plot(lmbdas, llf, 'b.-')
ax.axhline(stats.boxcox_llf(lmbda_optimal, x), color='r')
ax.axvline(lmbda_optimal, color='r', linestyle='--', label=f'Optimal Lambda: {lmbda_optimal:.2f}')
ax.axvline(lmbda_ci_lower, color='g', linestyle='--', label=f'CI Lower: {lmbda_ci_lower:.2f}')
ax.axvline(lmbda_ci_upper, color='g', linestyle='--', label=f'CI Upper: {lmbda_ci_upper:.2f}')
ax.set_xlabel('Lambda parameter')
ax.set_ylabel('Box-Cox log-likelihood')
ax.legend()

# Insert plots for different lambda values
locs = [3, 10, 4]  # 'lower left', 'center', 'lower right'
for lmbda, loc in zip([-1, lmbda_optimal, 9], locs):
    xt = stats.boxcox(x, lmbda=lmbda)
    (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt)
    ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc)
    ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-')
    ax_inset.set_xticklabels([])
    ax_inset.set_yticklabels([])
    ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda)

plt.show()

(lmbda_optimal, lmbda_ci_lower, lmbda_ci_upper)
