<a href="https://colab.research.google.com/github/francji1/01RAD/blob/main/code/01RAD_Ex06_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

01RAD Exercise 06

Lets use the same dataset from the last exercise

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import f,t,norm

import statsmodels.api as sm

import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

import matplotlib.pyplot as plt
import seaborn as sns

import itertools
from itertools import combinations

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression



In [None]:
cars_all = pd.read_csv("https://raw.githubusercontent.com/francji1/01RAD/refs/heads/main/data/carsdata2.csv", sep=";")
cars_all.head()

In [None]:
cars_all

In [None]:
cars_all.isna().sum()


In [None]:
# Define car type and wheel drive conditions
sedan_condition = cars_all['Sedan'] == 1
sport_condition = cars_all['Sports'] == 1
suv_condition = cars_all['SUV'] == 1
minivan_condition = (cars_all['Wagon'] == 1) | (cars_all['Minivan'] == 1) | (cars_all['Pickup'] == 1)
awd_condition = cars_all['AWD'] == 1
rwd_condition = cars_all['RWD'] == 1

# Create new DataFrame with car_type and other derived columns in a single step
cars_all = cars_all.assign(
    car_type=np.select(
        [sedan_condition, sport_condition, suv_condition, minivan_condition],
        ['sedan', 'sport', 'suv', 'minivan'],
        default='Unknown'
    ),
    wheel_drive=np.select(
        [awd_condition, rwd_condition],
        ['AWD', 'RWD'],
        default='FWD'
    ),
    consumption=100 / (1.60934 * ((cars_all['CityMPG'] + cars_all['HwyMPG']) / 2) / 3.7854)
).astype({
    'car_type': 'category',
    'wheel_drive': 'category'
}).filter([
    'RetailPrice', 'car_type', 'consumption', 'wheel_drive',
    'DealerCost', 'EngineSize', 'Cyl', 'HP', 'Weight', 'WheelBase', 'Len', 'Width'
])

cars_all.head()


In [None]:
# Drop redundant columns and rows with NA values
cars = cars_all.drop(columns = ['Cyl','DealerCost']).copy()
cars.dropna(inplace=True)
cars.isna().sum()


Show how to handle with formula with/without one hot encoded varialbes.

In [None]:
# One-hot encoding for categorical variables
cars_data_encoded = pd.get_dummies(cars, columns=['car_type', 'wheel_drive'], drop_first=True)

# Building the full model with all predictors and their second-order interactions
predictors = cars_data_encoded.columns.drop('Weight')
interaction_terms = ['{}:{}'.format(a, b) for a, b in combinations(predictors, 2)]
formula_full = 'Weight ~ ' + ' + '.join(predictors) + ' + ' + ' + '.join(interaction_terms) + '-' +  'type_sport:type_suv'
# not work: formula_full = 'Weight ~ (.)^2 ' , * works
formula_full

In [None]:
# Fit the full model
full_model = smf.ols(formula=formula_full, data=cars_data_encoded).fit()

# Display the summary of the full model
full_model_summary = full_model.summary()
full_model_aic = full_model.aic
full_model_bic = full_model.bic

print(full_model_summary)

In [None]:
full_model.summary()

In [None]:
# Function for fitting a model and getting AIC and BIC
def fit_model(formula, data):
    model = smf.ols(formula, data=data).fit()
    return model.aic, model.bic, model

# Stepwise Regression with column validation and dynamic name matching
def stepwise_selection(data, response, initial_list=[], threshold_in=0.01, threshold_out=0.05):
    included = list(initial_list)
    while True:
        changed = False

        # Forward step
        excluded = list(set(data.columns) - set(included) - {response})
        new_pval = pd.Series(index=excluded, dtype=float)
        for new_column in excluded:
            try:
                model = smf.ols(f'{response} ~ ' + ' + '.join(included + [new_column]), data=data).fit()
                new_pval[new_column] = model.pvalues[new_column]
            except KeyError:
                # In case the predictor isn't in the dataset, skip to avoid error
                continue
        if not new_pval.empty:
            best_pval = new_pval.min()
            if best_pval < threshold_in:
                best_feature = new_pval.idxmin()
                included.append(best_feature)
                changed = True

        # Backward step
        model = smf.ols(f'{response} ~ ' + ' + '.join(included), data=data).fit()
        # Use all p-values except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max()  # null if pvalues is empty
        if worst_pval > threshold_out:
            changed = True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)

        if not changed:
            break

    return included

# Ensure categorical variables are properly encoded
cars_data_encoded = pd.get_dummies(cars, columns=['car_type', 'wheel_drive'], drop_first=True)

# Run stepwise selection
predictors_stepwise = stepwise_selection(cars_data_encoded, 'Weight')

# Fit the model with selected predictors
formula_stepwise = 'Weight ~ ' + ' + '.join(predictors_stepwise)
aic_stepwise, bic_stepwise, reduced_model_t = fit_model(formula_stepwise, cars_data_encoded)

# Output the selected predictors, AIC, BIC, and the final formula
predictors_stepwise, aic_stepwise, bic_stepwise, formula_stepwise

print(reduced_model_t.summary())

In [None]:
# Function to fit and compare models using F-test
def fit_and_compare_models(data, full_formula, sub_formula):
    # Fit both the full model and the sub-model
    full_model = smf.ols(full_formula, data=data).fit()
    sub_model = smf.ols(sub_formula, data=data).fit()
    # Perform ANOVA to compare the models and get the p-value
    anova_results = anova_lm(sub_model, full_model)
    f_pvalue = anova_results["Pr(>F)"][1]  # p-value for the comparison
    return f_pvalue, full_model

# Stepwise Selection using F-tests and ANOVA
def stepwise_selection(data, response, initial_list=[], threshold_in=0.01, threshold_out=0.05):
    included = list(initial_list)
    while True:
        changed = False

        # Forward step: try adding each excluded variable and test significance with F-test
        excluded = list(set(data.columns) - set(included) - {response})
        new_pvalues = pd.Series(index=excluded, dtype=float)
        for new_column in excluded:
            # Only proceed if included is non-empty
            formula_with = f'{response} ~ ' + ' + '.join(included + [new_column])
            formula_without = f'{response} ~ ' + ' + '.join(included) if included else f'{response} ~ 1'
            try:
                f_pvalue, _ = fit_and_compare_models(data, formula_with, formula_without)
                new_pvalues[new_column] = f_pvalue
            except Exception as e:
                print(f"Error fitting model with {new_column}: {e}")
                continue

        # Add the variable with the lowest F-test p-value if below threshold_in
        if not new_pvalues.empty:
            best_pvalue = new_pvalues.min()
            if best_pvalue < threshold_in:
                best_feature = new_pvalues.idxmin()
                included.append(best_feature)
                changed = True

        # Backward step: try removing each variable in the model and test significance with F-test
        if included:
            pvalues = pd.Series(index=included, dtype=float)
            for column in included:
                formula_with = f'{response} ~ ' + ' + '.join(included)
                remaining_columns = [col for col in included if col != column]
                formula_without = f'{response} ~ ' + ' + '.join(remaining_columns) if remaining_columns else f'{response} ~ 1'
                try:
                    f_pvalue, _ = fit_and_compare_models(data, formula_with, formula_without)
                    pvalues[column] = f_pvalue
                except Exception as e:
                    print(f"Error fitting model without {column}: {e}")
                    continue

            # Remove the variable with the highest p-value if above threshold_out
            worst_pvalue = pvalues.max()
            if worst_pvalue > threshold_out:
                worst_feature = pvalues.idxmax()
                included.remove(worst_feature)
                changed = True

        # Stop if no predictors were added or removed
        if not changed:
            break

    return included

# Ensure categorical variables are properly encoded
cars_data_encoded = pd.get_dummies(cars, columns=['car_type', 'wheel_drive'], drop_first=True)

# Run stepwise selection
predictors_stepwise = stepwise_selection(cars_data_encoded, 'Weight')

# Fit the model with selected predictors
if predictors_stepwise:  # Ensure we have predictors before fitting the model
    formula_stepwise = 'Weight ~ ' + ' + '.join(predictors_stepwise)
    aic_stepwise, bic_stepwise, reduced_model_F = fit_model(formula_stepwise, cars_data_encoded)
else:
    print("No predictors were selected.")

print(reduced_model_F.summary())

In [None]:
# Conduct ANOVA (F-test) to compare the full model and the reduced model
anova_results = anova_lm(reduced_model_F, full_model)
anova_results


## Confidence interval for the mean response $E(Y \mid x_0)$

We consider the linear model
$$
Y = X\beta + \varepsilon, \quad \varepsilon \sim N_n(0, \sigma^2 I_n),
$$
with design matrix $X$ of dimension $n \times (m+1)$ including intercept.

**Least squares estimator:**
$$
\hat\beta = (X^\top X)^{-1} X^\top Y.
$$

For a new point
$$
x_0 = (1, x_{0,1}, \dots, x_{0,m})^\top
$$
the point prediction of the conditional mean is
$$
\hat Y_{x_0} = x_0^\top \hat\beta.
$$

**Residual sum of squares and variance estimate:**
$$
\text{SSE} = \sum_{i=1}^n (Y_i - \hat Y_i)^2, \qquad
s_n^2 = \frac{\text{SSE}}{n - m - 1}.
$$

**Variance of $\hat Y_{x_0}$:**
$$
\operatorname{Var}(\hat Y_{x_0})
= \sigma^2 x_0^\top (X^\top X)^{-1} x_0
\approx s_n^2 \, x_0^\top (X^\top X)^{-1} x_0.
$$

Using normality and independence of $\hat\beta$ and $s_n^2$:
$$
\frac{\hat Y_{x_0} - E(Y \mid x_0)}
     {s_n \sqrt{x_0^\top (X^\top X)^{-1} x_0}}
\sim t_{n-m-1}.
$$

**$100(1-\alpha)\%$ confidence interval for $E(Y \mid x_0)$:**
$$
\boxed{
\hat Y_{x_0}
\;\pm\;
t_{1-\alpha/2,\,n-m-1}
\; s_n \sqrt{x_0^\top (X^\top X)^{-1} x_0}
}
$$

---

## Prediction interval for a new observation $Y_{x_0}$

Now $Y_{x_0}$ is a new future observation at $x_0$:
$$
Y_{x_0} = x_0^\top \beta + \varepsilon_0,
$$
independent of the sample.

Point prediction is again
$$
\hat Y_{x_0} = x_0^\top \hat\beta.
$$

**Variance of prediction error:**
$$
\operatorname{Var}(\hat Y_{x_0} - Y_{x_0})
= \operatorname{Var}(\hat Y_{x_0}) + \operatorname{Var}(Y_{x_0})
= \sigma^2 \bigl(1 + x_0^\top (X^\top X)^{-1} x_0 \bigr)
\approx s_n^2 \bigl(1 + x_0^\top (X^\top X)^{-1} x_0 \bigr).
$$

Hence
$$
\frac{\hat Y_{x_0} - Y_{x_0}}
     {s_n \sqrt{1 + x_0^\top (X^\top X)^{-1} x_0}}
\sim t_{n-m-1}.
$$

**$100(1-\alpha)\%$ prediction interval for $Y_{x_0}$:**
$$
\boxed{
\hat Y_{x_0}
\;\pm\;
t_{1-\alpha/2,\,n-m-1}
\; s_n \sqrt{1 + x_0^\top (X^\top X)^{-1} x_0}
}
$$

**Key comparison:**

- CI for $E(Y \mid x_0)$: only model (estimation) uncertainty
  $\Rightarrow$ term $x_0^\top (X^\top X)^{-1} x_0$.
- PI for $Y_{x_0}$: model uncertainty $+$ new-error variance
  $\Rightarrow$ term $1 + x_0^\top (X^\top X)^{-1} x_0$.

Prediction interval is always wider.


In [None]:
reduced_model_F.model.exog_names[1:]

In [None]:
# =========================================
# 1) Prepare data and fit reduced model
# =========================================

# Use only needed columns and drop rows with missing values
cols = ["Weight", "consumption", "WheelBase", "Width", "RetailPrice", "HP"]
cars_model = cars[cols].dropna()

model = smf.ols(
    "Weight ~ consumption + WheelBase + Width + RetailPrice + HP",
    data=cars_model
    ).fit()

print(model.summary())

# =========================================
# 2) Build prediction grid
#    - x-axis: consumption
#    - others fixed at their sample means
# =========================================

x_col = "consumption"

x_min, x_max = cars_model[x_col].min(), cars_model[x_col].max()
x_grid = np.linspace(x_min, x_max, 200)

# Fix other regressors at mean
base_vals = {
    "WheelBase": cars_model["WheelBase"].mean(),
    "Width": cars_model["Width"].mean(),
    "RetailPrice": cars_model["RetailPrice"].mean(),
    "HP": cars_model["HP"].mean(),
}

pred_df = pd.DataFrame({
    "consumption": x_grid,
    "WheelBase": base_vals["WheelBase"],
    "Width": base_vals["Width"],
    "RetailPrice": base_vals["RetailPrice"],
    "HP": base_vals["HP"],
})

# =========================================
# MANUAL CI and PI
# =========================================

res = model  # shorthand

exog_names = res.model.exog_names  # e.g. ['Intercept','consumption',...]
n_grid = len(pred_df)

# Build design matrix for prediction in the same order as exog_names
pred_exog = np.column_stack([
    np.ones(n_grid) if name == "Intercept" else pred_df[name].to_numpy()
    for name in exog_names
])

beta = res.params.to_numpy()
cov_beta = res.cov_params().to_numpy()
s2 = float(res.mse_resid)
df_resid = int(res.df_resid)
t_crit = stats.t.ppf(0.975, df_resid)

# Point prediction
y_hat_manual = pred_exog @ beta

# Var(ŷ | x) = x' cov(β) x
var_mean = np.sum((pred_exog @ cov_beta) * pred_exog, axis=1)
se_mean = np.sqrt(var_mean)

# CI for E(Y | x)
ci_lower_manual = y_hat_manual - t_crit * se_mean
ci_upper_manual = y_hat_manual + t_crit * se_mean

# PI for new Y: Var(error) = s^2 + Var(ŷ)
se_pred = np.sqrt(s2 + var_mean)
pi_lower_manual = y_hat_manual - t_crit * se_pred
pi_upper_manual = y_hat_manual + t_crit * se_pred

# =========================================
# BUILT-IN CI and PI (statsmodels)
# =========================================

pred_res = res.get_prediction(pred_df)
sf = pred_res.summary_frame(alpha=0.05)

y_hat_sm = sf["mean"].to_numpy()
ci_lower_sm = sf["mean_ci_lower"].to_numpy()
ci_upper_sm = sf["mean_ci_upper"].to_numpy()
pi_lower_sm = sf["obs_ci_lower"].to_numpy()
pi_upper_sm = sf["obs_ci_upper"].to_numpy()

# Optional numeric sanity check
print("Max |manual - sm| mean:", np.max(np.abs(y_hat_manual - y_hat_sm)))
print("Max |manual - sm| CI lower:", np.max(np.abs(ci_lower_manual - ci_lower_sm)))
print("Max |manual - sm| PI lower:", np.max(np.abs(pi_lower_manual - pi_lower_sm)))



In [None]:

# =========================================
# PLOTS with Generated Data
# =========================================

# Plot 1: Manual CI and PI
plt.figure(figsize=(8, 5))
plt.plot(x_grid, y_hat_manual, label="Fit (manual)")
plt.plot(x_grid, ci_lower_manual, "--", label="CI lower (manual)")
plt.plot(x_grid, ci_upper_manual, "--", label="CI upper (manual)")
plt.plot(x_grid, pi_lower_manual, ":", label="PI lower (manual)")
plt.plot(x_grid, pi_upper_manual, ":", label="PI upper (manual)")
plt.xlabel(x_col)
plt.ylabel("Weight")
plt.title("Manual CI and PI for Weight vs consumption")
plt.legend()
plt.grid(True)
plt.tight_layout()

# Plot 2: Statsmodels CI and PI
plt.figure(figsize=(8, 5))
plt.plot(x_grid, y_hat_sm, label="Fit (statsmodels)")
plt.plot(x_grid, ci_lower_sm, "--", label="CI lower (sm)")
plt.plot(x_grid, ci_upper_sm, "--", label="CI upper (sm)")
plt.plot(x_grid, pi_lower_sm, ":", label="PI lower (sm)")
plt.plot(x_grid, pi_upper_sm, ":", label="PI upper (sm)")
plt.xlabel(x_col)
plt.ylabel("Weight")
plt.title("Statsmodels CI and PI for Weight vs consumption")
plt.legend()
plt.grid(True)
plt.tight_layout()

In [None]:
# =========================================
# PLOTS USING MEASURED DATA
# =========================================

res = model

# Design matrix used in the fit (matches params order)
X_all = res.model.exog
beta = res.params.to_numpy()
cov_beta = res.cov_params().to_numpy()
s2 = float(res.mse_resid)
df_resid = int(res.df_resid)
t_crit = stats.t.ppf(0.975, df_resid)

# Observed values
x_obs = cars_model["consumption"].to_numpy()
y_obs = cars_model["Weight"].to_numpy()

# ----- Manual CI/PI for each observed row -----

y_hat_manual_all = X_all @ beta
var_mean_all = np.sum((X_all @ cov_beta) * X_all, axis=1)
se_mean_all = np.sqrt(var_mean_all)

ci_lower_manual_all = y_hat_manual_all - t_crit * se_mean_all
ci_upper_manual_all = y_hat_manual_all + t_crit * se_mean_all

se_pred_all = np.sqrt(s2 + var_mean_all)
pi_lower_manual_all = y_hat_manual_all - t_crit * se_pred_all
pi_upper_manual_all = y_hat_manual_all + t_crit * se_pred_all

# Sort by consumption for nice bands
idx = np.argsort(x_obs)
x_sorted = x_obs[idx]
y_obs_sorted = y_obs[idx]
y_hat_manual_sorted = y_hat_manual_all[idx]
ci_l_man_sorted = ci_lower_manual_all[idx]
ci_u_man_sorted = ci_upper_manual_all[idx]
pi_l_man_sorted = pi_lower_manual_all[idx]
pi_u_man_sorted = pi_upper_manual_all[idx]

# ----- Built-in CI/PI via get_prediction for each observed row -----

pred_res_all = res.get_prediction(cars_model)
sf_all = pred_res_all.summary_frame(alpha=0.05)

y_hat_sm_all = sf_all["mean"].to_numpy()
ci_lower_sm_all = sf_all["mean_ci_lower"].to_numpy()
ci_upper_sm_all = sf_all["mean_ci_upper"].to_numpy()
pi_lower_sm_all = sf_all["obs_ci_lower"].to_numpy()
pi_upper_sm_all = sf_all["obs_ci_upper"].to_numpy()

y_hat_sm_sorted = y_hat_sm_all[idx]
ci_l_sm_sorted = ci_lower_sm_all[idx]
ci_u_sm_sorted = ci_upper_sm_all[idx]
pi_l_sm_sorted = pi_lower_sm_all[idx]
pi_u_sm_sorted = pi_upper_sm_all[idx]

# =========================================
# Plot 1: Manual CI and PI with measured data
# =========================================

plt.figure(figsize=(8, 5))
plt.scatter(x_obs, y_obs, alpha=0.4, label="Observed Weight")
plt.plot(x_sorted, y_hat_manual_sorted, label="Fit (manual)")
plt.plot(x_sorted, ci_l_man_sorted, "--", label="CI lower (manual)")
plt.plot(x_sorted, ci_u_man_sorted, "--", label="CI upper (manual)")
plt.plot(x_sorted, pi_l_man_sorted, ":", label="PI lower (manual)")
plt.plot(x_sorted, pi_u_man_sorted, ":", label="PI upper (manual)")
plt.xlabel("consumption")
plt.ylabel("Weight")
plt.title("Manual CI and PI for Weight vs consumption (all predictors used)")
plt.legend()
plt.grid(True)
plt.tight_layout()

# =========================================
# Plot 2: Statsmodels CI and PI with measured data
# =========================================

plt.figure(figsize=(8, 5))
plt.scatter(x_obs, y_obs, alpha=0.4, label="Observed Weight")
plt.plot(x_sorted, y_hat_sm_sorted, label="Fit (statsmodels)")
plt.plot(x_sorted, ci_l_sm_sorted, "--", label="CI lower (sm)")
plt.plot(x_sorted, ci_u_sm_sorted, "--", label="CI upper (sm)")
plt.plot(x_sorted, pi_l_sm_sorted, ":", label="PI lower (sm)")
plt.plot(x_sorted, pi_u_sm_sorted, ":", label="PI upper (sm)")
plt.xlabel("consumption")
plt.ylabel("Weight")
plt.title("Statsmodels CI and PI for Weight vs consumption (all predictors used)")
plt.legend()
plt.grid(True)
plt.tight_layout()


## Residual Diagnostics and Plots

Residual analysis is critical for validating model assumptions. We focus on normality, linearity, and constant variance assumptions. Key diagnostic tools include:

### 1. **Q-Q Plot for Residual Normality**

Plots the quantiles of the residuals against theoretical quantiles of a normal distribution.

### 2. **Residuals vs. Fitted Values**

- **Homoscedasticity**: Residuals should be evenly scattered around zero.
- **Non-linearity**: A pattern in residuals suggests that the relationship between predictors and response may not be linear.






### 3. **Residuals plots**

**Component-Residual Plot (Partial Residual Plots):**

* What to See: These plots show the relationship between each predictor and the response variable while controlling for the effect of other variables. They are useful for checking linearity and identifying outliers or influential points.
It visualize the isolated effect of each predictor by adjusting for other variables.
* Why to Plot: To verify the assumption that the relationship between predictors and the response is linear, and to spot any non-linear patterns, outliers, or points that might have a disproportionate impact on the regression model.

**Added Variable Plot (Partial Regression Plots):**

* What to See: These plots display the relationship between the response and a given predictor, after removing the effect of all other predictors. They help in understanding the individual contribution of a predictor to the model.
* Why to Plot: To assess the unique impact of each predictor on the response, checking for linearity, and identifying potential outliers or influential observations that might affect the slope of the regression line.

**Spread-Level Plot:**

* What to See: This plot shows the spread or variance of the residuals against the predicted values or a predictor. It's used to check the assumption of homoscedasticity (constant variance of errors).
* Why to Plot: To ensure that the error variance is constant across all levels of the predictors. Non-constant variance (heteroscedasticity) can indicate that the model is not capturing some aspect of the data, possibly violating regression assumptions.

In [None]:
reduced_model = reduced_model_F


In [None]:
from statsmodels.graphics.regressionplots import plot_partregress_grid, plot_ccpr_grid


In [None]:
# Spread-Level Plot (Residuals vs Predicted)
plt.figure(figsize=(10, 8))
sns.scatterplot(x=reduced_model.fittedvalues, y=reduced_model.resid)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Spread-Level Plot')
plt.show()


In [None]:
import numpy as np
from scipy.stats import levene
from statsmodels.stats.diagnostic import het_breuschpagan, het_white, het_goldfeldquandt

# Extract residuals and exogenous variables from the reduced model
resid = reduced_model.resid
exog = reduced_model.model.exog
fitted = reduced_model.fittedvalues

# 1) Levene’s test: compare variance of residuals across bins of fitted values
# (more standard than a single median split)
quantiles = np.quantile(fitted, [0, 0.2, 0.4, 0.6, 0.8, 1.0])
groups = [
    resid[(fitted >= quantiles[i]) & (fitted < quantiles[i + 1])]
    for i in range(len(quantiles) - 1)
]
# Keep only non-empty groups
groups = [g for g in groups if len(g) > 0]
lev_stat, lev_pvalue = levene(*groups, center="median")
print(f"Levene statistic: {lev_stat:.4f}, p-value: {lev_pvalue:.4g}")


# 2) Breusch–Pagan test (as you had; this is the standard usage)
bp_stat, bp_pvalue, bp_f, bp_f_pvalue = het_breuschpagan(resid, exog)
print(
    f"Breusch–Pagan: LM={bp_stat:.4f}, p={bp_pvalue:.4g}, "
    f"F={bp_f:.4f}, F p={bp_f_pvalue:.4g}"
)


# 3) White test (general heteroscedasticity, also in statsmodels)
w_stat, w_pvalue, w_f, w_f_pvalue = het_white(resid, exog)
print(
    f"White test: LM={w_stat:.4f}, p={w_pvalue:.4g}, "
    f"F={w_f:.4f}, F p={w_f_pvalue:.4g}"
)




In [None]:
# Component-Residual Plot (Partial Residual Plots)
fig = plt.figure(figsize=(12, 8))
sm.graphics.plot_ccpr_grid(reduced_model, fig=fig)
plt.tight_layout()
plt.show()



In [None]:
# Added Variable Plot (Partial Regression Plots)
fig = plt.figure(figsize=(16, 12))
plot_partregress_grid(reduced_model, fig=fig)
plt.tight_layout()
plt.show()


In [None]:

def plot_regression_diagnostics(model):
    """
    Generate diagnostic plots for a regression model.

    :param model: The fitted regression model object from statsmodels.
    :return: A matplotlib figure object containing the diagnostic plots.
    """

    #residuals = model.resid
    residuals = model.get_influence().resid_studentized  # internal studentized residuals

    num_regressors = len(model.model.exog_names) - 1  # Exclude intercept
    total_plots = num_regressors + 4  # Total plots needed (1 plot per regressor + 4 diagnostics)
    rows = (total_plots + 2) // 3  # Calculate rows needed to fit all plots in 3 columns

    fig, axes = plt.subplots(rows, 3, figsize=(15, 5 * rows))
    axes = axes.flatten()  # Flatten to iterate easily

    # Plot Fitted Values vs Residuals
    axes[0].scatter(model.fittedvalues, residuals)
    axes[0].axhline(0, color='red', linestyle='--')
    axes[0].set_xlabel('Fitted Values')
    axes[0].set_ylabel('Studentized residuals')
    axes[0].set_title('Fitted Values vs Residuals')

    # Plot Response vs Residuals for each regressor
    for i, col in enumerate(model.model.exog_names[1:], start=1):
        ax = axes[i]
        ax.scatter(model.model.exog[:, i], residuals)
        ax.axhline(0, color='red', linestyle='--')
        ax.set_xlabel(col)
        ax.set_ylabel('Studentized residuals')
        ax.set_title(f'Response vs Residuals: {col}')

    # Normal Q-Q plot
    sm.qqplot(residuals, line='s', ax=axes[num_regressors + 1])
    axes[num_regressors + 1].set_title('Normal Q-Q')

    # Scale-Location plot
    axes[num_regressors + 2].scatter(model.fittedvalues, np.sqrt(np.abs(residuals)))
    axes[num_regressors + 2].axhline(0, color='red', linestyle='--')
    axes[num_regressors + 2].set_xlabel('Fitted Values')
    axes[num_regressors + 2].set_ylabel('|sqrt(Studentized residuals)|')
    axes[num_regressors + 2].set_title('Scale-Location')

    # Hide any unused subplots
    for j in range(num_regressors + 3, len(axes)):
        axes[j].set_visible(False)

    plt.tight_layout()
    return fig

# Generate the diagnostic plots
fig0 = plot_regression_diagnostics(reduced_model)
plt.show()


In [None]:
from statsmodels.graphics.regressionplots import plot_regress_exog

# Check residuals against each independent variable using plot_regress_exog
key_predictors = ['consumption', 'WheelBase', 'Width']

for predictor in key_predictors:
    fig = plt.figure(figsize=(14, 10))
    plot_regress_exog(reduced_model, predictor, fig=fig)
    plt.show()


In [None]:
# Extract predictors from the reduced model
# predictors_stepwise = reduced_model.model.exog_names
# predictors_stepwise.remove('Intercept')  # Remove the intercept from the list
# predictors_stepwise