<a href="https://colab.research.google.com/github/francji1/01RAD/blob/main/python/01RAD_Ex08.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Exercise 08

Lets use the same dataset from the last exercise

In [None]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from statsmodels.stats.anova import anova_lm
from itertools import combinations
from statsmodels.formula.api import ols
from scipy.stats import f,t,norm

In [None]:
cars_all = pd.read_csv("https://raw.githubusercontent.com/francji1/01RAD/main/data/carsdata2.csv", sep=";")
cars_all.head()

In [None]:
cars_all

In [None]:
cars_all.isna().sum()


In [None]:
# Define car types and wheel drive conditions
sedan_condition = cars_all['Sedan'] == 1
sport_condition = cars_all['Sports'] == 1
suv_condition = cars_all['SUV'] == 1
minivan_condition = (cars_all['Wagon'] == 1) | (cars_all['Minivan'] == 1) | (cars_all['Pickup'] == 1)
awd_condition = cars_all['AWD'] == 1
rwd_condition = cars_all['RWD'] == 1

cars_all = (
    cars_all.assign(
        consumption=100 / (1.60934 * ((cars_all['CityMPG'] + cars_all['HwyMPG']) / 2) / 3.7854),
        type=np.select(
            [sedan_condition, sport_condition, suv_condition, minivan_condition],
            ['sedan', 'sport', 'suv', 'minivan'],
            default='Unknown'
        ),
        wheel_drive=np.select(
            [awd_condition, rwd_condition],
            ['AWD', 'RWD'],
            default='FWD'
        )
    )
    .astype({'type': 'category', 'wheel_drive': 'category'})
    .filter(['RetailPrice', 'type', 'consumption', 'wheel_drive', 'DealerCost', 'EngineSize', 'Cyl', 'HP', 'Weight', 'WheelBase', 'Len', 'Width'])
)

cars_all.head()

In [None]:
# Omit rows with NA values
cars_all.dropna(inplace=True)
cars_all.isna().sum()


In [None]:
cars = cars_all.drop(columns = ['Cyl','DealerCost']).copy()

In [None]:
cars

In [None]:
#cars.to_csv('cars.csv', index=False)


Show how to handle with formula with/without one hot encoded varialbes.

In [None]:
# One-hot encoding for categorical variables
cars_data_encoded = pd.get_dummies(cars, columns=['type', 'wheel_drive'], drop_first=True)

# Building the full model with all predictors and their second-order interactions
# First, prepare the formula for the full model
predictors = cars_data_encoded.columns.drop('Weight')
interaction_terms = ['{}:{}'.format(a, b) for a, b in combinations(predictors, 2)]
formula_full = 'Weight ~ ' + ' + '.join(predictors) + ' + ' + ' + '.join(interaction_terms) + '-' +  'type_sport:type_suv'
# not work: formula_full = 'Weight ~ (.)^2 ' , * works
formula_full

In [None]:
# Fit the full model
full_model = smf.ols(formula=formula_full, data=cars_data_encoded).fit()

# Display the summary of the full model
full_model_summary = full_model.summary()
full_model_aic = full_model.aic
full_model_bic = full_model.bic

print(full_model_summary)

In [None]:
from sklearn.model_selection import train_test_split
import itertools

# Function for fitting a model and getting AIC and BIC
def fit_model(formula, data):
    model = smf.ols(formula, data=data).fit()
    return model.aic, model.bic, model

# Stepwise Regression
def stepwise_selection(data, response, initial_list=[], threshold_in=0.01, threshold_out=0.05):
    included = list(initial_list)
    while True:
        changed = False

        # Forward step
        excluded = list(set(data.columns) - set(included) - {response})
        new_pval = pd.Series(index=excluded, dtype=float)
        for new_column in excluded:
            model = smf.ols(f'{response} ~ ' + ' + '.join(included + [new_column]), data=data).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True

        # Backward step
        model = smf.ols(f'{response} ~ ' + ' + '.join(included), data=data).fit()
        # Use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max()  # null if pvalues is empty
        if worst_pval > threshold_out:
            changed = True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)

        if not changed:
            break

    return included

# Run stepwise selection
predictors_stepwise = stepwise_selection(cars_data_encoded, 'Weight')

# Fit the model with selected predictors
formula_stepwise = 'Weight ~ ' + ' + '.join(predictors_stepwise)
aic_stepwise, bic_stepwise, reduced_model = fit_model(formula_stepwise, cars_data_encoded)

predictors_stepwise, aic_stepwise, bic_stepwise, formula_stepwise


In [None]:
print(reduced_model.summary())

In [None]:
# Conduct ANOVA (F-test) to compare the full model and the reduced model
anova_results = anova_lm(reduced_model, full_model)
anova_results


###Component-Residual Plot (Partial Residual Plots):

* What to See: These plots show the relationship between each predictor and the response variable while controlling for the effect of other variables. They are useful for checking linearity and identifying outliers or influential points.
* Why to Plot: To verify the assumption that the relationship between predictors and the response is linear, and to spot any non-linear patterns, outliers, or points that might have a disproportionate impact on the regression model.

###Added Variable Plot (Partial Regression Plots):

* What to See: These plots display the relationship between the response and a given predictor, after removing the effect of all other predictors. They help in understanding the individual contribution of a predictor to the model.
* Why to Plot: To assess the unique impact of each predictor on the response, checking for linearity, and identifying potential outliers or influential observations that might affect the slope of the regression line.

### Spread-Level Plot:

* What to See: This plot shows the spread or variance of the residuals against the predicted values or a predictor. It's used to check the assumption of homoscedasticity (constant variance of errors).
* Why to Plot: To ensure that the error variance is constant across all levels of the predictors. Non-constant variance (heteroscedasticity) can indicate that the model is not capturing some aspect of the data, possibly violating regression assumptions.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.regressionplots import plot_partregress_grid, plot_ccpr_grid

# Component-Residual Plot (Partial Residual Plots)
plot_ccpr_grid(reduced_model)
plt.tight_layout()
plt.show()

# Added Variable Plot (Partial Regression Plots)
fig = plt.figure(figsize=(16, 12))
plot_partregress_grid(reduced_model, fig=fig)
plt.tight_layout()
plt.show()

# Spread-Level Plot (Residuals vs Predicted)
plt.figure(figsize=(10, 8))
sns.scatterplot(x=reduced_model.fittedvalues, y=reduced_model.resid)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Spread-Level Plot')
plt.show()


In [None]:
from statsmodels.graphics.regressionplots import plot_regress_exog

# Check residuals against each independent variable using plot_regress_exog
key_predictors = ['consumption', 'WheelBase', 'Width']

for predictor in key_predictors:
    fig = plt.figure(figsize=(14, 10))
    plot_regress_exog(reduced_model, predictor, fig=fig)
    plt.show()


In [None]:
# Extract predictors from the reduced model
predictors_stepwise = reduced_model.model.exog_names
predictors_stepwise.remove('Intercept')  # Remove the intercept from the list


In [None]:
cars

In [None]:
predictors_stepwise

In [None]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from scipy import stats
import matplotlib.pyplot as plt


# One-hot encoding for categorical variables
cars_data_encoded = pd.get_dummies(cars, columns=['type', 'wheel_drive'], drop_first=True)

# Extract predictors from the reduced model
predictors_stepwise = reduced_model.model.exog_names
#predictors_stepwise.remove('Intercept')  # Remove the intercept from the list

# Log Transformation of the Response
cars_data_encoded['log_Weight'] = np.log(cars_data_encoded['Weight'])
formula_log = 'log_Weight ~ ' + ' + '.join(predictors_stepwise)
model_log = smf.ols(formula=formula_log, data=cars_data_encoded).fit()

# Box-Cox Transformation of the Response
box_cox_transformed, best_lambda = stats.boxcox(cars_data_encoded['Weight'])
cars_data_encoded['box_cox_Weight'] = box_cox_transformed
formula_box_cox = 'box_cox_Weight ~ ' + ' + '.join(predictors_stepwise)
model_box_cox = smf.ols(formula=formula_box_cox, data=cars_data_encoded).fit()

# Collecting summary statistics for comparison
log_model_summary = model_log.summary()
box_cox_model_summary = model_box_cox.summary()

print("Best Lambda for Box-Cox Transformation:", best_lambda)
print("\nLog-Transformed Model Summary:\n", log_model_summary)
print("\nBox-Cox Transformed Model Summary:\n", box_cox_model_summary)

In [None]:
from scipy import stats
from matplotlib import gridspec
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

# Using the 'Weight' variable from the cars_data_encoded dataframe
x = cars_data_encoded['Weight']

# Lambda range and corresponding log-likelihood values
lmbdas = np.linspace(-2, 2, 400)
llf = [stats.boxcox_llf(lmbda, x) for lmbda in lmbdas]

# Finding the lambda that maximizes the log-likelihood
lmbda_optimal = lmbdas[np.argmax(llf)]

# Plotting the log-likelihood as a function of lambda
fig = plt.figure(figsize=(10, 6))
gs = gridspec.GridSpec(1, 1)
ax = fig.add_subplot(gs[0])
ax.plot(lmbdas, llf, 'b.-')
ax.axhline(stats.boxcox_llf(lmbda_optimal, x), color='r')
ax.set_xlabel('Lambda parameter')
ax.set_ylabel('Box-Cox log-likelihood')

# Inset plots for different lambda values
locs = [3, 10, 4]  # 'lower left', 'center', 'lower right'
for lmbda, loc in zip([-1, lmbda_optimal, 9], locs):
    xt = stats.boxcox(x, lmbda=lmbda)
    (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt)
    ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc)
    ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-')
    ax_inset.set_xticklabels([])
    ax_inset.set_yticklabels([])
    ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda)

plt.show()

lmbda_optimal


Scipy box cox functions:
* https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.boxcox.html
* https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.boxcox_llf.html


In [None]:
# Recalculating the confidence interval for the optimal lambda using the correct method
_, lmbda_optimal, (lmbda_ci_lower, lmbda_ci_upper) = stats.boxcox(x, alpha=0.05)

# Replotting the log-likelihood as a function of lambda with the correct confidence interval
fig = plt.figure(figsize=(10, 6))
gs = gridspec.GridSpec(1, 1)
ax = fig.add_subplot(gs[0])
ax.plot(lmbdas, llf, 'b.-')
ax.axhline(stats.boxcox_llf(lmbda_optimal, x), color='r')
ax.axvline(lmbda_optimal, color='r', linestyle='--', label=f'Optimal Lambda: {lmbda_optimal:.2f}')
ax.axvline(lmbda_ci_lower, color='g', linestyle='--', label=f'CI Lower: {lmbda_ci_lower:.2f}')
ax.axvline(lmbda_ci_upper, color='g', linestyle='--', label=f'CI Upper: {lmbda_ci_upper:.2f}')
ax.set_xlabel('Lambda parameter')
ax.set_ylabel('Box-Cox log-likelihood')
ax.legend()

# Insert plots for different lambda values
locs = [3, 10, 4]  # 'lower left', 'center', 'lower right'
for lmbda, loc in zip([-1, lmbda_optimal, 9], locs):
    xt = stats.boxcox(x, lmbda=lmbda)
    (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt)
    ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc)
    ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-')
    ax_inset.set_xticklabels([])
    ax_inset.set_yticklabels([])
    ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda)

plt.show()

(lmbda_optimal, lmbda_ci_lower, lmbda_ci_upper)
