<a href="https://colab.research.google.com/github/francji1/01RAD/blob/main/python/01RAD_Ex07.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Exercise 07

In [None]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from scipy.stats import f,t,norm

In [None]:
cars_all = pd.read_csv("https://raw.githubusercontent.com/francji1/01RAD/main/data/carsdata2.csv", sep=";")
cars_all.head()

In [None]:
cars_all.isna().sum()


In [None]:
# Define car types and wheel drive conditions
sedan_condition = cars_all['Sedan'] == 1
sport_condition = cars_all['Sports'] == 1
suv_condition = cars_all['SUV'] == 1
minivan_condition = (cars_all['Wagon'] == 1) | (cars_all['Minivan'] == 1) | (cars_all['Pickup'] == 1)
awd_condition = cars_all['AWD'] == 1
rwd_condition = cars_all['RWD'] == 1

cars_all = (
    cars_all.assign(
        consumption=100 / (1.60934 * ((cars_all['CityMPG'] + cars_all['HwyMPG']) / 2) / 3.7854),
        type=np.select(
            [sedan_condition, sport_condition, suv_condition, minivan_condition],
            ['sedan', 'sport', 'suv', 'minivan'],
            default='Unknown'
        ),
        wheel_drive=np.select(
            [awd_condition, rwd_condition],
            ['AWD', 'RWD'],
            default='FWD'
        )
    )
    .astype({'type': 'category', 'wheel_drive': 'category'})
    .filter(['RetailPrice', 'type', 'consumption', 'wheel_drive', 'DealerCost', 'EngineSize', 'Cyl', 'HP', 'Weight', 'WheelBase', 'Len', 'Width'])
)

cars_all.head()

In [None]:
cars_all.isna().sum()


In [None]:
# Filter to include only rows where 'Cyl' is 4, 6, or 8
cars_all = cars_all[cars_all['Cyl'].isin([4, 6, 8])]

# Convert 'Cyl' to a categorical type
cars_all['Cyl'] = cars_all['Cyl'].astype('category')

# Omit rows with NA values
cars_all.dropna(inplace=True)

In [None]:
# Select only numeric columns for the pair plot
numeric_cols = cars_all.select_dtypes(include=['float64', 'int64'])
sns.pairplot(numeric_cols)
plt.show()

In [None]:
# Select only categorical columns
categorical_cols = cars_all.select_dtypes(include=['category'])

for col in categorical_cols:
    sns.countplot(x=col, data=cars_all)
    plt.title(f'Count Plot for {col}')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# Select only two categorical columns
col1 = categorical_cols.columns[0]
col2 = categorical_cols.columns[1]

plt.figure(figsize=(12, 6))
sns.catplot(x=col1, hue=col2, data=cars_all, kind='count', palette='Set2')
plt.title(f'Count Plot for {col1} by {col2}')
plt.xticks(rotation=45)
plt.show()

In [None]:
cars = cars_all.copy()

In [None]:
# Simple linear regression: Weight ~ EngineSize
model_simple = smf.ols('Weight ~ EngineSize', data=cars).fit()
print(model_simple.summary())

In [None]:
# Additive multivariate regression: Weight ~ EngineSize + HP
model_additive = smf.ols('Weight ~ EngineSize + HP', data=cars_all).fit()
print(model_additive.summary())

In [None]:
# Multivariate regression with interaction: Weight ~ EngineSize + HP + EngineSize:HP
model_interaction = smf.ols('Weight ~ EngineSize + HP + EngineSize:HP', data=cars_all).fit()
print(model_interaction.summary())

### Compare three models using ANOVA and F test

In [None]:
print(anova_lm(model_simple, model_additive, typ=1))  # Comparison between m1 and m2
print(anova_lm(model_additive, model_interaction, typ=1))  # Comparison between m2 and m3
print(anova_lm(model_simple, model_interaction, typ=1))  # Comparison between m1 and m3


Calculating SSR and RSS manually with F test


In [None]:

# SSR (Sum of Squares due to Regression)
SSR3 = sum((model_interaction.fittedvalues - cars_all['Weight'].mean()) ** 2)
SSR1 = sum((model_simple.fittedvalues - cars_all['Weight'].mean()) ** 2)

# RSS (Residual Sum of Squares)
RSS3 = sum(model_interaction.resid ** 2)
RSS1 = sum(model_simple.resid ** 2)

# Mean Squared Error (MSE) for m3
MSE = RSS3 / model_interaction.df_resid



In [None]:
# Calculating the F-statistic
numerator = (model_simple.ssr - model_interaction.ssr) / (model_simple.df_resid - model_interaction.df_resid)
denominator = model_interaction.ssr / model_interaction.df_resid
F_statistic = numerator / denominator
F_statistic

In [None]:
# Calculating SSR for each model
SSR1 = sum((model_simple.fittedvalues - cars_all['Weight'].mean())**2)
SSR3 = sum((model_interaction.fittedvalues - cars_all['Weight'].mean())**2)

# Calculating MSE for the more complex model (m3)
MSE3 = sum(model_interaction.resid**2) / model_interaction.df_resid

# Calculating the F-statistic
F_value = (SSR3 - SSR1) / (MSE3 * (model_simple.df_resid - model_interaction.df_resid))
F_value

In [None]:
# F-statistic calculation
# Probability value for the F-statistic
p_value = f.sf(F_statistic, model_simple.df_resid - model_interaction.df_resid, model_interaction.df_resid)

print("F-statistic:", F_statistic)
print("p-value:", p_value)


In [None]:
# Scatter plot of Weight vs EngineSize
plt.scatter(cars_all['EngineSize'], cars_all['Weight'], color='black', s=10)  # s is the size of the points

# Plotting the regression line from model_simple
sns.regplot(x='EngineSize', y='Weight', data=cars_all, scatter=False, color='blue', label='Linear Fit')
plt.xlabel('EngineSize')
plt.ylabel('Weight')
plt.title('Weight vs EngineSize with Regression Line')
plt.legend()
plt.show()

In [None]:
model = model_simple

In [None]:
cars_all['fit_m1'] = model_simple.fittedvalues
# Create the base plot with points
plt.figure(figsize=(10, 6))
sns.scatterplot(x='EngineSize', y='Weight', data=cars_all, color='blue')

# Add the linear regression line
sns.lineplot(x='EngineSize', y='fit_m1', data=cars_all, color='green')

# Add red segments to show the difference between observed and fitted values
for _, row in cars_all.iterrows():
    plt.plot([row['EngineSize'], row['EngineSize']], [row['Weight'], row['fit_m1']], color='red', linewidth=0.3)

# Additional plot settings
plt.xlabel('EngineSize')
plt.ylabel('Weight')
plt.title('Weight vs EngineSize with Linear Model Fit and Residuals')
plt.show()


In [None]:
# Plot the fitted values against EngineSize
fig, ax = plt.subplots(figsize=(10, 6))
sm.graphics.plot_fit(model_simple, 'EngineSize', ax=ax)

# Labels and Title
ax.set_ylabel('Weight')
ax.set_xlabel('EngineSize')
ax.set_title('Weight vs EngineSize with Linear Model Fit and Residuals')

# Show the plot
plt.show()


In [None]:
# Plot Residuals vs. Fitted
plt.scatter(model_simple.fittedvalues, model_simple.resid)
plt.axhline(y=0, color='grey', linestyle='--')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.title('Residuals vs Fitted')
plt.show()

# Plot Residuals vs. EngineSize (as an example of a covariate)
plt.scatter(cars_all['EngineSize'], model_simple.resid)
plt.axhline(y=0, color='grey', linestyle='--')
plt.xlabel('EngineSize')
plt.ylabel('Residuals')
plt.title('Residuals vs EngineSize')
plt.show()



In [None]:
from statsmodels.graphics.gofplots import qqplot

plt.figure(figsize=(8, 6))
qqplot(model_simple.resid, line='s')
plt.title('Normal Q-Q')
plt.show()


In [None]:
# Scale-Location Plot (or Spread-Location Plot)
plt.figure(figsize=(8, 6))
standardized_resids = model_simple.get_influence().resid_studentized_internal
plt.scatter(model_simple.fittedvalues, np.sqrt(np.abs(standardized_resids)))
plt.axhline(y=0, color='grey', linestyle='--')
plt.xlabel('Fitted values')
plt.ylabel('Sqrt(Abs(Standardized Residuals))')
plt.title('Scale-Location')
plt.show()


In [None]:
# Residuals vs Covariates (Engine Size)
plt.figure(figsize=(8, 6))
plt.scatter(cars_all['EngineSize'], model_simple.resid)
sns.regplot(x='EngineSize', y=model_simple.resid, data=cars_all, lowess=True, scatter=False, color='red', ci=None)
plt.xlabel('Engine Size')
plt.ylabel('Residuals')
plt.title('Residuals vs Engine Size')
plt.show()


In [None]:
# Durbin-Watson Test
from statsmodels.stats.stattools import durbin_watson

dw_stat = durbin_watson(model_simple.resid)
print('Durbin-Watson statistic:', dw_stat)

In [None]:
from statsmodels.stats.diagnostic import het_breuschpagan

# Perform Breusch-Pagan test
bp_test = het_breuschpagan(model_simple.resid, model_simple.model.exog)
labels = ['Lagrange Multiplier statistic', 'p-value', 'f-value', 'f p-value']
print(dict(zip(labels, bp_test)))


In [None]:
# Extract residuals and create shifted residuals for comparison
residuals = model_interaction.resid
shifted_residuals = np.roll(residuals, -1)

# Create a DataFrame for plotting
residuals_df = pd.DataFrame({
    'Residuals_i-1': residuals[:-1],
    'Residuals_i': shifted_residuals[:-1]
})

# Create scatter plot
plt.figure(figsize=(10, 6))
plt.scatter('Residuals_i-1', 'Residuals_i', data=residuals_df, color='blue')


# Add horizontal line at 0
plt.axhline(y=0, color='grey', linestyle='--')

# Labels and title
plt.xlabel(r'$r_{i-1}$')
plt.ylabel(r'$r_i$')
plt.title('Scatter plot of Residuals: $r_i$ vs $r_{i-1}$ with LOWESS and Linear Fit')
plt.show()


In [None]:
from scipy.stats import shapiro, anderson
from statsmodels.stats.diagnostic import lilliefors

In [None]:
# Lilliefors test for normality
lillie_result = lilliefors(model_interaction.resid)
print("Lilliefors test result:", lillie_result)

In [None]:
# Shapiro-Wilk test for normality
shapiro_result = shapiro(model_interaction.resid)
print("Shapiro-Wilk test result:", shapiro_result)

In [None]:
# Anderson-Darling test for normality
ad_result = anderson(model_interaction.resid)
print("Anderson-Darling test result:", ad_result)

### How to improve the model?

