<a href="https://colab.research.google.com/github/francji1/01RAD/blob/main/python/01RAD_Ex09.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats

import statsmodels.api as sm
import statsmodels.formula.api as smf


In [None]:
#import os
#current_directory = os.getcwd()
#print("Current Working Directory:", current_directory)

In [None]:
# set the seed for random number generation using NumPy
np.random.seed(4242)

In [None]:
# Sample size and number of predictors
n = 100
p = 4

# Generating the error term
e = np.random.normal(0, 4, n)

# Defining the beta coefficients
beta0 = np.array([5, 3, 2, -5]).reshape(4, 1)

# Creating the variables
X0 = np.ones(n)
X1 = np.random.normal(20, 3, n)
X2 = 10 + np.random.exponential(1/0.1, n)
X3 = 5 + np.random.binomial(15, 0.2, n)

# Calculating Y
Y = np.dot(np.column_stack((X0, X1, X2, X3)), beta0).flatten() + e

# Creating the data frame
data0 = pd.DataFrame({'X0': X0, 'X1': X1, 'X2': X2, 'X3': X3, 'Y': Y})

# Displaying the first few rows and summary of the data frame
print(data0.head())
print(data0.describe())


In [None]:
# Selecting the variables
X = data0[['X1', 'X2', 'X3']]
Y = data0['Y']

# Visualization
sns.pairplot(data0, vars=['X1', 'X2', 'X3', 'Y'])

In [None]:
# Fitting the linear regression model
model = smf.ols('Y ~ X1 + X2 + X3', data=data0).fit()

# Displaying the summary of the model
print(model.summary())

In [None]:
#X = sm.add_constant(X)  # Add an intercept term to the predictor variables
#model = sm.OLS(Y, X)  # Create the model object
#model_results = model.fit()


In [None]:
def plot_regression_diagnostics(model):
    """
    Generate diagnostic plots for a regression model.

    :param model: The fitted regression model object from statsmodels.
    :return: A matplotlib figure object containing the diagnostic plots.
    """
    fig = plt.figure(figsize=(15, 8))

    # Plot of Fitted Values vs Residuals
    plt.subplot(2, 3, 1)
    plt.scatter(model.fittedvalues, model.resid)
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel('Fitted Values')
    plt.ylabel('Residuals')
    plt.title('Fitted Values vs Residuals')

    # Response vs Residuals for each regressor
    for i, col in enumerate(model.model.exog_names[1:], 2):
        plt.subplot(2, 3, i)
        plt.scatter(model.model.exog[:, i - 1], model.resid)
        plt.axhline(0, color='red', linestyle='--')
        plt.xlabel(col)
        plt.ylabel('Residuals')
        plt.title(f'Response vs Residuals: {col}')

    # Normal Q-Q plot
    plt.subplot(2, 3, 5)
    sm.qqplot(model.resid, line='s', ax=plt.gca())
    plt.title('Normal Q-Q')

    # Scale-Location plot
    plt.subplot(2, 3, 6)
    plt.scatter(model.fittedvalues, np.sqrt(np.abs(model.resid)))
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel('Fitted Values')
    plt.ylabel('Standardized Residuals')
    plt.title('Scale-Location')

    plt.tight_layout()
    return fig


In [None]:
fig0 = plot_regression_diagnostics(model)
plt.show()

In [None]:
from statsmodels.graphics.regressionplots import plot_ccpr_grid

def plot_component_residuals(model):
    """
    Generate Component-Residual Plots (Partial Residual Plots) for a regression model.

    :param model: The fitted regression model object from statsmodels.
    :return: A matplotlib figure object containing the Component-Residual Plots.
    """
    fig = plt.figure(figsize=(12, 8))
    plot_ccpr_grid(model, fig=fig)
    plt.tight_layout()
    return fig


In [None]:
from statsmodels.graphics.regressionplots import plot_partregress_grid

def plot_added_variable(model):
    """
    Generate Added Variable Plots (Partial Regression Plots) for a regression model.

    :param model: The fitted regression model object from statsmodels.
    :return: A matplotlib figure object containing the Added Variable Plots.
    """
    fig = plt.figure(figsize=(12, 8))
    plot_partregress_grid(model, fig=fig)
    plt.tight_layout()
    return fig


In [None]:
fig1 = plot_component_residuals(model)
plt.show()

fig2 = plot_added_variable(model)
plt.show()

## Residuals in Linear Regression

In linear regression, residuals are the differences between the observed values and the values predicted by the model.

### 1. Classical (Ordinary) Residuals
Classical residuals are calculated as the difference between the observed values and the predicted values from the regression model.

**Equation:**
$$
e_i = y_i - \hat{y}_i
$$
where \( e_i \) is the residual for the $i $th observation, $ y_i $ is the observed value, and $\hat{y}_i $ is the predicted value by the model.

### 2. Standardized Residuals
Standardized residuals are classical residuals scaled by an estimate of their standard deviation.

**Equation:**
$$
r_i = \frac{e_i}{\hat{\sigma} \sqrt{1 - h_{ii}} }
$$
where \( r_i \) is the standardized residual, $ \hat{\sigma} $ is the estimated standard deviation of the residuals, and $ h_{ii} $ is the leverage of the $i $-th observation.

### 3. Studentized Residuals
Studentized residuals are similar to standardized residuals, but they are scaled using a more robust estimate of the standard deviation, one that excludes the \( i \)-th observation.

**Equation:**
$$
t_i = \frac{e_i}{\hat{\sigma}_{(i)} \sqrt{1 - h_{ii}} }
$$
where \( t_i \) is the studentized residual, $ \hat{\sigma}_{(i)} $ is the estimated standard deviation of the residuals excluding the $ i $-th observation.



In [None]:
def calculate_classical_residuals(model, data):
    """
    Calculate classical (ordinary) residuals for a regression model.
    """
    observed = data[model.endog_names]
    predicted = model.predict()
    classical_residuals = observed - predicted
    return classical_residuals

def manual_classical_residuals(observed, predicted):
    return observed - predicted

In [None]:
def calculate_standardized_residuals(model):
    """
    Calculate standardized residuals for a regression model.
    """
    influence = model.get_influence()
    standardized_residuals = influence.resid_studentized_internal
    return standardized_residuals

def manual_standardized_residuals(observed, predicted, leverage):
    residuals = observed - predicted
    residual_std = np.sqrt(np.sum(residuals**2) / (len(residuals) - 2))
    return residuals / (residual_std * np.sqrt(1 - leverage))


In [None]:
def calculate_studentized_residuals(model):
    """
    Calculate studentized residuals for a regression model.
    """
    influence = model.get_influence()
    studentized_residuals = influence.resid_studentized_external
    return studentized_residuals

def manual_studentized_residuals(observed, predicted, leverage):
    residuals = observed - predicted
    studentized_res = np.zeros_like(residuals)

    for i in range(len(residuals)):
        # Exclude the i-th residual
        residuals_without_i = np.delete(residuals, i)
        std_without_i = np.sqrt(np.sum(residuals_without_i**2) / (len(residuals_without_i) - 2))
        studentized_res[i] = residuals[i] / (std_without_i * np.sqrt(1 - leverage[i]))

    return studentized_res


In [None]:
# Get the influence object
influence = model.get_influence()

# Extract leverage values
leverage = influence.hat_matrix_diagobserved = np.array([...])  # Replace with your observed values
observed = data0.Y
predicted = model.predict()

# Calculate residuals
classical_residuals = manual_classical_residuals(observed, predicted)
#standardized_residuals = manual_standardized_residuals(observed, predicted, leverage)
#studentized_residuals = manual_studentized_residuals(observed, predicted, leverage)


In [None]:
data0

## Influence Measures in Linear Regression

In linear regression, influence measures are used to identify observations that have a disproportionate impact on the model. These measures help in diagnosing the model's robustness and identifying outliers or influential points. Below are key influence measures commonly used:

### 1. DFBETAS
DFBETAS measures the difference in each coefficient estimate when an observation is omitted.

**Equation:**
$$
DFBETAS_{ij} = \frac{\hat{\beta}_j - \hat{\beta}_{j(i)}}{\sqrt{\hat{\sigma}^2_{(i)} (X^T X)^{-1}_{jj}}}
$$
where $ \hat{\beta}_j $ is the estimated coefficient, $ \hat{\beta}_{j(i)} $ is the estimated coefficient with the \( i \)-th observation omitted, and $(X^T X)^{-1}_{jj} $ is the \( j \)-th diagonal element of the inverse of $X^T X $.

### 2. DFFITS
DFFITS is an influence statistic that measures the effect of deleting a single observation.

**Equation:**
$$
DFFITS_i = \frac{\hat{y}_i - \hat{y}_{i(i)}}{\hat{\sigma}_{(i)} \sqrt{h_{ii}}}
$$
where \( \hat{y}_i \) is the predicted value with all observations, $ \hat{y}_{i(i)}$is the predicted value with the \( i \)-th observation omitted, and $ h_{ii} $is the leverage of the $ i $-th observation.

### 3. Leverage Values (h values)
Leverage values measure the influence of each observation on its own fitted value. High leverage points can significantly alter the position of the regression line.

**Equation:**
$$
h_{ii} = X_i (X^T X)^{-1} X_i^T
$$
where $X_i$ is the \( i \)-th row of the matrix of predictors \( X \).

### 4. Covariance Ratios
Covariance ratios compare the determinants of the covariance matrices with and without each observation. They help identify observations that influence the variance of the parameter estimates.

**Equation:**
$$
CR_i = \frac{\det(\hat{\Sigma}_{(i)})}{\det(\hat{\Sigma})}
$$
where $ \hat{\Sigma}_{(i)} $is the covariance matrix with the \( i \)-th observation omitted and $ \hat{\Sigma} $ is the covariance matrix with all observations.

### 5. Cook's Distances
Cook's distance measures the effect of deleting a single observation on the entire regression model. It is a commonly used metric to identify influential observations.

**Equation:**
$$
D_i = \frac{\sum_{j=1}^n (\hat{y}_j - \hat{y}_{j(i)})^2}{p \hat{\sigma}^2}
$$
where $ \hat{y}_j $ is the predicted value for the $ j $-th observation, $ \hat{y}_{j(i)} $ is the predicted value with the \( i \)-th observation omitted, \( p \) is the number of predictors, and $ \hat{\sigma}^2 $ is the estimated variance of the residuals.



##

### 1. DFBETAS

**Rule of Thumb:** An observation is considered influential if the absolute value of DFBETAS for any coefficient exceeds $ \frac{2}{\sqrt{n}} $, where $ n $ is the number of observations.

### 2. DFFITS

**Rule of Thumb:** An observation is considered influential if the absolute value of DFFITS is larger than $ 2 \sqrt{\frac{p+1}{n}} $, where \( p \) is the number of predictors and \( n \) is the number of observations.

### 3. Leverage Values (h values)

**Rule of Thumb:** An observation is considered to have high leverage if its leverage value exceeds $ \frac{2(p+1)}{n} $, where \( p \) is the number of predictors and \( n \) is the number of observations.

### 4. Covariance Ratios

**Rule of Thumb:** There is no widely accepted rule of thumb for covariance ratios, but observations with values far from 1 (either much larger or smaller) are generally considered influential.

### 5. Cook's Distances

**Rule of Thumb:** An observation is considered influential if its Cook's distance is greater than$ \frac{4}{n} $, where \( n \) is the number of observations.


In [None]:
def create_influence_dataframe(model):
    """
    Create a DataFrame with influence statistics for each observation in the model.
    The DataFrame includes DFFITS, DFBETAS, Leverage Values (h values), Covariance Ratios, and Cook's Distances.
    """
    influence = model.get_influence()

    # Extracting the influence measures
    dffits = influence.dffits[0]
    dfbetas = influence.dfbetas
    leverage = influence.hat_matrix_diag
    covariance_ratios = influence.cov_ratio
    cooks_distances = influence.cooks_distance[0]

    # Creating the DataFrame
    influence_df = pd.DataFrame({
        'DFFITS': dffits,
        'Leverage': leverage,
        'Covariance Ratio': covariance_ratios,
        'Cook\'s Distance': cooks_distances
    })

    # Adding DFBETAS columns for each predictor
    for i in range(dfbetas.shape[1]):
        influence_df[f'DFBETA_{i}'] = dfbetas[:, i]

    return influence_df

influence_df = create_influence_dataframe(model)
print(influence_df)


In [None]:
def create_influence_dataframe_with_violations(model):
    """
    Create a DataFrame with influence statistics for each observation in the model.
    Additionally, include a column that lists the names of the statistics where the rule of thumb is violated.
    """
    influence = model.get_influence()

    # Extracting the influence measures
    dffits = influence.dffits[0]
    dfbetas = influence.dfbetas
    leverage = influence.hat_matrix_diag
    covariance_ratios = influence.cov_ratio
    cooks_distances = influence.cooks_distance[0]

    # Rules of thumb
    n = model.nobs
    p = model.df_model
    dffits_threshold = 2 * np.sqrt((p + 1) / n)
    leverage_threshold = 2 * (p + 1) / n
    cooks_distance_threshold = 4 / n
    dfbetas_threshold = 2 / np.sqrt(n)

    # Creating the DataFrame
    influence_df = pd.DataFrame({
        'DFFITS': dffits,
        'Leverage': leverage,
        'Covariance Ratio': covariance_ratios,
        'Cook\'s Distance': cooks_distances
    })

    # Adding DFBETAS columns for each predictor
    for i in range(dfbetas.shape[1]):
        influence_df[f'DFBETA_{i}'] = dfbetas[:, i]

    # Identifying violations of rules of thumb
    violations = []
    for index, row in influence_df.iterrows():
        violated_stats = []
        if abs(row['DFFITS']) > dffits_threshold:
            violated_stats.append('DFFITS')
        if row['Leverage'] > leverage_threshold:
            violated_stats.append('Leverage')
        if row['Cook\'s Distance'] > cooks_distance_threshold:
            violated_stats.append('Cook\'s Distance')
        for i in range(dfbetas.shape[1]):
            if abs(row[f'DFBETA_{i}']) > dfbetas_threshold:
                violated_stats.append(f'DFBETA_{i}')
        violations.append(', '.join(violated_stats))

    influence_df['Violations'] = violations

    return influence_df

influence_df_with_violations = create_influence_dataframe_with_violations(model)
print(influence_df_with_violations.head())


In [None]:
def manual_dfbetas(X, y, betas, sigma):
    n, p = X.shape
    dfbetas = np.zeros((n, p))

    for i in range(n):
        X_exclude_i = np.delete(X, i, axis=0)
        y_exclude_i = np.delete(y, i)
        betas_exclude_i = np.linalg.inv(X_exclude_i.T @ X_exclude_i) @ (X_exclude_i.T @ y_exclude_i)

        for j in range(p):
            dfbetas[i, j] = (betas[j] - betas_exclude_i[j]) / (sigma * np.sqrt(np.linalg.inv(X.T @ X)[j, j]))

    return dfbetas


In [None]:
def manual_dffits(X, y, y_hat, sigma):
    n = X.shape[0]
    dffits = np.zeros(n)

    for i in range(n):
        X_exclude_i = np.delete(X, i, axis=0)
        y_exclude_i = np.delete(y, i)
        y_hat_exclude_i = np.delete(y_hat, i)
        y_hat_new_i = np.linalg.inv(X_exclude_i.T @ X_exclude_i) @ (X_exclude_i.T @ y_exclude_i) @ X[i]

        dffits[i] = (y_hat[i] - y_hat_new_i) / (sigma * np.sqrt(np.linalg.inv(X.T @ X)[i, i]))

    return dffits


In [None]:
def manual_leverage(X):
    H = X @ np.linalg.inv(X.T @ X) @ X.T
    leverage = np.diag(H)
    return leverage


In [None]:
def manual_cooks_distances(X, y, y_hat, sigma):
    n = X.shape[0]
    p = X.shape[1]
    cooks_d = np.zeros(n)

    for i in range(n):
        X_exclude_i = np.delete(X, i, axis=0)
        y_exclude_i = np.delete(y, i)
        y_hat_exclude_i = np.delete(y_hat, i)
        y_hat_new = np.linalg.inv(X_exclude_i.T @ X_exclude_i) @ (X_exclude_i.T @ y_exclude_i) @ X

        cooks_d[i] = np.sum((y_hat_exclude_i - y_hat_new) ** 2) / (p * sigma**2)

    return cooks_d


In [None]:
# Set the display option to show all rows (or a specified large number)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
def create_influence_dataframe_with_violations(model):
    """
    Create a DataFrame with influence statistics for each observation in the model.
    Additionally, include a column that lists the names of the statistics where the rule of thumb is violated.
    """
    influence = model.get_influence()

    # Extracting the influence measures
    dffits = influence.dffits[0]
    dfbetas = influence.dfbetas
    leverage = influence.hat_matrix_diag
    covariance_ratios = influence.cov_ratio
    cooks_distances = influence.cooks_distance[0]

    # Rules of thumb
    n = model.nobs
    p = model.df_model
    dffits_threshold = 2 * np.sqrt((p + 1) / n)
    leverage_threshold = 2 * (p + 1) / n
    cooks_distance_threshold = 4 / n
    dfbetas_threshold = 2 / np.sqrt(n)

    # Creating the DataFrame
    influence_df = pd.DataFrame({
        'DFFITS': dffits,
        'Leverage': leverage,
        'Covariance Ratio': covariance_ratios,
        'Cook\'s Distance': cooks_distances
    })

    # Adding DFBETAS columns for each predictor
    for i in range(dfbetas.shape[1]):
        influence_df[f'DFBETA_{i}'] = dfbetas[:, i]

    # Identifying violations of rules of thumb
    violations = []
    for index, row in influence_df.iterrows():
        violated_stats = []
        if abs(row['DFFITS']) > dffits_threshold:
            violated_stats.append('DFFITS')
        if row['Leverage'] > leverage_threshold:
            violated_stats.append('Leverage')
        if row['Cook\'s Distance'] > cooks_distance_threshold:
            violated_stats.append('Cook\'s Distance')
        for i in range(dfbetas.shape[1]):
            if abs(row[f'DFBETA_{i}']) > dfbetas_threshold:
                violated_stats.append(f'DFBETA_{i}')
        violations.append(', '.join(violated_stats))

    influence_df['Violations'] = violations

    return influence_df

influence_df_with_violations = create_influence_dataframe_with_violations(model)



In [None]:
influence_df_with_violations


In [None]:
# def compare_influence_measures(model, X, y):
#     """
#     Compare manually calculated influence measures with those from statsmodels' built-in functions.

#     :param model: The fitted regression model.
#     :param X: Design matrix (predictor variables).
#     :param y: Response variable.
#     :return: DataFrame comparing manual and built-in influence measures.
#     """
#     # Manually calculate influence measures
#     y_hat = model.predict(X)
#     sigma = np.sqrt(np.sum((y - y_hat) ** 2) / (len(y) - X.shape[1] - 1))
#     betas = np.linalg.lstsq(X, y, rcond=None)[0]

#     manual_dfbetas = manual_dfbetas(X, y, betas, sigma)
#     manual_dffits = manual_dffits(X, y, y_hat, sigma)
#     manual_leverage = manual_leverage(X)
#     manual_cooks_d = manual_cooks_distances(X, y, y_hat, sigma)

#     # Create DataFrame for manual calculations
#     manual_df = pd.DataFrame({
#         'Manual_DFBETAS': np.max(np.abs(manual_dfbetas), axis=1),
#         'Manual_DFFITS': np.abs(manual_dffits),
#         'Manual_Leverage': manual_leverage,
#         'Manual_Cooks_Distance': manual_cooks_d
#     })

#     # Use built-in functions to get influence measures
#     built_in_df = create_influence_dataframe_with_violations(model)

#     # Combine the DataFrames for comparison
#     comparison_df = pd.concat([manual_df, built_in_df], axis=1)

#     return comparison_df

# comparison_df = compare_influence_measures(model, X, Y)
# print(comparison_df.head())


In [None]:
# Adding a good outlying point to predictors
outlier = pd.DataFrame({'X1': [max(data0['X1']) + 25],
                        'X2': [max(data0['X2']) + 35],
                        'X3': [max(data0['X3']) + 25]})
X_with_outlier = pd.concat([data0[['X1', 'X2', 'X3']], outlier], ignore_index=True)

# Recalculating Y with the new outlying point
# Create the design matrix for the model including the intercept
X_design = sm.add_constant(X_with_outlier)
# Calculate Y values including the outlier
Y_with_outlier = np.dot(X_design, beta0).flatten() + np.append(e, np.random.normal(0, 4))

# Simple Regression - only X2 as independent variable
plt.figure(figsize=(10, 8))
plt.scatter(X_with_outlier['X2'], Y_with_outlier)
plt.xlabel('X2')
plt.ylabel('Y')
plt.title('Simple Regression with at least one influential point')
plt.show()
