<a href="https://colab.research.google.com/github/francji1/01RAD/blob/main/code/01RAD_Ex08.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Linear Regression: Diagnostics and Influence Measures

In this lecture, we explore:
1. **Data Generation**: Simulating a dataset with multiple predictors and a response variable.
2. **Regression Modeling**: Fitting a linear regression model using ordinary least squares (OLS).
3. **Visualization**: Scatterplots for data exploration and regression diagnostic plots.
4. **Diagnostics**: Examining residuals, leverage, and influence measures to evaluate model assumptions and detect outliers or influential observations.

The goal is to understand how linear regression assumptions can be validated and how to identify problematic data points.


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.regressionplots import plot_ccpr_grid, plot_partregress_grid



In [None]:
# Set a random seed for reproducibility
np.random.seed(4242)


# Generate data for regression
# Sample size and predictors
n = 100
p = 4

# Error term
e = np.random.normal(0, 4, n)

# Coefficients
beta0 = np.array([5, 3, 2, -5]).reshape(4, 1)

# Predictors
X0 = np.ones(n)
X1 = np.random.normal(20, 3, n)
X2 = 10 + np.random.exponential(1 / 0.1, n)
X3 = 5 + np.random.binomial(15, 0.2, n)

# Response variable Y
Y = np.dot(np.column_stack((X0, X1, X2, X3)), beta0).flatten() + e

# Create DataFrame
data0 = pd.DataFrame({'X0': X0, 'X1': X1, 'X2': X2, 'X3': X3, 'Y': Y})

# Selecting the variables
X = data0[['X1', 'X2', 'X3']]
Y = data0['Y']

# Display basic information about the data
print(data0.head())
print(data0.describe())


## Scatterplots for Data Exploration

Scatterplots help visualize relationships between predictors and the response variable (`Y`). These plots provide an initial understanding of the data and potential linear relationships.


In [None]:
# Pairplot for exploratory data visualization
sns.pairplot(data0, vars=['X1', 'X2', 'X3', 'Y'], diag_kind='kde')
plt.show()

## Fitting a Linear Regression Model

The dataset includes three predictors (`X1`, `X2`, `X3`) and one response variable (`Y`). We'll fit an ordinary least squares (OLS) regression model to examine the relationship between the predictors and the response.


In [None]:
# Fit the regression model
model = smf.ols('Y ~ X1 + X2 + X3', data=data0).fit()

#X = sm.add_constant(X)  # Add an intercept term to the predictor variables
#model = sm.OLS(Y, X).fit()

# Displaying the summary of the model
print(model.summary())

## Diagnostic Plots

Regression diagnostic plots provide insights into:
- **Residual Behavior**: Check for non-linearity, heteroscedasticity, and outliers.
- **Normality**: Evaluate the distribution of residuals.
- **Influence and Leverage**: Identify observations that disproportionately affect the model.

The key plots include:
1. Fitted Values vs. Residuals
2. Scale-Location Plot (Spread-Location plot)
3. Normal Q-Q Plot
4. Component-Residual (Partial Residual) Plots
5. Added Variable (Partial Regression) Plots


In [None]:
def plot_regression_diagnostics(model):
    """
    Generate diagnostic plots for a regression model with spline smoothing.

    :param model: The fitted regression model object from statsmodels.
    :return: A matplotlib figure object containing the diagnostic plots.
    """
    fig = plt.figure(figsize=(15, 8))
    standardized_residuals = model.get_influence().resid_studentized_internal
    fitted_values = model.fittedvalues
    scale_residuals = np.sqrt(np.abs(standardized_residuals))

    # Plot of Fitted Values vs Residuals with LOESS smoothing
    plt.subplot(2, 3, 1)
    sns.regplot(
        x=fitted_values,
        y=model.resid,
        lowess=True,
        scatter_kws={'alpha': 0.7},
        line_kws={'color': 'red'}
    )
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel('Fitted Values')
    plt.ylabel('Residuals')
    plt.title('Fitted Values vs Residuals')


    # Scale-Location plot with spline smoothing
    plt.subplot(2, 3, 2)
    sns.regplot(
        x=model.fittedvalues,
        y=np.sqrt(np.abs(standardized_residuals)),
        lowess=True,
        scatter_kws={'alpha': 0.7},
        line_kws={'color': 'red'}
    )
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel('Fitted Values')
    plt.ylabel(r'$\sqrt{|Standardized\ Residuals|}$')  # LaTeX format
    plt.title('Scale-Location')


    # Normal Q-Q plot
    plt.subplot(2, 3, 3)
    sm.qqplot(model.resid, line='s', ax=plt.gca())
    plt.title('Normal Q-Q')

    # Response vs Residuals for each regressor
    for i, col in enumerate(model.model.exog_names[1:], 2):
        plt.subplot(2, 3, 2+i)
        plt.scatter(model.model.exog[:, i - 1], model.resid, alpha=0.7)
        plt.axhline(0, color='red', linestyle='--')
        plt.xlabel(col)
        plt.ylabel('Residuals')
        plt.title(f'Response vs Residuals: {col}')

    plt.tight_layout()
    return fig


In [None]:
fig = plot_regression_diagnostics(model)
plt.show()

## Component-Residual (Partial Residual) Plots

Component-Residual Plots (Partial Residual Plots) are a useful diagnostic tool in regression analysis. They help to visualize the relationship between a predictor and the response variable while accounting for the effect of other predictors in the model.

- Helps determine if a transformation of a predictor is necessary.
- A **linear pattern** in the plot suggests that the relationship between the predictor and response is well-modeled.
- Deviations from linearity (e.g., curvature) may indicate that the predictor's relationship with the response is non-linear.
- **Outliers or influential points** may appear as points far away from the general pattern.



In [None]:
from statsmodels.graphics.regressionplots import plot_ccpr_grid

def plot_component_residuals(model):
    """
    Generate Component-Residual Plots (Partial Residual Plots) for a regression model.

    :param model: The fitted regression model object from statsmodels.
    :return: A matplotlib figure object containing the Component-Residual Plots.
    """
    fig = plt.figure(figsize=(12, 8))
    plot_ccpr_grid(model, fig=fig)
    plt.tight_layout()
    return fig


In [None]:
fig1 = plot_component_residuals(model)
plt.show()

## Added Variable Plots
Added Variable Plots are a useful diagnostic tool in regression. They help to visualize the contribution of each predictor variable to the response variable after accounting for other predictors.

### Key Points:
- **Purpose**: Show the partial relationship between a predictor and the response.
- **Usage**: Identify whether a variable has a significant relationship with the response after adjusting for others.
- **Interpretation**:
  - A strong linear pattern indicates a significant relationship.
  - Outliers or curvature may indicate a poor model fit or influential points.


In [None]:
from statsmodels.graphics.regressionplots import plot_partregress_grid

def plot_added_variable(model):
    """
    Generate Added Variable Plots (Partial Regression Plots) for a regression model.

    :param model: The fitted regression model object from statsmodels.
    :return: A matplotlib figure object containing the Added Variable Plots.
    """
    fig = plt.figure(figsize=(12, 8))
    plot_partregress_grid(model, fig=fig)
    plt.tight_layout()
    return fig


In [None]:
fig2 = plot_added_variable(model)
plt.show()

### Types of Residuals in Linear Regression (Recap from Ex4)

$$
Y_i = X_i \beta + e_i, \ \text{where} \ e_i \sim N(0, \sigma^2)
$$

Residuals measure the difference between observed and predicted values.

#### 1. Raw Residuals

The raw residuals are simply the differences between each observed value $ Y_i $ and its corresponding predicted value $\hat{Y}_i $:
$$
\hat{e}_i = Y_i - \hat{Y}_i
$$

#### 2. Internally Studentized Residuals (unknown sigma)

Internally studentized residuals adjust each residual to account for the leverage $ h_{ii} $ of each observation.

$$
\hat{r_i} = \frac{\hat{e}_i}{s \sqrt{1 - h_{ii}}}
$$

and $s^2 = \hat{\sigma}^2 = \frac{1}{n - p}\sum_{j=1}^n \hat{e}_j^2 $ is the variance estimate from OLS, using all $n$ observations.


Studentized Residuals better reflects the influence of each observation on the fit by normalizing based on individual variances. Internally studentized residuals do not fully assess an observation's influence if removed from the model.

#### 3. Externally Studentized Residuals

Externally Studentized Residuals $\hat{r}_{(-i)}$
 - taking the PRESS residuals, or leave-one-out residuals (the residuals when each observation is left out of the model fit) and dividing by a scaled estimate of their standard deviation.
$$
\hat{r}_{(-i)} =  \frac{\hat{e}_{(-i)}}{s_{(-i)} \sqrt{1 - h_{ii}}}
$$
where
$$
s_{(-i)} = \sqrt{\frac{(n - p - 1)s^2 - \frac{\hat{e}_i^2}{1 - h_{ii}}}{n - p - 1}}
$$.






In [None]:
# Extract model details
residuals = model.resid  # Classical residuals
h_ii = model.get_influence().hat_matrix_diag  # Leverage values (h_ii)
n = int(model.nobs)  # Number of observations (ensure it's an integer)
p = int(model.df_model)  # Number of predictors
mse = model.mse_resid  # Mean squared error (s^2)

# 2. Internal Studentized Residuals (matches resid_studentized in statsmodels)
s_squared = np.sum(residuals**2) / (n - p - 1)  # OLS variance estimate
studentized_residuals_internal = residuals / np.sqrt(s_squared * (1 - h_ii))

# 3. External Studentized Residuals (matches resid_studentized_external in statsmodels)
studentized_residuals_external = np.zeros_like(residuals)
for i in range(n):
    # PRESS residuals (leave-one-out residuals)
    e_i = residuals[i]
    h_ii_i = h_ii[i]

    # Leave-one-out standard deviation (s_{(-i)})
    s_minus_i = np.sqrt(((n - p - 1) * mse - (e_i**2) / (1 - h_ii_i)) / (n - p - 2))

    # Externally studentized residual
    studentized_residuals_external[i] = e_i / (s_minus_i * np.sqrt(1 - h_ii_i))

# Residuals from statsmodels for comparison
model_studentized_residuals_internal = model.get_influence().resid_studentized  # Internal
model_studentized_residuals_external = model.get_influence().resid_studentized_external  # External

# Create a DataFrame for comparison
residuals_df = pd.DataFrame({
    'Classical Residuals (StatsModels)': residuals,
    'Studentized Residuals (Internal - Hand)': studentized_residuals_internal,
    'Studentized Residuals (External - Hand)': studentized_residuals_external,
    'Studentized Residuals (Internal - StatsModels)': model_studentized_residuals_internal,
    'Studentized Residuals (External - StatsModels)': model_studentized_residuals_external
})

# Display the first few rows
residuals_df.head()


## Influence Measures in Linear Regression

In linear regression, influence measures are used to identify observations that have a disproportionate impact on the model. These measures help in diagnosing the model's robustness and identifying outliers or influential points. Below are key influence measures commonly used:

### 1. DFBETAS
DFBETAS measures the difference in each coefficient estimate when an observation is omitted.

**Equation:**
$$
DFBETAS_{ij} = \frac{\hat{\beta}_j - \hat{\beta}_{j(i)}}{\sqrt{\hat{\sigma}^2_{(i)} (X^T X)^{-1}_{jj}}}
$$
where $ \hat{\beta}_j $ is the estimated coefficient, $ \hat{\beta}_{j(i)} $ is the estimated coefficient with the \( i \)-th observation omitted, and $(X^T X)^{-1}_{jj} $ is the \( j \)-th diagonal element of the inverse of $X^T X $.

### 2. DFFITS
DFFITS is an influence statistic that measures the effect of deleting a single observation.

**Equation:**
$$
DFFITS_i = \frac{\hat{y}_i - \hat{y}_{i(i)}}{\hat{\sigma}_{(i)} \sqrt{h_{ii}}}
$$
where $ \hat{y}_i $ is the predicted value with all observations, $ \hat{y}_{i(i)}$is the predicted value with the \( i \)-th observation omitted, and $ h_{ii} $is the leverage of the $ i $-th observation.

### 3. Leverage Values (h values)
Leverage values measure the influence of each observation on its own fitted value. High leverage points can significantly alter the position of the regression line.

**Equation:**
$$
h_{ii} = X_i (X^T X)^{-1} X_i^T
$$
where $X_i$ is the \( i \)-th row of the matrix of predictors \( X \).

### 4. Covariance Ratios
Covariance ratios compare the determinants of the covariance matrices with and without each observation. They help identify observations that influence the variance of the parameter estimates.

**Equation:**
$$
CR_i = \frac{\det(\hat{\Sigma}_{(i)})}{\det(\hat{\Sigma})}
$$
where $ \hat{\Sigma}_{(i)} $is the covariance matrix with the \( i \)-th observation omitted and $ \hat{\Sigma} $ is the covariance matrix with all observations.

### 5. Cook's Distances
Cook's distance measures the effect of deleting a single observation on the entire regression model. It is a commonly used metric to identify influential observations.

**Equation:**
$$
D_i = \frac{\sum_{j=1}^n (\hat{y}_j - \hat{y}_{j(i)})^2}{p \hat{\sigma}^2}
$$
where $ \hat{y}_j $ is the predicted value for the $ j $-th observation, $ \hat{y}_{j(i)} $ is the predicted value with the \( i \)-th observation omitted, \( p \) is the number of predictors, and $ \hat{\sigma}^2 $ is the estimated variance of the residuals.



##

### 1. DFBETAS

**Rule of Thumb:** An observation is considered influential if the absolute value of DFBETAS for any coefficient exceeds $ \frac{2}{\sqrt{n}} $, where $ n $ is the number of observations.

### 2. DFFITS

**Rule of Thumb:** An observation is considered influential if the absolute value of DFFITS is larger than $ 2 \sqrt{\frac{p+1}{n}} $, where \( p \) is the number of predictors and \( n \) is the number of observations.

### 3. Leverage Values (h values)

**Rule of Thumb:** An observation is considered to have high leverage if its leverage value exceeds $ \frac{2(p+1)}{n} $, where \( p \) is the number of predictors and \( n \) is the number of observations.

### 4. Covariance Ratios

**Rule of Thumb:** There is no widely accepted rule of thumb for covariance ratios, but observations with values far from 1 (either much larger or smaller) are generally considered influential.

### 5. Cook's Distances

**Rule of Thumb:** An observation is considered influential if its Cook's distance $D_i > \frac{4}{n} $, where \( n \) is the number of observations.


In [None]:
# Old version of influence measures data frame
def create_influence_dataframe(model):
    influence = model.get_influence()

    # Extracting the influence measures
    dffits = influence.dffits[0]
    dfbetas = influence.dfbetas
    leverage = influence.hat_matrix_diag
    covariance_ratios = influence.cov_ratio
    cooks_distances = influence.cooks_distance[0]

    # Creating the DataFrame
    influence_df = pd.DataFrame({
        'DFFITS': dffits,
        'Leverage': leverage,
        'Covariance Ratio': covariance_ratios,
        'Cook\'s Distance': cooks_distances
    })

    # Adding DFBETAS columns for each predictor
    for i in range(dfbetas.shape[1]):
        influence_df[f'DFBETA_{i}'] = dfbetas[:, i]

    return influence_df

influence_df = create_influence_dataframe(model)
influence_df


In [None]:
def summarize_influence_measures(model):
    """
    Summarize influence measures and flag observations as potential outliers.

    :param model: Fitted regression model object from statsmodels.
    :return: DataFrame summarizing influence measures and flagged outliers.
    """
    influence = model.get_influence()

    # Extract measures
    leverage = influence.hat_matrix_diag
    cooks_distance = influence.cooks_distance[0]
    dffits = influence.dffits[0]
    dfbetas = influence.dfbetas
    cov_ratios = influence.cov_ratio

    # Number of observations and predictors
    n = int(model.nobs)
    p = int(model.df_model)

    # Rule of Thumb thresholds
    leverage_threshold = 2 * (p + 1) / n
    cooks_distance_threshold = 4 / n
    dffits_threshold = 2 * np.sqrt((p + 1) / n)
    dfbetas_threshold = 2 / np.sqrt(n)

    # Summarize outliers based on thresholds
    flagged = {
        'High Leverage': leverage > leverage_threshold,
        'High Cook\'s Distance': cooks_distance > cooks_distance_threshold,
        'High DFFITS': np.abs(dffits) > dffits_threshold,
    }

    # Flag observations with high DFBETAS for any predictor
    for j in range(dfbetas.shape[1]):
        flagged[f'High DFBETAS (Predictor {j})'] = np.abs(dfbetas[:, j]) > dfbetas_threshold

    # Create summary DataFrame
    summary_df = pd.DataFrame({
        'Leverage': leverage,
        'Cook\'s Distance': cooks_distance,
        'DFFITS': dffits,
        'Covariance Ratio': cov_ratios
    })

    # Add flags for rule-of-thumb violations
    for key, flag in flagged.items():
        summary_df[key] = flag

    return summary_df

summary = summarize_influence_measures(model)
summary


In [None]:
def summarize_influence_measures_with_data(model, data):
    """
    Summarize influence measures, flag outliers, and include original data columns.

    :param model: Fitted regression model object from statsmodels.
    :param data: DataFrame used to fit the regression model.
    :return: DataFrame summarizing influence measures, flagged outliers, and original data.
    """
    influence = model.get_influence()

    # Extract measures
    leverage = influence.hat_matrix_diag
    cooks_distance = influence.cooks_distance[0]
    dffits = influence.dffits[0]
    dfbetas = influence.dfbetas
    cov_ratios = influence.cov_ratio

    # Number of observations and predictors
    n = int(model.nobs)
    p = int(model.df_model)

    # Rule of Thumb thresholds
    leverage_threshold = 2 * (p + 1) / n
    cooks_distance_threshold = 4 / n
    dffits_threshold = 2 * np.sqrt((p + 1) / n)
    dfbetas_threshold = 2 / np.sqrt(n)

    # Summarize outliers based on thresholds
    flagged = {
        'High Leverage': leverage > leverage_threshold,
        'High Cook\'s Distance': cooks_distance > cooks_distance_threshold,
        'High DFFITS': np.abs(dffits) > dffits_threshold,
    }

    # Flag observations with high DFBETAS for any predictor
    for j in range(dfbetas.shape[1]):
        flagged[f'High DFBETAS (Predictor {j})'] = np.abs(dfbetas[:, j]) > dfbetas_threshold

    # Create summary DataFrame
    summary_df = pd.DataFrame({
        'Leverage': leverage,
        'Cook\'s Distance': cooks_distance,
        'DFFITS': dffits,
        'Covariance Ratio': cov_ratios
    })

    # Add flags for rule-of-thumb violations
    for key, flag in flagged.items():
        summary_df[key] = flag

    # Combine summary DataFrame with original data
    summary_with_data = pd.concat([data.reset_index(drop=True), summary_df], axis=1)

    #  Select rows where any flag is True
    flagged_observations = summary_with_data.loc[summary_with_data.iloc[:, len(data.columns) + 4:].any(axis=1)]
    return summary_with_data, flagged_observations

# Example usage:
all_observations_with_im, flagged_observations = summarize_influence_measures_with_data(model, data0)
flagged_observations


In [None]:
def manual_leverage(model):
    X = model.model.exog  # Extract design matrix
    H = X @ np.linalg.inv(X.T @ X) @ X.T
    return np.diag(H)


In [None]:
def manual_dfbetas(model):
    X = model.model.exog  # Design matrix
    y = model.model.endog  # Response variable
    betas = model.params  # Coefficients
    sigma = np.sqrt(model.mse_resid)  # Residual standard deviation
    n, p = X.shape
    dfbetas = np.zeros((n, p))

    for i in range(n):
        # Leave-one-out X and y
        X_exclude_i = np.delete(X, i, axis=0)
        y_exclude_i = np.delete(y, i)

        # Recompute betas excluding observation i
        betas_exclude_i = np.linalg.inv(X_exclude_i.T @ X_exclude_i) @ (X_exclude_i.T @ y_exclude_i)

        # Compute DFBETAS for each predictor
        for j in range(p):
            dfbetas[i, j] = (betas[j] - betas_exclude_i[j]) / (sigma * np.sqrt(np.linalg.inv(X.T @ X)[j, j]))

    return dfbetas


In [None]:
def manual_dffits(model):
    X = model.model.exog  # Design matrix
    y = model.model.endog  # Response variable
    y_hat = model.fittedvalues  # Fitted values
    sigma = np.sqrt(model.mse_resid)  # Residual standard deviation
    n, p = X.shape
    dffits = np.zeros(n)

    for i in range(n):
        # Leave-one-out X and y
        X_exclude_i = np.delete(X, i, axis=0)
        y_exclude_i = np.delete(y, i)

        # Recompute predicted y for observation i
        betas_exclude_i = np.linalg.inv(X_exclude_i.T @ X_exclude_i) @ (X_exclude_i.T @ y_exclude_i)
        y_hat_new_i = X[i] @ betas_exclude_i

        # Compute DFFITS
        h_ii = X[i] @ np.linalg.inv(X.T @ X) @ X[i].T  # Leverage for observation i
        dffits[i] = (y_hat[i] - y_hat_new_i) / (sigma * np.sqrt(h_ii))

    return dffits


In [None]:
def manual_dffits(model):
    X = model.model.exog  # Design matrix
    y = model.model.endog  # Response variable
    y_hat = model.fittedvalues  # Fitted values
    residuals = model.resid  # Residuals
    n, p = X.shape
    dffits = np.zeros(n)

    for i in range(n):
        # Leave-one-out X and y
        X_exclude_i = np.delete(X, i, axis=0)
        y_exclude_i = np.delete(y, i)

        # Recompute betas and predicted y for observation i
        betas_exclude_i = np.linalg.inv(X_exclude_i.T @ X_exclude_i) @ (X_exclude_i.T @ y_exclude_i)
        y_hat_new_i = X[i] @ betas_exclude_i

        # Compute leverage for observation i
        h_ii = X[i] @ np.linalg.inv(X.T @ X) @ X[i].T

        # Compute sigma^2_{(-i)} using the leave-one-out residuals
        sse = np.sum(residuals**2)  # Sum of squared residuals
        sigma_sq_minus_i = (sse - (residuals[i]**2 / (1 - h_ii))) / (n - p - 2)
        sigma_minus_i = np.sqrt(sigma_sq_minus_i)

        # Compute DFFITS using sigma_{(-i)} and leverage
        dffits[i] = (y_hat[i] - y_hat_new_i) / (sigma_minus_i * np.sqrt(h_ii))

    return dffits


In [None]:
def manual_cooks_distances(model):
    X = model.model.exog  # Design matrix
    y = model.model.endog  # Response variable
    y_hat = model.fittedvalues  # Fitted values
    sigma = np.sqrt(model.mse_resid)  # Residual standard deviation
    n, p = X.shape
    cooks_d = np.zeros(n)

    for i in range(n):
        # Leave-one-out X and y
        X_exclude_i = np.delete(X, i, axis=0)
        y_exclude_i = np.delete(y, i)

        # Recompute predicted y for all observations
        betas_exclude_i = np.linalg.inv(X_exclude_i.T @ X_exclude_i) @ (X_exclude_i.T @ y_exclude_i)
        y_hat_new = X @ betas_exclude_i

        # Compute Cook's Distance
        cooks_d[i] = np.sum((y_hat - y_hat_new) ** 2) / (p * sigma**2)

    return cooks_d


In [None]:
# Compute manual influence measures
leverage = manual_leverage(model)
dfbetas = manual_dfbetas(model)
dffits = manual_dffits(model)
cooks_distances = manual_cooks_distances(model)

# Compare with statsmodels
influence = model.get_influence()
statsmodels_leverage = influence.hat_matrix_diag
statsmodels_dfbetas = influence.dfbetas
statsmodels_dffits = influence.dffits[0]
statsmodels_cooks = influence.cooks_distance[0]

# Print comparisons
print("Manual Leverage vs Statsmodels Leverage:")
print(np.allclose(leverage, statsmodels_leverage,atol=1e-02))

print("Manual DFBETAS vs Statsmodels DFBETAS:")
print(np.allclose(dfbetas, statsmodels_dfbetas,atol=1e-02))

print("Manual DFFITS vs Statsmodels DFFITS:")
print(np.allclose(dffits, statsmodels_dffits,atol=1e-02))

print("Manual Cook's Distance vs Statsmodels Cook's Distance:")
print(np.allclose(cooks_distances, statsmodels_cooks,atol=1e-02))

In [None]:
def summarize_manual_influence_measures(model):
    X = model.model.exog  # Design matrix
    data = pd.DataFrame(model.model.data.frame)  # Original data as a DataFrame

    # Compute influence measures using manual functions
    leverage = manual_leverage(model)
    dfbetas = manual_dfbetas(model)
    dffits = manual_dffits(model)
    cooks_distances = manual_cooks_distances(model)

    # Number of observations and predictors
    n, p = X.shape

    # Rule-of-thumb thresholds
    leverage_threshold = 2 * (p + 1) / n
    cooks_distance_threshold = 4 / n
    dffits_threshold = 2 * np.sqrt((p + 1) / n)
    dfbetas_threshold = 2 / np.sqrt(n)

    # Flag outliers based on thresholds
    flagged = {
        'High Leverage': leverage > leverage_threshold,
        'High Cook\'s Distance': cooks_distances > cooks_distance_threshold,
        'High DFFITS': np.abs(dffits) > dffits_threshold,
    }

    # Flag observations with high DFBETAS for any predictor
    for j in range(p):
        flagged[f'High DFBETAS (Predictor {j})'] = np.abs(dfbetas[:, j]) > dfbetas_threshold

    # Create summary DataFrame
    summary_df = pd.DataFrame({
        'Leverage': leverage,
        'Cook\'s Distance': cooks_distances,
        'DFFITS': dffits,
    })

    # Add DFBETAS for each predictor
    for j in range(p):
        summary_df[f'DFBETAS (Predictor {j})'] = dfbetas[:, j]

    # Add flags for rule-of-thumb violations
    for key, flag in flagged.items():
        summary_df[key] = flag

    # Combine with original data
    summary_with_data = pd.concat([data.reset_index(drop=True), summary_df], axis=1)

    # Select flagged observations
    flagged_columns = [col for col in summary_with_data.columns if col.startswith('High')]
    flagged_observations = summary_with_data.loc[summary_with_data[flagged_columns].any(axis=1)]

    return flagged_observations, summary_with_data


In [None]:
flagged_observations_manual, summary_with_data_manual = summarize_manual_influence_measures(model)
flagged_observations_manual

In [None]:
flagged_observations

In [None]:
# Adding a good outlying point to predictors
outlier = pd.DataFrame({'X1': [max(data0['X1']) + 25],
                        'X2': [max(data0['X2']) + 35],
                        'X3': [max(data0['X3']) + 25]})
X_with_outlier = pd.concat([data0[['X1', 'X2', 'X3']], outlier], ignore_index=True)

# Recalculating Y with the new outlying point
# Create the design matrix for the model including the intercept
X_design = sm.add_constant(X_with_outlier)
# Calculate Y values including the outlier
Y_with_outlier = np.dot(X_design, beta0).flatten() + np.append(e, np.random.normal(0, 4))

# Simple Regression - only X2 as independent variable
plt.figure(figsize=(10, 8))
plt.scatter(X_with_outlier['X2'], Y_with_outlier)
plt.xlabel('X2')
plt.ylabel('Y')
plt.title('Simple Regression with at least one influential point')
plt.show()


##Playground

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

# Step 1: Generate synthetic data
np.random.seed(42)  # For reproducibility
n = 100
X1 = np.random.normal(10, 2, n)
X2 = np.random.normal(20, 5, n)
X3 = np.random.normal(30, 3, n)
e = np.random.normal(0, 4, n)
beta0 = [5, 2, -1, 3]  # Intercept and slopes for X1, X2, X3
X_design = sm.add_constant(pd.DataFrame({'X1': X1, 'X2': X2, 'X3': X3}))  # Add intercept
Y = np.dot(X_design, beta0) + e

data0 = pd.DataFrame({'X1': X1, 'X2': X2, 'X3': X3, 'Y': Y})

# Step 2: Function to add outliers or leverage points
def add_outliers(data, n_outliers=1, leverage=False, extreme_y=False):
    """
    Add outliers or high-leverage points to the data.

    :param data: Original data as a DataFrame.
    :param n_outliers: Number of outliers to add.
    :param leverage: Whether to add high-leverage points (extreme predictors).
    :param extreme_y: Whether to add extreme Y values.
    :return: Updated DataFrame with added outliers.
    """
    new_data = data.copy()
    for _ in range(n_outliers):
        if leverage:
            # Add high-leverage points (extreme predictor values)
            outlier = {
                'X1': max(data['X1']) + np.random.uniform(20, 30),
                'X2': max(data['X2']) + np.random.uniform(30, 40),
                'X3': max(data['X3']) + np.random.uniform(20, 30),
                'Y': np.random.uniform(min(data['Y']), max(data['Y']))
            }
        elif extreme_y:
            # Add extreme Y values
            outlier = {
                'X1': np.random.uniform(min(data['X1']), max(data['X1'])),
                'X2': np.random.uniform(min(data['X2']), max(data['X2'])),
                'X3': np.random.uniform(min(data['X3']), max(data['X3'])),
                'Y': max(data['Y']) + np.random.uniform(20, 40)
            }
        else:
            # Add a general outlier (moderately extreme values in both X and Y)
            outlier = {
                'X1': max(data['X1']) + np.random.uniform(10, 20),
                'X2': max(data['X2']) + np.random.uniform(15, 25),
                'X3': max(data['X3']) + np.random.uniform(10, 20),
                'Y': max(data['Y']) + np.random.uniform(10, 20)
            }
        new_data = pd.concat([new_data, pd.DataFrame([outlier])], ignore_index=True)
    return new_data


In [None]:
# Step 3: Add outliers or leverage points
data_with_outliers = add_outliers(data0, n_outliers=3, leverage=True,extreme_y=True)

# Step 4: Fit a regression model and calculate influence measures
X_with_outliers = sm.add_constant(data_with_outliers[['X1', 'X2', 'X3']])
model = sm.OLS(data_with_outliers['Y'], X_with_outliers).fit()

In [None]:
# Step 5: Visualize scatter plot
plt.figure(figsize=(10, 8))
plt.scatter(data_with_outliers['X2'], data_with_outliers['Y'], label="Data Points", alpha=0.7)
plt.xlabel('X2')
plt.ylabel('Y')
plt.title('Simple Regression with Added Outliers')
plt.legend()
plt.show()

# Step 6: Plot regression diagnostics
fig = model.get_influence().summary_frame().plot(kind='scatter', x='hat_diag', y='student_resid', alpha=0.7)
plt.title('Regression Diagnostics: Leverage vs Studentized Residuals')
plt.show()

# Step 7: Highlight flagged observations
influence = model.get_influence()
summary_frame = influence.summary_frame()
summary_frame['index'] = range(len(summary_frame))
flagged_obs = summary_frame[
    (summary_frame['cooks_d'] > 4 / n) | (summary_frame['hat_diag'] > 2 * (X_with_outliers.shape[1] / n))
]

plt.figure(figsize=(10, 8))
plt.scatter(data_with_outliers['X2'], data_with_outliers['Y'], label="Data Points", alpha=0.7)
plt.scatter(
    data_with_outliers.iloc[flagged_obs['index']]['X2'],
    data_with_outliers.iloc[flagged_obs['index']]['Y'],
    color='red',
    label='Flagged Observations',
    s=100
)
plt.xlabel('X2')
plt.ylabel('Y')
plt.title('Flagged Observations Highlighted')
plt.legend()
plt.show()
