# Regression Diagnosis #

In [89]:
# Import Libraries

# Data Management
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt

# Statistics
import statsmodels.api as sm 
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.diagnostic import het_white
from statsmodels.stats.diagnostic import het_goldfeldquandt
from statsmodels.stats.diagnostic import acorr_breusch_godfrey
from statsmodels.stats.diagnostic import linear_reset
from statsmodels.stats.stattools import jarque_bera
from statsmodels.stats.stattools import durbin_watson
from scipy.stats import anderson
from scipy.stats import norm

# Handle Files
import sys
import os

# Import Local Functions
sys.path.append(os.path.abspath("../source"))
from functions import get_fred_data

In [90]:
# key
key = '0174cb93931388a2bf305663e4117fd3'

In [91]:
# Data for Y (Memorandum Items: Custody Holdings: Marketable U.S. Treasury Securities: Wednesday Level)
y_data = get_fred_data('WMTSECL1', key)

# Creating Time Index
y_data.index = pd.to_datetime(y_data.index)
y_data.name = 'holdings'

y_data

In [92]:
# Data for Explanatory Variables (Nominal Broad U.S. Dollar Index)
x_data = get_fred_data('DTWEXBGS', key)

# Time Index
x_data.index = pd.to_datetime(x_data.index)
x_data.name = 'dollar_index'

x_data

In [93]:
# Create the Data for Lineal Regression
ols_df = pd.DataFrame({'x': x_data, 'y': y_data})

# Eliminate 0s
ols_df = ols_df[(ols_df['x'] > 0) & (ols_df['y'] > 0)]

# Cut the Sample
ols_df = ols_df["2018-01-01":]

ols_df

In [94]:
# We use Natural Log to calculate better betas
log_ols_df = np.log(ols_df)

# Drop NaNs
log_ols_df = log_ols_df.dropna()

# Set Index as a Date Item
log_ols_df.index = pd.to_datetime(log_ols_df.index)

log_ols_df

In [95]:
#Model specification
X_Matrix = sm.add_constant(log_ols_df[['x']])
X_Matrix = X_Matrix.values
Y_Vector = log_ols_df['y'].values

model = sm.OLS(
    Y_Vector, 
    X_Matrix
    )   
     
#the results of the model
results = model.fit() 

#here we check the summary
print(results.summary())     

In [96]:
# Create the Residuals Series
residuals = results.resid

residuals = pd.Series(residuals, index = log_ols_df.index)
residuals.name = 'residuals'

residuals

In [97]:
# Breusch Pagan Test for Heteroskedasticity (null: homoskedasticity)

bp_test = het_breuschpagan(
    residuals, 
    sm.add_constant(log_ols_df['x'])
)

labels = ['LM statistic', 'p-value', 'F-statistic', 'F p-value']
print(dict(zip(labels, bp_test)))

In [98]:
# White Test for Heteroskedasticity (null: homoskedasticity)

white_test = het_white(
    residuals, 
    sm.add_constant(log_ols_df['x'])
)

# Results
labels = ['Test Statistic', 'Test p-value', 'F-Statistic', 'F-Test p-value']
for name, value in zip(labels, white_test):
    print(f'{name}: {value:.4f}')

In [99]:
# Goldfeld-Quandt Test for Heteroskedasticity (null: homoskedasticity)

gq_test = het_goldfeldquandt(
    residuals, 
    sm.add_constant(log_ols_df['x'])
)

# Results
labels = ['F statistic', 'p-value', 'Alternative hypothesis']
for name, value in zip(labels, gq_test):
    print(f'{name}: {value}')

In [100]:
# Jarque Bera Test for Normality of Residuals (null: Normality)

jb_test = jarque_bera(residuals)

# Results
labels = ['JB statistic', 'p-value', 'skewness', 'kurtosis']
for name, value in zip(labels, jb_test):
    print(f'{name}: {value:.4f}')

In [101]:
# Anderson Darling Test for Normality of Residuals (null: Normality)

ad_test = anderson(residuals, dist='norm')

# Results
print(f"Statistic: {ad_test.statistic:.4f}")
print("Critical values:", ad_test.critical_values)
print("Significance levels:", ad_test.significance_level)

In [102]:
# Plot the Residuals Histogram

mu = residuals.mean()
sigma = residuals.std()
median = residuals.median()

# Create Histogram
plt.figure(figsize=(10, 6))
plt.hist(residuals, bins=30, density=True, color='orange', alpha=0.5, edgecolor='black', label='Residuals')

# Generate the Values of the Normal Distribution
x = np.linspace(residuals.min(), residuals.max(), 100)
y = norm.pdf(x, mu, sigma)

# Graph the Real Normal Distribution
plt.plot(x, y, color='black', linestyle='solid', linewidth=2, label='Normal Distribution')

# Reference Lines
plt.axvline(x=mu, color='black', linestyle='dashed', label='Mean')
plt.axvline(x=median, color='red', linestyle='dashed', label='Median')
plt.axvline(x=mu + sigma, color='grey', linestyle='dashed')
plt.axvline(x=mu - sigma, color='grey', linestyle='dashed')

# Config
plt.title('Residuals Histogram with Normal Distribution')
plt.xlabel('Residuals')
plt.ylabel('Density')

# Legends and Grid
plt.legend()
plt.grid(True)

# Show
plt.show()

In [103]:
# Durbin Watson Test for First-Order Autocorrelation

# Calculate the Statistic
dw = durbin_watson(residuals)

print(f'Durbin-Watson statistic: {dw:.4f}')

In [104]:
# Breusch-Godfrey Test for Autocorrelation (null: No Autocorrelation)
bg_test = acorr_breusch_godfrey(results, nlags=4) # we can adjust the lags

# Results
labels = ['LM Statistic', 'LM p-value', 'F Statistic', 'F p-value']
for name, value in zip(labels, bg_test):
    print(f'{name}: {value:.4f}')

In [105]:
# Plot Residuals vs Fitted Values

y_fit = results.fittedvalues
y_fit = pd.DataFrame(y_fit, index=log_ols_df.index)
y_fit.name = 'fitted_values'

fig, ax1 = plt.subplots(dpi = 600)

plt.scatter(residuals, y_fit)
plt.ylabel('Fitted Values')
plt.xlabel('Residuals')

plt.show()

In [113]:
# Ramsey Test for Linearity (null: lineal model)

# Statistic
reset_test = linear_reset(results, power=2, use_f=True)

print(f"RESET F-statistic: {reset_test.fvalue:.4f}")
print(f"p-value: {reset_test.pvalue:.4f}")

In [114]:
# Ramsey Test for Linearity (null: lineal model)

# Statistic
reset_test = linear_reset(results, power=3, use_f=True)

print(f"RESET F-statistic: {reset_test.fvalue:.4f}")
print(f"p-value: {reset_test.pvalue:.4f}")