# Weighted Least Squares Regression #

### Generalized Least Squares ###

In [51]:
# Import Libraries

# Data Management
import pandas as pd
import numpy as np

# Statistics
import statsmodels.api as sm 
import matplotlib.pyplot as plt
from statsmodels.stats.diagnostic import het_breuschpagan

# Handle Files
import sys
import os

# Import Local Functions
sys.path.append(os.path.abspath("../source"))
from functions import import_financial_data

# Pretty Notation
from IPython.display import display, Math

In [23]:
# Data for Y
data_y = import_financial_data('AMZN')

y = np.log(data_y['adj_close'])

y

In [25]:
# Data for Explanatory Variables
data_x = import_financial_data('WMT')

x = np.log(data_x['adj_close'])

x

In [26]:
# Call the sp500
data_sp500 = pd.read_csv(r'..\additional_data\sp500.csv')
data_sp500.set_index('Date', inplace=True)
data_sp500.index = pd.to_datetime(data_sp500.index)

data_sp500

In [27]:
# Create the Data for Lineal Regression
ols_df = pd.DataFrame(index = y.index)
ols_df['AMZN'] = y
ols_df['WMT'] = x
ols_df['SP500'] = np.log(data_sp500['SP500'])

# Drop NaNs
ols_df = ols_df.dropna()

# Date time
ols_df.index = pd.to_datetime(ols_df.index)

# Add constant
ols_df = sm.add_constant(ols_df)

ols_df

In [29]:
# Create Figure
fig, ax1 = plt.subplots(dpi = 300)

# AMZN Price
ols_df['AMZN'].plot(color = 'blue', ax = ax1)
ax1.set_xlabel('Date')
ax1.set_ylabel(
    'AMZN Price', 
    color='blue'
    )

# WMT Price
ax2 = ax1.twinx()

ols_df['WMT'].plot(color = 'red', ax = ax2)
ax2.set_ylabel(
    'WMT Price', 
    color='red'
    )

plt.show()

In [30]:
#Model specification
model = sm.OLS(
    ols_df['AMZN'], 
    ols_df[['const', 'WMT', 'SP500']]
    )   
     
#the results of the model
results = model.fit() 
    
#The Parameters
betas = results.params  

#here we check the summary
print(results.summary())    

In [31]:
# Check the residuals 

residuals = results.resid

# Create Figure
fig, ax1 = plt.subplots(dpi = 300)

# Residuals Rate Plot
residuals.plot(color = 'blue', ax = ax1)
ax1.set_xlabel('Date')
ax1.set_ylabel(
    'Residuals', 
    color='blue'
    )

plt.show()

In [32]:
# Let us check some important things

print(f"The Residuals Mean is: {residuals.mean().round(3)}")
print(f"The Residuals Variance is: {residuals.var()}")

In [33]:
# Test for Heteroskedasticity (null: homoskedasticity)

bp_test = het_breuschpagan(
    residuals, 
    sm.add_constant(ols_df[['WMT', 'SP500']])
)

labels = ['LM statistic', 'p-value', 'F-statistic', 'F p-value']
print(dict(zip(labels, bp_test)))

In [35]:
# Omega

Omega = np.diag(residuals**2)

Omega

In [36]:
# The weighted least squares formula

display(Math(r"\beta_{gls}=(X^⊤WX)^{-1}(X^⊤WY)"))

# In this case

display(Math(r"\Omega^{-1}=W"))

In [38]:
# If we want to replicate it we have to create the matrix

# Y Vector
Y_Vector = ols_df["AMZN"].to_numpy()

# X matrix
Information_Matrix = ols_df[['const', 'WMT', 'SP500']].to_numpy()

# Weights

W = np.linalg.inv(Omega)

W.round(4)

In [39]:
# Calculate the Weighted Data

Weighted_X = W.dot(Information_Matrix)
Weighted_Y = W.dot(Y_Vector)

In [40]:
# Calculate the components

Information_Matrix_T = Information_Matrix.transpose()

X_Weighted_Var = Information_Matrix_T.dot(Weighted_X)
X_Y_Weighted_Covar = Information_Matrix_T.dot(Weighted_Y)

X_Weighted_Var_Inv = np.linalg.inv(X_Weighted_Var)

In [41]:
# The betas

betas_gls = X_Weighted_Var_Inv.dot(X_Y_Weighted_Covar)

betas_gls.round(4)

In [44]:
# GLS

gls_model = sm.GLS(
    ols_df['AMZN'], 
    sm.add_constant(ols_df[['WMT', 'SP500']]),
    sigma=Omega
)

#the results of the model
gls_results = gls_model.fit()

#here we check the summary
print(gls_results.summary())    

In [45]:
# Check the residuals 

residuals_gls = gls_results.resid

# Create Figure
fig, ax1 = plt.subplots(dpi = 300)

# Residuals Rate Plot
residuals_gls.plot(color = 'blue', ax = ax1)
ax1.set_xlabel('Date')
ax1.set_ylabel(
    'Residuals', 
    color='blue'
    )

plt.show()

In [46]:
# Calculate the Sums of Squares

# First the Weighted Mean
y_w_mean = np.sum(W.diagonal() * y) / np.sum(W)

# The Fitted Values
y_hat_gls = gls_results.fittedvalues

In [47]:
# Calculate the Total Sum of Squares

SST_wls = np.sum(W.diagonal() * (y - y_w_mean)**2)

SST_wls

In [48]:
# Calculate the Residual Sum of Squares

SSR_wls = np.sum(W.diagonal() * (y - y_hat_gls)**2)

SSR_wls

In [49]:
# Calculate the R-Squared

R_Squared_GLS = 1 - (SSR_wls/SST_wls)

R_Squared_GLS

In [50]:
# Test for Heteroskedasticity (null: homoskedasticity)

bp_test = het_breuschpagan(
    residuals_gls, 
    sm.add_constant(ols_df[['WMT', 'SP500']])
)

labels = ['LM statistic', 'p-value', 'F-statistic', 'F p-value']
print(dict(zip(labels, bp_test)))

Why didn’t we eliminate heteroskedasticity?

Even though we applied GLS to correct for heteroskedasticity, it may still persist if the variables in the regression are non-stationary. This is one of the reasons why we use returns instead of prices when modeling financial data — returns tend to be stationary and reduce the presence of structural heteroskedasticity.