# Lesson 3: OLS Regression Analysis


This tutorial covers Ordinary Least Squares (OLS) regression analysis using Python. It includes:

1. Running an OLS Regression
2. Running Regression Diagnostics
   - Multicollinearity
   - Heteroskedasticity
   - Serial Correlation (Breusch-Godfrey Test)


In [None]:
import pandas as pd

# Load the dataset
file_path = '2021FIESvol1_puf_SELECTED.dta' #change this depending on where the file is located
fies_data = pd.read_stata(file_path)

# Add a new variable: Per Capita Total Expenditure
fies_data['PC_TOTEX'] = fies_data['TOTEX'] / fies_data['FSIZE']


## 1. Running an OLS Regression

In [None]:

import statsmodels.api as sm

# Define the dependent variable and independent variables
Y = fies_data['PC_TOTEX']
X = fies_data[['FSIZE', 'FOODoTOTEX', 'lnTOINC']]

# Add a constant to the independent variables
X = sm.add_constant(X)

# Fit the OLS regression model
ols_model = sm.OLS(Y, X).fit()

# Display the regression results
print(ols_model.summary())


                            OLS Regression Results                            
Dep. Variable:               PC_TOTEX   R-squared:                       0.526
Model:                            OLS   Adj. R-squared:                  0.526
Method:                 Least Squares   F-statistic:                 6.104e+04
Date:                Fri, 15 Nov 2024   Prob (F-statistic):               0.00
Time:                        11:35:44   Log-Likelihood:            -1.9933e+06
No. Observations:              165029   AIC:                         3.987e+06
Df Residuals:                  165025   BIC:                         3.987e+06
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -4.844e+05   2583.025   -187.528      0.0

## 2.1 Multicollinearity

In [None]:

from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate Variance Inflation Factor (VIF)
vif_data = pd.DataFrame()
vif_data['Variable'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Display VIF results
print(vif_data)


     Variable         VIF
0       const  606.513501
1       FSIZE    1.305823
2  FOODoTOTEX    1.624127
3     lnTOINC    1.711840


## 2.2 Heteroskedasticity

In [None]:

from statsmodels.stats.diagnostic import het_breuschpagan

# Perform the Breusch-Pagan test for heteroskedasticity
bp_test = het_breuschpagan(ols_model.resid, X)

# Display test results
labels = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
bp_results = dict(zip(labels, bp_test))
print(bp_results)


{'Lagrange multiplier statistic': 892.703973732243, 'p-value': 3.3851240742789013e-193, 'f-value': 299.1791513754853, 'f p-value': 1.0187850325972879e-193}


## 2.3 Serial Correlation: Breusch-Godfrey Test

* Skip if you're not doing time series data

In [None]:

from statsmodels.stats.diagnostic import acorr_breusch_godfrey

# Perform the Breusch-Godfrey test for serial correlation
bg_test = acorr_breusch_godfrey(ols_model)

# Display test results
bg_labels = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
bg_results = dict(zip(bg_labels, bg_test))
print(bg_results)


{'Lagrange multiplier statistic': 13448.157830290767, 'p-value': 0.0, 'f-value': 1464.0027938892704, 'f p-value': 0.0}
