In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
import scipy as sp
import faraway.utils

In [3]:
gd = sm.datasets.longley.load_pandas()
longley = gd.data
longley.head()

Unnamed: 0,TOTEMP,GNPDEFL,GNP,UNEMP,ARMED,POP,YEAR
0,60323.0,83.0,234289.0,2356.0,1590.0,107608.0,1947.0
1,61122.0,88.5,259426.0,2325.0,1456.0,108632.0,1948.0
2,60171.0,88.2,258054.0,3682.0,1616.0,109773.0,1949.0
3,61187.0,89.5,284599.0,3351.0,1650.0,110929.0,1950.0
4,63221.0,96.2,328975.0,2099.0,3099.0,112075.0,1951.0


The Longley dataset contains various US macroeconomic variables that are known to be highly collinear. It has been used to appraise the accuracy of least squares routines.

Number of Observations - 16

Number of Variables - 6

Variable name definitions::

        TOTEMP - Total Employment
        GNPDEFL - GNP deflator
        GNP - GNP
        UNEMP - Number of unemployed
        ARMED - Size of armed forces
        POP - Population
        YEAR - Year (1947 - 1962)

In [10]:
%run utils.ipynb

In [13]:
import faraway.utils

lmod = smf.ols(
    "TOTEMP ~ GNPDEFL + GNP + UNEMP + ARMED + POP + YEAR",
    data=longley
).fit()
lmod.sumary()

                   coefs      stderr tvalues pvalues
Intercept -3,482,258.635 890,420.384   -3.91  0.0036
GNPDEFL           15.062      84.915    0.18  0.8631
GNP               -0.036       0.033   -1.07  0.3127
UNEMP             -2.020       0.488   -4.14  0.0025
ARMED             -1.033       0.214   -4.82  0.0009
POP               -0.051       0.226   -0.23  0.8262
YEAR           1,829.151     455.478    4.02  0.0030

n=16 p=7 Residual SD=304.854 R-squared=1.00


In [14]:
get_cond_nums(lmod)

  cond_nums = np.sqrt(eigs[-1] / eigs)


array([           nan, 4.56040296e+05, 3.99022436e+04, 1.05119570e+03,
       4.88280556e+02, 1.98292801e+01, 1.00000000e+00])

In [19]:
part_rsq = get_partial_rsquareds(lmod)
part_rsq

Intercept    1.000000
GNP          0.999441
YEAR         0.998682
POP          0.997495
GNPDEFL      0.992622
UNEMP        0.970255
ARMED        0.721365
dtype: float64

In [22]:
VIFs = 1 / (1 - part_rsq)
VIFs

Intercept    1.364980e+08
GNP          1.788513e+03
YEAR         7.589806e+02
POP          3.991510e+02
GNPDEFL      1.355324e+02
UNEMP        3.361889e+01
ARMED        3.588930e+00
dtype: float64

In [24]:
get_dsg_mtx(lmod).corr()

Unnamed: 0,GNPDEFL,GNP,UNEMP,ARMED,POP,YEAR
GNPDEFL,1.0,0.991589,0.620633,0.464744,0.979163,0.991149
GNP,0.991589,1.0,0.604261,0.446437,0.99109,0.995273
UNEMP,0.620633,0.604261,1.0,-0.177421,0.686552,0.668257
ARMED,0.464744,0.446437,-0.177421,1.0,0.364416,0.417245
POP,0.979163,0.99109,0.686552,0.364416,1.0,0.993953
YEAR,0.991149,0.995273,0.668257,0.417245,0.993953,1.0
