In this notebook, statistical features of the linear regression model is evaluated.

In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import norm
import pickle

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [2]:
def log_plus_one(x):
    return np.log(x+1)


In [3]:
df = pd.read_csv('nyc_bldg2021_model_ready.csv')

with open("lr_model.pickle", "rb") as f:
    lr = pickle.load(f)

with open("X_train_ct.pickle", "rb") as f:
    X_train = pickle.load(f)

with open("y_train.pickle", "rb") as f:
    y = pickle.load(f)

with open("column_transformer.pickle", "rb") as f:
    ct = pickle.load(f)

with open("one_hot_encoder.pickle", "rb") as f:
    oh = pickle.load(f)



In [7]:
lr.coef_
lr.intercept_

94.39994644348454

In [5]:
X0 = np.ones((X_train.shape[0],1))
X = np.concatenate((X0, X_train), axis=1)

In [8]:
XTX_inv = np.linalg.inv(X.T @ X)
betas = XTX_inv @ (X.T @ y)
betas


array([ 9.43999428e+01,  1.79625404e+03, -1.03939873e+04,  1.13677152e+04,
        1.11612974e+03, -6.68351158e+02, -1.11517849e+03,  2.74067093e+02,
        2.64043132e+02,  1.48963214e+02,  2.85398761e+03,  2.91836306e+03,
        1.91331893e+01, -2.64234917e+01,  1.79814481e+02, -3.09932010e+03,
        4.14024828e+00,  6.94575896e+00, -1.14842267e+03, -7.95710928e+00,
       -4.57170369e+00,  3.50369137e+00,  5.86430312e+01,  2.86290657e+02,
       -1.93733076e+03, -9.40297390e+00, -5.06835572e+01, -1.20083832e+04,
        3.30848343e+01, -1.37014696e+01,  1.52735016e+02, -3.85709725e+02,
        4.62087541e+03,  9.68429259e+00,  1.30073935e+01,  1.55578217e+04,
       -4.35731834e+01,  1.58705727e+01, -2.38053843e+02, -6.84162295e+02,
       -9.92200274e-01,  1.81445764e+02,  4.34981651e+02, -1.79505932e+01,
        1.31933764e+02,  1.36803898e+01,  3.66277092e+02, -3.74870663e+03,
       -7.29228047e+02, -1.38209192e+02, -3.22151393e+03, -3.86536913e+02,
       -1.26958057e+00, -

In [9]:
y_hat = X @ betas
resid_sq = (y - y_hat) ** 2

In [17]:
D = np.diag(resid_sq)


In [18]:
D.shape

(8624, 8624)

In [19]:
beta_vcov = XTX_inv @ (X.T @ D @ X) @ XTX_inv

In [20]:
np.diag(beta_vcov)

array([5.53642730e+02, 2.74831264e+06, 2.24390415e+07, 2.52275993e+07,
       1.53540003e+05, 1.20774207e+05, 1.30610466e+05, 2.39642006e+06,
       1.61734703e+04, 1.40063561e+05, 7.22521129e+05, 5.45440079e+05,
       4.99281434e+01, 6.72423140e+01, 3.76076045e+03, 1.72583259e+05,
       2.83308811e+00, 3.90418213e+00, 2.02066674e+06, 6.89613682e+01,
       1.42133152e+00, 5.16277471e-01, 3.05586629e+02, 3.31135674e+04,
       3.63408746e+06, 6.49515119e+01, 3.02579218e+02, 1.88164892e+07,
       9.66030732e+02, 1.77807537e+02, 1.75435568e+04, 6.06955597e+04,
       4.28064613e+06, 1.13759332e+02, 1.06271296e+02, 2.17558113e+07,
       1.46546624e+03, 2.29022442e+02, 2.08782122e+04, 1.15006452e+04,
       1.99372947e+02, 1.37007076e+04, 1.06599115e+05, 8.13695880e+01,
       9.56862452e+02, 5.45749715e+01, 1.13313772e+04, 4.12535372e+05,
       1.05493039e+05, 2.51102216e+04, 6.50371396e+04, 3.30002504e+04,
       1.81262835e+01, 9.44766203e+04, 2.26987221e+00, 2.19994651e+00,
      

In [21]:
beta_se = np.sqrt(np.diag(beta_vcov))
t_stat = betas / beta_se
t_stat.shape

(87,)

In [22]:
features = ["Intercept"] + list(X_train.columns)

In [23]:
p_value = 2 * (1 - norm.cdf(np.abs(t_stat)))
p_value

array([6.02159127e-05, 2.78580160e-01, 2.82196483e-02, 2.36194179e-02,
       4.39370066e-03, 5.44587415e-02, 2.03065374e-03, 8.59475683e-01,
       3.78736536e-02, 6.90607637e-01, 7.86270229e-04, 7.76516223e-05,
       6.77332738e-03, 1.27156328e-03, 3.36619406e-03, 8.61533067e-14,
       1.39022034e-02, 4.39360864e-04, 4.19151574e-01, 3.37966121e-01,
       1.25722217e-04, 1.08132062e-06, 7.94609738e-04, 1.15655534e-01,
       3.09504193e-01, 2.43319170e-01, 3.57152192e-03, 5.63472572e-03,
       2.87115764e-01, 3.04173684e-01, 2.48855374e-01, 1.17441179e-01,
       2.55215334e-02, 3.63890884e-01, 2.07029371e-01, 8.51456782e-04,
       2.55022974e-01, 2.94313646e-01, 9.94528848e-02, 1.77473147e-10,
       9.43979240e-01, 1.21104138e-01, 1.82769578e-01, 4.65936254e-02,
       1.99795315e-05, 6.40498726e-02, 5.79844599e-04, 5.33157785e-09,
       2.47564806e-02, 3.83104130e-01, 0.00000000e+00, 3.33530385e-02,
       7.65551442e-01, 1.47784675e-02, 1.32998566e-01, 6.39456671e-01,
      

In [24]:
coef_stat = pd.DataFrame({
    "feature": features, 
    "coef": betas,
    "standard_error": beta_se, 
    "t_stat" : t_stat, 
    "p_value" : p_value

})

In [27]:
coef_stat.sort_values(by=['coef'], ascending=False)

Unnamed: 0,feature,coef,standard_error,t_stat,p_value
35,Property.GFA.Calculated.Buildings.and.Parking....,15557.821671,4664.312525,3.335502,0.0008514568
3,Property.GFA.Calculated.Buildings.and.Parking.ft,11367.715199,5022.708363,2.263264,0.02361942
32,Property.GFA.Calculated.Buildings.and.Parking....,4620.875411,2068.97224,2.233416,0.02552153
11,Percent.of.Electricity.Green.Power,2918.363058,738.539152,3.951535,7.765162e-05
10,ENERGY.STAR.Score,2853.98761,850.012429,3.357583,0.0007862702
63,Longitude Percent.of.Electricity.Green.Power,2551.649808,621.413552,4.106202,4.022171e-05
1,Weather.Normalized.Site.Natural.Gas.Intensity....,1796.25404,1657.803559,1.083514,0.2785802
4,Year.Built,1116.129737,391.841809,2.848419,0.004393701
42,Year.Built Longitude,434.981651,326.495199,1.332276,0.1827696
46,Latitude Multifamily.Housing.Percent.That.Can....,366.277092,106.448942,3.440871,0.0005798446
