In [55]:
import statsmodels.api as sm
import pandas as pd
from sklearn.linear_model import LinearRegression

## Website Tutorial with fictitious economy

In [56]:
stock_market = {'Year': [2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016],
                'Month': [12, 11,10,9,8,7,6,5,4,3,2,1,12,11,10,9,8,7,6,5,4,3,2,1],
                'Interest_Rate': [2.75,2.5,2.5,2.5,2.5,2.5,2.5,2.25,2.25,2.25,2,2,2,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75],
                'Unemployment_Rate': [5.3,5.3,5.3,5.3,5.4,5.6,5.5,5.5,5.5,5.6,5.7,5.9,6,5.9,5.8,6.1,6.2,6.1,6.1,6.1,5.9,6.2,6.2,6.1],
                'Stock_Index_Price': [1464,1394,1357,1293,1256,1254,1234,1195,1159,1167,1130,1075,1047,965,943,958,971,949,884,866,876,822,704,719]
                }

df = pd.DataFrame(stock_market,columns=['Year','Month','Interest_Rate','Unemployment_Rate','Stock_Index_Price'])

In [57]:
X = df[['Interest_Rate', 'Unemployment_Rate']]
Y = df['Stock_Index_Price']
X = sm.add_constant(X)

In [58]:
model = sm.OLS(Y, X).fit()
predictions = model.predict(X)

In [59]:
# model.summary()
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:      Stock_Index_Price   R-squared:                       0.898
Model:                            OLS   Adj. R-squared:                  0.888
Method:                 Least Squares   F-statistic:                     92.07
Date:                Wed, 27 Apr 2022   Prob (F-statistic):           4.04e-11
Time:                        17:45:01   Log-Likelihood:                -134.61
No. Observations:                  24   AIC:                             275.2
Df Residuals:                      21   BIC:                             278.8
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const              1798.4040    899.24

## LHL Walkthrough (keep working with housing data from kaggle)

In [None]:
df = pd.read_csv('df_train.csv')
dep_var = df.columns[[df.columns != 'SalePrice']].tolist()
X = df[dep_var]
y = df['SalePrice']

In [78]:
X = sm.add_constant(X)
lin_reg = sm.OLS(y, X)

In [80]:
model = lin_reg.fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.835
Model:                            OLS   Adj. R-squared:                  0.834
Method:                 Least Squares   F-statistic:                     732.0
Date:                Wed, 27 Apr 2022   Prob (F-statistic):               0.00
Time:                        17:53:29   Log-Likelihood:                -17206.
No. Observations:                1458   AIC:                         3.443e+04
Df Residuals:                    1447   BIC:                         3.449e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const        -8.992e+05   8.93e+04    -10.069   

##### p-value of BsmtQual is very large, lets see how our model will do without it

In [88]:
X = df[['OverallQual',
        'YearBuilt',
        'ExterQual',
        # 'BsmtQual',
        'TotalBsmtSF',
        'GrLivArea',
        'FullBath',
        'KitchenQual',
        'GarageCars',
        'OverallGrade']]

In [89]:
X = sm.add_constant(X)
lin_reg = sm.OLS(y, X)
model = lin_reg.fit()
model_summary = model.summary()

In [90]:
print(model_summary)
type(lin_reg)

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.835
Model:                            OLS   Adj. R-squared:                  0.834
Method:                 Least Squares   F-statistic:                     813.6
Date:                Wed, 27 Apr 2022   Prob (F-statistic):               0.00
Time:                        18:00:18   Log-Likelihood:                -17206.
No. Observations:                1458   AIC:                         3.443e+04
Df Residuals:                    1448   BIC:                         3.449e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         -9.15e+05   8.58e+04    -10.665   

statsmodels.regression.linear_model.OLS

###### Adj. R-squared did not change, but we got rid of unneeded column

### Lets repeat everything using sklearn library

In [93]:
regressor = LinearRegression()
regressor.fit(X, y)

LinearRegression()

In [97]:
regressor.coef_  # shows the coefficients for each independent variable

array([     0.        ,   5658.76697148,    401.0126101 ,  14555.29412402,
           42.72498789,     66.83083518, -11277.90029621,  11495.44094134,
         9265.51545575,   1080.4781385 ])

In [98]:
regressor.score(X, y)  # shows R score

0.8349027776571498