# EDA notebook - Interaction
The below steps show how the final model is validated for the analysis.

In [1]:
import pandas as pd
import numpy as np

import data_preparation as dp

import statsmodels.api as sm

house = pd.read_csv("../data/kc_house_data.csv")

house.drop(['id',
            'date',
            'zipcode', 
            'lat', 
            'long', 'sqft_above',
            'sqft_living15', 
            'sqft_lot15'], axis = 1, inplace=True)

In [2]:
# cleaning before finding interaction
house = dp.missing(house)
house = dp.cleaning(house)
house_num_final = dp.numeric_transform(house)
house = dp.categorical_tansformation(house)
house_final = dp.concatenation(house_num_final, house)

In [3]:
from sklearn.model_selection import train_test_split

X = house_final.drop('price', axis = 1)
y = house_final['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [4]:
from sklearn.preprocessing import PolynomialFeatures

In [5]:
# selecting sqft_living column only as it has the highest correlation with house price column
dp.scores(X_train[['sqft_living']], y_train, X_test[['sqft_living']], y_test)

Train score:      0.4534361083157941
Validation score: 0.45952300008481545
X-test score: 0.45641912249708183
R2 score: 0.45641912249708183
Mean**2 Error 0.7337180230010705


In [6]:
# implementing data with just sqft_living column
poly = PolynomialFeatures(2)
X_poly_train = poly.fit_transform(X_train[['sqft_living']])
X_poly_test = poly.transform(X_test[['sqft_living']])

dp.scores(X_poly_train, y_train, X_poly_test, y_test)

Train score:      0.4848214267770404
Validation score: 0.48714093663144115
X-test score: 0.484503445512511
R2 score: 0.484503445512511
Mean**2 Error 0.7145127541661467


In [7]:
poly = PolynomialFeatures(3)
X_poly_train = poly.fit_transform(X_train[['sqft_living']])
X_poly_test = poly.transform(X_test[['sqft_living']])

dp.scores(X_poly_train, y_train, X_poly_test, y_test)

Train score:      0.48548684435400985
Validation score: 0.4870417343473496
X-test score: 0.4857001733102616
R2 score: 0.4857001733102616
Mean**2 Error 0.7136828998143384


In [8]:
# Scores with all columns
# The scores are much higher than the scores from the previous steps
# higher mean squared error
dp.scores(X_train, y_train, X_test, y_test)

Train score:      0.6554961568004584
Validation score: 0.6562382822720109
X-test score: 0.637612306512999
R2 score: 0.637612306512999
Mean**2 Error 0.59907862233999


In [9]:
# Condition number is a bit higher. It means there is some multicollinearity.
sm.OLS(y_train, sm.add_constant(X_train)).fit().summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.656
Model:,OLS,Adj. R-squared:,0.655
Method:,Least Squares,F-statistic:,1713.0
Date:,"Sun, 08 May 2022",Prob (F-statistic):,0.0
Time:,11:57:15,Log-Likelihood:,-14370.0
No. Observations:,16197,AIC:,28780.0
Df Residuals:,16178,BIC:,28920.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.9537,0.024,40.235,0.000,0.907,1.000
bedrooms,-0.0747,0.006,-11.783,0.000,-0.087,-0.062
bathrooms,0.0789,0.009,9.135,0.000,0.062,0.096
sqft_living,0.3492,0.011,32.585,0.000,0.328,0.370
sqft_lot,-0.0641,0.006,-11.071,0.000,-0.075,-0.053
floors,0.0670,0.007,9.778,0.000,0.054,0.080
yr_built,-0.3067,0.007,-45.943,0.000,-0.320,-0.294
waterfront,0.9412,0.058,16.167,0.000,0.827,1.055
view,0.2408,0.017,13.977,0.000,0.207,0.275

0,1,2,3
Omnibus:,41.176,Durbin-Watson:,2.005
Prob(Omnibus):,0.0,Jarque-Bera (JB):,51.243
Skew:,-0.034,Prob(JB):,7.46e-12
Kurtosis:,3.267,Cond. No.,21.9


In [10]:
# Scores with 5 columns with only numerical columns
# Scores are lowered while mean squared error is higher
var = ['sqft_living', 'bathrooms', 'bedrooms', 'floors', 'view']

dp.scores(X_train[var], y_train, X_test[var], y_test)

Train score:      0.5003532396509703
Validation score: 0.5042691681991028
X-test score: 0.5019864253772119
R2 score: 0.5019864253772119
Mean**2 Error 0.7022919532225378


In [11]:
# Information on bathrooms and bedrooms are removed for below cells
X_train = X_train.drop(['bathrooms','bedrooms'],axis = 1).copy()
X_test = X_test.drop(['bathrooms','bedrooms'],axis = 1).copy()

dp.scores(X_train, y_train, X_test, y_test)

Train score:      0.6515747022621045
Validation score: 0.6513176974301091
X-test score: 0.6346523693995985
R2 score: 0.6346523693995985
Mean**2 Error 0.6015202461894136


In [12]:
# Condition number is better after removing two columns.
# Validation score is slightly lowered, but the impact is very minor.
sm.OLS(y_train, sm.add_constant(X_train)).fit().summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.652
Model:,OLS,Adj. R-squared:,0.651
Method:,Least Squares,F-statistic:,1892.0
Date:,"Sun, 08 May 2022",Prob (F-statistic):,0.0
Time:,11:57:15,Log-Likelihood:,-14468.0
No. Observations:,16197,AIC:,28970.0
Df Residuals:,16180,BIC:,29100.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.9790,0.024,41.423,0.000,0.933,1.025
sqft_living,0.3301,0.008,39.854,0.000,0.314,0.346
sqft_lot,-0.0659,0.006,-11.384,0.000,-0.077,-0.055
floors,0.0817,0.007,12.101,0.000,0.068,0.095
yr_built,-0.2838,0.006,-45.379,0.000,-0.296,-0.272
waterfront,0.9704,0.059,16.586,0.000,0.856,1.085
view,0.2549,0.017,14.736,0.000,0.221,0.289
is_renovated,0.0743,0.027,2.750,0.006,0.021,0.127
has_basement,0.1176,0.012,10.096,0.000,0.095,0.140

0,1,2,3
Omnibus:,38.697,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,47.738
Skew:,-0.034,Prob(JB):,4.3e-11
Kurtosis:,3.257,Cond. No.,17.6


In [13]:
# Polynomial methos will not be used in this analysis as condition number is very high.
poly = PolynomialFeatures(2)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)

dp.scores(X_poly_train, y_train, X_poly_test, y_test)

Train score:      0.6756152843579682
Validation score: 0.668807038828558
X-test score: 0.6555229266242255
R2 score: 0.6555229266242255
Mean**2 Error 0.5840866276161898


In [14]:
sm.OLS(y_train, sm.add_constant(X_poly_train)).fit().summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.675
Model:,OLS,Adj. R-squared:,0.672
Method:,Least Squares,F-statistic:,264.8
Date:,"Sun, 08 May 2022",Prob (F-statistic):,0.0
Time:,11:57:16,Log-Likelihood:,-13907.0
No. Observations:,16197,AIC:,28070.0
Df Residuals:,16070,BIC:,29050.0
Df Model:,126,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.6685,0.073,9.181,0.000,0.526,0.811
x1,0.5885,0.049,12.125,0.000,0.493,0.684
x2,-0.2336,0.026,-8.931,0.000,-0.285,-0.182
x3,-0.1739,0.036,-4.856,0.000,-0.244,-0.104
x4,-0.2922,0.039,-7.474,0.000,-0.369,-0.216
x5,0.9019,0.348,2.592,0.010,0.220,1.584
x6,0.1081,0.035,3.076,0.002,0.039,0.177
x7,0.2198,0.066,3.307,0.001,0.090,0.350
x8,-0.0078,0.029,-0.270,0.787,-0.064,0.049

0,1,2,3
Omnibus:,63.18,Durbin-Watson:,2.017
Prob(Omnibus):,0.0,Jarque-Bera (JB):,83.302
Skew:,-0.044,Prob(JB):,8.15e-19
Kurtosis:,3.34,Cond. No.,1.29e+16


In [15]:
X_train.loc[:,'int4'] = X_train['sqft_living'] * X_train['yr_built']
X_test.loc[:,'int4'] = X_test['sqft_living'] * X_test['yr_built']

dp.scores(X_train, y_train, X_test, y_test)

Train score:      0.6532701480220614
Validation score: 0.6523981556506147
X-test score: 0.6352544653210463
R2 score: 0.6352544653210463
Mean**2 Error 0.601024386635919


In [16]:
# small increase in validation score with small increase in condition number.
# This would be the final model.
sm.OLS(y_train, sm.add_constant(X_train)).fit().summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.653
Model:,OLS,Adj. R-squared:,0.653
Method:,Least Squares,F-statistic:,1792.0
Date:,"Sun, 08 May 2022",Prob (F-statistic):,0.0
Time:,11:57:16,Log-Likelihood:,-14432.0
No. Observations:,16197,AIC:,28900.0
Df Residuals:,16179,BIC:,29040.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.9193,0.025,37.368,0.000,0.871,0.968
sqft_living,0.3313,0.008,40.086,0.000,0.315,0.348
sqft_lot,-0.0679,0.006,-11.737,0.000,-0.079,-0.057
floors,0.0881,0.007,12.994,0.000,0.075,0.101
yr_built,-0.2792,0.006,-44.584,0.000,-0.292,-0.267
waterfront,0.9788,0.058,16.765,0.000,0.864,1.093
view,0.2604,0.017,15.076,0.000,0.227,0.294
is_renovated,0.1019,0.027,3.751,0.000,0.049,0.155
has_basement,0.1355,0.012,11.473,0.000,0.112,0.159

0,1,2,3
Omnibus:,48.702,Durbin-Watson:,2.009
Prob(Omnibus):,0.0,Jarque-Bera (JB):,61.796
Skew:,-0.039,Prob(JB):,3.81e-14
Kurtosis:,3.293,Cond. No.,17.9
