### Run regression on internal and external validation scores

In [51]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels
import statsmodels.formula.api as smf

In [52]:
# Read in internal scores
scores_i = pd.read_csv('scores_cluster2.csv')
# Read in external scores
scores_e = pd.read_csv('housing_scores2.csv')

In [53]:
# merge dataframes 
merged_df = pd.merge(scores_i, scores_e, on=['num_clus', 'num_feats', 'scale'])

In [54]:
# add squared terms
merged_df['scale_sq'] = merged_df['scale'] ** 2
merged_df['num_clus_sq'] = merged_df['num_clus'] ** 2
merged_df['num_feats_sq'] = merged_df['num_feats'] ** 2

### Fit linear regression

In [55]:
# Sil score
mod1 = smf.ols(formula='sil_score ~ scale + num_clus + num_feats', data=merged_df)
res1 = mod1.fit()
res1.summary()

0,1,2,3
Dep. Variable:,sil_score,R-squared:,0.76
Model:,OLS,Adj. R-squared:,0.756
Method:,Least Squares,F-statistic:,164.8
Date:,"Tue, 27 Feb 2024",Prob (F-statistic):,3.72e-48
Time:,14:45:49,Log-Likelihood:,335.39
No. Observations:,160,AIC:,-662.8
Df Residuals:,156,BIC:,-650.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.3159,0.007,45.748,0.000,0.302,0.330
scale,8.922e-05,2.13e-05,4.192,0.000,4.72e-05,0.000
num_clus,-0.0062,0.000,-21.589,0.000,-0.007,-0.006
num_feats,-1.106e-05,3.35e-06,-3.301,0.001,-1.77e-05,-4.44e-06

0,1,2,3
Omnibus:,7.412,Durbin-Watson:,0.863
Prob(Omnibus):,0.025,Jarque-Bera (JB):,11.457
Skew:,0.194,Prob(JB):,0.00325
Kurtosis:,4.252,Cond. No.,3360.0


In [56]:
# DB score
mod2 = smf.ols(formula='db_score ~ scale + num_clus + num_feats', data=merged_df)
res2 = mod2.fit()
res2.summary()

0,1,2,3
Dep. Variable:,db_score,R-squared:,0.55
Model:,OLS,Adj. R-squared:,0.541
Method:,Least Squares,F-statistic:,63.56
Date:,"Tue, 27 Feb 2024",Prob (F-statistic):,6.66e-27
Time:,14:45:50,Log-Likelihood:,103.25
No. Observations:,160,AIC:,-198.5
Df Residuals:,156,BIC:,-186.2
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.4357,0.029,48.718,0.000,1.377,1.494
scale,-0.0011,9.08e-05,-12.336,0.000,-0.001,-0.001
num_clus,-0.0059,0.001,-4.789,0.000,-0.008,-0.003
num_feats,5.639e-05,1.43e-05,3.944,0.000,2.81e-05,8.46e-05

0,1,2,3
Omnibus:,0.952,Durbin-Watson:,1.152
Prob(Omnibus):,0.621,Jarque-Bera (JB):,0.997
Skew:,-0.079,Prob(JB):,0.607
Kurtosis:,2.647,Cond. No.,3360.0


In [57]:
# CH score
mod3 = smf.ols(formula='cal_score ~ scale + num_clus + num_feats', data=merged_df)
res3 = mod3.fit()
res3.summary()

0,1,2,3
Dep. Variable:,cal_score,R-squared:,0.404
Model:,OLS,Adj. R-squared:,0.393
Method:,Least Squares,F-statistic:,35.24
Date:,"Tue, 27 Feb 2024",Prob (F-statistic):,1.9e-17
Time:,14:45:51,Log-Likelihood:,-1310.0
No. Observations:,160,AIC:,2628.0
Df Residuals:,156,BIC:,2640.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2156.5713,202.029,10.675,0.000,1757.507,2555.636
scale,-5.9934,0.623,-9.625,0.000,-7.223,-4.763
num_clus,-30.4412,8.417,-3.617,0.000,-47.067,-13.815
num_feats,-0.0073,0.098,-0.074,0.941,-0.201,0.186

0,1,2,3
Omnibus:,119.373,Durbin-Watson:,0.317
Prob(Omnibus):,0.0,Jarque-Bera (JB):,967.474
Skew:,2.769,Prob(JB):,8.23e-211
Kurtosis:,13.699,Cond. No.,3360.0


In [58]:
# housep score
mod4 = smf.ols(formula='std_price_divided ~ scale + num_clus + num_feats', data=merged_df)
res4 = mod4.fit()
res4.summary()

0,1,2,3
Dep. Variable:,std_price_divided,R-squared:,0.709
Model:,OLS,Adj. R-squared:,0.703
Method:,Least Squares,F-statistic:,126.4
Date:,"Tue, 27 Feb 2024",Prob (F-statistic):,1.47e-41
Time:,14:45:52,Log-Likelihood:,-1866.2
No. Observations:,160,AIC:,3740.0
Df Residuals:,156,BIC:,3753.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.422e+05,6533.987,52.374,0.000,3.29e+05,3.55e+05
scale,-93.6243,20.139,-4.649,0.000,-133.404,-53.845
num_clus,-5123.8890,272.219,-18.823,0.000,-5661.600,-4586.178
num_feats,-5.7477,3.170,-1.813,0.072,-12.010,0.515

0,1,2,3
Omnibus:,1.714,Durbin-Watson:,0.942
Prob(Omnibus):,0.424,Jarque-Bera (JB):,1.517
Skew:,-0.106,Prob(JB):,0.468
Kurtosis:,2.573,Cond. No.,3360.0


In [59]:
# house area score
mod5 = smf.ols(formula='std_area_divided ~ scale + num_clus + num_feats', data=merged_df)
res5 = mod5.fit()
res5.summary()

0,1,2,3
Dep. Variable:,std_area_divided,R-squared:,0.607
Model:,OLS,Adj. R-squared:,0.6
Method:,Least Squares,F-statistic:,80.41
Date:,"Tue, 27 Feb 2024",Prob (F-statistic):,1.7100000000000001e-31
Time:,14:45:53,Log-Likelihood:,-497.77
No. Observations:,160,AIC:,1004.0
Df Residuals:,156,BIC:,1016.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,64.3912,1.261,51.062,0.000,61.900,66.882
scale,-0.0053,0.004,-1.368,0.173,-0.013,0.002
num_clus,-0.8088,0.053,-15.394,0.000,-0.913,-0.705
num_feats,-0.0009,0.001,-1.535,0.127,-0.002,0.000

0,1,2,3
Omnibus:,3.179,Durbin-Watson:,1.014
Prob(Omnibus):,0.204,Jarque-Bera (JB):,3.28
Skew:,0.112,Prob(JB):,0.194
Kurtosis:,3.665,Cond. No.,3360.0


In [60]:
# house age
# housep score
mod6 = smf.ols(formula='std_conyr_divided ~ scale + num_clus + num_feats', data=merged_df)
res6 = mod6.fit()
res6.summary()

0,1,2,3
Dep. Variable:,std_conyr_divided,R-squared:,0.769
Model:,OLS,Adj. R-squared:,0.765
Method:,Least Squares,F-statistic:,173.3
Date:,"Tue, 27 Feb 2024",Prob (F-statistic):,1.89e-49
Time:,14:45:53,Log-Likelihood:,-311.58
No. Observations:,160,AIC:,631.2
Df Residuals:,156,BIC:,643.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,22.8804,0.394,58.091,0.000,22.102,23.658
scale,-0.0018,0.001,-1.510,0.133,-0.004,0.001
num_clus,-0.3730,0.016,-22.729,0.000,-0.405,-0.341
num_feats,-0.0002,0.000,-1.004,0.317,-0.001,0.000

0,1,2,3
Omnibus:,1.935,Durbin-Watson:,0.812
Prob(Omnibus):,0.38,Jarque-Bera (JB):,1.959
Skew:,-0.216,Prob(JB):,0.375
Kurtosis:,2.673,Cond. No.,3360.0


### Fit quadratic linear regression

In [61]:
# db score 
mod7 = smf.ols(formula='db_score ~ scale + scale_sq + num_clus + num_clus_sq + num_feats + num_feats_sq', data=merged_df)
res7 = mod7.fit()
res7.summary()

0,1,2,3
Dep. Variable:,db_score,R-squared:,0.585
Model:,OLS,Adj. R-squared:,0.569
Method:,Least Squares,F-statistic:,36.0
Date:,"Tue, 27 Feb 2024",Prob (F-statistic):,5.9500000000000004e-27
Time:,14:45:56,Log-Likelihood:,109.8
No. Observations:,160,AIC:,-205.6
Df Residuals:,153,BIC:,-184.1
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.3342,0.047,28.204,0.000,1.241,1.428
scale,-0.0003,0.000,-0.784,0.434,-0.001,0.000
scale_sq,-2.107e-06,9.33e-07,-2.259,0.025,-3.95e-06,-2.64e-07
num_clus,-0.0057,0.005,-1.059,0.291,-0.016,0.005
num_clus_sq,-6.223e-06,0.000,-0.033,0.974,-0.000,0.000
num_feats,0.0002,5.72e-05,3.723,0.000,9.99e-05,0.000
num_feats_sq,-7.196e-08,2.55e-08,-2.821,0.005,-1.22e-07,-2.16e-08

0,1,2,3
Omnibus:,1.721,Durbin-Watson:,1.132
Prob(Omnibus):,0.423,Jarque-Bera (JB):,1.688
Skew:,0.247,Prob(JB):,0.43
Kurtosis:,2.903,Cond. No.,9940000.0


In [62]:
# sil score 
mod8 = smf.ols(formula='sil_score ~ scale + scale_sq + num_clus + num_clus_sq + num_feats + num_feats_sq', data=merged_df)
res8 = mod8.fit()
res8.summary()

0,1,2,3
Dep. Variable:,sil_score,R-squared:,0.876
Model:,OLS,Adj. R-squared:,0.871
Method:,Least Squares,F-statistic:,179.8
Date:,"Tue, 27 Feb 2024",Prob (F-statistic):,1.1700000000000001e-66
Time:,14:45:56,Log-Likelihood:,388.03
No. Observations:,160,AIC:,-762.1
Df Residuals:,153,BIC:,-740.5
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.3821,0.008,45.973,0.000,0.366,0.399
scale,-5.572e-05,6.61e-05,-0.843,0.400,-0.000,7.48e-05
scale_sq,3.699e-07,1.64e-07,2.256,0.025,4.6e-08,6.94e-07
num_clus,-0.0166,0.001,-17.553,0.000,-0.018,-0.015
num_clus_sq,0.0004,3.3e-05,11.267,0.000,0.000,0.000
num_feats,-4.245e-05,1e-05,-4.226,0.000,-6.23e-05,-2.26e-05
num_feats_sq,1.443e-08,4.48e-09,3.221,0.002,5.58e-09,2.33e-08

0,1,2,3
Omnibus:,15.446,Durbin-Watson:,1.017
Prob(Omnibus):,0.0,Jarque-Bera (JB):,33.237
Skew:,0.384,Prob(JB):,6.06e-08
Kurtosis:,5.097,Cond. No.,9940000.0


In [63]:
# cal score 
mod9 = smf.ols(formula='cal_score ~ scale + scale_sq + num_clus + num_clus_sq + num_feats + num_feats_sq', data=merged_df)
res9 = mod9.fit()
res9.summary()

0,1,2,3
Dep. Variable:,cal_score,R-squared:,0.629
Model:,OLS,Adj. R-squared:,0.614
Method:,Least Squares,F-statistic:,43.17
Date:,"Tue, 27 Feb 2024",Prob (F-statistic):,1.4899999999999998e-30
Time:,14:45:57,Log-Likelihood:,-1272.1
No. Observations:,160,AIC:,2558.0
Df Residuals:,153,BIC:,2580.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3737.2027,266.685,14.014,0.000,3210.343,4264.062
scale,-25.3817,2.120,-11.974,0.000,-29.570,-21.194
scale_sq,0.0495,0.005,9.408,0.000,0.039,0.060
num_clus,-89.2711,30.362,-2.940,0.004,-149.254,-29.288
num_clus_sq,2.1011,1.058,1.987,0.049,0.012,4.190
num_feats,-0.1202,0.322,-0.373,0.710,-0.757,0.516
num_feats_sq,5.194e-05,0.000,0.361,0.718,-0.000,0.000

0,1,2,3
Omnibus:,116.657,Durbin-Watson:,0.424
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1012.258
Skew:,2.642,Prob(JB):,1.5499999999999999e-220
Kurtosis:,14.132,Cond. No.,9940000.0


In [64]:
# house price score 
mod10 = smf.ols(formula='std_price_divided ~ scale + scale_sq + num_clus + num_clus_sq + num_feats + num_feats_sq', data=merged_df)
res10 = mod10.fit()
res10.summary()

0,1,2,3
Dep. Variable:,std_price_divided,R-squared:,0.813
Model:,OLS,Adj. R-squared:,0.806
Method:,Least Squares,F-statistic:,111.0
Date:,"Tue, 27 Feb 2024",Prob (F-statistic):,3.65e-53
Time:,14:45:58,Log-Likelihood:,-1830.6
No. Observations:,160,AIC:,3675.0
Df Residuals:,153,BIC:,3697.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.005e+05,8747.204,45.782,0.000,3.83e+05,4.18e+05
scale,-356.0658,69.529,-5.121,0.000,-493.427,-218.705
scale_sq,0.6697,0.172,3.882,0.000,0.329,1.011
num_clus,-1.317e+04,995.864,-13.229,0.000,-1.51e+04,-1.12e+04
num_clus_sq,287.5149,34.688,8.289,0.000,218.986,356.043
num_feats,-20.2416,10.570,-1.915,0.057,-41.124,0.640
num_feats_sq,0.0067,0.005,1.413,0.160,-0.003,0.016

0,1,2,3
Omnibus:,0.385,Durbin-Watson:,1.171
Prob(Omnibus):,0.825,Jarque-Bera (JB):,0.539
Skew:,-0.068,Prob(JB):,0.764
Kurtosis:,2.75,Cond. No.,9940000.0


In [65]:
# house area score 
mod11 = smf.ols(formula='std_area_divided ~ scale + scale_sq + num_clus + num_clus_sq + num_feats + num_feats_sq', data=merged_df)
res11 = mod11.fit()
res11.summary()

0,1,2,3
Dep. Variable:,std_area_divided,R-squared:,0.694
Model:,OLS,Adj. R-squared:,0.682
Method:,Least Squares,F-statistic:,57.81
Date:,"Tue, 27 Feb 2024",Prob (F-statistic):,6.91e-37
Time:,14:45:59,Log-Likelihood:,-477.83
No. Observations:,160,AIC:,969.7
Df Residuals:,153,BIC:,991.2
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,73.0150,1.862,39.215,0.000,69.337,76.693
scale,-0.0378,0.015,-2.551,0.012,-0.067,-0.009
scale_sq,8.277e-05,3.67e-05,2.254,0.026,1.02e-05,0.000
num_clus,-2.0624,0.212,-9.730,0.000,-2.481,-1.644
num_clus_sq,0.0448,0.007,6.064,0.000,0.030,0.059
num_feats,-0.0036,0.002,-1.584,0.115,-0.008,0.001
num_feats_sq,1.207e-06,1e-06,1.203,0.231,-7.76e-07,3.19e-06

0,1,2,3
Omnibus:,0.294,Durbin-Watson:,1.115
Prob(Omnibus):,0.863,Jarque-Bera (JB):,0.149
Skew:,0.071,Prob(JB):,0.928
Kurtosis:,3.044,Cond. No.,9940000.0


In [66]:
# house age score 
mod12 = smf.ols(formula='std_conyr_divided ~ scale + scale_sq + num_clus + num_clus_sq + num_feats + num_feats_sq', data=merged_df)
res12 = mod12.fit()
res12.summary()

0,1,2,3
Dep. Variable:,std_conyr_divided,R-squared:,0.881
Model:,OLS,Adj. R-squared:,0.877
Method:,Least Squares,F-statistic:,189.2
Date:,"Tue, 27 Feb 2024",Prob (F-statistic):,3.93e-68
Time:,14:46:00,Log-Likelihood:,-258.44
No. Observations:,160,AIC:,530.9
Df Residuals:,153,BIC:,552.4
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,27.0017,0.473,57.140,0.000,26.068,27.935
scale,-0.0214,0.004,-5.705,0.000,-0.029,-0.014
scale_sq,5.001e-05,9.32e-06,5.367,0.000,3.16e-05,6.84e-05
num_clus,-0.9287,0.054,-17.262,0.000,-1.035,-0.822
num_clus_sq,0.0198,0.002,10.591,0.000,0.016,0.024
num_feats,-0.0012,0.001,-2.102,0.037,-0.002,-7.19e-05
num_feats_sq,4.637e-07,2.55e-07,1.820,0.071,-3.97e-08,9.67e-07

0,1,2,3
Omnibus:,0.25,Durbin-Watson:,1.23
Prob(Omnibus):,0.882,Jarque-Bera (JB):,0.229
Skew:,-0.09,Prob(JB):,0.892
Kurtosis:,2.953,Cond. No.,9940000.0
