In [50]:
import re
import patsy
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import r2_score
from sklearn.preprocessing import scale
import warnings
import pickle

In [51]:
def makeFormula(target, numeric=None, categoric=None, noInter = False, scale = False):
    formula = ""
    formula += target
    formula += "~"
    
    if(numeric != None):
        if scale == True:
            numeric = list(map(lambda c :  "scale(" + c + ")", numeric))
        formula_numeric = "+".join(numeric)
        formula += formula_numeric 
    
    if(numeric != None and categoric != None):
        formula += "+"
    
    if(categoric != None):
        categoric = list(map(lambda c :  "C(" + c + ")", categoric))
        formula_categoric = "+".join(categoric)
        formula += formula_categoric
    
    if(noInter == True):
        formula +="+0"
    
    return formula

In [52]:
df_train = pd.read_csv("C:/Users/user/Desktop/newww/house-prj/train_no_missing.csv")
df_test = pd.read_csv("C:/Users/user/Desktop/newww/house-prj/test_no_missing.csv")

# Preprocessing

In [53]:
nuTocaList = ["MSSubClass", "MoSold", "YrSold", "OverallQual", "OverallCond"]
df_train[nuTocaList] = df_train[nuTocaList].astype("str")
df_test[nuTocaList] = df_test[nuTocaList].astype("str")

df_all = pd.concat([df_train, df_test])
df_all.reset_index(drop = True, inplace = True)

df_all = pd.get_dummies(df_all)
df_train_dummy = df_all.iloc[:1460]
df_test_dummy = df_all.iloc[1460:]

df_train_dummy.reset_index(drop=True, inplace =True)
df_test_dummy.reset_index(drop=True, inplace =True)

df_test_dummy.drop("SalePriceLog", axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [54]:
df_train.shape, df_test.shape, df_train_dummy.shape, df_test_dummy.shape

((1460, 81), (1459, 80), (1460, 351), (1459, 350))

# Split into numerical and categorical data 

In [55]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_list = list(df_test.select_dtypes(include=numerics).columns)

print(len(numeric_list))

32


In [56]:
df_scale_numeric = pd.DataFrame()
for c in numeric_list:
    if c == "Id":
        continue
    df_scale_numeric[c+"Scale"] = scale(df_train[c])
    
df_scale_numeric.drop("TotalBsmtSFScale", axis = 1, inplace = True)
df_scale_numeric.drop("GrLivAreaScale", axis= 1, inplace=True)

df_scale_numeric["SalePriceLog"] = df_train["SalePriceLog"]

scale_list = list(df_scale_numeric.columns)
scale_list.remove('SalePriceLog')



In [57]:
categoric_list = list(df_train.select_dtypes(include=['object']).columns)
categoric_dummy_list = list(df_train_dummy.filter(regex="_").columns)

df_scale_cate = pd.concat([df_scale_numeric, df_train[categoric_list]], axis=1)
df_scale_cate_dummy = pd.concat([df_scale_numeric, df_train_dummy[categoric_dummy_list]], axis=1)

df_scale_cate.shape, df_scale_cate_dummy.shape

((1460, 78), (1460, 348))

In [58]:
column_ls = scale_list + categoric_dummy_list

# Modeling

- scale numerical data + categorical dummy data
- test score : 0.1498

In [59]:
model = sm.OLS.from_formula(makeFormula("SalePriceLog", column_ls, None, True),df_scale_cate_dummy)
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,SalePriceLog,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.935
Method:,Least Squares,F-statistic:,73.74
Date:,"Mon, 12 Mar 2018",Prob (F-statistic):,0.0
Time:,21:05:47,Log-Likelihood:,1423.3
No. Observations:,1460,AIC:,-2269.0
Df Residuals:,1171,BIC:,-740.8
Df Model:,288,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
FirstFlrSFScale,0.0893,0.010,9.140,0.000,0.070,0.108
SecondFlrSFScale,0.0986,0.012,8.521,0.000,0.076,0.121
ThreeSsnPorchScale,0.0047,0.003,1.572,0.116,-0.001,0.011
BedroomAbvGrScale,0.0019,0.005,0.366,0.715,-0.008,0.012
BsmtFinSF1Scale,0.0617,0.011,5.732,0.000,0.041,0.083
BsmtFinSF2Scale,0.0205,0.007,3.139,0.002,0.008,0.033
BsmtFullBathScale,0.0127,0.005,2.702,0.007,0.003,0.022
BsmtHalfBathScale,0.0030,0.003,0.912,0.362,-0.003,0.010
BsmtUnfSFScale,0.0363,0.009,3.835,0.000,0.018,0.055

0,1,2,3
Omnibus:,398.481,Durbin-Watson:,1.925
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5949.495
Skew:,-0.851,Prob(JB):,0.0
Kurtosis:,12.742,Cond. No.,1.17e+16


## trial 1
- delete the variables that have over 0.01 p-value
- test score: 0.1449

In [60]:
over_pvalues = list(result.pvalues[result.pvalues > 0.01].index)
for c in over_pvalues:
    column_ls.remove(c)
    
model1 = sm.OLS.from_formula(makeFormula("SalePriceLog", column_ls, None, True),df_scale_cate_dummy)
result1 = model1.fit()
result1.summary()

0,1,2,3
Dep. Variable:,SalePriceLog,R-squared:,0.941
Model:,OLS,Adj. R-squared:,0.931
Method:,Least Squares,F-statistic:,98.49
Date:,"Mon, 12 Mar 2018",Prob (F-statistic):,0.0
Time:,21:05:48,Log-Likelihood:,1333.4
No. Observations:,1460,AIC:,-2259.0
Df Residuals:,1256,BIC:,-1180.0
Df Model:,203,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
FirstFlrSFScale,0.0968,0.008,11.487,0.000,0.080,0.113
SecondFlrSFScale,0.1043,0.009,11.504,0.000,0.087,0.122
BsmtFinSF1Scale,0.0615,0.010,5.928,0.000,0.041,0.082
BsmtFinSF2Scale,0.0242,0.006,3.766,0.000,0.012,0.037
BsmtFullBathScale,0.0104,0.004,2.376,0.018,0.002,0.019
BsmtUnfSFScale,0.0357,0.009,3.953,0.000,0.018,0.053
FullBathScale,0.0177,0.005,3.347,0.001,0.007,0.028
GarageAreaScale,0.0349,0.005,6.815,0.000,0.025,0.045
HalfBathScale,0.0145,0.005,3.083,0.002,0.005,0.024

0,1,2,3
Omnibus:,346.982,Durbin-Watson:,1.929
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4472.659
Skew:,-0.729,Prob(JB):,0.0
Kurtosis:,11.45,Cond. No.,1.22e+16


## trial2
- test score: 0.1441

In [61]:
over_pvalues = list(result1.pvalues[result1.pvalues > 0.01].index)
for c in over_pvalues:
    column_ls.remove(c)
    
model2 = sm.OLS.from_formula(makeFormula("SalePriceLog", column_ls, None, True),df_scale_cate_dummy)
result2 = model2.fit()
result2.summary()

0,1,2,3
Dep. Variable:,SalePriceLog,R-squared:,0.932
Model:,OLS,Adj. R-squared:,0.925
Method:,Least Squares,F-statistic:,127.4
Date:,"Mon, 12 Mar 2018",Prob (F-statistic):,0.0
Time:,21:05:48,Log-Likelihood:,1232.8
No. Observations:,1460,AIC:,-2180.0
Df Residuals:,1317,BIC:,-1424.0
Df Model:,142,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
FirstFlrSFScale,0.0984,0.008,12.322,0.000,0.083,0.114
SecondFlrSFScale,0.1099,0.006,18.836,0.000,0.098,0.121
BsmtFinSF1Scale,0.0777,0.010,8.067,0.000,0.059,0.097
BsmtFinSF2Scale,0.0294,0.006,4.580,0.000,0.017,0.042
BsmtUnfSFScale,0.0442,0.009,5.103,0.000,0.027,0.061
FullBathScale,0.0103,0.005,1.980,0.048,9.57e-05,0.021
GarageAreaScale,0.0339,0.005,6.864,0.000,0.024,0.044
HalfBathScale,0.0098,0.005,2.161,0.031,0.001,0.019
LotAreaScale,0.0300,0.005,6.500,0.000,0.021,0.039

0,1,2,3
Omnibus:,324.983,Durbin-Watson:,1.931
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4436.325
Skew:,-0.636,Prob(JB):,0.0
Kurtosis:,11.445,Cond. No.,1.3e+16


## trial3
- test score: 0.1398

In [62]:
over_pvalues = list(result2.pvalues[result2.pvalues > 0.01].index)
for c in over_pvalues:
    column_ls.remove(c)
    
model3 = sm.OLS.from_formula(makeFormula("SalePriceLog", column_ls, None, True),df_scale_cate_dummy)
result3 = model3.fit()
result3.summary()

0,1,2,3
Dep. Variable:,SalePriceLog,R-squared:,0.931
Model:,OLS,Adj. R-squared:,0.924
Method:,Least Squares,F-statistic:,132.9
Date:,"Mon, 12 Mar 2018",Prob (F-statistic):,0.0
Time:,21:05:49,Log-Likelihood:,1217.6
No. Observations:,1460,AIC:,-2165.0
Df Residuals:,1325,BIC:,-1452.0
Df Model:,134,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
FirstFlrSFScale,0.1045,0.008,13.540,0.000,0.089,0.120
SecondFlrSFScale,0.1202,0.004,29.306,0.000,0.112,0.128
BsmtFinSF1Scale,0.0743,0.010,7.698,0.000,0.055,0.093
BsmtFinSF2Scale,0.0289,0.006,4.491,0.000,0.016,0.042
BsmtUnfSFScale,0.0413,0.009,4.755,0.000,0.024,0.058
GarageAreaScale,0.0342,0.005,6.904,0.000,0.024,0.044
LotAreaScale,0.0278,0.005,6.058,0.000,0.019,0.037
ScreenPorchScale,0.0141,0.003,4.504,0.000,0.008,0.020
WoodDeckSFScale,0.0108,0.003,3.203,0.001,0.004,0.017

0,1,2,3
Omnibus:,319.651,Durbin-Watson:,1.938
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4239.177
Skew:,-0.627,Prob(JB):,0.0
Kurtosis:,11.253,Cond. No.,1.26e+16


## trial4
- test score: 0.1395

In [63]:
over_pvalues = list(result3.pvalues[result3.pvalues > 0.01].index)
for c in over_pvalues:
    column_ls.remove(c)
    
model4 = sm.OLS.from_formula(makeFormula("SalePriceLog", column_ls, None, True),df_scale_cate_dummy)
result4 = model4.fit()
result4.summary()

0,1,2,3
Dep. Variable:,SalePriceLog,R-squared:,0.93
Model:,OLS,Adj. R-squared:,0.923
Method:,Least Squares,F-statistic:,136.7
Date:,"Mon, 12 Mar 2018",Prob (F-statistic):,0.0
Time:,21:05:49,Log-Likelihood:,1208.7
No. Observations:,1460,AIC:,-2157.0
Df Residuals:,1330,BIC:,-1470.0
Df Model:,129,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
FirstFlrSFScale,0.1032,0.008,13.356,0.000,0.088,0.118
SecondFlrSFScale,0.1214,0.004,29.673,0.000,0.113,0.129
BsmtFinSF1Scale,0.0749,0.010,7.737,0.000,0.056,0.094
BsmtFinSF2Scale,0.0296,0.006,4.584,0.000,0.017,0.042
BsmtUnfSFScale,0.0423,0.009,4.868,0.000,0.025,0.059
GarageAreaScale,0.0334,0.005,6.727,0.000,0.024,0.043
LotAreaScale,0.0275,0.005,5.962,0.000,0.018,0.037
ScreenPorchScale,0.0134,0.003,4.286,0.000,0.007,0.020
WoodDeckSFScale,0.0104,0.003,3.082,0.002,0.004,0.017

0,1,2,3
Omnibus:,317.96,Durbin-Watson:,1.936
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4055.524
Skew:,-0.634,Prob(JB):,0.0
Kurtosis:,11.066,Cond. No.,1.28e+16


## trial5
- delete the category data that have over 0.01 F-value
- test score: 0.1353

In [64]:
df_one_anova_Fvalue = pd.DataFrame(columns = ["name", "Fvalue"])

for idx, c in enumerate(categoric_list):
    model = sm.OLS.from_formula("SalePriceLog ~ C({})".format(c), data=df_train)
    result = model.fit()
    df_tmp=sm.stats.anova_lm(result)
    df_one_anova_Fvalue.loc[idx] = {"name":c, "Fvalue":df_tmp['PR(>F)'][0]}   

df_one_anova_Fvalue.sort_values(by=["Fvalue"], ascending=False, inplace=True)
df_one_anova_Fvalue.reset_index(drop=True, inplace=True)
df_one_anova_Fvalue

  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Unnamed: 0,name,Fvalue
0,Utilities,0.6296086
1,YrSold,0.5656386
2,MoSold,0.4496614
3,LandSlope,0.3388339
4,Street,0.02837965
5,Condition2,0.01382043
6,PoolQC,0.007695369
7,MiscFeature,0.006060004
8,RoofMatl,0.0004504077
9,Functional,4.250494e-06


In [65]:
over_one_Fvalue_list = list(df_one_anova_Fvalue[df_one_anova_Fvalue["Fvalue"] > 0.01]["name"])
over_one_Fvalue_dummy_list=[]
for c in over_one_Fvalue_list:
    reg = c+"_"
    tmp = list(df_test_dummy.filter(regex = reg))
    for a in tmp:
        over_one_Fvalue_dummy_list.append(a)

In [66]:
for c in over_one_Fvalue_dummy_list:
    if c in column_ls:
        column_ls.remove(c)

In [67]:
model5 = sm.OLS.from_formula(makeFormula("SalePriceLog", column_ls, None, True),df_scale_cate_dummy)
result5 = model5.fit()
result5.summary()

0,1,2,3
Dep. Variable:,SalePriceLog,R-squared:,0.922
Model:,OLS,Adj. R-squared:,0.916
Method:,Least Squares,F-statistic:,146.9
Date:,"Mon, 12 Mar 2018",Prob (F-statistic):,0.0
Time:,21:05:50,Log-Likelihood:,1133.0
No. Observations:,1460,AIC:,-2046.0
Df Residuals:,1350,BIC:,-1464.0
Df Model:,109,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
FirstFlrSFScale,0.1036,0.008,12.960,0.000,0.088,0.119
SecondFlrSFScale,0.1213,0.004,28.764,0.000,0.113,0.130
BsmtFinSF1Scale,0.0686,0.010,6.866,0.000,0.049,0.088
BsmtFinSF2Scale,0.0282,0.007,4.214,0.000,0.015,0.041
BsmtUnfSFScale,0.0400,0.009,4.451,0.000,0.022,0.058
GarageAreaScale,0.0325,0.005,6.337,0.000,0.022,0.043
LotAreaScale,0.0168,0.004,4.255,0.000,0.009,0.025
ScreenPorchScale,0.0136,0.003,4.222,0.000,0.007,0.020
WoodDeckSFScale,0.0105,0.004,2.991,0.003,0.004,0.017

0,1,2,3
Omnibus:,909.603,Durbin-Watson:,1.943
Prob(Omnibus):,0.0,Jarque-Bera (JB):,44108.377
Skew:,-2.231,Prob(JB):,0.0
Kurtosis:,29.555,Cond. No.,1.29e+16


## trial6
- delete the category class data that have over 0.01 F-value
- test score: 0.1418

In [73]:
df_one_anova_class_Fvalue = pd.DataFrame(columns = ["name", "Fvalue"])

for idx, c in enumerate(categoric_dummy_list):
    model = sm.OLS.from_formula("SalePriceLog ~ C({})".format(c), data=df_train_dummy)
    result = model.fit()
    df_tmp=sm.stats.anova_lm(result)
    df_one_anova_class_Fvalue.loc[idx] = {"name":c, "Fvalue":df_tmp['PR(>F)'][0]}   

df_one_anova_class_Fvalue.sort_values(by=["Fvalue"], ascending=False, inplace=True)
df_one_anova_class_Fvalue.reset_index(drop=True, inplace=True)
df_one_anova_class_Fvalue

  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Unnamed: 0,name,Fvalue
0,MSSubClass_80,9.654636e-01
1,MiscFeature_Gar2,9.505640e-01
2,BsmtFinType2_GLQ,9.286725e-01
3,MoSold_10,9.240021e-01
4,RoofMatl_ClyTile,9.180190e-01
5,SaleType_ConLI,9.070738e-01
6,LotConfig_Corner,8.790832e-01
7,MoSold_3,8.707371e-01
8,MoSold_6,8.571153e-01
9,LotConfig_FR2,8.521263e-01


In [74]:
over_one_FvalueClass_list = list(df_one_anova_class_Fvalue[df_one_anova_class_Fvalue["Fvalue"] > 0.01]["name"])
for c in over_one_FvalueClass_list:
    if c in column_ls:
        column_ls.remove(c)

In [75]:
model6 = sm.OLS.from_formula(makeFormula("SalePriceLog", column_ls, None, True),df_scale_cate_dummy)
result6 = model6.fit()
result6.summary()

0,1,2,3
Dep. Variable:,SalePriceLog,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.891
Method:,Least Squares,F-statistic:,134.5
Date:,"Mon, 12 Mar 2018",Prob (F-statistic):,0.0
Time:,21:20:35,Log-Likelihood:,930.23
No. Observations:,1460,AIC:,-1680.0
Df Residuals:,1370,BIC:,-1205.0
Df Model:,89,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
FirstFlrSFScale,0.1150,0.009,13.073,0.000,0.098,0.132
SecondFlrSFScale,0.1089,0.005,23.189,0.000,0.100,0.118
BsmtFinSF1Scale,0.0040,0.010,0.386,0.700,-0.016,0.024
BsmtFinSF2Scale,0.0183,0.007,2.637,0.008,0.005,0.032
BsmtUnfSFScale,0.0013,0.010,0.133,0.894,-0.018,0.020
GarageAreaScale,0.0334,0.006,5.777,0.000,0.022,0.045
LotAreaScale,0.0203,0.004,4.748,0.000,0.012,0.029
ScreenPorchScale,0.0155,0.004,4.224,0.000,0.008,0.023
WoodDeckSFScale,0.0123,0.004,3.088,0.002,0.004,0.020

0,1,2,3
Omnibus:,922.409,Durbin-Watson:,1.916
Prob(Omnibus):,0.0,Jarque-Bera (JB):,38710.81
Skew:,-2.328,Prob(JB):,0.0
Kurtosis:,27.792,Cond. No.,1.32e+16
