In [1]:
import matplotlib.pyplot as plt
import numpy  as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict,cross_validate, GridSearchCV
from sklearn.model_selection import train_test_split

pd.options.display.max_columns = None
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [17]:
housing = pd.read_csv('data/Ames_Housing_Price_Data.csv', index_col = 0)
real_estate = pd.read_csv('data/Ames_Real_Estate_Data.csv')
housing_df = pd.merge(housing,real_estate,left_on='PID',right_on='MapRefNo',how='inner')

house_df = pd.read_csv('data/final_df.csv', index_col = 0)

  interactivity=interactivity, compiler=compiler, result=result)


In [18]:
# Checking house price dataset, contains 2580 observations
housing.shape

(2580, 91)

In [19]:
# Checking final_df after generating geoCode, found duplicated observations
house_df.shape

(2603, 179)

In [20]:
# Test cell, ignored! 
house_df_test = house_df.copy()
len(house_df_test.drop_duplicates(subset=['PID'],keep = 'first'))

2558

In [21]:
# drop duplicated observations
house_df = house_df.drop_duplicates(subset=['PID'],keep = 'first')
house_df.shape

(2558, 179)

In [22]:
len(house_df[house_df['YrSold']==2010])

314

## Yi's Feature engineer

In [23]:
# calculate age of building
house_df['BldgAge'] = house_df['YrSold'] - house_df['YearBuilt']

# convert MSSubClass to str
house_df['MSSubClass'] = house_df[['MSSubClass']].astype('str')
house_df['TxD_S'] = house_df[['TxD_S']].astype('str')
house_df['SchD_S'] = house_df[['SchD_S']].astype('str')

# convert NA's to 0s in LotFrontage
house_df['LotFrontage'] = np.where(np.isnan(house_df['LotFrontage']), 0, house_df['LotFrontage'])

# binarize YearRemodAdd
house_df['Remodeled'] = np.where(house_df['YearRemodAdd'] == house_df['YearBuilt'], 0, 1)

# binarize Alley
house_df['Alley'] = np.where(pd.isnull(house_df['Alley']), 0, 1)

# binarize MSSubClass to PUD or not PUD
house_df['IsPUD'] = np.where(house_df['MSSubClass'].isin(['120','150','160','180']), 1, 0)

# binarize LotShape to Reg or not Reg
house_df['LotIsReg'] = np.where(house_df['LotShape']=='Reg', 1, 0)

# binarize Condition1/2 to positive feature or no positive feature
house_df['PosFeat'] = np.where(house_df['Condition1'].isin(['PosN','PosA'])|\
                                house_df['Condition2'].isin(['PosN','PosA']), 1, 0)

# binarize condition 1/2 to normal sorroundings or not normal surroundings
house_df['SurrIsNormal'] = np.where((house_df['Condition1']=='Norm') & (house_df['Condition2']=='Norm'), 1, 0)

# binarize condition 1/2 to normal sale condition or not sale condition
house_df['Salecondition'] = np.where((house_df['SaleCondition']=='Normal'), 1, 0)

house_df['ExtMatl'] = np.where((house_df['Exterior1st']==house_df['Exterior2nd']),house_df['Exterior1st'], 'Mixed')


## Daniel's Feature engineer

In [24]:
house_df['BsmtExposure2']=np.where((house_df['BsmtExposure']=='Av')|(house_df['BsmtExposure']=='Gd'),1,0)



## Danny's Feature engineer

In [25]:
house_df['Total_porch_sf'] = house_df['OpenPorchSF'] + house_df['3SsnPorch'] + house_df['EnclosedPorch'] +\
                            house_df['ScreenPorch'] + house_df['WoodDeckSF']

### These are the features I selected from the first 27 features: 
- GrLivArea
- LotArea
- BldgAge
- Remodeled
- IsPUD
- LotIsReg
- PosFeat
- SurrIsNormal (unsure about this one)

In [26]:
house_df.corr()

Unnamed: 0,index,PID,GrLivArea,SalePrice,Distance,LotFrontage,LotArea,Alley,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,MasVnrArea2,GarageArea2,PoolArea2,HalfBath2,BsmtHalfBath2,BasmtFinSF1,BasmtFinSF2,BasmtFinSF,total_LivArea,num_bathroom,GeoRefNo,Tier,Range,MA_Line2,MA_Zip1,MA_Zip2,Rcrd_Yr,Rcrd_Mo,Inst1_Yr,Inst1_Mo,LndAc_S,ImpAc_S,OthAc_S,TtlVal_AsrYr,X2TPr_D,X2TSc_D,X2TPr_S,X2TSc_S,LndAcX1S,ImpAcX1S,ImpAcX2S,HSTtl_D,MilVal_D,HSTtl_S,MilVal_S,AcreX_S1,AcreGr,AcreNt_S,BldgNo_S,DwlgNo_S,YrBuilt,GLA,TtlBsmtSF,GarYrBlt,Cars,YrSold_YYYY,MoSold_MM,PA-Nmbr,PA-PostD,NmbrBRs,address3,latitude2,longitude2,altitude2,ISU_lat,ISU_long,BldgAge,Remodeled,IsPUD,LotIsReg,PosFeat,SurrIsNormal,Salecondition,BsmtExposure2,Total_porch_sf
index,1.0,0.02791739,-0.01261419,-0.05739388,-0.02427344,-0.02894691,-0.03329784,0.0007034381,-0.04428859,0.03092424,-0.05361013,-0.04334595,-0.0316182,-0.0481302,0.0255029,-0.005648592,-0.04471576,-0.03418459,0.01391125,0.01790329,-0.01857289,0.01160716,-0.005058203,-0.01928123,0.03279934,0.01963806,0.01298555,-0.01102082,-0.04594916,-0.0404462,-0.04090368,0.003144988,-0.02671177,-0.015811,-0.02880524,0.01735681,0.007539172,-0.005091314,-0.02684664,0.0368959,-0.0160614,-0.03984139,0.007539172,-0.01659909,0.01862167,-0.05715182,0.01034314,-0.05359411,-0.04152677,-0.01946676,0.02791739,,,,-0.02712432,0.02387474,0.02733173,-0.01727746,0.02733173,-0.01727746,-0.06837646,,-0.06538777,-0.06619668,,,,,-0.01626887,0.01419935,,-0.004009877,,-0.003215938,,-0.02663358,-0.002993367,-0.02496089,,0.02637898,0.00437795,-0.01233659,-0.04134013,-0.01520349,-0.04402276,0.01343176,-0.0203621,0.0144309,,0.03544494,0.0144309,-0.02112852,0.0170944,,4.625062e-14,1.895199e-14,0.05523547,0.001922232,-0.01925056,0.01681331,0.00157269,-0.02253894,0.2841051,-0.0530114,-0.01294672
PID,0.02791739,1.0,-0.1035972,-0.2280929,-0.1759347,-0.03066857,0.04140111,0.0909619,-0.2475504,0.1218971,-0.3379339,-0.1285462,-0.2287169,-0.1120034,-0.001270685,-0.07265161,-0.1901503,-0.1543005,0.01133257,0.05905988,-0.04080844,0.0002159106,-0.1559727,-0.1749148,0.002151015,0.04928582,-0.07507457,-0.1065517,-0.2580216,-0.2207168,-0.198456,-0.03987737,-0.05932387,0.1715473,-0.02064363,-0.03075141,-0.006196802,-0.02198603,-0.06365961,0.004873327,-0.2338928,-0.1991345,-0.006196802,-0.1730355,-0.0002187923,-0.09993216,0.02911749,-0.09067746,-0.1250181,-0.189145,1.0,,,,0.0127237,0.508269,0.006735089,-0.01124304,0.006735089,-0.01124304,-0.2188482,,-0.2174008,-0.2179672,,,,,0.04016809,0.01839779,,-0.03889049,,-0.02245859,,0.03573399,-0.003972935,0.03621383,,-0.04047514,-0.1833764,-0.09640373,-0.1862921,-0.1537132,-0.2149999,0.05513727,-0.00536977,-0.1692794,,-0.001571142,-0.1692794,-0.8054421,-0.2050632,,6.760472e-16,1.131389e-15,0.3380931,0.2227591,-0.1462217,0.1300698,-0.07540392,0.04872405,0.06993464,0.03878275,-0.001829076
GrLivArea,-0.01261419,-0.1035972,1.0,0.7198022,0.01797821,0.1467564,0.2611281,-0.002800647,0.5600393,-0.1109892,0.2287499,0.3037251,0.3886588,0.1642282,-0.02042461,0.2438666,0.4095695,0.5389424,0.6644188,0.08921815,0.04247845,-0.05339892,0.6435057,0.4446041,0.5371289,0.1301704,0.8066151,0.4570315,0.2662565,0.4947489,0.4764588,0.2477108,0.3261422,0.002695094,0.008648365,0.09768094,0.05812793,-0.00405906,0.03969921,-0.006148179,0.2658441,0.4753268,0.05812793,0.4155383,-0.05947592,0.2091788,0.02080369,0.2122893,0.7969358,0.5919273,-0.1035972,,,,-0.01655518,-0.1488834,-0.1188268,-0.03350571,-0.1188268,-0.03350571,0.7396092,,0.7359414,0.7375508,,,,,0.03018461,-0.0004127749,,0.0001852881,,-0.00961874,,0.03502031,-0.0005646206,0.0352182,,0.05288054,0.1323056,0.9769274,0.4039663,0.1401467,0.4821516,-0.06990034,0.02667751,0.178819,,0.5275113,0.178819,0.1837838,-0.1429528,,-7.914311000000001e-17,3.3925450000000003e-17,-0.2289839,0.06968665,-0.1305533,-0.1953207,0.1347751,-0.01516946,-0.06938826,0.07135388,0.3676199
SalePrice,-0.05739388,-0.2280929,0.7198022,1.0,0.1917935,0.1830315,0.2775087,-0.1135083,0.7904479,-0.10665,0.5443873,0.5112947,0.5011804,0.4596804,0.02297305,0.1655287,0.6525424,0.6431167,0.2619293,-0.03126661,0.2878717,-0.03400554,0.533528,0.2847139,0.1501915,-0.1152058,0.4888353,0.487162,0.5197258,0.6397939,0.6363632,0.3324387,0.3134009,-0.1202169,0.0312127,0.1214687,0.03090348,-0.01923813,0.01382741,0.0009976053,0.3861365,0.6360379,0.03090348,0.292754,-0.03622162,0.5369109,0.07718349,0.5514363,0.8200507,0.6443445,-0.2280929,,,,-0.007395858,-0.2321861,-0.1144998,-0.03603198,-0.1144998,-0.03603198,0.9335096,,0.9297388,0.9315562,,,,,0.04332761,-0.009399524,,0.02895883,,0.01771794,,0.0565467,0.01480496,0.05282043,,0.06635029,0.3023201,0.7065622,0.6509866,0.2417052,0.6277424,-0.06320549,0.01832749,0.3312907,,0.1465696,0.3312907,0.2769226,-0.2547675,,6.908338e-16,4.820486e-16,-0.5442518,-0.05251723,-0.0318789,-0.3016197,0.1251696,0.09683324,-0.1306464,0.3540085,0.3935828
Distance,-0.02427344,-0.1759347,0.01797821,0.1917935,1.0,0.03625969,0.02482471,-0.1673797,0.1970943,-0.1951583,0.5037821,0.3052331,0.0939496,0.1476595,-0.06330687,0.05026121,0.177032,0.1009873,-0.05919403,-0.09205896,0.07574803,-0.003128923,0.1698193,0.1090021,-0.0394129,-0.04808382,0.01380792,-0.02936968,0.4605336,0.2613653,0.2416769,0.1391437,0.06151985,-0.1949026,0.0007182714,-0.04251422,0.0130343,-0.005906718,0.01466214,0.0209896,0.1504596,0.2425528,0.0130343,0.1087199,0.002686541,0.2248348,-0.0369233,0.2118746,0.1429393,0.2018255,-0.1759347,,,,-0.04140618,0.06018293,-0.03316618,-0.00759837,-0.03316618,-0.00759837,0.1840594,,0.183223,0.1836045,,,,,0.01253532,0.01825005,,0.01781997,,0.03420654,,0.003699051,0.01859538,0.004526661,,0.008701215,0.2308256,0.008716863,0.1722675,0.1230009,0.2705646,-0.02639433,-0.008263678,0.2587681,,-0.04009903,0.2587681,-0.03970183,0.1535105,,-3.53258e-15,-3.550788e-15,-0.5027679,-0.2786132,0.141998,-0.1324773,0.0270585,0.1205835,-0.05465299,0.2083371,0.04404404
LotFrontage,-0.02894691,-0.03066857,0.1467564,0.1830315,0.03625969,1.0,0.111682,-0.03982029,0.09768482,-0.007863784,-0.005303897,0.03823654,0.08693296,0.03232091,0.005679443,0.1264613,0.1647084,0.2090961,-0.01716411,0.006954397,0.01640248,-0.0355744,0.06021676,-0.04273413,0.1097266,0.02512731,0.1768216,0.03603861,-0.00145533,0.1169021,0.1485955,-0.0006984672,0.06526557,0.02017828,0.001732722,0.05459756,0.07373101,-0.02120158,0.01323784,-0.001617785,0.02657995,0.148611,0.07373101,-0.03612753,-0.03929336,0.03024188,-0.002937496,0.02905663,0.1156584,0.03442856,-0.03066857,,,,0.02204385,-0.08128475,-0.02969793,0.01013139,-0.02969793,0.01013139,0.184909,,0.1850514,0.1851898,,,,,0.02077792,0.03251308,,0.02148574,,0.02077907,,0.00638676,0.002943809,-0.006371806,,-0.008036762,-0.008653341,0.1412528,0.1613805,0.0128692,0.1029626,-0.008637632,-0.04507816,-0.01446533,,0.1045765,-0.01446533,0.03951699,-0.008712292,,-3.095722e-16,-4.459545e-16,0.005231347,0.05385079,-0.2834603,0.1894725,0.01775014,-0.04610574,-0.08940115,0.06239503,0.05342777
LotArea,-0.03329784,0.04140111,0.2611281,0.2775087,0.02482471,0.111682,1.0,-0.08201672,0.08576787,-0.02996216,0.01381973,0.01318153,0.1034781,0.1652521,0.08211746,0.02187946,0.2271427,0.3123574,0.02373022,-0.000188249,0.1195313,0.02839087,0.1259523,0.02589115,0.1413649,-0.01430858,0.2053451,0.2460588,-0.0168003,0.1770735,0.1993697,0.156507,0.08720849,0.01530201,0.01791993,0.05422574,0.05285549,0.04047462,0.01238165,-0.01687676,0.05044683,0.199285,0.05285549,0.02181924,0.02780101,0.1292514,0.02319492,0.1340171,0.2566783,0.1738537,0.04140111,,,,0.006170346,0.001664544,-0.08013355,-0.01380088,-0.08013355,-0.01380088,0.3683861,,0.3645275,0.3658353,,,,,0.6116445,-0.006214918,,-0.000869042,,0.002187559,,0.7212732,0.275016,0.7136931,,-0.00222707,0.004798147,0.2782153,0.2509437,0.06767474,0.1828345,0.00355471,0.01958263,0.1071023,,0.1320732,0.1071023,-0.03934019,-0.1001378,,4.846374e-16,5.184562e-16,-0.01456498,0.01386224,-0.273422,-0.2137336,0.05789837,-0.03120355,-0.02985662,0.1548578,0.1890244
Alley,0.0007034381,0.0909619,-0.002800647,-0.1135083,-0.1673797,-0.03982029,-0.08201672,1.0,-0.05676741,0.08815912,-0.2652618,-0.03542423,-0.06672259,-0.1556357,-0.04799506,0.05585174,-0.1236419,-0.1331482,0.1097165,0.04913321,-0.09359842,-0.0212163,-0.02515201,-0.02822764,-0.006703384,0.04878416,-0.01616548,-0.1089556,-0.1518936,-0.07631548,-0.07457678,-0.1068461,0.05390591,0.1852304,-0.02627287,0.0001653533,-0.01449609,0.003616082,-0.009116279,-0.002580334,-0.1369358,-0.0742702,-0.01449609,-0.02158286,-0.01997345,-0.1347555,-0.02639302,-0.140332,-0.08861117,-0.08898855,0.0909619,,,,-0.009984805,0.09444319,0.01158207,-0.01454834,0.01158207,-0.01454834,-0.1119251,,-0.1064215,-0.107898,,,,,-0.01205999,-0.008524983,,0.02304772,,0.01087354,,-0.00888788,-0.008256552,-0.0090269,,-0.02973753,-0.1165088,-0.0009031683,-0.1224428,-0.0766493,-0.06539913,0.06686663,6.809935e-05,-0.146346,,-0.007245977,-0.146346,-0.011743,0.1534624,,-2.326555e-16,-1.854037e-16,0.2651031,0.1891761,0.06591109,0.1104089,-0.02856661,0.02038873,-0.0152721,-0.1317786,0.005595959
OverallQual,-0.04428859,-0.2475504,0.5600393,0.7904479,0.1970943,0.09768482,0.08576787,-0.05676741,1.0,-0.1088278,0.5772398,0.5433255,0.4068631,0.2720194,-0.03075553,0.2515978,0.5242939,0.4554951,0.2458026,-0.04525454,0.1698409,-0.05473239,0.5066125,0.2738761,0.06327082,-0.1562801,0.3584261,0.387083,0.5616591,0.58145,0.5389895,0.2481711,0.2898395,-0.1306628,0.01398699,0.04932785,-0.002734206,-0.02651688,0.01775889,0.005345872,0.3823611,0.5390742,-0.002734206,0.2840494,-0.05804076,0.39174,0.04090932,0.3981014,0.6189107,0.5432385,-0.2475504,,,,-0.00849105,-0.2135948,-0.1009293,-0.03552599,-0.1009293,-0.03552599,0.76862,,0.7679889,0.7688689,,,,,0.007107495,0.01061747,,0.03155383,,0.02408726,,0.001768037,-0.01382824,-0.001302385,,0.1035249,0.3444141,0.5510593,0.5197408,0.2691667,0.5731321,-0.1032188,-0.007097006,0.3484157,,0.06143403,0.3484157,0.3053151,-0.2619683,,-3.570272e-16,-8.158533e-16,-0.5769062,-0.09755341,0.1484987,-0.2602125,0.08679656,0.09592992,-0.1099514,0.2875474,0.2839615
OverallCond,0.03092424,0.1218971,-0.1109892,-0.10665,-0.1951583,-0.007863784,-0.02996216,0.08815912,-0.1088278,1.0,-0.3998847,0.06076586,-0.1445893,-0.06596254,0.03042528,-0.1300143,-0.1879033,-0.1580432,0.0103418,0.01906727,-0.05763038,0.07922416,-0.2162102,-0.0975643,0.001794514,-0.07637709,-0.07767538,-0.04758149,-0.3379142,-0.2011803,-0.1730981,0.01330188,-0.07635431,0.08987416,0.04211397,0.05029738,-0.02139462,0.04625595,0.01238936,0.01745003,-0.1745768,-0.173222,-0.02139462,-0.08792787,0.08753292,-0.07982181,0.02588694,-0.07170651,-0.1182121,-0.2028079,0.1218971,,,,0.01854338,0.1487148,0.0401194,0.03136082,0.0401194,0.03136082,-0.1613502,,-0.1565479,-0.1578961,,,,,-0.002045618,-0.01842971,,0.002749432,,0.000418618,,0.007610502,-0.006924945,0.01165177,,0.08383274,-0.09974694,-0.1001627,-0.1849241,-0.003896394,-0.1942264,0.08667454,-0.04666519,-0.2292929,,0.007328363,-0.2292929,-0.1014847,0.1838556,,-6.907505e-16,-3.724736e-16,0.4005906,0.297135,-0.1344332,0.08096236,0.0078836,-0.05343551,0.1050309,-0.09684046,0.04010978


In [27]:
pd.options.display.max_rows = None
house_df.corr()['SalePrice'].sort_values()

BldgAge          -5.442518e-01
LotIsReg         -3.016197e-01
longitude2       -2.547675e-01
MA_Zip2          -2.321861e-01
PID              -2.280929e-01
GeoRefNo         -2.280929e-01
Salecondition    -1.306464e-01
EnclosedPorch    -1.202169e-01
KitchenAbvGr     -1.152058e-01
Rcrd_Yr          -1.144998e-01
Inst1_Yr         -1.144998e-01
Alley            -1.135083e-01
OverallCond      -1.066500e-01
YrSold_YYYY      -6.320549e-02
index            -5.739388e-02
Remodeled        -5.251723e-02
BsmtHalfBath2    -3.622162e-02
Inst1_Mo         -3.603198e-02
Rcrd_Mo          -3.603198e-02
BsmtHalfBath     -3.400554e-02
IsPUD            -3.187890e-02
LowQualFinSF     -3.126661e-02
MiscVal          -1.923813e-02
ImpAcX1S         -9.399524e-03
MA_Zip1          -7.395858e-03
ISU_long          4.820486e-16
ISU_lat           6.908338e-16
YrSold            9.976053e-04
MoSold            1.382741e-02
AcreGr            1.480496e-02
HSTtl_S           1.771794e-02
MoSold_MM         1.832749e-02
BsmtFinS

In [28]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2580 entries, 1 to 763
Data columns (total 91 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PID            2580 non-null   int64  
 1   GrLivArea      2580 non-null   int64  
 2   SalePrice      2580 non-null   int64  
 3   MSSubClass     2580 non-null   int64  
 4   MSZoning       2580 non-null   object 
 5   LotFrontage    2118 non-null   float64
 6   LotArea        2580 non-null   int64  
 7   Street         2580 non-null   object 
 8   Alley          168 non-null    object 
 9   LotShape       2580 non-null   object 
 10  LandContour    2580 non-null   object 
 11  Utilities      2580 non-null   object 
 12  LotConfig      2580 non-null   object 
 13  LandSlope      2580 non-null   object 
 14  Neighborhood   2580 non-null   object 
 15  Condition1     2580 non-null   object 
 16  Condition2     2580 non-null   object 
 17  BldgType       2580 non-null   object 
 18  HouseStyl

In [29]:
real_estate.isnull().sum()

MapRefNo            0
GeoRefNo            0
Tier                0
Range               0
Prop_Addr          20
ZngCdPr           106
ZngCdSc         22187
ZngOLPr         20243
ZngOLSc         22060
ClassPr_S           0
ClassSc_S           0
Legal_Pr            0
SchD_S              0
TxD_S               0
MA_Ownr1            0
MA_Ownr2        19337
MA_Line1           14
MA_Line2        22213
MA_City            14
MA_State           14
MA_Zip1            14
MA_Zip2          8890
Rcrd_Yr          1907
Rcrd_Mo          1907
Inst1_No         1857
Inst1_Yr         1907
Inst1_Mo         1907
Inst1TPr         2677
LndAc_S             0
ImpAc_S             0
OthAc_S             0
TtlVal_AsrYr        0
ValType             0
X1TPr_D         21519
X1TSc_D         21519
X2TPr_D         22212
X2TSc_D         22212
X1TPr_S         21522
X1TSc_S         21522
X2TPr_S         22212
X2TSc_S         22212
LndAcX1S            0
ImpAcX1S            0
ImpAcX2S            0
HSTtl_D         13024
MilVal_D  

In [31]:
pd.set_option('display.max_rows',None)

X_train.isnull().sum()

NameError: name 'X_train' is not defined

## Features that drop because of too many missing values
- PoolArea
- PoolQC
- MiscFeature
- MiscVal
- YrSold_YYYY
- MoSold_MM
- SaleCond
- PA-PreD
- PA-PostD
- PA-UnTyp
- PA-UntNo
- X1TPr_D
- X1TSc_D
- X2TPr_D
- X2TSc_D
- X1TPr_S
- X1TSc_S
- X2TPr_S
- X2TSc_S

## Features that drop because of multicollinearity
- ISU_lat_long
- address_lat_long
- FullBath 
- HalfBath
- MA_Ownr1
- MA_Ownr2
- MA_Line1
- MA_Line2
- MA_City
- MA_State
- location
- point


## Features that drop because of p value
- MasVnrArea2
- BsmtQual
- BsmtCond
- BsmtExposure
- Electrical

In [36]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [53]:
house_df_test = house_df.drop(['Alley','PoolArea','PoolQC','MiscFeature','MiscVal','YrSold_YYYY','MoSold_MM','SaleCond','PA-PreD','PA-PostD',\
                        'PA-UnTyp','PA-UntNo','X1TPr_D','X1TSc_D','X2TPr_D','X2TSc_D','X1TPr_S','X1TSc_S','X2TPr_S','X2TSc_S',\
                       'ISU_lat_long','address','FullBath','HalfBath','MA_Ownr1','MA_Ownr2','MA_Line1','MA_Line2',\
                        'MA_City','MA_State','address3','location2','point2','Street','index','PID','Utilities','BsmtHalfBath2',\
                        'HalfBath2','BsmtFullBath','BsmtHalfBath','BsmtFinSF1','BsmtFinSF2','BsmtFinType1','BsmtFinType2',\
                        'Source','Date','ParType','BldgNo_S','DwlgNo_S','YrBuilt','Ext1','Ext2','Central Air','GLA','TtlBsmtSF',\
                        'GarYrBlt','Cars','MA_Zip1','MA_Zip2','ZngCdPr','ZngCdSc','ZngOLPr','ZngOLSc','PA-Nmbr','PA-Strt',\
                        'PA-StSfx','Inst1_No','Inst1_Yr','Inst1_Mo','Inst1TPr','TtlVal_AsrYr','ValType','OthAc_S','ImpAc_S',\
                        'LndAc_S','Prop_Addr','HSTtl_D','MilVal_D','HSTtl_S','MilVal_S','GeoRefNo','Tier','Range','AcreX_S1',\
                        'ClassPr_S','ClassSc_S','LndAcX1S','ImpAcX1S','ImpAcX2S','AcreGr','AcreNt_S','ParclRel','WoodDeckSF',\
                        'OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','SaleCondition','Rcrd_Yr','Rcrd_Mo','Legal_Pr',\
                        'GrLivArea','Condition1','Condition2','1stFlrSF','2ndFlrSF','YearRemodAdd','YearBuilt','MasVnrType',\
                        'MasVnrArea','GarageArea2','PoolArea2','BsmtQual','BsmtCond','BsmtUnfSF','LowQualFinSF','BasmtFinSF2',\
                        'BasmtFinSF1','NmbrBRs','altitude2','LotShape','Neighborhood','HouseStyle','BedroomAbvGr','num_bathroom',\
                        'MSSubClass','BsmtExposure','TotalBsmtSF','GarageYrBlt','Exterior1st','Exterior2nd','address2',\
                        'MSZoning','latitude2','longitude2','SchD_S','KitchenQual','ExterCond','ISU_lat','ISU_long',\
                        'Functional','TotRmsAbvGrd',],axis=1)
house_df_test.info()
house_df_test.corr()['SalePrice'].sort_values()
# house_df_test.isnull().sum()

# Impute None to Nan value in columns
impute_feature1 = house_df_test[['FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','Fence']]
for feature in impute_feature1.columns:
    house_df_test[feature] = house_df_test[feature].fillna('None')
    
house_df_test = house_df_test.drop(2465,axis=0)
house_df_test = house_df_test.drop(437,axis=0)

house_df_test.isnull().sum()

# Train-test split 
train = house_df_test[house_df_test['YrSold']!=2010]
test = house_df_test[house_df_test['YrSold']==2010]
print(train.shape)
print(test.shape)
X_train = train.drop(['SalePrice'],axis=1)
y_train = train['SalePrice']
X_test = test.drop(['SalePrice'],axis=1)
y_test = test['SalePrice']

X_train_dum = pd.get_dummies(X_train, drop_first=True)
print(X_train.shape)
print(len(X_train_dum.columns))

# apply log to area and house price
X_train_dum['LotFrontage'] = np.log(X_train_dum['LotFrontage']+1)
# X_train_dum['LotArea'] = np.log(X_train_dum['LotArea']+1)
X_train_dum['total_LivArea'] = np.log(X_train_dum['total_LivArea']+1)
X_train_dum['Total_porch_sf'] = np.log(X_train_dum['Total_porch_sf']+1)
y_train = np.log(y_train)

lin_reg = LinearRegression().fit(X_train_dum, y_train)
lin_reg.score(X_train_dum,y_train)



x_feature = sm.add_constant(X_train_dum)
# Notice that the dependent variable (y) comes before the independent variables (x)
# Use df.columns.values to get all the column names
model = sm.OLS(y_train, x_feature)
results_feature = model.fit()
print(results_feature.summary())
pValue = results_feature.pvalues
pValue[pValue<0.05]

X_vif = X_train_dum[pValue[pValue<0.05].drop('const').index]
vif_data = pd.DataFrame() 
vif_data["feature"] = X_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i) 
                          for i in range(len(X_vif.columns))] 
print(vif_data)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2558 entries, 0 to 2602
Data columns (total 46 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SalePrice       2558 non-null   int64  
 1   Distance        2558 non-null   float64
 2   LotFrontage     2558 non-null   float64
 3   LotArea         2558 non-null   int64  
 4   LandContour     2558 non-null   object 
 5   LotConfig       2558 non-null   object 
 6   LandSlope       2558 non-null   object 
 7   BldgType        2558 non-null   object 
 8   OverallQual     2558 non-null   int64  
 9   OverallCond     2558 non-null   int64  
 10  RoofStyle       2558 non-null   object 
 11  RoofMatl        2558 non-null   object 
 12  ExterQual       2558 non-null   object 
 13  Foundation      2558 non-null   object 
 14  Heating         2558 non-null   object 
 15  HeatingQC       2558 non-null   object 
 16  CentralAir      2558 non-null   object 
 17  Electrical      2557 non-null   o

BldgAge          -0.544252
LotIsReg         -0.301620
Salecondition    -0.130646
KitchenAbvGr     -0.115206
OverallCond      -0.106650
Remodeled        -0.052517
IsPUD            -0.031879
YrSold            0.000998
MoSold            0.013827
SurrIsNormal      0.096833
PosFeat           0.125170
LotFrontage       0.183032
Distance          0.191794
LotArea           0.277509
BsmtExposure2     0.354009
MasVnrArea2       0.386136
Total_porch_sf    0.393583
Fireplaces        0.487162
BasmtFinSF        0.551436
GarageArea        0.636363
GarageCars        0.639794
OverallQual       0.790448
total_LivArea     0.820051
SalePrice         1.000000
Name: SalePrice, dtype: float64

SalePrice         0
Distance          0
LotFrontage       0
LotArea           0
LandContour       0
LotConfig         0
LandSlope         0
BldgType          0
OverallQual       0
OverallCond       0
RoofStyle         0
RoofMatl          0
ExterQual         0
Foundation        0
Heating           0
HeatingQC         0
CentralAir        0
Electrical        0
KitchenAbvGr      0
Fireplaces        0
FireplaceQu       0
GarageType        0
GarageFinish      0
GarageCars        0
GarageArea        0
GarageQual        0
GarageCond        0
PavedDrive        0
Fence             0
MoSold            0
YrSold            0
SaleType          0
MasVnrArea2       0
BasmtFinSF        0
total_LivArea     0
TxD_S             0
BldgAge           0
Remodeled         0
IsPUD             0
LotIsReg          0
PosFeat           0
SurrIsNormal      0
Salecondition     0
ExtMatl           0
BsmtExposure2     0
Total_porch_sf    0
dtype: int64

(2242, 46)
(314, 46)
(2242, 45)
120


0.9184228757131621

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.918
Model:                            OLS   Adj. R-squared:                  0.914
Method:                 Least Squares   F-statistic:                     202.6
Date:                Sat, 28 Nov 2020   Prob (F-statistic):               0.00
Time:                        15:55:56   Log-Likelihood:                 1784.0
No. Observations:                2242   AIC:                            -3330.
Df Residuals:                    2123   BIC:                            -2650.
Df Model:                         118                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 11.3166      4

const                 1.151790e-02
Distance              6.957098e-15
LotArea               1.418851e-16
OverallQual           2.879209e-78
OverallCond           3.495586e-48
Fireplaces            9.686927e-09
GarageCars            1.414246e-02
GarageArea            1.045198e-10
BasmtFinSF            4.910214e-32
total_LivArea        6.218876e-204
BldgAge               4.382104e-30
LotIsReg              1.389483e-02
PosFeat               9.529080e-07
SurrIsNormal          2.516685e-13
Salecondition         1.444250e-04
BsmtExposure2         7.311831e-05
Total_porch_sf        2.697526e-03
LandContour_HLS       1.299628e-04
LandContour_Lvl       2.227785e-02
LandSlope_Mod         3.953542e-03
LandSlope_Sev         2.945125e-03
BldgType_2fmCon       1.329780e-03
BldgType_Duplex       5.032628e-03
BldgType_Twnhs        1.323527e-03
RoofMatl_WdShngl      7.569903e-03
ExterQual_Fa          8.232561e-05
ExterQual_Gd          3.718748e-06
ExterQual_TA          1.497897e-08
Foundation_PConc    

              feature         VIF
0            Distance   16.038015
1             LotArea    4.263066
2         OverallQual   70.927145
3         OverallCond   38.492015
4          Fireplaces   10.773061
5          GarageCars   38.879297
6          GarageArea   32.451859
7          BasmtFinSF    3.131391
8       total_LivArea  393.548578
9             BldgAge   11.994065
10           LotIsReg    3.343739
11            PosFeat    1.278835
12       SurrIsNormal    9.348573
13      Salecondition   29.936926
14      BsmtExposure2    1.837632
15     Total_porch_sf    7.101155
16    LandContour_HLS    1.776506
17    LandContour_Lvl   22.431807
18      LandSlope_Mod    1.400092
19      LandSlope_Sev    1.513611
20    BldgType_2fmCon    1.186182
21    BldgType_Duplex    1.293425
22     BldgType_Twnhs    1.194879
23   RoofMatl_WdShngl    1.058009
24       ExterQual_Fa    1.847927
25       ExterQual_Gd   15.809890
26       ExterQual_TA   36.885223
27   Foundation_PConc    4.515656
28    Foundati

In [None]:
house_df_test.isnull().sum()

In [None]:
house_df_test.groupby(['Functional']).count()

In [None]:
# Impute None to Nan value in columns
impute_feature1 = house_df_test[['FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','Fence']]
for feature in impute_feature1.columns:
    house_df_test[feature] = house_df_test[feature].fillna('None')

In [None]:
house_df_test.isnull().sum()

In [None]:
house_df_test = house_df_test.drop(2465,axis=0)
house_df_test = house_df_test.drop(437,axis=0)

In [None]:
house_df_test[house_df_test['Electrical'].isnull()]

In [None]:
house_df_test[house_df_test['GarageCars'].isnull()]

In [None]:
house_df_test[house_df_test['TotalBath'].isnull()]

In [None]:
house_df_test.loc[house_df_test.index == 2328,'TotalBath'] = 3.5

In [None]:
house_df_test.shape

In [None]:
house_df_test[house_df_test.index == 918 ]['TotalBath']

In [None]:
house_df_test.shape

In [None]:
house_df_test.isnull().sum()

In [None]:
house_df_test.info()

### Train-test split

In [44]:
# Train-test split 
train = house_df_test[house_df_test['YrSold']!=2010]
test = house_df_test[house_df_test['YrSold']==2010]
print(train.shape)
print(test.shape)
X_train = train.drop(['SalePrice'],axis=1)
y_train = train['SalePrice']
X_test = test.drop(['SalePrice'],axis=1)
y_test = test['SalePrice']

(2242, 44)
(314, 44)


In [45]:
X_train_dum = pd.get_dummies(X_train, drop_first=True)
print(X_train.shape)
print(len(X_train_dum.columns))

(2242, 43)
118


In [46]:
X_train_dum.columns[-50:]

Index(['FireplaceQu_None', 'FireplaceQu_Po', 'FireplaceQu_TA',
       'GarageType_Attchd', 'GarageType_Basment', 'GarageType_BuiltIn',
       'GarageType_CarPort', 'GarageType_Detchd', 'GarageType_None',
       'GarageFinish_None', 'GarageFinish_RFn', 'GarageFinish_Unf',
       'GarageQual_Fa', 'GarageQual_Gd', 'GarageQual_None', 'GarageQual_Po',
       'GarageQual_TA', 'GarageCond_Fa', 'GarageCond_Gd', 'GarageCond_None',
       'GarageCond_Po', 'GarageCond_TA', 'PavedDrive_P', 'PavedDrive_Y',
       'Fence_GdWo', 'Fence_MnPrv', 'Fence_MnWw', 'Fence_None', 'SaleType_CWD',
       'SaleType_Con', 'SaleType_ConLD', 'SaleType_ConLI', 'SaleType_ConLw',
       'SaleType_New', 'SaleType_Oth', 'SaleType_VWD', 'SaleType_WD ',
       'TxD_S_45', 'ExtMatl_AsphShn', 'ExtMatl_BrkFace', 'ExtMatl_CBlock',
       'ExtMatl_HdBoard', 'ExtMatl_ImStucc', 'ExtMatl_MetalSd',
       'ExtMatl_Mixed', 'ExtMatl_Plywood', 'ExtMatl_PreCast', 'ExtMatl_Stucco',
       'ExtMatl_VinylSd', 'ExtMatl_Wd Sdng'],
      dt

In [47]:
# apply log to area and house price
X_train_dum['LotFrontage'] = np.log(X_train_dum['LotFrontage']+1)
# X_train_dum['LotArea'] = np.log(X_train_dum['LotArea']+1)
X_train_dum['total_LivArea'] = np.log(X_train_dum['total_LivArea']+1)
X_train_dum['Total_porch_sf'] = np.log(X_train_dum['Total_porch_sf']+1)
y_train = np.log(y_train)

lin_reg = LinearRegression().fit(X_train_dum, y_train)
lin_reg.score(X_train_dum,y_train)

0.9014220574129522

In [48]:
house_df_test_dum = pd.get_dummies(house_df_test, drop_first=True)

house_df_test_dum['LotFrontage'] = np.log(house_df_test_dum['LotFrontage']+1)
# house_df_test_dum['LotArea'] = np.log(house_df_test_dum['LotArea']+1)
house_df_test_dum['total_LivArea'] = np.log(house_df_test_dum['total_LivArea']+1)
house_df_test_dum['Total_porch_sf'] = np.log(house_df_test_dum['Total_porch_sf']+1)
house_df_test_dum['SalePrice'] = np.log(house_df_test_dum['SalePrice'])

train1 = house_df_test_dum[house_df_test['YrSold']!=2010]
test1 = house_df_test_dum[house_df_test['YrSold']==2010]
# print(train1.shape)
# print(test1.shape)
X_train1 = train1.drop(['SalePrice'],axis=1)
y_train1 = train1['SalePrice']
X_test1 = test1.drop(['SalePrice'],axis=1)
y_test1 = test1['SalePrice']


# linear regresion model
lin_reg1 = LinearRegression().fit(X_train1, y_train1)
print(f'R^2 of Train set: {lin_reg1.score(X_train1,y_train1)}')
print(f'R^2 of Test set: {lin_reg1.score(X_test1,y_test1)}')

R^2 of Train set: 0.9014220574129522
R^2 of Test set: 0.828677627011333


In [49]:
# Decision Tree Model
tree_reg = DecisionTreeRegressor(max_depth=10).fit(X_train1,y_train1)
print(f'R^2 of Train set: {tree_reg.score(X_train1,y_train1)}')
print(f'R^2 Test set: {tree_reg.score(X_test1,y_test1)}')



R^2 of Train set: 0.9644156075606799
R^2 Test set: 0.7224647639522689


In [50]:
# Random Forest Model
forest_reg = RandomForestRegressor(n_estimators=100,max_features=5).fit(X_train1,y_train1)
print(f'R^2 of Train set: {forest_reg.score(X_train1,y_train1)}')
print(f'R^2 Test set: {forest_reg.score(X_test1,y_test1)}')

R^2 of Train set: 0.9769357687919005
R^2 Test set: 0.7631898385412298


In [51]:
import statsmodels.api as sm
x_feature = sm.add_constant(X_train_dum)
# Notice that the dependent variable (y) comes before the independent variables (x)
# Use df.columns.values to get all the column names
model = sm.OLS(y_train, x_feature)
results_feature = model.fit()
print(results_feature.summary())
pValue = results_feature.pvalues
pValue[pValue<0.05]

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.901
Model:                            OLS   Adj. R-squared:                  0.896
Method:                 Least Squares   F-statistic:                     167.5
Date:                Sat, 28 Nov 2020   Prob (F-statistic):               0.00
Time:                        14:14:20   Log-Likelihood:                 1571.8
No. Observations:                2242   AIC:                            -2910.
Df Residuals:                    2125   BIC:                            -2241.
Df Model:                         116                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 10.5685      4

const                  3.170057e-02
Distance               4.148648e-17
LotFrontage            8.971901e-03
OverallCond            1.982452e-54
Fireplaces             3.176940e-12
GarageCars             2.985997e-04
GarageArea             9.559297e-11
MasVnrArea2            7.108003e-03
BasmtFinSF             6.545987e-48
total_LivArea         7.278540e-250
BldgAge                4.051281e-42
LotIsReg               6.968003e-04
PosFeat                8.063277e-07
SurrIsNormal           3.891390e-12
Salecondition          3.154759e-04
BsmtExposure2          2.725090e-07
Total_porch_sf         5.111690e-04
LandContour_HLS        1.235277e-04
BldgType_2fmCon        1.055428e-02
BldgType_Duplex        1.479951e-03
BldgType_Twnhs         6.995538e-04
BldgType_TwnhsE        1.967910e-02
RoofMatl_WdShngl       1.148289e-04
ExterQual_Fa           1.561429e-15
ExterQual_Gd           9.331052e-15
ExterQual_TA           1.787257e-25
Foundation_Slab        4.775095e-07
HeatingQC_Gd           9.793

In [52]:
from statsmodels.stats.outliers_influence import variance_inflation_factor 

X_vif = X_train_dum[pValue[pValue<0.05].index]
vif_data = pd.DataFrame() 
vif_data["feature"] = X_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i) 
                          for i in range(len(X_vif.columns))] 
print(vif_data)


KeyError: "['const'] not in index"

In [None]:
x_feature = sm.add_constant(X_train_dum)
# Notice that the dependent variable (y) comes before the independent variables (x)
# Use df.columns.values to get all the column names
model = sm.OLS(y_train, x_feature)
results_feature = model.fit()
print(results_feature.summary())
pValue = results_feature.pvalues
pValue[pValue<0.05]

In [None]:
X_vif = X_train_dum[pValue[pValue<0.05].index]
vif_data = pd.DataFrame() 
vif_data["feature"] = X_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i) 
                          for i in range(len(X_vif.columns))] 
print(vif_data)

In [None]:
x_feature = sm.add_constant(X_train_dum)
# Notice that the dependent variable (y) comes before the independent variables (x)
# Use df.columns.values to get all the column names
model = sm.OLS(y_train, x_feature)
results_feature = model.fit()
print(results_feature.summary())
pValue = results_feature.pvalues
pValue[pValue<0.05]

In [None]:
X_vif = X_train_dum[pValue[pValue<0.05].index]
vif_data = pd.DataFrame() 
vif_data["feature"] = X_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i) 
                          for i in range(len(X_vif.columns))] 
print(vif_data)

In [None]:
# from sklearn.metrics import r2_score
# lin_reg_pred = lin_reg.predict(X_test)
# sum((y_test - lin_reg_pred)**2)/sum((y_test - np.mean(y_test))**2)
# mean_squared_error(X_test,y_test)
# lin_reg_pred
# len(y_test)
# r2_score(y_test,lin_reg_pred)

In [None]:
# from sklearn.metrics import mean_squared_error
# from sklearn.metrics import r2_score
# housing_prediction = lin_reg.predict(X_test)
# mean_squared_error(y_train,y_test)
# lin_reg2 = LinearRegression().fit(X_train,y_train)
# lin_reg.r2_score

### How to make money

In [43]:
import pickle