In [1]:
import pandas as pd
import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np

In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
data = pd.concat([train, test])

# data dimensions: # rows, # columns
print train.shape
print test.shape
print data.shape

(1460, 81)
(1459, 80)
(2919, 81)


# Explore variables one at a time

In [3]:
data = data.drop(["Id"], axis = 1)  # drop this variable
data.columns.values  # print all column names

array(['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'Alley', 'BedroomAbvGr',
       'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtFinType1', 'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath',
       'BsmtQual', 'BsmtUnfSF', 'CentralAir', 'Condition1', 'Condition2',
       'Electrical', 'EnclosedPorch', 'ExterCond', 'ExterQual',
       'Exterior1st', 'Exterior2nd', 'Fence', 'FireplaceQu', 'Fireplaces',
       'Foundation', 'FullBath', 'Functional', 'GarageArea', 'GarageCars',
       'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType',
       'GarageYrBlt', 'GrLivArea', 'HalfBath', 'Heating', 'HeatingQC',
       'HouseStyle', 'KitchenAbvGr', 'KitchenQual', 'LandContour',
       'LandSlope', 'LotArea', 'LotConfig', 'LotFrontage', 'LotShape',
       'LowQualFinSF', 'MSSubClass', 'MSZoning', 'MasVnrArea',
       'MasVnrType', 'MiscFeature', 'MiscVal', 'MoSold', 'Neighborhood',
       'OpenPorchSF', 'OverallCond', 'OverallQual', 'PavedDrive',
       'PoolArea', 'P

## MSSubClass

In [4]:
data["MSSubClass"].isnull().sum()

0

In [5]:
freq = data["MSSubClass"].value_counts()  # frequency table
freq

20     1079
60      575
50      287
120     182
30      139
160     128
70      128
80      118
90      109
190      61
85       48
75       23
45       18
180      17
40        6
150       1
Name: MSSubClass, dtype: int64

MSSubClass is categorical, though it is coded as numeric

In [6]:
py.iplot([go.Bar(x = freq.keys(), y = freq.values)])

In [7]:
MSSubClass = data["MSSubClass"].replace([20, 60], 1) \
.replace([50, 120, 30, 160, 70, 80, 90], 2) \
.replace([190, 85, 75, 45, 180, 40, 150], 3)

In [8]:
new = pd.DataFrame({"MSSubClass": MSSubClass})  # create new dataframe
print new["MSSubClass"].value_counts()
new.head()

1    1654
2    1091
3     174
Name: MSSubClass, dtype: int64


Unnamed: 0,MSSubClass
0,1
1,1
2,1
3,2
4,1


## MSZoning

In [9]:
data["MSZoning"].isnull().sum()

4

In [10]:
data["MSZoning"].value_counts()

RL         2265
RM          460
FV          139
RH           26
C (all)      25
Name: MSZoning, dtype: int64

Low residential density?: Y (1) or N (0)

In [11]:
MSZoning = data["MSZoning"].map(lambda x: 1 if (x == "RL") or (x == "RM") else 0)
MSZoning.value_counts()

1    2725
0     194
Name: MSZoning, dtype: int64

Drop this variable since majority of the houses are located on low residential density zones and doesn't deviate much from that value.

## LotFrontage

In [12]:
data["LotFrontage"].describe()

count    2433.000000
mean       69.305795
std        23.344905
min        21.000000
25%        59.000000
50%        68.000000
75%        80.000000
max       313.000000
Name: LotFrontage, dtype: float64

In [13]:
new["LotFrontage"] = data["LotFrontage"].fillna(data["LotFrontage"].mean())
new["LotFrontage"].describe()

count    2919.000000
mean       69.305795
std        21.312345
min        21.000000
25%        60.000000
50%        69.305795
75%        78.000000
max       313.000000
Name: LotFrontage, dtype: float64

In [14]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage
0,1,65.0
1,1,80.0
2,1,68.0
3,2,60.0
4,1,84.0


## LotArea

In [15]:
data["LotArea"].describe()

count      2919.000000
mean      10168.114080
std        7886.996359
min        1300.000000
25%        7478.000000
50%        9453.000000
75%       11570.000000
max      215245.000000
Name: LotArea, dtype: float64

In [16]:
new["LogLotArea"] = np.log(data["LotArea"])
new["LogLotArea"].describe()

count    2919.000000
mean        9.094785
std         0.509966
min         7.170120
25%         8.919721
50%         9.154087
75%         9.356171
max        12.279532
Name: LogLotArea, dtype: float64

In [17]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea
0,1,65.0,9.041922
1,1,80.0,9.169518
2,1,68.0,9.328123
3,2,60.0,9.164296
4,1,84.0,9.565214


## Street and PavedDrive

In [18]:
data["Street"].isnull().sum()

0

In [19]:
data["Street"].value_counts()

Pave    2907
Grvl      12
Name: Street, dtype: int64

Paved street?: Y (1) or No (0)

In [20]:
Street = pd.Series(np.where(data["Street"] == "Pave", 1, 0))
Street.value_counts()

1    2907
0      12
dtype: int64

In [21]:
data["PavedDrive"].isnull().sum()

0

In [22]:
data["PavedDrive"].value_counts()

Y    2641
N     216
P      62
Name: PavedDrive, dtype: int64

Paved driveway?: Y (1) or No (0)

In [23]:
PavedDrive = data["PavedDrive"].map({"Y": 1, "N": 0, "P": 0})
PavedDrive.value_counts()

1    2641
0     278
Name: PavedDrive, dtype: int64

Drop Street and PavedDrive since they don't have much variability.

## Alley

In [24]:
data["Alley"].isnull().sum()

2721

NA means that houses have no alley, which is not the same as a missing value.

Alley access?: Y (1) or N (0)

In [25]:
data["Alley"] = data["Alley"].fillna("None")
Alley = pd.Series(np.where(data["Alley"] == "None", 1, 0))
Alley.value_counts()

1    2721
0     198
dtype: int64

Drop this variable since it lacks variability.

## LotShape

In [26]:
data["LotShape"].isnull().sum()

0

In [27]:
data["LotShape"].value_counts()

Reg    1859
IR1     968
IR2      76
IR3      16
Name: LotShape, dtype: int64

Regular lot shape?: Y (1) or No (0)

In [28]:
new["LotShape"] = np.where(data["LotShape"] == "Reg", 1, 0)
new["LotShape"].value_counts()

1    1859
0    1060
Name: LotShape, dtype: int64

In [29]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape
0,1,65.0,9.041922,1
1,1,80.0,9.169518,1
2,1,68.0,9.328123,0
3,2,60.0,9.164296,0
4,1,84.0,9.565214,0


## LandContour

In [30]:
data["LandContour"].isnull().sum()

0

In [31]:
data["LandContour"].value_counts()

Lvl    2622
HLS     120
Bnk     117
Low      60
Name: LandContour, dtype: int64

Flat lot?: Y (1) or N (0)

In [32]:
LandContour = pd.Series(np.where(data["LandContour"] == "Lvl", 1, 0))
LandContour.value_counts()

1    2622
0     297
dtype: int64

Drop this variable it lacks variability.

## Utilities

In [33]:
data["Utilities"].value_counts()

AllPub    2916
NoSeWa       1
Name: Utilities, dtype: int64

Drop this variable. It has a severe lack of variability!

## LotConfig

In [34]:
data["LotConfig"].isnull().sum()

0

In [35]:
data["LotConfig"].value_counts()

Inside     2133
Corner      511
CulDSac     176
FR2          85
FR3          14
Name: LotConfig, dtype: int64

In [36]:
new["LotConfig"] = data["LotConfig"].replace(["FR2", "FR3"], "FR")
new["LotConfig"].value_counts()

Inside     2133
Corner      511
CulDSac     176
FR           99
Name: LotConfig, dtype: int64

In [37]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig
0,1,65.0,9.041922,1,Inside
1,1,80.0,9.169518,1,FR
2,1,68.0,9.328123,0,Inside
3,2,60.0,9.164296,0,Corner
4,1,84.0,9.565214,0,FR


## LandSlope

In [38]:
data["LandSlope"].isnull().sum()

0

In [39]:
data["LandSlope"].value_counts()

Gtl    2778
Mod     125
Sev      16
Name: LandSlope, dtype: int64

Gentle slope?: Y (1) or No (0)

In [40]:
LandSlope = pd.Series(np.where(data["LandSlope"] == "Gtl", 1, 0))
LandSlope.value_counts()

1    2778
0     141
dtype: int64

Drop this variable since it lacks variability.

## Neighborhood

In [41]:
data["Neighborhood"].isnull().sum()

0

In [42]:
data["Neighborhood"].value_counts()

NAmes      443
CollgCr    267
OldTown    239
Edwards    194
Somerst    182
NridgHt    166
Gilbert    165
Sawyer     151
NWAmes     131
SawyerW    125
Mitchel    114
BrkSide    108
Crawfor    103
IDOTRR      93
Timber      72
NoRidge     71
StoneBr     51
SWISU       48
ClearCr     44
MeadowV     37
BrDale      30
Blmngtn     28
Veenker     24
NPkVill     23
Blueste     10
Name: Neighborhood, dtype: int64

In [43]:
new["Neighborhood"] = data["Neighborhood"]
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood
0,1,65.0,9.041922,1,Inside,CollgCr
1,1,80.0,9.169518,1,FR,Veenker
2,1,68.0,9.328123,0,Inside,CollgCr
3,2,60.0,9.164296,0,Corner,Crawfor
4,1,84.0,9.565214,0,FR,NoRidge


## Condition1 and Condition2

In [44]:
data["Condition1"].value_counts()

Norm      2511
Feedr      164
Artery      92
RRAn        50
PosN        39
RRAe        28
PosA        20
RRNn         9
RRNe         6
Name: Condition1, dtype: int64

In [45]:
data["Condition2"].value_counts()

Norm      2889
Feedr       13
Artery       5
PosA         4
PosN         4
RRNn         2
RRAn         1
RRAe         1
Name: Condition2, dtype: int64

Most houses have normal condition. If either conditions are normal, record the overall condition as normal. 

In [46]:
Condition = np.logical_or(data["Condition1"] == "Norm", data["Condition2"] == "Norm")
Condition.value_counts()

True     2889
False      30
Name: Condition1, dtype: int64

Drop this variable since it lacks variability.

## BldgType

In [47]:
data["BldgType"].isnull().sum()

0

In [48]:
data["BldgType"].value_counts()

1Fam      2425
TwnhsE     227
Duplex     109
Twnhs       96
2fmCon      62
Name: BldgType, dtype: int64

Single-family detached?: Y (1) or N (0)

In [49]:
BldgType = pd.Series(np.where(data["BldgType"] == "1Fam", 1, 0))
BldgType.value_counts()

1    2425
0     494
dtype: int64

Drop this variable due to its general lack of variability.

## HouseStyle

In [50]:
data["HouseStyle"].isnull().sum()

0

In [51]:
data["HouseStyle"].value_counts()

1Story    1471
2Story     872
1.5Fin     314
SLvl       128
SFoyer      83
2.5Unf      24
1.5Unf      19
2.5Fin       8
Name: HouseStyle, dtype: int64

Combine 1-1.5 story as 1, 2-2.5 story as 2, and the rest as 0.

In [52]:
new["HouseStyle"] = data["HouseStyle"].replace(["1Story", "1.5Fin", "1.5Unf"], 1) \
.replace(["2Story", "2.5Fin", "2.5Unf"], 2) \
.replace(["SFoyer", "SLvl"], 0)
new["HouseStyle"].value_counts()

1    1804
2     904
0     211
Name: HouseStyle, dtype: int64

In [53]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle
0,1,65.0,9.041922,1,Inside,CollgCr,2
1,1,80.0,9.169518,1,FR,Veenker,1
2,1,68.0,9.328123,0,Inside,CollgCr,2
3,2,60.0,9.164296,0,Corner,Crawfor,2
4,1,84.0,9.565214,0,FR,NoRidge,2


## OverallQual

In [54]:
data["OverallQual"].isnull().sum()

0

In [55]:
freq = data["OverallQual"].value_counts()
freq

5     825
6     731
7     600
8     342
4     226
9     107
3      40
10     31
2      13
1       4
Name: OverallQual, dtype: int64

In [56]:
py.iplot([go.Bar(x = freq.keys(), y = freq.values)])

Combine very poor/poor/fair/less than average  as -1, average/above average/good as 0, very good/excellent/very excellent as 1.

In [57]:
new["OverallQual"] = data["OverallQual"].map(lambda x: -1 if x < 4 else 0 if x < 8 else 1)
new["OverallQual"].value_counts()

 0    2382
 1     480
-1      57
Name: OverallQual, dtype: int64

Note that this variable generally lacks variability. However, it is added to the model.

In [58]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual
0,1,65.0,9.041922,1,Inside,CollgCr,2,0
1,1,80.0,9.169518,1,FR,Veenker,1,0
2,1,68.0,9.328123,0,Inside,CollgCr,2,0
3,2,60.0,9.164296,0,Corner,Crawfor,2,0
4,1,84.0,9.565214,0,FR,NoRidge,2,1


## OverallCond

In [59]:
data["OverallCond"].isnull().sum()

0

In [60]:
freq = data["OverallCond"].value_counts()
freq

5    1645
6     531
7     390
8     144
4     101
3      50
9      41
2      10
1       7
Name: OverallCond, dtype: int64

In [61]:
py.iplot([go.Bar(x = freq.keys(), y = freq.values)])

Average conndition?: Y (1) or No (0) 

In [62]:
OverallCond = pd.Series(np.where(data["OverallCond"] > 4, 1, 0))
OverallCond.value_counts()

1    2751
0     168
dtype: int64

Drop this variable since it is a heavily unimodal and skewed.

## YearBuilt 

In [63]:
data["YearBuilt"].describe()

count    2919.000000
mean     1971.312778
std        30.291442
min      1872.000000
25%      1953.500000
50%      1973.000000
75%      2001.000000
max      2010.000000
Name: YearBuilt, dtype: float64

Years usually needs to be binned.

In [64]:
new["YearBuilt"] = pd.qcut(data["YearBuilt"], q = 4, labels = ["ancient", "older", "newer", "modern"])
pd.concat((new["YearBuilt"], data["YearBuilt"]), axis = 1).head()

Unnamed: 0,YearBuilt,YearBuilt.1
0,modern,2003
1,newer,1976
2,newer,2001
3,ancient,1915
4,newer,2000


In [65]:
new["YearBuilt"].value_counts()

newer      748
older      741
ancient    730
modern     700
Name: YearBuilt, dtype: int64

In [66]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer


## YearRemodAdd

In [67]:
data["YearRemodAdd"].describe()

count    2919.000000
mean     1984.264474
std        20.894344
min      1950.000000
25%      1965.000000
50%      1993.000000
75%      2004.000000
max      2010.000000
Name: YearRemodAdd, dtype: float64

In [68]:
remodel = np.subtract(data["YearRemodAdd"], data["YearBuilt"])
remodel.describe()

count    2919.000000
mean       12.951696
std        24.065624
min        -1.000000
25%         0.000000
50%         0.000000
75%        18.000000
max       127.000000
Name: YearRemodAdd, dtype: float64

In [69]:
new["YearRemodAdd"] = pd.Series(remodel.map(lambda x: "Never" if x <= 0 else "recent" if x <= 10 else "long ago"))
new["YearRemodAdd"].value_counts()

Never       1561
long ago     807
recent       551
Name: YearRemodAdd, dtype: int64

In [70]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never


## RoofStyle

In [71]:
data["RoofStyle"].isnull().sum()

0

In [72]:
data["RoofStyle"].value_counts()

Gable      2310
Hip         551
Gambrel      22
Flat         20
Mansard      11
Shed          5
Name: RoofStyle, dtype: int64

In [73]:
RoofStyle = pd.Series(data["RoofStyle"].replace(["Gambrel", "Flat", "Mansard", "Shed"], "Other"))
RoofStyle.value_counts()

Gable    2310
Hip       551
Other      58
Name: RoofStyle, dtype: int64

Lacks variability. Drop this variable.

## RoofMatl

In [74]:
data["RoofMatl"].isnull().sum()

0

In [75]:
data["RoofMatl"].value_counts()

CompShg    2876
Tar&Grv      23
WdShake       9
WdShngl       7
Membran       1
ClyTile       1
Metal         1
Roll          1
Name: RoofMatl, dtype: int64

Standard shingle?: Y (1) or N (0)

In [76]:
RoofMatl = pd.Series(np.where(data["RoofMatl"] == "CompShg", 1, 0))
RoofMatl.value_counts()

1    2876
0      43
dtype: int64

Drop this variable since it heavily lacks variability.

## Exterior1st and Exterior2nd

In [77]:
print data["Exterior1st"].isnull().sum()
print data["Exterior2nd"].isnull().sum()

1
1


In [78]:
data["Exterior1st"].value_counts()

VinylSd    1025
MetalSd     450
HdBoard     442
Wd Sdng     411
Plywood     221
CemntBd     126
BrkFace      87
WdShing      56
AsbShng      44
Stucco       43
BrkComm       6
AsphShn       2
Stone         2
CBlock        2
ImStucc       1
Name: Exterior1st, dtype: int64

In [79]:
Exterior1st = data["Exterior1st"].fillna("Other") \
.replace(["BrkFace", "WdShing", "AsbShng", "Stucco", "BrkComm", "AsphShn", "Stone", "CBlock", "ImStucc", "Other"], "Other")
Exterior1st.value_counts()

VinylSd    1025
MetalSd     450
HdBoard     442
Wd Sdng     411
Other       244
Plywood     221
CemntBd     126
Name: Exterior1st, dtype: int64

In [80]:
data["Exterior2nd"].value_counts()

VinylSd    1014
MetalSd     447
HdBoard     406
Wd Sdng     391
Plywood     270
CmentBd     126
Wd Shng      81
BrkFace      47
Stucco       47
AsbShng      38
Brk Cmn      22
ImStucc      15
Stone         6
AsphShn       4
CBlock        3
Other         1
Name: Exterior2nd, dtype: int64

In [81]:
Exterior2nd = data["Exterior2nd"].fillna("Other") \
.replace(["Wd Shng", "BrkFace", "Stucco", "AsbShng", "Brk Cmn", "ImStucc", "Stone", "AsphShn", "CBlock", "Other"], "Other")
Exterior2nd.value_counts()

VinylSd    1014
MetalSd     447
HdBoard     406
Wd Sdng     391
Plywood     270
Other       265
CmentBd     126
Name: Exterior2nd, dtype: int64

In [82]:
np.equal(Exterior1st, Exterior2nd).value_counts()  # check if both columns are same

True     2544
False     375
Name: Exterior1st, dtype: int64

Since both variables roughly have the same distribution and most houses have only one exterior material, keep only one of them in the model.

In [83]:
new["Exterior"] = Exterior1st
new["Exterior"].value_counts()

VinylSd    1025
MetalSd     450
HdBoard     442
Wd Sdng     411
Other       244
Plywood     221
CemntBd     126
Name: Exterior, dtype: int64

In [84]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,Exterior
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,VinylSd
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,MetalSd
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,VinylSd
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,Wd Sdng
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,VinylSd


## MasVnrType and MasVnrArea

In [85]:
data["MasVnrArea"].describe()

count    2896.000000
mean      102.201312
std       179.334253
min         0.000000
25%         0.000000
50%         0.000000
75%       164.000000
max      1600.000000
Name: MasVnrArea, dtype: float64

In [86]:
data["MasVnrArea"] = data["MasVnrArea"].fillna(0)
np.sum(data["MasVnrArea"] == 0)

1761

Since most houses don't have masonry veneer, the area is mostly 0.

In [87]:
data["MasVnrType"].isnull().sum()

24

In [88]:
data["MasVnrType"] = data["MasVnrType"].fillna("None")
data["MasVnrType"].value_counts()

None       1766
BrkFace     879
Stone       249
BrkCmn       25
Name: MasVnrType, dtype: int64

It makes more sense to simply record if a house has masonry veneer rather than worrying about unnecessary details, such as its area and type.

In [89]:
new["MasVnr"] = np.where(data["MasVnrArea"] == 0, 0, 1)
new["MasVnr"].value_counts()

0    1761
1    1158
Name: MasVnr, dtype: int64

In [90]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,Exterior,MasVnr
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,VinylSd,1
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,MetalSd,0
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,VinylSd,1
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,Wd Sdng,0
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,VinylSd,1


## ExterQual and ExterCond

In [91]:
print data["ExterQual"].isnull().sum()
print data["ExterCond"].isnull().sum()

0
0


Compare the original quality of the material to its current condition.

In [92]:
original = data["ExterQual"].map({"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1})
current = data["ExterCond"].map({"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1})
original.value_counts()

3    1798
4     979
5     107
2      35
Name: ExterQual, dtype: int64

In [93]:
current.value_counts()

3    2538
4     299
2      67
5      12
1       3
Name: ExterCond, dtype: int64

Get the change in condition overtime.

In [94]:
cond = np.subtract(current, original)
cond.value_counts()

 0    1620
-1     951
 1     237
-2     104
 2       7
Name: ExterCond, dtype: int64

In [95]:
new["ExterCond"] = pd.Series(cond.map(lambda x: "depreciated" if x < 0 else "improved" if x > 0 else "no change"))
new["ExterCond"].value_counts()

no change      1620
depreciated    1055
improved        244
Name: ExterCond, dtype: int64

In [96]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,Exterior,MasVnr,ExterCond
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,VinylSd,1,depreciated
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,MetalSd,0,no change
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,VinylSd,1,depreciated
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,Wd Sdng,0,no change
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,VinylSd,1,depreciated


## Foundation

In [97]:
data["Foundation"].isnull().sum()

0

In [98]:
data["Foundation"].value_counts()

PConc     1308
CBlock    1235
BrkTil     311
Slab        49
Stone       11
Wood         5
Name: Foundation, dtype: int64

In [99]:
new["Foundation"] = data["Foundation"].replace(["BrkTil", "Slab", "Stone", "Wood"], "Other")
new["Foundation"].value_counts()

PConc     1308
CBlock    1235
Other      376
Name: Foundation, dtype: int64

In [100]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,Exterior,MasVnr,ExterCond,Foundation
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,VinylSd,1,depreciated,PConc
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,MetalSd,0,no change,CBlock
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,VinylSd,1,depreciated,PConc
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,Wd Sdng,0,no change,Other
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,VinylSd,1,depreciated,PConc


## BsmtQual

This variable records the basement height.

In [101]:
data["BsmtQual"].isnull().sum()

81

In [102]:
data["BsmtQual"].value_counts()

TA    1283
Gd    1209
Ex     258
Fa      88
Name: BsmtQual, dtype: int64

NA means no basement, and not missing.

In [103]:
new["BsmtQual"] = data["BsmtQual"].fillna("None").map({"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 1, "None": 0})
new["BsmtQual"].value_counts()

2    1283
3    1209
4     258
1      88
0      81
Name: BsmtQual, dtype: int64

In [104]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,Exterior,MasVnr,ExterCond,Foundation,BsmtQual
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,VinylSd,1,depreciated,PConc,3
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,MetalSd,0,no change,CBlock,3
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,VinylSd,1,depreciated,PConc,3
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,Wd Sdng,0,no change,Other,2
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,VinylSd,1,depreciated,PConc,3


## BsmtCond

In [105]:
data["BsmtCond"].isnull().sum()

82

In [106]:
data["BsmtCond"].value_counts()

TA    2606
Gd     122
Fa     104
Po       5
Name: BsmtCond, dtype: int64

NA means no basement, and not missing.

In [107]:
BsmtCond = pd.Series(data["BsmtCond"].fillna("None").map({"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 1, "None": 0}))
BsmtCond.value_counts()

2    2606
3     122
1     109
0      82
Name: BsmtCond, dtype: int64

Drop this variable since it lacks variability.

## BsmtExposure

In [108]:
data["BsmtExposure"].isnull().sum()

82

NA means no basement, and not missing.

In [109]:
data["BsmtExposure"].value_counts()

No    1904
Av     418
Gd     276
Mn     239
Name: BsmtExposure, dtype: int64

In [110]:
new["BsmtExposure"] = data["BsmtExposure"].fillna("None").map({"Gd": 3, "Av": 2, "Mn": 1, "No": 0, "None": 0})
new["BsmtExposure"].value_counts()

0    1986
2     418
3     276
1     239
Name: BsmtExposure, dtype: int64

In [111]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,Exterior,MasVnr,ExterCond,Foundation,BsmtQual,BsmtExposure
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,VinylSd,1,depreciated,PConc,3,0
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,MetalSd,0,no change,CBlock,3,3
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,VinylSd,1,depreciated,PConc,3,1
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,Wd Sdng,0,no change,Other,2,0
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,VinylSd,1,depreciated,PConc,3,2


## BsmtFinType1 and BsmtFinType2

In [112]:
print data["BsmtFinType1"].isnull().sum()
print data["BsmtFinType2"].isnull().sum()

79
80


NA means no basement, and not missing.

In [113]:
data["BsmtFinType1"].value_counts()

Unf    851
GLQ    849
ALQ    429
Rec    288
BLQ    269
LwQ    154
Name: BsmtFinType1, dtype: int64

In [114]:
data["BsmtFinType2"].value_counts()

Unf    2493
Rec     105
LwQ      87
BLQ      68
ALQ      52
GLQ      34
Name: BsmtFinType2, dtype: int64

In [115]:
type1 = data["BsmtFinType1"].fillna("None").map({"GLQ": 3, "ALQ": 2, "Rec": 2, "BLQ": 1, "LwQ": 1, "Unf": 0, "None": 0})
type2 = data["BsmtFinType2"].fillna("None").map({"GLQ": 3, "ALQ": 2, "Rec": 2, "BLQ": 1, "LwQ": 1, "Unf": 0, "None": 0})
type1.value_counts()

0    930
3    849
2    717
1    423
Name: BsmtFinType1, dtype: int64

In [116]:
type2.value_counts()

0    2573
2     157
1     155
3      34
Name: BsmtFinType2, dtype: int64

In [117]:
np.equal(type1, type2).value_counts()  # most basements have a 2nd rating

False    1899
True     1020
Name: BsmtFinType1, dtype: int64

Most basements can be used as a living quarter, however that space is unfinished. Hence, it makes more sense here to record if the basement is unfinished and use TotalBsmtSF to provide the total basement area.

In [118]:
new["BsmtFinType"] = pd.Series(np.logical_or(type1 == 0.0, type2 == 0.0))
new["BsmtFinType"] = np.where(new["BsmtFinType"] == True, 1, 0)
new["BsmtFinType"].value_counts()

1    2573
0     346
Name: BsmtFinType, dtype: int64

In [119]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,Exterior,MasVnr,ExterCond,Foundation,BsmtQual,BsmtExposure,BsmtFinType
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,VinylSd,1,depreciated,PConc,3,0,1
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,MetalSd,0,no change,CBlock,3,3,1
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,VinylSd,1,depreciated,PConc,3,1,1
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,Wd Sdng,0,no change,Other,2,0,1
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,VinylSd,1,depreciated,PConc,3,2,1


## BsmtFinSF1, BsmtFinSF2, BsmtUnfSF

In [120]:
data["BsmtFinSF1"].describe()

count    2918.000000
mean      441.423235
std       455.610826
min         0.000000
25%         0.000000
50%       368.500000
75%       733.000000
max      5644.000000
Name: BsmtFinSF1, dtype: float64

In [121]:
data["BsmtFinSF2"].describe()

count    2918.000000
mean       49.582248
std       169.205611
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max      1526.000000
Name: BsmtFinSF2, dtype: float64

In [122]:
print (data["BsmtFinSF1"] == 0).sum()
print (data["BsmtFinSF2"] == 0).sum()

929
2571


Since most basements are unfinished, there are just as many zeroes as in type1 and type2.

In [123]:
BsmtFinSF = np.logical_or(data["BsmtFinSF1"] == 0, data["BsmtFinSF2"] == 0)
(BsmtFinSF == True).sum()

2571

Drop this variable since it is redundant with BsmtFinType.

In [124]:
data["BsmtUnfSF"].describe()

count    2918.000000
mean      560.772104
std       439.543659
min         0.000000
25%       220.000000
50%       467.000000
75%       805.500000
max      2336.000000
Name: BsmtUnfSF, dtype: float64

In [125]:
(data["BsmtUnfSF"] != 0).sum()

2678

There are many nonzero values since most basements are unfinished. Drop this variable since it is redundant with BsmtFinType.

## TotalBsmtSF

In [126]:
data["TotalBsmtSF"].describe()

count    2918.000000
mean     1051.777587
std       440.766258
min         0.000000
25%       793.000000
50%       989.500000
75%      1302.000000
max      6110.000000
Name: TotalBsmtSF, dtype: float64

In [127]:
data["TotalBsmtSF"] = data["TotalBsmtSF"].fillna(0)
new["LogTotalBsmtSF"] = np.log(data["TotalBsmtSF"] + 1)
new["LogTotalBsmtSF"].describe()

count    2919.000000
mean        6.730833
std         1.182460
min         0.000000
25%         6.677083
50%         6.897705
75%         7.172425
max         8.717846
Name: LogTotalBsmtSF, dtype: float64

In [128]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,Exterior,MasVnr,ExterCond,Foundation,BsmtQual,BsmtExposure,BsmtFinType,LogTotalBsmtSF
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,VinylSd,1,depreciated,PConc,3,0,1,6.753438
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,MetalSd,0,no change,CBlock,3,3,1,7.141245
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,VinylSd,1,depreciated,PConc,3,1,1,6.82546
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,Wd Sdng,0,no change,Other,2,0,1,6.629363
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,VinylSd,1,depreciated,PConc,3,2,1,7.044033


## Heating

In [129]:
data["Heating"].value_counts()

GasA     2874
GasW       27
Grav        9
Wall        6
OthW        2
Floor       1
Name: Heating, dtype: int64

Has gas air furnace?: Y (1) or No (0).

In [130]:
Heating = pd.Series(np.where(data["Heating"] == "GasA", 1, 0))
Heating.value_counts()

1    2874
0      45
dtype: int64

Drop this variable since it lacks variability.

## HeatingQC

In [131]:
data["HeatingQC"].isnull().sum()

0

In [132]:
data["HeatingQC"].value_counts()

Ex    1493
TA     857
Gd     474
Fa      92
Po       3
Name: HeatingQC, dtype: int64

In [133]:
new["HeatingQC"] = data["HeatingQC"].map({"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 1})
new["HeatingQC"].value_counts()

4    1493
2     857
3     474
1      95
Name: HeatingQC, dtype: int64

In [134]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,Exterior,MasVnr,ExterCond,Foundation,BsmtQual,BsmtExposure,BsmtFinType,LogTotalBsmtSF,HeatingQC
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,VinylSd,1,depreciated,PConc,3,0,1,6.753438,4
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,MetalSd,0,no change,CBlock,3,3,1,7.141245,4
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,VinylSd,1,depreciated,PConc,3,1,1,6.82546,4
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,Wd Sdng,0,no change,Other,2,0,1,6.629363,3
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,VinylSd,1,depreciated,PConc,3,2,1,7.044033,4


## CentralAir

In [135]:
data["CentralAir"].isnull().sum()

0

In [136]:
data["CentralAir"].value_counts()

Y    2723
N     196
Name: CentralAir, dtype: int64

Drop this variable since it lacks variability.

## Electrical

In [137]:
data["Electrical"].isnull().sum()

1

In [138]:
data["Electrical"].value_counts()

SBrkr    2671
FuseA     188
FuseF      50
FuseP       8
Mix         1
Name: Electrical, dtype: int64

Standard breaker?: Y (1) or No (0)

In [139]:
Electrical = pd.Series(np.where(data["Electrical"] == "SBrkr", 1, 0))
Electrical.value_counts()

1    2671
0     248
dtype: int64

Drop this variable since it lacks variability.

## 1stFlrSF

In [140]:
data["1stFlrSF"].describe()

count    2919.000000
mean     1159.581706
std       392.362079
min       334.000000
25%       876.000000
50%      1082.000000
75%      1387.500000
max      5095.000000
Name: 1stFlrSF, dtype: float64

In [141]:
new["Log1stFlrSF"] = np.log(data["1stFlrSF"])
new["Log1stFlrSF"].describe()

count    2919.000000
mean        7.003229
std         0.322956
min         5.811141
25%         6.775366
50%         6.986566
75%         7.235259
max         8.536015
Name: Log1stFlrSF, dtype: float64

In [142]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,Exterior,MasVnr,ExterCond,Foundation,BsmtQual,BsmtExposure,BsmtFinType,LogTotalBsmtSF,HeatingQC,Log1stFlrSF
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,VinylSd,1,depreciated,PConc,3,0,1,6.753438,4,6.75227
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,MetalSd,0,no change,CBlock,3,3,1,7.141245,4,7.140453
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,VinylSd,1,depreciated,PConc,3,1,1,6.82546,4,6.824374
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,Wd Sdng,0,no change,Other,2,0,1,6.629363,3,6.867974
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,VinylSd,1,depreciated,PConc,3,2,1,7.044033,4,7.04316


## 2ndFlrSF

In [143]:
data["2ndFlrSF"].describe()

count    2919.000000
mean      336.483727
std       428.701456
min         0.000000
25%         0.000000
50%         0.000000
75%       704.000000
max      2065.000000
Name: 2ndFlrSF, dtype: float64

In [144]:
np.sum(data["2ndFlrSF"] == 0)

1668

Has 2nd floor?: Y (1) or No (0).

In [145]:
new["2ndFlr"] = np.where(data["2ndFlrSF"] == 0, 0, 1)
new["2ndFlr"].value_counts()

0    1668
1    1251
Name: 2ndFlr, dtype: int64

In [146]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,...,MasVnr,ExterCond,Foundation,BsmtQual,BsmtExposure,BsmtFinType,LogTotalBsmtSF,HeatingQC,Log1stFlrSF,2ndFlr
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,...,1,depreciated,PConc,3,0,1,6.753438,4,6.75227,1
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,...,0,no change,CBlock,3,3,1,7.141245,4,7.140453,0
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,...,1,depreciated,PConc,3,1,1,6.82546,4,6.824374,1
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,...,0,no change,Other,2,0,1,6.629363,3,6.867974,1
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,...,1,depreciated,PConc,3,2,1,7.044033,4,7.04316,1


## GrLivArea

In [147]:
data["GrLivArea"].describe()

count    2919.000000
mean     1500.759849
std       506.051045
min       334.000000
25%      1126.000000
50%      1444.000000
75%      1743.500000
max      5642.000000
Name: GrLivArea, dtype: float64

In [148]:
np.corrcoef(data["GrLivArea"], data["1stFlrSF"])

array([[ 1.        ,  0.56253825],
       [ 0.56253825,  1.        ]])

There is a positive, moderate correlation between GrLivArea and 1stFlrSF.

In [149]:
new["LogGrLivArea"] = np.log(data["GrLivArea"])
new["LogGrLivArea"].describe()

count    2919.000000
mean        7.260762
std         0.324991
min         5.811141
25%         7.026427
50%         7.275172
75%         7.463650
max         8.637994
Name: LogGrLivArea, dtype: float64

In [150]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,...,ExterCond,Foundation,BsmtQual,BsmtExposure,BsmtFinType,LogTotalBsmtSF,HeatingQC,Log1stFlrSF,2ndFlr,LogGrLivArea
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,...,depreciated,PConc,3,0,1,6.753438,4,6.75227,1,7.444249
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,...,no change,CBlock,3,3,1,7.141245,4,7.140453,0,7.140453
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,...,depreciated,PConc,3,1,1,6.82546,4,6.824374,1,7.487734
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,...,no change,Other,2,0,1,6.629363,3,6.867974,1,7.448334
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,...,depreciated,PConc,3,2,1,7.044033,4,7.04316,1,7.695303


## TotalArea: NEW feature

Ground living area only considers the space that is above the grade of the ground, and not the space that is below the grade of the ground, such as a basement. Lets combine the two to inform homeowners how much total area will be available to them.

In [151]:
new["TotalArea"] = np.sum([data["GrLivArea"], data["TotalBsmtSF"]], axis = 0)
new["TotalArea"].describe()

count     2919.000000
mean      2552.177115
std        806.018663
min        334.000000
25%       2004.000000
50%       2453.000000
75%       2996.000000
max      11752.000000
Name: TotalArea, dtype: float64

In [152]:
new["LogTotalArea"] = np.log(new["TotalArea"])
new["LogTotalArea"].describe()

count    2919.000000
mean        7.796990
std         0.312961
min         5.811141
25%         7.602900
50%         7.805067
75%         8.005033
max         9.371779
Name: LogTotalArea, dtype: float64

In [153]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,...,BsmtQual,BsmtExposure,BsmtFinType,LogTotalBsmtSF,HeatingQC,Log1stFlrSF,2ndFlr,LogGrLivArea,TotalArea,LogTotalArea
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,...,3,0,1,6.753438,4,6.75227,1,7.444249,2566.0,7.850104
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,...,3,3,1,7.141245,4,7.140453,0,7.140453,2524.0,7.8336
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,...,3,1,1,6.82546,4,6.824374,1,7.487734,2706.0,7.903227
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,...,2,0,1,6.629363,3,6.867974,1,7.448334,2473.0,7.813187
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,...,3,2,1,7.044033,4,7.04316,1,7.695303,3343.0,8.114624


## LowQualFinSF

In [154]:
data["LowQualFinSF"].describe()

count    2919.000000
mean        4.694416
std        46.396825
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max      1064.000000
Name: LowQualFinSF, dtype: float64

In [155]:
np.sum(data["LowQualFinSF"] == 0)

2879

Drop this variable due to lack of data.

## BsmtFullBath, BsmtHalfBath, FullBath, HalfBath

In [156]:
print data["BsmtFullBath"].isnull().sum()
print data["BsmtHalfBath"].isnull().sum()
print data["FullBath"].isnull().sum()
print data["HalfBath"].isnull().sum()

2
2
0
0


In [157]:
data["BsmtFullBath"].value_counts()

0.0    1705
1.0    1172
2.0      38
3.0       2
Name: BsmtFullBath, dtype: int64

In [158]:
data["FullBath"].value_counts()

2    1530
1    1309
3      64
0      12
4       4
Name: FullBath, dtype: int64

Combine all full bathrooms regardless of whether it is in the basement or not.

In [159]:
data["BsmtFullBath"] = data["BsmtFullBath"].fillna(0)
full = pd.Series(np.sum([data["BsmtFullBath"], data["FullBath"]], axis = 0))
full.value_counts()

2.0    1466
1.0     758
3.0     645
4.0      44
6.0       5
0.0       1
dtype: int64

In [160]:
new["FullBath"] = full.replace([0], 1).replace([3, 4, 6], 3)
new["FullBath"].value_counts()

2.0    1499
1.0     744
3.0     676
Name: FullBath, dtype: int64

In [161]:
data["BsmtHalfBath"].value_counts()

0.0    2742
1.0     171
2.0       4
Name: BsmtHalfBath, dtype: int64

In [162]:
data["HalfBath"].value_counts()

0    1834
1    1060
2      25
Name: HalfBath, dtype: int64

Similarly, combine all half bathrooms.

In [163]:
data["BsmtHalfBath"] = data["BsmtHalfBath"].fillna(0)
half = pd.Series(np.sum([data["BsmtHalfBath"], data["HalfBath"]], axis = 0))
half.value_counts()

0.0    1700
1.0    1154
2.0      61
3.0       3
4.0       1
dtype: int64

Has half bathroom?: Y (1) or No (0)

In [164]:
new["HalfBath"] = half.replace([2, 3, 4], 1)
new["HalfBath"].value_counts()

0.0    1710
1.0    1209
Name: HalfBath, dtype: int64

In [165]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,...,BsmtFinType,LogTotalBsmtSF,HeatingQC,Log1stFlrSF,2ndFlr,LogGrLivArea,TotalArea,LogTotalArea,FullBath,HalfBath
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,...,1,6.753438,4,6.75227,1,7.444249,2566.0,7.850104,3.0,1.0
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,...,1,7.141245,4,7.140453,0,7.140453,2524.0,7.8336,2.0,1.0
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,...,1,6.82546,4,6.824374,1,7.487734,2706.0,7.903227,3.0,1.0
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,...,1,6.629363,3,6.867974,1,7.448334,2473.0,7.813187,2.0,0.0
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,...,1,7.044033,4,7.04316,1,7.695303,3343.0,8.114624,3.0,1.0


## Bedroom

In [166]:
data["BedroomAbvGr"].value_counts()

3    1596
2     742
4     400
1     103
5      48
6      21
0       8
8       1
Name: BedroomAbvGr, dtype: int64

In [167]:
data["BedroomAbvGr"].isnull().sum()

0

In [168]:
new["BedroomAbvGr"] = data["BedroomAbvGr"].replace([5, 6, 8], 5)
new["BedroomAbvGr"].value_counts()

3    1596
2     742
4     400
1     103
5      70
0       8
Name: BedroomAbvGr, dtype: int64

In [169]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,...,LogTotalBsmtSF,HeatingQC,Log1stFlrSF,2ndFlr,LogGrLivArea,TotalArea,LogTotalArea,FullBath,HalfBath,BedroomAbvGr
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,...,6.753438,4,6.75227,1,7.444249,2566.0,7.850104,3.0,1.0,3
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,...,7.141245,4,7.140453,0,7.140453,2524.0,7.8336,2.0,1.0,3
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,...,6.82546,4,6.824374,1,7.487734,2706.0,7.903227,3.0,1.0,3
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,...,6.629363,3,6.867974,1,7.448334,2473.0,7.813187,2.0,0.0,3
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,...,7.044033,4,7.04316,1,7.695303,3343.0,8.114624,3.0,1.0,4


## KitchenAbvGr

In [170]:
data["KitchenAbvGr"].value_counts()

1    2785
2     129
0       3
3       2
Name: KitchenAbvGr, dtype: int64

In [171]:
data["KitchenAbvGr"].isnull().sum()

0

Drop this variable due to lack of data. Also, KitchenQual assumes that a kitchen is available.

## KitchenQual

In [172]:
data["KitchenQual"].value_counts()

TA    1492
Gd    1151
Ex     205
Fa      70
Name: KitchenQual, dtype: int64

In [173]:
data["KitchenQual"].isnull().sum()

1

In [174]:
new["KitchenQual"] = data["KitchenQual"].fillna("TA").map({"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 1})
new["KitchenQual"].value_counts()

2    1493
3    1151
4     205
1      70
Name: KitchenQual, dtype: int64

In [175]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,...,HeatingQC,Log1stFlrSF,2ndFlr,LogGrLivArea,TotalArea,LogTotalArea,FullBath,HalfBath,BedroomAbvGr,KitchenQual
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,...,4,6.75227,1,7.444249,2566.0,7.850104,3.0,1.0,3,3
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,...,4,7.140453,0,7.140453,2524.0,7.8336,2.0,1.0,3,2
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,...,4,6.824374,1,7.487734,2706.0,7.903227,3.0,1.0,3,3
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,...,3,6.867974,1,7.448334,2473.0,7.813187,2.0,0.0,3,3
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,...,4,7.04316,1,7.695303,3343.0,8.114624,3.0,1.0,4,3


## TotRmsAbvGrd

In [176]:
freq = data["TotRmsAbvGrd"].value_counts()
py.iplot([go.Bar(x = freq.keys(), y = freq.values)])

In [177]:
new["TotRmsAbvGrd"] = data["TotRmsAbvGrd"].replace([2, 3], 4).replace([10, 11, 12, 13, 14, 15], 9)
new["TotRmsAbvGrd"].value_counts()

6    844
7    649
5    583
8    347
9    274
4    222
Name: TotRmsAbvGrd, dtype: int64

## Functional

In [178]:
data["Functional"].value_counts()

Typ     2717
Min2      70
Min1      65
Mod       35
Maj1      19
Maj2       9
Sev        2
Name: Functional, dtype: int64

In [179]:
data["Functional"].isnull().sum()

2

Typical home functionality?: Y (1) or No (0)

In [180]:
data["Functional"] = data["Functional"].fillna("Typ")
Functional = pd.Series(np.where(data["Functional"] == "Typ", 1, 0))
Functional.value_counts()

1    2719
0     200
dtype: int64

Drop this variable since it lacks variability.

## Fireplaces

In [181]:
data["Fireplaces"].isnull().sum()

0

In [182]:
data["Fireplaces"].value_counts()

0    1420
1    1268
2     219
3      11
4       1
Name: Fireplaces, dtype: int64

Has a fireplace?: Y (1) or No (0)

In [183]:
new["Fireplaces"] = np.where(data["Fireplaces"] > 0, 1, 0)
new["Fireplaces"].value_counts()

1    1499
0    1420
Name: Fireplaces, dtype: int64

In [184]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,...,2ndFlr,LogGrLivArea,TotalArea,LogTotalArea,FullBath,HalfBath,BedroomAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,...,1,7.444249,2566.0,7.850104,3.0,1.0,3,3,8,0
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,...,0,7.140453,2524.0,7.8336,2.0,1.0,3,2,6,1
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,...,1,7.487734,2706.0,7.903227,3.0,1.0,3,3,6,1
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,...,1,7.448334,2473.0,7.813187,2.0,0.0,3,3,7,1
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,...,1,7.695303,3343.0,8.114624,3.0,1.0,4,3,9,1


## FireplaceQu

In [185]:
data["FireplaceQu"].isnull().sum()

1420

In [186]:
data["FireplaceQu"].value_counts()

Gd    744
TA    592
Fa     74
Po     46
Ex     43
Name: FireplaceQu, dtype: int64

NA means no fireplace, and not missing.

In [187]:
new["FireplaceQu"] = data["FireplaceQu"].fillna("None").map({"Ex": 3, "Gd": 3, "TA": 2, "Fa": 1, "Po": 1, "None": 0})
new["FireplaceQu"].value_counts()

0    1420
3     787
2     592
1     120
Name: FireplaceQu, dtype: int64

In [188]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,...,LogGrLivArea,TotalArea,LogTotalArea,FullBath,HalfBath,BedroomAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,...,7.444249,2566.0,7.850104,3.0,1.0,3,3,8,0,0
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,...,7.140453,2524.0,7.8336,2.0,1.0,3,2,6,1,2
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,...,7.487734,2706.0,7.903227,3.0,1.0,3,3,6,1,2
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,...,7.448334,2473.0,7.813187,2.0,0.0,3,3,7,1,3
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,...,7.695303,3343.0,8.114624,3.0,1.0,4,3,9,1,2


## GarageType

In [189]:
data["GarageType"].isnull().sum()

157

NA means No Garage, and doesn't mean missing.

In [190]:
data["GarageType"] = data["GarageType"].fillna("None")
data["GarageType"].value_counts()

Attchd     1723
Detchd      779
BuiltIn     186
None        157
Basment      36
2Types       23
CarPort      15
Name: GarageType, dtype: int64

In [191]:
new["GarageType"] = data["GarageType"].replace(["Basment", "2Types", "CarPort"], "Other")
new["GarageType"].value_counts()

Attchd     1723
Detchd      779
BuiltIn     186
None        157
Other        74
Name: GarageType, dtype: int64

In [192]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,...,TotalArea,LogTotalArea,FullBath,HalfBath,BedroomAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageType
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,...,2566.0,7.850104,3.0,1.0,3,3,8,0,0,Attchd
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,...,2524.0,7.8336,2.0,1.0,3,2,6,1,2,Attchd
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,...,2706.0,7.903227,3.0,1.0,3,3,6,1,2,Attchd
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,...,2473.0,7.813187,2.0,0.0,3,3,7,1,3,Detchd
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,...,3343.0,8.114624,3.0,1.0,4,3,9,1,2,Attchd


## GarageYrBlt

In [193]:
data["GarageYrBlt"].describe()

count    2760.000000
mean     1978.113406
std        25.574285
min      1895.000000
25%      1960.000000
50%      1979.000000
75%      2002.000000
max      2207.000000
Name: GarageYrBlt, dtype: float64

In [194]:
data["GarageYrBlt"].isnull().sum()

159

Since 159 houses don't have garages, replace NA with 0, including the max year!

In [195]:
data["GarageYrBlt"] = data["GarageYrBlt"].fillna(0).replace([data["GarageYrBlt"].max()], 0)
data["GarageYrBlt"].describe()

count    2919.000000
mean     1869.608085
std       450.973653
min         0.000000
25%      1957.000000
50%      1977.000000
75%      2001.000000
max      2010.000000
Name: GarageYrBlt, dtype: float64

In [196]:
new["GarageYrBlt"] = pd.qcut(data["YearBuilt"], q = 4, labels = ["ancient", "older", "newer", "modern"])
new["GarageYrBlt"].value_counts()

newer      748
older      741
ancient    730
modern     700
Name: GarageYrBlt, dtype: int64

## GarageFinish

In [197]:
data["GarageFinish"].isnull().sum()

159

NA means no garage, and not missing.

In [198]:
data["GarageFinish"] = data["GarageFinish"].fillna("None")
data["GarageFinish"].value_counts()

Unf     1230
RFn      811
Fin      719
None     159
Name: GarageFinish, dtype: int64

In [199]:
new["GarageFinish"] = data["GarageFinish"]
new["GarageFinish"].value_counts()

Unf     1230
RFn      811
Fin      719
None     159
Name: GarageFinish, dtype: int64

In [200]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,...,FullBath,HalfBath,BedroomAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,...,3.0,1.0,3,3,8,0,0,Attchd,modern,RFn
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,...,2.0,1.0,3,2,6,1,2,Attchd,newer,RFn
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,...,3.0,1.0,3,3,6,1,2,Attchd,newer,RFn
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,...,2.0,0.0,3,3,7,1,3,Detchd,ancient,Unf
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,...,3.0,1.0,4,3,9,1,2,Attchd,newer,RFn


## GarageCars and GarageArea

In [201]:
data["GarageCars"].isnull().sum()

1

In [202]:
data["GarageCars"].value_counts()

2.0    1594
1.0     776
3.0     374
0.0     157
4.0      16
5.0       1
Name: GarageCars, dtype: int64

In [203]:
data["GarageArea"].describe()

count    2918.000000
mean      472.874572
std       215.394815
min         0.000000
25%       320.000000
50%       480.000000
75%       576.000000
max      1488.000000
Name: GarageArea, dtype: float64

In [204]:
data["GarageCars"] = data["GarageCars"].fillna(2)
data["GarageArea"] = data["GarageArea"].fillna(data["GarageArea"].mean())
py.iplot([go.Scatter(x = data["GarageArea"], y = data["GarageCars"], mode = "markers")])

In [205]:
np.corrcoef(data["GarageCars"], data["GarageArea"])

array([[ 1.       ,  0.8896861],
       [ 0.8896861,  1.       ]])

It is clear from the graph that there is a relationship between GarageCars and GarageArea. In fact, both variables have a strong positive correlation of 0.89. Garages that can fit 1-3 cars form the three biggest clusters in the graph. The minimum square footage in each cluster increases, with no bound on the maximum, allowing more cars to be fit in those garages. We choose to merge garages that can fit more than 3 cars with the garages that can fit exactly three cars since their area square footage is in the same range and that data doesn't have much density. Also, in order to reduce redundancy, we keep only one of the two variables. We picked GarageCars since most people understand how many cars they can fit in their garage as opposed to its area.

In [206]:
new["GarageCars"] = data["GarageCars"].replace([4, 5], 3)
new["GarageCars"].value_counts()

2.0    1595
1.0     776
3.0     391
0.0     157
Name: GarageCars, dtype: int64

In [207]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,...,HalfBath,BedroomAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,...,1.0,3,3,8,0,0,Attchd,modern,RFn,2.0
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,...,1.0,3,2,6,1,2,Attchd,newer,RFn,2.0
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,...,1.0,3,3,6,1,2,Attchd,newer,RFn,2.0
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,...,0.0,3,3,7,1,3,Detchd,ancient,Unf,3.0
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,...,1.0,4,3,9,1,2,Attchd,newer,RFn,3.0


## GarageQual and GarageCond

In [208]:
data["GarageQual"].isnull().sum()

159

NA means no garage, and not missing.

In [209]:
data["GarageQual"] = data["GarageQual"].fillna("None")
data["GarageQual"].value_counts()

TA      2604
None     159
Fa       124
Gd        24
Po         5
Ex         3
Name: GarageQual, dtype: int64

In [210]:
data["GarageCond"].isnull().sum()

159

In [211]:
data["GarageCond"] = data["GarageCond"].fillna("None")
data["GarageCond"].value_counts()

TA      2654
None     159
Fa        74
Gd        15
Po        14
Ex         3
Name: GarageCond, dtype: int64

Let's compare the original garage quality (GarageQual) to the current garage condition (GarageCond).

In [212]:
original = data["GarageQual"].map({"Ex": 4, "Gd": 4, "TA": 2, "Fa": 1, "Po": 0, "None": 0})
current = data["GarageCond"].map({"Ex": 4, "Gd": 4, "TA": 2, "Fa": 1, "Po": 0, "None": 0})
cond = np.subtract(current, original)
cond.value_counts()

 0    2770
 1      78
-1      42
-2      20
 2       9
Name: GarageCond, dtype: int64

In [213]:
new["GarageRemod"] = pd.Series(cond.map(lambda x: "depreciated" if x < 0 else "improved" if x > 0 else "no change"))
new["GarageRemod"].value_counts()

no change      2770
improved         87
depreciated      62
Name: GarageRemod, dtype: int64

Was garage remodeled?: Y (1) or No (0).

In [214]:
GarageRemod = pd.Series(np.where(new["GarageRemod"] == "no change", 1, 0))
GarageRemod.value_counts()

1    2770
0     149
dtype: int64

Drop these variables since they do not provide much information.

## WoodDeckSF

In [215]:
data["WoodDeckSF"].describe()

count    2919.000000
mean       93.709832
std       126.526589
min         0.000000
25%         0.000000
50%         0.000000
75%       168.000000
max      1424.000000
Name: WoodDeckSF, dtype: float64

In [216]:
np.sum(data["WoodDeckSF"] == 0)

1523

Has a wood deck?: Y(1) or No (0)

In [217]:
new["WoodDeck"] = np.where(data["WoodDeckSF"] == 0, 0, 1)
new["WoodDeck"].value_counts()

0    1523
1    1396
Name: WoodDeck, dtype: int64

In [218]:
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,...,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageRemod,WoodDeck
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,...,3,8,0,0,Attchd,modern,RFn,2.0,no change,0
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,...,2,6,1,2,Attchd,newer,RFn,2.0,no change,1
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,...,3,6,1,2,Attchd,newer,RFn,2.0,no change,0
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,...,3,7,1,3,Detchd,ancient,Unf,3.0,no change,0
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,...,3,9,1,2,Attchd,newer,RFn,3.0,no change,1


## OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch

In [219]:
data["OpenPorchSF"].describe()

count    2919.000000
mean       47.486811
std        67.575493
min         0.000000
25%         0.000000
50%        26.000000
75%        70.000000
max       742.000000
Name: OpenPorchSF, dtype: float64

In [220]:
np.sum(data["OpenPorchSF"] == 0)

1298

Has an open porch?: Y (1) or No (0)

In [221]:
new["OpenPorch"] = np.where(data["OpenPorchSF"] == 0, 0, 1)
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,...,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageRemod,WoodDeck,OpenPorch
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,...,8,0,0,Attchd,modern,RFn,2.0,no change,0,1
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,...,6,1,2,Attchd,newer,RFn,2.0,no change,1,0
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,...,6,1,2,Attchd,newer,RFn,2.0,no change,0,1
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,...,7,1,3,Detchd,ancient,Unf,3.0,no change,0,1
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,...,9,1,2,Attchd,newer,RFn,3.0,no change,1,1


In [222]:
data["EnclosedPorch"].describe()

count    2919.000000
mean       23.098321
std        64.244246
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max      1012.000000
Name: EnclosedPorch, dtype: float64

In [223]:
np.sum(data["EnclosedPorch"] == 0)

2460

Majority of the houses don't have an enclosed porch. Drop this variable since it lacks variability.

In [224]:
data["3SsnPorch"].describe()

count    2919.000000
mean        2.602261
std        25.188169
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max       508.000000
Name: 3SsnPorch, dtype: float64

In [225]:
np.sum(data["3SsnPorch"] == 0)

2882

An overwhelming number of houses don't have a three season porch. Drop this variable since it lacks variability.

In [226]:
data["ScreenPorch"].describe()

count    2919.000000
mean       16.062350
std        56.184365
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max       576.000000
Name: ScreenPorch, dtype: float64

In [227]:
np.sum(data["ScreenPorch"] == 0)

2663

Majority of the houses don't have a screened porch. Drop this variable.

## PoolArea and PoolQC

In [228]:
data["PoolArea"].describe()

count    2919.000000
mean        2.251799
std        35.663946
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max       800.000000
Name: PoolArea, dtype: float64

In [229]:
np.sum(data["PoolArea"] == 0)

2906

An overwhelming majority of the houses don't have a pool! Drop both PoolArea and PoolQC.

In [230]:
data["PoolQC"].value_counts()

Ex    4
Gd    4
Fa    2
Name: PoolQC, dtype: int64

## Fence

In [231]:
data["Fence"].isnull().sum()

2348

NA means no fence, and not missing. Since vast majority of the houses don't have a fence, drop this variable.

## MiscFeature and MiscVal

In [232]:
data["MiscFeature"].isnull().sum()

2814

NA means no miscellaneous features, and not missing. Since vast majority of the houses don't have miscellaneous features, there is no need to investigate its dollar value. Drop both variables.

In [233]:
data["MiscVal"].describe()

count     2919.000000
mean        50.825968
std        567.402211
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max      17000.000000
Name: MiscVal, dtype: float64

In [234]:
np.sum(data["MiscVal"] == 0)

2816

## MoSold

In [235]:
data["MoSold"].isnull().sum()

0

In [236]:
freq = data["MoSold"].value_counts()
py.iplot([go.Bar(x = freq.keys(), y = freq.values)])

In [237]:
new["MoSold"] = data["MoSold"]
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,...,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageRemod,WoodDeck,OpenPorch,MoSold
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,...,0,0,Attchd,modern,RFn,2.0,no change,0,1,2
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,...,1,2,Attchd,newer,RFn,2.0,no change,1,0,5
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,...,1,2,Attchd,newer,RFn,2.0,no change,0,1,9
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,...,1,3,Detchd,ancient,Unf,3.0,no change,0,1,2
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,...,1,2,Attchd,newer,RFn,3.0,no change,1,1,12


## YrSold

In [238]:
data["YrSold"].value_counts()

2007    692
2009    647
2008    622
2006    619
2010    339
Name: YrSold, dtype: int64

In [239]:
data["YrSold"].describe()

count    2919.000000
mean     2007.792737
std         1.314964
min      2006.000000
25%      2007.000000
50%      2008.000000
75%      2009.000000
max      2010.000000
Name: YrSold, dtype: float64

In [240]:
new["YrSold"] = data["YrSold"]
new.head()

Unnamed: 0,MSSubClass,LotFrontage,LogLotArea,LotShape,LotConfig,Neighborhood,HouseStyle,OverallQual,YearBuilt,YearRemodAdd,...,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageRemod,WoodDeck,OpenPorch,MoSold,YrSold
0,1,65.0,9.041922,1,Inside,CollgCr,2,0,modern,Never,...,0,Attchd,modern,RFn,2.0,no change,0,1,2,2008
1,1,80.0,9.169518,1,FR,Veenker,1,0,newer,Never,...,2,Attchd,newer,RFn,2.0,no change,1,0,5,2007
2,1,68.0,9.328123,0,Inside,CollgCr,2,0,newer,recent,...,2,Attchd,newer,RFn,2.0,no change,0,1,9,2008
3,2,60.0,9.164296,0,Corner,Crawfor,2,0,ancient,long ago,...,3,Detchd,ancient,Unf,3.0,no change,0,1,2,2006
4,1,84.0,9.565214,0,FR,NoRidge,2,1,newer,Never,...,2,Attchd,newer,RFn,3.0,no change,1,1,12,2008


## SaleType

In [241]:
data["SaleType"].isnull().sum()

1

In [242]:
data["SaleType"].value_counts()

WD       2525
New       239
COD        87
ConLD      26
CWD        12
ConLI       9
ConLw       8
Oth         7
Con         5
Name: SaleType, dtype: int64

In [243]:
SaleType = pd.Series(data["SaleType"] \
.replace(["CWD", "VWD"], "WD") \
.replace(["COD", "Con", "ConLw", "ConLI", "ConLD", "Oth"], "Other") \
.fillna("WD"))
SaleType.value_counts()

WD       2538
New       239
Other     142
Name: SaleType, dtype: int64

Drop this variable due to lack of variability.

## SaleCondition

In [244]:
data["SaleCondition"].isnull().sum()

0

In [245]:
data["SaleCondition"].value_counts()

Normal     2402
Partial     245
Abnorml     190
Family       46
Alloca       24
AdjLand      12
Name: SaleCondition, dtype: int64

In [246]:
SaleCondition = pd.Series(data["SaleCondition"].replace(["Family", "Alloca", "AdjLand"], "Other"))
SaleCondition.value_counts()

Normal     2402
Partial     245
Abnorml     190
Other        82
Name: SaleCondition, dtype: int64

Drop this variable due to lack of variability.

# Save cleaned-up data

In [247]:
data["SalePrice"].describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [248]:
new[:1460].to_csv("trainEng.csv", index = False)  # extract rows 0 to 1459
  # append SalePrice column to CSV file
tmp = pd.read_csv("trainEng.csv")
tmp["SalePrice"] = np.log(train["SalePrice"])
tmp.to_csv("trainEng.csv", index = False)
new[1460:].to_csv("testEng.csv", index = False)  # extract rest of the rows