In [1]:
import pandas as pd
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go

In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
data = pd.concat([train, test])

# data dimensions: # rows, # columns
print train.shape
print test.shape
print data.shape

(1460, 81)
(1459, 80)
(2919, 81)


# Explore variables one at a time

In [3]:
data = data.drop(["Id"], axis = 1)  # drop this variable
data.columns.values  # print all column names

array(['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'Alley', 'BedroomAbvGr',
       'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtFinType1', 'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath',
       'BsmtQual', 'BsmtUnfSF', 'CentralAir', 'Condition1', 'Condition2',
       'Electrical', 'EnclosedPorch', 'ExterCond', 'ExterQual',
       'Exterior1st', 'Exterior2nd', 'Fence', 'FireplaceQu', 'Fireplaces',
       'Foundation', 'FullBath', 'Functional', 'GarageArea', 'GarageCars',
       'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType',
       'GarageYrBlt', 'GrLivArea', 'HalfBath', 'Heating', 'HeatingQC',
       'HouseStyle', 'KitchenAbvGr', 'KitchenQual', 'LandContour',
       'LandSlope', 'LotArea', 'LotConfig', 'LotFrontage', 'LotShape',
       'LowQualFinSF', 'MSSubClass', 'MSZoning', 'MasVnrArea',
       'MasVnrType', 'MiscFeature', 'MiscVal', 'MoSold', 'Neighborhood',
       'OpenPorchSF', 'OverallCond', 'OverallQual', 'PavedDrive',
       'PoolArea', 'P

## MSSubClass

In [None]:
data["MSSubClass"].isnull().sum()

0

MSSubClass is categorical, though it is coded as numeric

In [None]:
freq = data["MSSubClass"].value_counts()  # frequency table
py.iplot([go.Bar(x = freq.keys(), y = freq.values)])

Combine all the 1 and 1.5 story dwelling types as 1, 2 and 2.5 story types as 2, and the rest as 0.

In [None]:
MSSubClass = data["MSSubClass"].replace([20, 30, 40, 45, 50, 120, 150], 1) \
.replace([60, 70, 75, 160], 2) \
.replace([80, 85, 90, 180, 190], 0)

In [None]:
new = pd.DataFrame({"MSSubClass": MSSubClass})  # create new dataframe
print new["MSSubClass"].value_counts()
new.head()

1    1712
2     854
0     353
Name: MSSubClass, dtype: int64


Unnamed: 0,MSSubClass
0,2
1,1
2,2
3,2
4,2


## MSZoning

In [None]:
data["MSZoning"].isnull().sum()

4

In [None]:
data["MSZoning"].value_counts()

RL         2265
RM          460
FV          139
RH           26
C (all)      25
Name: MSZoning, dtype: int64

Residential zone?: Y (1) or N (0)

In [None]:
new["MSZoning"] = data["MSZoning"].map(lambda x: 1 if (x == "RL") or (x == "RM") else 0)
new["MSZoning"].value_counts()

1    2725
0     194
Name: MSZoning, dtype: int64

## LotFrontage

In [None]:
data["LotFrontage"].describe()

count    2433.000000
mean       69.305795
std        23.344905
min        21.000000
25%        59.000000
50%        68.000000
75%        80.000000
max       313.000000
Name: LotFrontage, dtype: float64

In [None]:
new["LotFrontage"] = data["LotFrontage"].fillna(data["LotFrontage"].mean())
new["LotFrontage"].describe()

count    2919.000000
mean       69.305795
std        21.312345
min        21.000000
25%        60.000000
50%        69.305795
75%        78.000000
max       313.000000
Name: LotFrontage, dtype: float64

## LotArea

In [None]:
data["LotArea"].describe()

count      2919.000000
mean      10168.114080
std        7886.996359
min        1300.000000
25%        7478.000000
50%        9453.000000
75%       11570.000000
max      215245.000000
Name: LotArea, dtype: float64

In [None]:
new["LogLotArea"] = np.log(data["LotArea"])
new["LogLotArea"].describe()

count    2919.000000
mean        9.094785
std         0.509966
min         7.170120
25%         8.919721
50%         9.154087
75%         9.356171
max        12.279532
Name: LogLotArea, dtype: float64

## Street and PavedDrive

In [None]:
data["Street"].isnull().sum()

0

In [None]:
data["Street"].value_counts()

Pave    2907
Grvl      12
Name: Street, dtype: int64

Paved street?: Y (1) or No (0)

In [None]:
Street = pd.Series(np.where(data["Street"] == "Pave", 1, 0))
Street.value_counts()

1    2907
0      12
dtype: int64

Street has a severe lack of variability, providing little-to-no information in predicting SalePrice. Thus, this variable will not be used in the model.

In [None]:
data["PavedDrive"].isnull().sum()

0

In [None]:
data["PavedDrive"].value_counts()

Y    2641
N     216
P      62
Name: PavedDrive, dtype: int64

Paved driveway?: Y (1) or No (0)

In [None]:
new["PavedDrive"] = np.where(data["PavedDrive"] == "Y", 1, 0)
new["PavedDrive"].value_counts()

1    2641
0     278
Name: PavedDrive, dtype: int64

## Alley

In [None]:
data["Alley"].isnull().sum()

2721

NA means that houses have no alley, which is not the same as a missing value. Alley access?: Y (1) or N (0)

In [None]:
new["Alley"] = np.where(data["Alley"].isnull(), 0, 1)
new["Alley"].value_counts()

0    2721
1     198
Name: Alley, dtype: int64

## LotShape

In [None]:
data["LotShape"].isnull().sum()

0

In [None]:
data["LotShape"].value_counts()

Reg    1859
IR1     968
IR2      76
IR3      16
Name: LotShape, dtype: int64

Regular lot shape?: Y (1) or No (0)

In [None]:
new["LotShape"] = np.where(data["LotShape"] == "Reg", 1, 0)
new["LotShape"].value_counts()

1    1859
0    1060
Name: LotShape, dtype: int64

## LandContour

In [None]:
data["LandContour"].isnull().sum()

0

In [None]:
data["LandContour"].value_counts()

Lvl    2622
HLS     120
Bnk     117
Low      60
Name: LandContour, dtype: int64

Flat lot?: Y (1) or N (0)

In [None]:
new["LandContour"] = np.where(data["LandContour"] == "Lvl", 1, 0)
new["LandContour"].value_counts()

1    2622
0     297
Name: LandContour, dtype: int64

## Utilities

In [None]:
data["Utilities"].value_counts()

AllPub    2916
NoSeWa       1
Name: Utilities, dtype: int64

Drop this variable. It has a severe lack of variability!

## LotConfig

In [None]:
data["LotConfig"].isnull().sum()

0

In [None]:
data["LotConfig"].value_counts()

Inside     2133
Corner      511
CulDSac     176
FR2          85
FR3          14
Name: LotConfig, dtype: int64

Inside lot?: Y (1) or No (0)

In [None]:
new["LotConfig"] = np.where(data["LotConfig"] == "Inside", 1, 0)
new["LotConfig"].value_counts()

1    2133
0     786
Name: LotConfig, dtype: int64

## LandSlope

In [None]:
data["LandSlope"].isnull().sum()

0

In [None]:
data["LandSlope"].value_counts()

Gtl    2778
Mod     125
Sev      16
Name: LandSlope, dtype: int64

Gentle slope?: Y (1) or No (0)

In [None]:
new["LandSlope"] = np.where(data["LandSlope"] == "Gtl", 1, 0)
new["LandSlope"].value_counts()

1    2778
0     141
Name: LandSlope, dtype: int64

## Neighborhood

In [None]:
data["Neighborhood"].isnull().sum()

0

In [None]:
data["Neighborhood"].value_counts()

NAmes      443
CollgCr    267
OldTown    239
Edwards    194
Somerst    182
NridgHt    166
Gilbert    165
Sawyer     151
NWAmes     131
SawyerW    125
Mitchel    114
BrkSide    108
Crawfor    103
IDOTRR      93
Timber      72
NoRidge     71
StoneBr     51
SWISU       48
ClearCr     44
MeadowV     37
BrDale      30
Blmngtn     28
Veenker     24
NPkVill     23
Blueste     10
Name: Neighborhood, dtype: int64

In [None]:
new["Neighborhood"] = data["Neighborhood"]

## Condition1 and Condition2

In [None]:
data["Condition1"].value_counts()

Norm      2511
Feedr      164
Artery      92
RRAn        50
PosN        39
RRAe        28
PosA        20
RRNn         9
RRNe         6
Name: Condition1, dtype: int64

In [None]:
data["Condition2"].value_counts()

Norm      2889
Feedr       13
Artery       5
PosA         4
PosN         4
RRNn         2
RRAn         1
RRAe         1
Name: Condition2, dtype: int64

Most houses have normal condition. If either conditions are normal, record the overall condition as normal. 

In [None]:
Condition = np.logical_or(data["Condition1"] == "Norm", data["Condition2"] == "Norm")
Condition.value_counts()

True     2889
False      30
Name: Condition1, dtype: int64

Drop this variable since it lacks variability.

## BldgType

In [None]:
data["BldgType"].isnull().sum()

0

In [None]:
data["BldgType"].value_counts()

1Fam      2425
TwnhsE     227
Duplex     109
Twnhs       96
2fmCon      62
Name: BldgType, dtype: int64

Single-family detached?: Y (1) or N (0)

In [None]:
new["BldgType"] = np.where(data["BldgType"] == "1Fam", 1, 0)
new["BldgType"].value_counts()

1    2425
0     494
Name: BldgType, dtype: int64

## HouseStyle

In [None]:
data["HouseStyle"].isnull().sum()

0

In [None]:
data["HouseStyle"].value_counts()

1Story    1471
2Story     872
1.5Fin     314
SLvl       128
SFoyer      83
2.5Unf      24
1.5Unf      19
2.5Fin       8
Name: HouseStyle, dtype: int64

Combine 1-1.5 story as 1, 2-2.5 story as 2, and the rest as 0.

In [None]:
new["HouseStyle"] = data["HouseStyle"].replace(["1Story", "1.5Fin", "1.5Unf"], 1) \
.replace(["2Story", "2.5Fin", "2.5Unf"], 2) \
.replace(["SFoyer", "SLvl"], 0)
new["HouseStyle"].value_counts()

1    1804
2     904
0     211
Name: HouseStyle, dtype: int64

## OverallQual

In [None]:
data["OverallQual"].isnull().sum()

0

In [None]:
freq = data["OverallQual"].value_counts()
py.iplot([go.Bar(x = freq.keys(), y = freq.values)])

Combine very poor/poor/fair/less than average  as -1, average/above average/good as 0, very good/excellent/very excellent as 1.

In [None]:
new["OverallQual"] = data["OverallQual"].map(lambda x: -1 if x < 4 else 0 if x < 8 else 1)
new["OverallQual"].value_counts()

 0    2382
 1     480
-1      57
Name: OverallQual, dtype: int64

## OverallCond

In [None]:
data["OverallCond"].isnull().sum()

0

In [None]:
freq = data["OverallCond"].value_counts()
py.iplot([go.Bar(x = freq.keys(), y = freq.values)])

Above average condition?: Y (1) or No (0) 

In [None]:
new["OverallCond"] = np.where(data["OverallCond"] > 4, 1, 0)
new["OverallCond"].value_counts()

## YearBuilt 

In [None]:
data["YearBuilt"].describe()

Years usually needs to be binned.

In [None]:
new["YearBuilt"] = pd.qcut(data["YearBuilt"], q = 4, labels = ["ancient", "older", "newer", "modern"])
pd.concat((new["YearBuilt"], data["YearBuilt"]), axis = 1).head()

In [None]:
new["YearBuilt"].value_counts()

## YearRemodAdd

In [None]:
data["YearRemodAdd"].describe()

In [None]:
remodel = np.subtract(data["YearRemodAdd"], data["YearBuilt"])
remodel.describe()

In [None]:
new["YearRemodAdd"] = pd.Series(remodel.map(lambda x: "Never" if x <= 0 else "recent" if x <= 10 else "long ago"))
new["YearRemodAdd"].value_counts()

## RoofStyle

In [None]:
data["RoofStyle"].isnull().sum()

In [None]:
data["RoofStyle"].value_counts()

Gable roof?: Y (1) or No (0)

In [None]:
new["RoofStyle"] = np.where(data["RoofStyle"] == "Gable", 1, 0)
new["RoofStyle"].value_counts()

## RoofMatl

In [None]:
data["RoofMatl"].isnull().sum()

In [None]:
data["RoofMatl"].value_counts()

Standard shingle?: Y (1) or N (0)

In [None]:
RoofMatl = pd.Series(np.where(data["RoofMatl"] == "CompShg", 1, 0))
RoofMatl.value_counts()

Drop this variable since it heavily lacks variability.

## Exterior1st and Exterior2nd

In [None]:
print data["Exterior1st"].isnull().sum()
print data["Exterior2nd"].isnull().sum()

In [None]:
data["Exterior1st"].value_counts()

In [None]:
Exterior1st = data["Exterior1st"].fillna("Other") \
.replace(["BrkFace", "WdShing", "AsbShng", "Stucco", "BrkComm", "AsphShn", "Stone", "CBlock", "ImStucc", "Other"], "Other")
Exterior1st.value_counts()

In [None]:
data["Exterior2nd"].value_counts()

In [None]:
Exterior2nd = data["Exterior2nd"].fillna("Other") \
.replace(["Wd Shng", "BrkFace", "Stucco", "AsbShng", "Brk Cmn", "ImStucc", "Stone", "AsphShn", "CBlock", "Other"], "Other")
Exterior2nd.value_counts()

In [None]:
np.equal(Exterior1st, Exterior2nd).value_counts()  # check if both columns are same

Since both variables roughly have the same distribution and most houses have only one exterior material, keep only one of them in the model.

In [None]:
new["Exterior"] = Exterior1st
new["Exterior"].value_counts()

## MasVnrType and MasVnrArea

In [None]:
data["MasVnrArea"].describe()

In [None]:
data["MasVnrArea"] = data["MasVnrArea"].fillna(0)
np.sum(data["MasVnrArea"] == 0)

Since most houses don't have masonry veneer walls, the area is mostly 0.

In [None]:
data["MasVnrType"].isnull().sum()

In [None]:
data["MasVnrType"] = data["MasVnrType"].fillna("None")
data["MasVnrType"].value_counts()

It makes more sense to simply record if a house has masonry veneer rather than worrying about unnecessary details, such as its area and type.

In [None]:
new["MasVnr"] = np.where(data["MasVnrArea"] == 0, 0, 1)
new["MasVnr"].value_counts()

## ExterQual and ExterCond

In [None]:
print data["ExterQual"].isnull().sum()
print data["ExterCond"].isnull().sum()

Compare the original quality of the material to its current condition.

In [None]:
original = data["ExterQual"].map({"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1})
current = data["ExterCond"].map({"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1})
original.value_counts()

In [None]:
current.value_counts()

Get the change in condition over time.

In [None]:
cond = np.subtract(current, original)
cond.value_counts()

In [None]:
new["ExterCond"] = pd.Series(cond.map(lambda x: "depreciated" if x < 0 else "improved" if x > 0 else "no change"))
new["ExterCond"].value_counts()

## Foundation

In [None]:
data["Foundation"].isnull().sum()

In [None]:
data["Foundation"].value_counts()

In [None]:
new["Foundation"] = data["Foundation"].replace(["BrkTil", "Slab", "Stone", "Wood"], "Other")
new["Foundation"].value_counts()

## BsmtQual

This variable records the basement height.

In [None]:
data["BsmtQual"].isnull().sum()

In [None]:
data["BsmtQual"].value_counts()

NA means no basement, and not missing.

In [None]:
new["BsmtQual"] = data["BsmtQual"].fillna("None").map({"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 1, "None": 0})
new["BsmtQual"].value_counts()

## BsmtCond

In [None]:
data["BsmtCond"].isnull().sum()

In [None]:
data["BsmtCond"].value_counts()

NA means no basement, and not missing.

In [None]:
new["BsmtCond"] = data["BsmtCond"].fillna("None").map({"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 1, "None": 0})
new["BsmtCond"].value_counts()

Atleast average basement condition?: Y (1) or No (0)

In [None]:
new["BsmtCond"] = np.where(new["BsmtCond"] > 1, 1, 0)
new["BsmtCond"].value_counts()

## BsmtExposure

In [None]:
data["BsmtExposure"].isnull().sum()

NA means no basement, and not missing.

In [None]:
data["BsmtExposure"].value_counts()

Has a walkout or garden level walls?: Y (1) or No (0)

In [None]:
new["BsmtExposure"] = data["BsmtExposure"].fillna("None").map({"Gd": 1, "Av": 1, "Mn": 1, "No": 0, "None": 0})
new["BsmtExposure"].value_counts()

## BsmtFinType1 and BsmtFinType2

In [None]:
print data["BsmtFinType1"].isnull().sum()
print data["BsmtFinType2"].isnull().sum()

NA means no basement, and not missing.

In [None]:
data["BsmtFinType1"].value_counts()

In [None]:
data["BsmtFinType2"].value_counts()

In [None]:
type1 = data["BsmtFinType1"].fillna("None").map({"GLQ": 3, "ALQ": 2, "Rec": 2, "BLQ": 1, "LwQ": 1, "Unf": 0, "None": 0})
type2 = data["BsmtFinType2"].fillna("None").map({"GLQ": 3, "ALQ": 2, "Rec": 2, "BLQ": 1, "LwQ": 1, "Unf": 0, "None": 0})
type1.value_counts()

In [None]:
type2.value_counts()

In [None]:
np.equal(type1, type2).value_counts()  # most basements have a 2nd rating

Most basements can be used as a living quarter, however that space is unfinished. Hence, it makes more sense here to record if the basement is unfinished and use TotalBsmtSF to provide the total basement area.

In [None]:
new["BsmtFinType"] = pd.Series(np.logical_or(type1 == 0.0, type2 == 0.0))
new["BsmtFinType"] = np.where(new["BsmtFinType"] == True, 1, 0)
new["BsmtFinType"].value_counts()

## BsmtFinSF1, BsmtFinSF2, BsmtUnfSF

In [None]:
data["BsmtFinSF1"].describe()

In [None]:
data["BsmtFinSF2"].describe()

In [None]:
print (data["BsmtFinSF1"] == 0).sum()
print (data["BsmtFinSF2"] == 0).sum()

Since most basements are unfinished, there are just as many zeroes as in type1 and type2.

In [None]:
BsmtFinSF = np.logical_or(data["BsmtFinSF1"] == 0, data["BsmtFinSF2"] == 0)
(BsmtFinSF == True).sum()

Drop this variable since it is redundant with BsmtFinType.

In [None]:
data["BsmtUnfSF"].describe()

In [None]:
(data["BsmtUnfSF"] != 0).sum()

There are many nonzero values since most basements are unfinished. Drop this variable since it is redundant with BsmtFinType.

## TotalBsmtSF

In [None]:
data["TotalBsmtSF"].describe()

In [None]:
data["TotalBsmtSF"] = data["TotalBsmtSF"].fillna(0)
new["LogTotalBsmtSF"] = np.log(data["TotalBsmtSF"] + 1)
new["LogTotalBsmtSF"].describe()

## Heating

In [None]:
data["Heating"].value_counts()

Has gas air furnace?: Y (1) or No (0).

In [None]:
Heating = pd.Series(np.where(data["Heating"] == "GasA", 1, 0))
Heating.value_counts()

Drop this variable since it severely lacks variability.

## HeatingQC

In [None]:
data["HeatingQC"].isnull().sum()

In [None]:
data["HeatingQC"].value_counts()

In [None]:
new["HeatingQC"] = data["HeatingQC"].map({"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 1})
new["HeatingQC"].value_counts()

## CentralAir

In [None]:
data["CentralAir"].isnull().sum()

In [None]:
data["CentralAir"].value_counts()

In [None]:
new["CentralAir"] = np.where(data["CentralAir"] == "Y", 1, 0)
new["CentralAir"].value_counts()

## Electrical

In [None]:
data["Electrical"].isnull().sum()

In [None]:
data["Electrical"].value_counts()

Standard breaker?: Y (1) or No (0)

In [None]:
new["Electrical"] = np.where(data["Electrical"] == "SBrkr", 1, 0)
new["Electrical"].value_counts()

## 1stFlrSF

In [None]:
data["1stFlrSF"].describe()

In [None]:
new["Log1stFlrSF"] = np.log(data["1stFlrSF"])
new["Log1stFlrSF"].describe()

## 2ndFlrSF

In [None]:
data["2ndFlrSF"].describe()

In [None]:
np.sum(data["2ndFlrSF"] == 0)

Has 2nd floor?: Y (1) or No (0).

In [None]:
new["SecondFlr"] = np.where(data["2ndFlrSF"] == 0, 0, 1)
new["SecondFlr"].value_counts()

## GrLivArea

In [None]:
data["GrLivArea"].describe()

In [None]:
np.corrcoef(data["GrLivArea"], data["1stFlrSF"])

There is a positive moderate correlation between GrLivArea and 1stFlrSF.

In [None]:
new["LogGrLivArea"] = np.log(data["GrLivArea"])
new["LogGrLivArea"].describe()

## TotalArea: NEW feature

Ground living area is calculated by measuring the outside perimeter of the house and includes only finished, habitable, above-grade living space. Finished basements and unfinished attic areas are not included in total gross living area. Let's combine the ground living area with the total basement area to inform homeowners how much total area will be available to them. Note that the living area on the first floor is included in the ground living area calculations, which explains the correlation between the two variables.

In [None]:
TotalArea = pd.Series(np.sum([data["GrLivArea"], data["TotalBsmtSF"]], axis = 0))
TotalArea.describe()

In [None]:
new["LogTotalArea"] = np.log(TotalArea)
new["LogTotalArea"].describe()

## LowQualFinSF

In [None]:
data["LowQualFinSF"].describe()

In [None]:
np.sum(data["LowQualFinSF"] == 0)

Drop this variable due to lack of data.

## BsmtFullBath, BsmtHalfBath, FullBath, HalfBath

In [None]:
print data["BsmtFullBath"].isnull().sum()
print data["BsmtHalfBath"].isnull().sum()
print data["FullBath"].isnull().sum()
print data["HalfBath"].isnull().sum()

In [None]:
data["BsmtFullBath"].value_counts()

In [None]:
data["FullBath"].value_counts()

Combine all full bathrooms, regardless of whether it is in the basement or not.

In [None]:
data["BsmtFullBath"] = data["BsmtFullBath"].fillna(0)
full = pd.Series(np.sum([data["BsmtFullBath"], data["FullBath"]], axis = 0))
full.value_counts()

In [None]:
new["FullBath"] = full.replace([0], 1).replace([3, 4, 6], 3)
new["FullBath"].value_counts()

In [None]:
data["BsmtHalfBath"].value_counts()

In [None]:
data["HalfBath"].value_counts()

Similarly, combine all half bathrooms.

In [None]:
data["BsmtHalfBath"] = data["BsmtHalfBath"].fillna(0)
half = pd.Series(np.sum([data["BsmtHalfBath"], data["HalfBath"]], axis = 0))
half.value_counts()

Has half bathroom?: Y (1) or No (0)

In [None]:
new["HalfBath"] = half.replace([2, 3, 4], 1)
new["HalfBath"].value_counts()

## Bedroom

In [None]:
data["BedroomAbvGr"].value_counts()

In [None]:
data["BedroomAbvGr"].isnull().sum()

In [None]:
new["BedroomAbvGr"] = data["BedroomAbvGr"].replace([0], 1).replace([4, 5, 6, 8], 4)
new["BedroomAbvGr"].value_counts()

## KitchenAbvGr

In [None]:
data["KitchenAbvGr"].value_counts()

In [None]:
data["KitchenAbvGr"].isnull().sum()

Drop this variable due to lack of data. Also, KitchenQual assumes that a kitchen is available.

## KitchenQual

In [None]:
data["KitchenQual"].value_counts()

In [None]:
data["KitchenQual"].isnull().sum()

In [None]:
new["KitchenQual"] = data["KitchenQual"].fillna("TA").map({"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 1})
new["KitchenQual"].value_counts()

Above average kitchen quality?: Y (1) or No (0)

In [None]:
new["KitchenQual"] = np.where(new["KitchenQual"] > 2, 1, 0)
new["KitchenQual"].value_counts()

## TotRmsAbvGrd

In [None]:
freq = data["TotRmsAbvGrd"].value_counts()
py.iplot([go.Bar(x = freq.keys(), y = freq.values)])

In [None]:
new["TotRmsAbvGrd"] = data["TotRmsAbvGrd"].replace([2, 3], 4).replace([10, 11, 12, 13, 14, 15], 9)
new["TotRmsAbvGrd"].value_counts()

## Functional

In [None]:
data["Functional"].value_counts()

In [None]:
data["Functional"].isnull().sum()

Typical home functionality?: Y (1) or No (0)

In [None]:
data["Functional"] = data["Functional"].fillna("Typ")
new["Functional"] = np.where(data["Functional"] == "Typ", 1, 0)
new["Functional"].value_counts()

## Fireplaces

In [None]:
data["Fireplaces"].isnull().sum()

In [None]:
data["Fireplaces"].value_counts()

Has a fireplace?: Y (1) or No (0)

In [None]:
new["Fireplaces"] = np.where(data["Fireplaces"] > 0, 1, 0)
new["Fireplaces"].value_counts()

## FireplaceQu

In [None]:
data["FireplaceQu"].isnull().sum()

In [None]:
data["FireplaceQu"].value_counts()

NA means no fireplace, and not missing.

In [None]:
new["FireplaceQu"] = data["FireplaceQu"].fillna("None").map({"Ex": 3, "Gd": 3, "TA": 2, "Fa": 1, "Po": 1, "None": 0})
new["FireplaceQu"].value_counts()

## GarageType

In [None]:
data["GarageType"].isnull().sum()

NA means No Garage, and doesn't mean missing.

In [None]:
data["GarageType"] = data["GarageType"].fillna("None")
data["GarageType"].value_counts()

Attached garage?: Y (1) or No (0)

In [None]:
new["GarageType"] = np.where(data["GarageType"] == "Attchd", 1, 0)
new["GarageType"].value_counts()

## GarageYrBlt

In [None]:
data["GarageYrBlt"].describe()

In [None]:
data["GarageYrBlt"].isnull().sum()

Since 159 houses don't have garages, replace NA with 0, including the max year!

In [None]:
data["GarageYrBlt"] = data["GarageYrBlt"].fillna(0).replace([data["GarageYrBlt"].max()], 0)
data["GarageYrBlt"].describe()

In [None]:
new["GarageYrBlt"] = pd.qcut(data["YearBuilt"], q = 4, labels = ["ancient", "older", "newer", "modern"])
new["GarageYrBlt"].value_counts()

## GarageFinish

In [None]:
data["GarageFinish"].isnull().sum()

NA means no garage, and not missing.

In [None]:
data["GarageFinish"] = data["GarageFinish"].fillna("None")
data["GarageFinish"].value_counts()

In [None]:
new["GarageFinish"] = data["GarageFinish"]
new["GarageFinish"].value_counts()

## GarageCars and GarageArea

In [None]:
data["GarageCars"].isnull().sum()

In [None]:
data["GarageCars"].value_counts()

In [None]:
data["GarageArea"].describe()

In [None]:
data["GarageCars"] = data["GarageCars"].fillna(2)
data["GarageArea"] = data["GarageArea"].fillna(data["GarageArea"].mean())
py.iplot([go.Scatter(x = data["GarageArea"], y = data["GarageCars"], mode = "markers")])

In [None]:
np.corrcoef(data["GarageCars"], data["GarageArea"])

It is clear from the graph that there is a relationship between GarageCars and GarageArea. In fact, both variables have a strong positive correlation of 0.89. Garages that can fit 1-3 cars form the three biggest clusters in the graph. We choose to merge garages that can fit more than 3 cars with the garages that can fit exactly three cars since they share same area square footage range and that data doesn't have much density. Also, in order to reduce redundancy, we keep only one of the two variables. We picked GarageCars since most people understand how many cars they can fit in their garage as opposed to its area. In fact, the number of cars that a garage can fit can be viewed as binning classes for the garage area.

In [None]:
new["GarageCars"] = data["GarageCars"].replace([4, 5], 3)
new["GarageCars"].value_counts()

## GarageQual and GarageCond

In [None]:
data["GarageQual"].isnull().sum()

NA means no garage, and not missing.

In [None]:
data["GarageQual"] = data["GarageQual"].fillna("None")
data["GarageQual"].value_counts()

In [None]:
data["GarageCond"].isnull().sum()

In [None]:
data["GarageCond"] = data["GarageCond"].fillna("None")
data["GarageCond"].value_counts()

Let's compare the original garage quality (GarageQual) to the current garage condition (GarageCond).

In [None]:
original = data["GarageQual"].map({"Ex": 4, "Gd": 4, "TA": 2, "Fa": 1, "Po": 1, "None": 0})
current = data["GarageCond"].map({"Ex": 4, "Gd": 4, "TA": 2, "Fa": 1, "Po": 1, "None": 0})
cond = np.subtract(current, original)
cond.value_counts()

In [None]:
new["GarageRemod"] = pd.Series(cond.map(lambda x: "depreciated" if x < 0 else "improved" if x > 0 else "no change"))
new["GarageRemod"].value_counts()

Was garage remodeled?: Y (1) or No (0).

In [None]:
new["GarageRemod"] = np.where(new["GarageRemod"] == "no change", 1, 0)
new["GarageRemod"].value_counts()

## WoodDeckSF

In [None]:
data["WoodDeckSF"].describe()

In [None]:
np.sum(data["WoodDeckSF"] == 0)

Has a wood deck?: Y(1) or No (0)

In [None]:
new["WoodDeck"] = np.where(data["WoodDeckSF"] == 0, 0, 1)
new["WoodDeck"].value_counts()

## OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch

In [None]:
data["OpenPorchSF"].describe()

In [None]:
np.sum(data["OpenPorchSF"] == 0)

In [None]:
data["EnclosedPorch"].describe()

In [None]:
np.sum(data["EnclosedPorch"] == 0)

In [None]:
data["3SsnPorch"].describe()

In [None]:
np.sum(data["3SsnPorch"] == 0)

In [None]:
data["ScreenPorch"].describe()

In [None]:
np.sum(data["ScreenPorch"] == 0)

Has porch?: Y (1) or No (0)

In [None]:
new["TotalPorchSF"] = np.sum([data["OpenPorchSF"], data["EnclosedPorch"], data["3SsnPorch"], data["ScreenPorch"]], axis = 0)
new["TotalPorchSF"].describe()

In [None]:
np.sum(new["TotalPorchSF"] == 0)

## PoolArea and PoolQC

In [None]:
data["PoolArea"].describe()

In [None]:
np.sum(data["PoolArea"] == 0)

An overwhelming majority of the houses don't have a pool! Drop both PoolArea and PoolQC.

In [None]:
data["PoolQC"].value_counts()

## Fence

In [None]:
data["Fence"].isnull().sum()

NA means no fence, and not missing. Has Fence?: Y (1) or No (0)

In [None]:
new["Fence"] = np.where(data["Fence"].isnull(), 0 , 1)
new["Fence"].value_counts()

## MiscFeature and MiscVal

In [None]:
data["MiscFeature"].isnull().sum()

NA means no miscellaneous features, and not missing. Since vast majority of the houses don't have miscellaneous features, there is no need to investigate its dollar value. Drop both variables.

In [None]:
data["MiscVal"].describe()

In [None]:
np.sum(data["MiscVal"] == 0)

## MoSold

In [None]:
data["MoSold"].isnull().sum()

In [None]:
freq = data["MoSold"].value_counts()
py.iplot([go.Bar(x = freq.keys(), y = freq.values)])

In [None]:
new["MoSold"] = data["MoSold"]

## YrSold

In [None]:
data["YrSold"].value_counts()

In [None]:
data["YrSold"].describe()

In [None]:
new["YrSold"] = data["YrSold"]

## SaleType

In [None]:
data["SaleType"].isnull().sum()

In [None]:
data["SaleType"].value_counts()

Combine all types of warranty deed and all types of contracts, but keep newly constructed and sold homes separate.

In [None]:
new["SaleType"] = data["SaleType"] \
.replace(["CWD", "VWD"], "WD") \
.replace(["COD", "Con", "ConLw", "ConLI", "ConLD", "Oth"], "Other") \
.fillna("WD")
new["SaleType"].value_counts()

## SaleCondition

In [None]:
data["SaleCondition"].isnull().sum()

In [None]:
data["SaleCondition"].value_counts()

Normal sale?: Y (1) or No (0)

In [None]:
new["SaleCondition"] = np.where(data["SaleCondition"] == "Normal", 1, 0)
new["SaleCondition"].value_counts()

# Save cleaned-up data

In [None]:
data["SalePrice"].describe()

In [None]:
new[:1460].to_csv("../data/trainEng.csv", index = False)  # extract rows 0 to 1459
  # append SalePrice column to CSV file
tmp = pd.read_csv("../data/trainEng.csv")
tmp["LogSalePrice"] = np.log(train["SalePrice"])
tmp.to_csv("../data/trainEng.csv", index = False)
new[1460:].to_csv("../data/testEng.csv", index = False)  # extract rest of the rows