In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import seaborn as sns
import numpy as np
from feature import is_outlier
from sklearn.preprocessing import Normalizer,Imputer
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from scipy.stats import skew
from sklearn.feature_selection import f_regression
plt.style.use('ggplot')

In [2]:
train_file = "../data/train.csv"
test_file = "../data/test.csv"
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)

In [3]:
cols = train_data.columns.values
print(cols)

['Id' 'MSSubClass' 'MSZoning' 'LotFrontage' 'LotArea' 'Street' 'Alley'
 'LotShape' 'LandContour' 'Utilities' 'LotConfig' 'LandSlope'
 'Neighborhood' 'Condition1' 'Condition2' 'BldgType' 'HouseStyle'
 'OverallQual' 'OverallCond' 'YearBuilt' 'YearRemodAdd' 'RoofStyle'
 'RoofMatl' 'Exterior1st' 'Exterior2nd' 'MasVnrType' 'MasVnrArea'
 'ExterQual' 'ExterCond' 'Foundation' 'BsmtQual' 'BsmtCond' 'BsmtExposure'
 'BsmtFinType1' 'BsmtFinSF1' 'BsmtFinType2' 'BsmtFinSF2' 'BsmtUnfSF'
 'TotalBsmtSF' 'Heating' 'HeatingQC' 'CentralAir' 'Electrical' '1stFlrSF'
 '2ndFlrSF' 'LowQualFinSF' 'GrLivArea' 'BsmtFullBath' 'BsmtHalfBath'
 'FullBath' 'HalfBath' 'BedroomAbvGr' 'KitchenAbvGr' 'KitchenQual'
 'TotRmsAbvGrd' 'Functional' 'Fireplaces' 'FireplaceQu' 'GarageType'
 'GarageYrBlt' 'GarageFinish' 'GarageCars' 'GarageArea' 'GarageQual'
 'GarageCond' 'PavedDrive' 'WoodDeckSF' 'OpenPorchSF' 'EnclosedPorch'
 '3SsnPorch' 'ScreenPorch' 'PoolArea' 'PoolQC' 'Fence' 'MiscFeature'
 'MiscVal' 'MoSold' 'YrSold' 'SaleTy

In [4]:
train_data.shape

(1460, 81)

In [5]:
common = list(set(cols).intersection(set(test_data.columns.values)))
print(common)

['GrLivArea', 'Foundation', 'BldgType', 'TotRmsAbvGrd', 'ExterCond', 'FullBath', 'LowQualFinSF', 'BsmtFinSF2', 'BsmtExposure', 'MiscFeature', 'Condition1', 'BsmtFinType1', 'KitchenQual', 'PavedDrive', 'MSSubClass', 'Electrical', 'GarageQual', 'WoodDeckSF', 'Utilities', 'GarageCars', 'MSZoning', 'ScreenPorch', 'Id', 'MoSold', 'LotArea', 'PoolArea', 'SaleType', 'Neighborhood', 'Fireplaces', 'BsmtFinSF1', 'Alley', 'MiscVal', 'Street', 'HeatingQC', 'Condition2', 'OpenPorchSF', 'Exterior2nd', 'LandContour', 'CentralAir', 'KitchenAbvGr', 'OverallQual', 'LotShape', 'RoofMatl', 'BsmtQual', 'BsmtFinType2', 'TotalBsmtSF', 'LotFrontage', 'FireplaceQu', 'YearRemodAdd', 'RoofStyle', 'SaleCondition', 'BsmtHalfBath', '3SsnPorch', 'MasVnrType', 'GarageYrBlt', 'MasVnrArea', 'Heating', '1stFlrSF', 'ExterQual', 'BsmtUnfSF', 'YearBuilt', 'BsmtFullBath', 'GarageFinish', 'GarageCond', 'EnclosedPorch', 'PoolQC', 'Exterior1st', 'Functional', 'BedroomAbvGr', 'LandSlope', 'GarageArea', 'Fence', 'BsmtCond', '2nd

In [6]:
all_data = pd.concat((train_data.loc[:, common],
                      test_data.loc[:, common]), ignore_index=True)
all_data.shape

(2919, 80)

In [12]:
col = all_data.isnull().sum()
print (col[col > 0])

BsmtFinSF2         1
BsmtExposure      82
MiscFeature     2814
BsmtFinType1      79
KitchenQual        1
Electrical         1
GarageQual       159
Utilities          2
GarageCars         1
MSZoning           4
SaleType           1
BsmtFinSF1         1
Alley           2721
Exterior2nd        1
BsmtQual          81
BsmtFinType2      80
TotalBsmtSF        1
LotFrontage      486
FireplaceQu     1420
BsmtHalfBath       2
MasVnrType        24
GarageYrBlt      159
MasVnrArea        23
BsmtUnfSF          1
BsmtFullBath       2
GarageFinish     159
GarageCond       159
PoolQC          2909
Exterior1st        1
Functional         2
GarageArea         1
Fence           2348
BsmtCond          82
GarageType       157
dtype: int64


# Simple tranform
Since the raw data has some "dirty" data that needs to be clean up, this section we will go through each feature according to the data description.

In [None]:
all_data[["BsmtFinType1", "BsmtCond", "BsmtFinType2", "BsmtExposure", "BsmtQual"]].isnull().any(axis=1)

In [None]:
def fill_na_gen(data, cols):
    if not isinstance(data, pd.DataFrame):
        raise ValueError("The passing dataset is not a DataFrame")

    if not isinstance(cols, list):
        raise ValueError("The passing columns is not a list")

    for val in cols:
        if data[val].dtypes == "object":
            data[val].fillna(data[val].mode(), inplace=True)
        else:
            data[val].fillna(data[val].median(), inplace=True)
    col = data.isnull().sum()
    print (col[col > 0])

In [None]:
fill_cols = ["MSZoning", "Utilities", "BsmtUnfSF", "Electrical","KitchenQual",
             "BsmtFinSF1", "Utilities", "SaleType", "Functional", "Exterior1st", "Exterior2nd"]
fill_na_gen(all_data, fill_cols)

In [None]:
col = all_data.isnull().sum()
print (col[col > 0])

In [None]:
SubClass_mapping = { 20: "1S-New",
                     30: "1S-Old",
                     40: "1S-Attic",
                     45: "1.5S-Finish",
                     50: "1.5S-Unfinished",
                     60: "2S-New",
                     70: "2S-Old",
                     75: "2.5S-All",
                     80: "Split-Level",
                     85: "Split-Foyer",
                     90: "Duplex",
                     120: "1S-PUD-New",
                     150: "1.5S-PUD-All",
                     160: "2S-PUD-New",
                     180: "Split-Level-PUD",
                     190: "2F-All"}
train_data['style'] = train_data['MSSubClass'].map(SubClass_mapping)
train_data = train_data.drop(['MSSubClass'], axis=1)

In [None]:
train_data['Alley'].fillna('NoAc', inplace=True)
train_data['Alley'].value_counts()

In [None]:
train_data.MasVnrType.fillna("None", inplace=True)
train_data.MasVnrType.value_counts()

In [None]:
train_data.loc[train_data.MasVnrType=="None", 'MasVnrArea'] = 0

# Remove Skew from Price values
After exploring the distribution of the price values, we can take the logarithm to change the shape of the variable to reduce the skewness of the variables.

In [None]:
import seaborn as sns
price = train_data[train_data.columns.values[-1]]
log_price = np.log(price)
print ("The skewness of the price is {0}".format(skew(price)))

plt.figure(figsize = (10, 5))
plt.subplot(1,2,1)
sns.distplot(price, color = 'r', hist_kws = {'alpha': 0.9})
plt.title('Original Data')
plt.xlabel('Sale Price')

plt.subplot(1,2,2)
sns.distplot(log_price, color = 'r', hist_kws = {'alpha': 0.9})
plt.title('Natual log of Data')
plt.xlabel('Natual log of Sale Price')
plt.tight_layout()

# Handling missing values for numerical variables
* Remove outliers using Median Absolute Deviation
* Calculate skewness of each variable and transform it if greater than 0.75
* Apply the sklearn.Normalizer to each column

  We can compare two different numerical variables: one with skewness handling and normalization, one without it.

In [None]:
train_num_m = train_data.select_dtypes(include = ['float64', 'int64']).iloc[:,1:]
train_num = train_data.select_dtypes(include = ['float64', 'int64']).iloc[:,1:]
for col in train_num_m.columns.values:
    if np.sum(train_num_m[col].isnull() ) > 0:
        median = train_num_m[col].median()
        idx = np.where(train_num_m.isnull())[0]
        train_num_m[col].iloc[idx] = median
        train_num[col].iloc[idx] = median
        
        outliers = np.where(is_outlier(train_num_m[col]))
        train_num_m[col].iloc[outliers] = median
        
        if skew(train_num_m[col]) > 0.75:
            train_num_m[col] = np.log(train_num[col])
            train_num_m[col] = train_num_m[col].apply(lambda x: 0 if x == -np.inf else x)
            
        train_num_m[col] = Normalizer().fit_transform(train_num_m[col].reshape(1,-1))[0]

In [None]:
train_num_m[train_num_m.columns.values[-1]] = np.log(train_num_m[train_num_m.columns.values[-1]])
corr_m = train_num_m.corr()
corr = train_num.corr()
plt.figure(figsize = (15,15))
sns.heatmap(corr_m, vmax =1 , square = True)
plt.figure(figsize = (15,15))
sns.heatmap(corr, vmax=1, square = True)

In [None]:
cor_dict_m = corr_m['SalePrice'].to_dict()
del cor_dict_m['SalePrice']

# raw numerical features
cor_dict = corr["SalePrice"].to_dict()
del cor_dict['SalePrice']

print("The raw numerical features sorted by their correlation with Sale price:\n")
for item in sorted(cor_dict.items(), key = lambda x: -abs(x[1])):
    print("{0}: \t {1}".format(*item))

print("\n\nThe modified numerical features sorted by their correlation with Sale price:\n")
for item in sorted(cor_dict_m.items(), key = lambda x: -abs(x[1])):
    print("{0}: \t {1}".format(*item))

In [None]:
sns.regplot(x='OverallQual', y='SalePrice', data = train_num_m, color='Orange')

In [None]:
fig = plt.figure()
plt.scatter(train_num_m.YearBuilt.values, train_num_m.YearRemodAdd.values)
plt.title("Year Built")
fig.text(-0.01,0.5, 'Year Remodel Added', va="center", rotation = 'vertical', fontsize = 12)
print (corr_m['YearBuilt']['YearRemodAdd'])

In [None]:
plt.figure(figsize = (10, 5))
plt.subplot(1,2,1)
plt.scatter(train_num_m.GarageCars.values, train_num_m.GarageArea.values)
plt.title("Size of Garage for Cars")
fig.text(-0.01,0.5, 'Size of Garage area', va="center", rotation = 'vertical', fontsize = 12)
print(corr_m['GarageCars']['GarageArea'])
plt.subplot(1,2,2)
sns.regplot(x='GarageCars', y='GarageArea', data = train_num_m, color='Orange')

In [None]:
fig = plt.figure()
plt.scatter(train_num_m.GarageArea.values, train_num_m.GarageYrBlt.values)
plt.title("Size of Garage")
fig.text(-0.01,0.5, 'The building year for garage', va="center", rotation = 'vertical', fontsize = 12)
print(corr_m['GarageArea']['GarageYrBlt'])

In [None]:
fig = plt.figure()
plt.scatter(train_num_m.YearBuilt.values, train_num_m.GarageYrBlt.values)
plt.title("Year Built")
fig.text(-0.01,0.5, 'The year build the garage', va="center", rotation = 'vertical', fontsize = 12)
print(corr_m['YearBuilt']['GarageYrBlt'])

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Campare the importances of RF for raw and modified features
rfr = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
y = train_num['SalePrice']
X= train_num.drop('SalePrice', axis=1)

rfr.fit(X,y)
coef = pd.Series(rfr.feature_importances_, index = X.columns).sort_values(ascending=False)

plt.figure(figsize=(10, 5))
plt.subplot(1,2,1)
coef.head(25).plot(kind='bar')
plt.title('Raw Feature Significance for random forest')
plt.tight_layout()

rfr_m = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
y_m = train_num_m['SalePrice']
X_m= train_num_m.drop('SalePrice', axis=1)

rfr_m.fit(X_m,y_m)
coef_m = pd.Series(rfr_m.feature_importances_, index = X_m.columns).sort_values(ascending=False)

plt.subplot(1,2,2)
coef_m.head(25).plot(kind='bar')
plt.title('Modified Feature Significance for random forest')
plt.tight_layout()

In [None]:
gbr = GradientBoostingRegressor(loss='ls', n_estimators=1000)
gbr.fit(X,y)
coef = pd.Series(gbr.feature_importances_, index = X.columns).sort_values(ascending=False)

plt.figure(figsize=(10, 5))
plt.subplot(1,2,1)
coef.head(25).plot(kind='bar')
plt.title('Raw Feature Significance for gradient boosting trees')
plt.tight_layout()

gbr_m = GradientBoostingRegressor(loss='ls', n_estimators=1000)
y_m = train_num_m['SalePrice']
X_m= train_num_m.drop('SalePrice', axis=1)

gbr_m.fit(X_m,y_m)
coef_m = pd.Series(gbr_m.feature_importances_, index = X_m.columns).sort_values(ascending=False)

plt.subplot(1,2,2)
coef_m.head(25).plot(kind='bar')
plt.title('Modified Feature Significance for gradient boosting trees')
plt.tight_layout()

In [None]:
_,pval = f_regression(X, y)
coef = pd.Series(pval, index=X.columns).sort_values(ascending=True)

plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
coef.head(25).plot(kind='bar')
plt.title('Raw Feature Significance')
plt.tight_layout()

_,pval_m = f_regression(X_m, y)
coef_m = pd.Series(pval_m, index=X_m.columns).sort_values(ascending=True)

plt.subplot(1,2,2) 
coef_m.head(25).plot(kind='bar')
plt.title('Modified Feature Significance')
plt.tight_layout()

# Categorical Variables

In [None]:
cat_value = train_data.select_dtypes(include=['object']).iloc[:,1:]

In [None]:
plt.figure(figsize = (12, 6))
sns.boxplot(x = 'Neighborhood', y = 'SalePrice', data = train_data)
xt = plt.xticks(rotation = 45)

In [None]:
plt.figure (figsize = (12, 6))
sns.countplot(x = 'Neighborhood', data = cat_value)
xt = plt.xticks(rotation = 45)