# Initial Setup and Data Load

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from scipy.stats import norm, skew

import math
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

#from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_log_error

import string
import warnings
warnings.filterwarnings('ignore')

In [3]:
PATH = "../../../data/house_pricing/"

In [4]:
df_train=pd.read_csv(f'{PATH}train.csv')#, index_col='Id')
df_test=pd.read_csv(f'{PATH}test.csv')#, index_col='Id')

### Y (target value) to Log, as stated at Kaggle Evaluation page

In [5]:
# for the purpose of evaluation of current competition
#df_train.SalePrice = np.log1p(df_train.SalePrice)
df_train.SalePrice = np.log1p(df_train.SalePrice)

In [6]:
print('Number of Training Examples = {}'.format(df_train.shape[0]))
print('Number of Test Examples = {}\n'.format(df_test.shape[0]))
print('Training X Shape = {}'.format(df_train.shape))
print('Training y Shape = {}\n'.format(df_train['SalePrice'].shape[0]))
print('Test X Shape = {}'.format(df_test.shape))
print('Test y Shape = {}\n'.format(df_test.shape[0]))
#print(df_train.columns)
#print(df_test.columns)

Number of Training Examples = 1460
Number of Test Examples = 1459

Training X Shape = (1460, 81)
Training y Shape = 1460

Test X Shape = (1459, 80)
Test y Shape = 1459



In [7]:
#print(df_train.info())
#df_train.sample(3)
#print(df_test.info())
#df_test.sample(3)

# DataFrame concatination and Y separation

In [8]:
def concat_df(train_data, test_data):
    # Returns a concatenated df of training and test set on axis 0
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)

df_all = concat_df(df_train, df_test)

df_train.name = 'Training Set'
df_test.name = 'Test Set'
df_all.name = 'All Set' 

dfs = [df_train, df_test]

df_all.shape

(2919, 81)

In [152]:
#remember where to divide train and test
ntrain = df_train.shape[0]
ntest = df_test.shape[0]

#Save the 'Id' column
train_ID = df_train['Id']
test_ID = df_test['Id']

In [10]:
#Dividing Target column (Y)
y_train_full = df_train.SalePrice.values
df_all.drop(['SalePrice'], axis=1, inplace=True)

# Dealing with Missing Values

### Create columns to mark originally missed values

In [11]:
def mark_missing (df):
    for col in df.columns:
        if df_all[col].isnull().sum()>0:
            df_all[col+'_missed']=df_all[col].isnull()

In [12]:
mark_missing(df_all)

In [13]:
df_all.shape

(2919, 114)

### Replace Missing

In [22]:
def display_missing(df):
    for col in df.columns:
        print(col, df[col].isnull().sum())
    print('\n')
    
for df in dfs:
    print(format(df.name))
    display_missing(df)
    
    
    
#Check remaining missing values if any 
def display_only_missing(df):
    all_data_na = (df.isnull().sum() / len(df)) * 100
    all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
    print(missing_data)

Training Set
Id 0
MSSubClass 0
MSZoning 0
LotFrontage 259
LotArea 0
Street 0
Alley 1369
LotShape 0
LandContour 0
Utilities 0
LotConfig 0
LandSlope 0
Neighborhood 0
Condition1 0
Condition2 0
BldgType 0
HouseStyle 0
OverallQual 0
OverallCond 0
YearBuilt 0
YearRemodAdd 0
RoofStyle 0
RoofMatl 0
Exterior1st 0
Exterior2nd 0
MasVnrType 8
MasVnrArea 8
ExterQual 0
ExterCond 0
Foundation 0
BsmtQual 37
BsmtCond 37
BsmtExposure 38
BsmtFinType1 37
BsmtFinSF1 0
BsmtFinType2 38
BsmtFinSF2 0
BsmtUnfSF 0
TotalBsmtSF 0
Heating 0
HeatingQC 0
CentralAir 0
Electrical 1
1stFlrSF 0
2ndFlrSF 0
LowQualFinSF 0
GrLivArea 0
BsmtFullBath 0
BsmtHalfBath 0
FullBath 0
HalfBath 0
BedroomAbvGr 0
KitchenAbvGr 0
KitchenQual 0
TotRmsAbvGrd 0
Functional 0
Fireplaces 0
FireplaceQu 690
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageCars 0
GarageArea 0
GarageQual 81
GarageCond 81
PavedDrive 0
WoodDeckSF 0
OpenPorchSF 0
EnclosedPorch 0
3SsnPorch 0
ScreenPorch 0
PoolArea 0
PoolQC 1453
Fence 1179
MiscFeature 1406
MiscVal 0
M

In [23]:
display_only_missing(df_all)

              Missing Ratio
PoolQC            99.657417
MiscFeature       96.402878
Alley             93.216855
Fence             80.438506
FireplaceQu       48.646797
LotFrontage       16.649538
GarageQual         5.447071
GarageCond         5.447071
GarageFinish       5.447071
GarageYrBlt        5.447071
GarageType         5.378554
BsmtExposure       2.809181
BsmtCond           2.809181
BsmtQual           2.774923
BsmtFinType2       2.740665
BsmtFinType1       2.706406
MasVnrType         0.822199
MasVnrArea         0.787941
MSZoning           0.137033
BsmtFullBath       0.068517
BsmtHalfBath       0.068517
Utilities          0.068517
Functional         0.068517
Electrical         0.034258
BsmtUnfSF          0.034258
Exterior1st        0.034258
Exterior2nd        0.034258
TotalBsmtSF        0.034258
GarageArea         0.034258
GarageCars         0.034258
BsmtFinSF2         0.034258
BsmtFinSF1         0.034258
KitchenQual        0.034258
SaleType           0.034258


### Replace non-missing but "NA", "None", etc values by Data description

##### Replace NA in Object columns

In [29]:
display_only_missing(df_all)

              Missing Ratio
LotFrontage       16.649538
GarageYrBlt        5.447071
MasVnrArea         0.787941
MSZoning           0.137033
Utilities          0.068517
BsmtFullBath       0.068517
BsmtHalfBath       0.068517
Functional         0.068517
Exterior1st        0.034258
BsmtFinSF2         0.034258
BsmtUnfSF          0.034258
Electrical         0.034258
GarageArea         0.034258
Exterior2nd        0.034258
TotalBsmtSF        0.034258
GarageCars         0.034258
KitchenQual        0.034258
SaleType           0.034258
BsmtFinSF1         0.034258


In [30]:
# fill NA values (not missed) with None - based on data description -  - for non-Numerical (object) Columns
for col in ('Alley','MasVnrType','BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2','FireplaceQu','GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
            'PoolQC','Fence','MiscFeature'):
    df_all[col] = df_all[col].fillna('None')

##### Replace NA in Numerical columns

In [26]:
display_only_missing(df_all)

              Missing Ratio
LotFrontage       16.649538
GarageYrBlt        5.447071
MasVnrArea         0.787941
MSZoning           0.137033
Utilities          0.068517
BsmtFullBath       0.068517
BsmtHalfBath       0.068517
Functional         0.068517
Exterior1st        0.034258
BsmtFinSF2         0.034258
BsmtUnfSF          0.034258
Electrical         0.034258
GarageArea         0.034258
Exterior2nd        0.034258
TotalBsmtSF        0.034258
GarageCars         0.034258
KitchenQual        0.034258
SaleType           0.034258
BsmtFinSF1         0.034258


In [37]:
#fill NA numerical value with '0' - based on data description of correspondent Object columns - for Numerical Columns
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars','BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath','MasVnrArea'):
    df_all[col] = df_all[col].fillna(0)

##### Replace NA missing values by most often in column (only for columns with 2 and less NA values, where do not make sense to invest hugely into Analysis)

In [38]:
display_only_missing(df_all)

             Missing Ratio
LotFrontage      16.649538
MSZoning          0.137033
Utilities         0.068517
Functional        0.068517
SaleType          0.034258
KitchenQual       0.034258
Exterior2nd       0.034258
Exterior1st       0.034258
Electrical        0.034258


In [45]:
# Fill missing value in corresponding columns with most frequent value in column
for col in ('Utilities','Functional','SaleType','KitchenQual','Exterior2nd','Exterior1st','Electrical'):
    df_all[col].fillna(df_all[col].mode()[0], inplace=True)
    
# Functional : data description says NA means typical
# BTW we just used df_all.Functional.mode() = use most frequent value (as 'Typ' is most frequent value)
#df_all["Functional"] = df_all["Functional"].fillna("Typ")

### Replacing real missing values

##### Dealing with missing values left

In [44]:
display_only_missing(df_all)

             Missing Ratio
LotFrontage      16.649538
MSZoning          0.137033


In [None]:
# Dealing with MSZoning

In [46]:
df_all.MSZoning.isnull().sum()

4

In [48]:
df_all["MSZoning"] = df_all["MSZoning"].fillna("None")

In [49]:
display_only_missing(df_all)

             Missing Ratio
LotFrontage      16.649538


In [51]:
# Dealing with LotFrontage

In [56]:
df_corr=df_train.corr().sort_values(kind="quicksort", ascending=False, by='LotFrontage').abs()
df_corr.drop(axis=1, columns=df_corr.columns.drop('LotFrontage'), inplace=True)
df_corr.tail

<bound method NDFrame.tail of                LotFrontage
LotFrontage       1.000000
1stFlrSF          0.457181
LotArea           0.426095
GrLivArea         0.402797
TotalBsmtSF       0.392075
SalePrice         0.355879
TotRmsAbvGrd      0.352096
GarageArea        0.344997
GarageCars        0.285691
Fireplaces        0.266639
BedroomAbvGr      0.263170
OverallQual       0.251646
BsmtFinSF1        0.233633
PoolArea          0.206167
FullBath          0.198769
MasVnrArea        0.193458
OpenPorchSF       0.151972
BsmtUnfSF         0.132644
YearBuilt         0.123349
BsmtFullBath      0.100949
YearRemodAdd      0.088866
WoodDeckSF        0.088521
2ndFlrSF          0.080177
GarageYrBlt       0.070250
3SsnPorch         0.070029
HalfBath          0.053532
BsmtFinSF2        0.049900
ScreenPorch       0.041383
LowQualFinSF      0.038469
MoSold            0.011200
EnclosedPorch     0.010700
YrSold            0.007450
MiscVal           0.003368
KitchenAbvGr      0.006069
BsmtHalfBath      0.00723

In [55]:
df_all.groupby("Neighborhood")["LotFrontage"].median()

Neighborhood
Blmngtn    43.0
Blueste    24.0
BrDale     21.0
BrkSide    51.0
ClearCr    80.5
CollgCr    70.0
Crawfor    70.0
Edwards    65.0
Gilbert    64.0
IDOTRR     60.0
MeadowV    21.0
Mitchel    74.0
NAmes      73.0
NPkVill    24.0
NWAmes     80.0
NoRidge    89.0
NridgHt    92.0
OldTown    60.0
SWISU      60.0
Sawyer     72.0
SawyerW    67.0
Somerst    72.5
StoneBr    60.0
Timber     82.0
Veenker    80.0
Name: LotFrontage, dtype: float64

In [57]:
df_all.groupby("MSSubClass")["LotFrontage"].median()

MSSubClass
20     75.0
30     60.0
40     55.0
45     55.0
50     60.0
60     75.0
70     60.0
75     65.0
80     78.0
85     72.0
90     70.0
120    43.0
150     NaN
160    24.0
180    21.0
190    60.0
Name: LotFrontage, dtype: float64

In [58]:
# LotFrontage : Since the area of each street connected to the house property most likely have a similar area to other houses in its neighborhood , we can fill in missing values by the median LotFrontage of the neighborhood.
# Group by neighborhood and fill in missing value by the median LotFrontage of all the neighborhood
df_all["LotFrontage"] = df_all.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

In [59]:
df_all['LotFrontage']

0        65.0
1        80.0
2        68.0
3        60.0
4        84.0
5        85.0
6        75.0
7        80.0
8        51.0
9        50.0
10       70.0
11       85.0
12       72.0
13       91.0
14       73.0
15       51.0
16       73.0
17       72.0
18       66.0
19       70.0
20      101.0
21       57.0
22       75.0
23       44.0
24       72.0
25      110.0
26       60.0
27       98.0
28       47.0
29       60.0
        ...  
2889     50.0
2890     75.0
2891     69.0
2892     50.0
2893     60.0
2894     41.0
2895     44.0
2896     69.0
2897     65.0
2898     70.0
2899    140.0
2900     82.0
2901     82.0
2902     95.0
2903     88.0
2904    125.0
2905     78.0
2906     41.0
2907     58.0
2908     74.0
2909     21.0
2910     21.0
2911     80.0
2912     21.0
2913     21.0
2914     21.0
2915     21.0
2916    160.0
2917     62.0
2918     74.0
Name: LotFrontage, Length: 2919, dtype: float64

In [60]:
display_only_missing(df_all)

Empty DataFrame
Columns: [Missing Ratio]
Index: []


In [61]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Columns: 114 entries, 1stFlrSF to Utilities_missed
dtypes: bool(34), float64(11), int64(26), object(43)
memory usage: 1.9+ MB


##### Seems no missed values
Missing Values = DONE

# Pre-Evaluation - benchmarking before Feature Generation

## Making Training, Validation, Test Dataset

In [143]:
"""Dividing working DataFrame back to Train and Test"""
# split Validational/Test set from Training set after Categorical Value Engeneering
#def original_train_test(df_all):
X_test=df_all.iloc[ntrain:] # Test set
X_train_full=df_all.iloc[:ntrain] # Train set

In [135]:
#df_all.shape, y_train_full.shape, X_test.shape, X_train_full.shape

In [136]:
X_train, X_valid, y_train, y_valid = train_test_split(pd.get_dummies(X_train_full), y_train_full)

In [94]:
#X_train.shape, X_valid.shape, y_train.shape

## Evaluation

In [71]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

### Experimenting with Random Forest

In [93]:
m_rf = RandomForestRegressor(n_estimators=160, min_samples_leaf=1, max_features=0.5, n_jobs=-1, oob_score=True)
m_rf.fit(X_train, y_train)
print_score(m_rf)

[0.05150031238369064, 0.14697048141792002, 0.9827975582916019, 0.8766027278831514, 0.8735696527697776]


### XGBoost

In [91]:
m_xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05)
# using early_stop to find out where validation scores don't improve
m_xgb.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)
%time m_xgb.fit(X_train, y_train)
print_score(m_xgb)

CPU times: user 18.9 s, sys: 145 ms, total: 19 s
Wall time: 20 s
[0.07613898691186147, 0.13331673102187008, 0.9624002847448923, 0.8984652654719681]


# Dealing with categorical values

In [110]:
def show_object_columns(df):
    for col in df:
        if is_string_dtype(df[col]):
            print(col)
show_object_columns(df_all)

Alley
BldgType
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
BsmtQual
CentralAir
Condition1
Condition2
Electrical
ExterCond
ExterQual
Exterior1st
Exterior2nd
Fence
FireplaceQu
Foundation
Functional
GarageCond
GarageFinish
GarageQual
GarageType
Heating
HeatingQC
HouseStyle
KitchenQual
LandContour
LandSlope
LotConfig
LotShape
MSZoning
MasVnrType
MiscFeature
Neighborhood
PavedDrive
PoolQC
RoofMatl
RoofStyle
SaleCondition
SaleType
Street
Utilities


In [None]:
# Transforming some numerical variables that are really categorical

# MSSubClass=The building class
#df_all['MSSubClass'] = df_all['MSSubClass'].astype(str)


# Changing OverallCond into a categorical variable
#df_all['OverallCond'] = df_all['OverallCond'].astype(str)


# Year and month sold are transformed into categorical features.
#df_all['YrSold'] = df_all['YrSold'].astype(str)
#df_all['MoSold'] = df_all['MoSold'].astype(str)

In [112]:
df_all.info(114)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 114 columns):
1stFlrSF               int64
2ndFlrSF               int64
3SsnPorch              int64
Alley                  object
BedroomAbvGr           int64
BldgType               object
BsmtCond               object
BsmtExposure           object
BsmtFinSF1             float64
BsmtFinSF2             float64
BsmtFinType1           object
BsmtFinType2           object
BsmtFullBath           float64
BsmtHalfBath           float64
BsmtQual               object
BsmtUnfSF              float64
CentralAir             object
Condition1             object
Condition2             object
Electrical             object
EnclosedPorch          int64
ExterCond              object
ExterQual              object
Exterior1st            object
Exterior2nd            object
Fence                  object
FireplaceQu            object
Fireplaces             int64
Foundation             object
FullBath               

In [117]:
# convert object columns to categorical
def conv_obj_to_categories(df):
    """
    Convert Object columns to Categorical
    """
    for col in df:
        if is_string_dtype(df[col]):
            df[col]=df[col].astype('category')


In [118]:



conv_obj_to_categories(df_all)






In [119]:
def show_categorical_columns(df):
    """
    Print only categorical columns Number, Name and Codes of unique values in corresponding column 
    """
    for col in df:
        if is_categorical_dtype(df[col]):
            print(sum(np.unique(df[col].cat.categories,return_counts=True)[1]), col ,df[col].cat.categories)

In [120]:
show_categorical_columns(df_all)

3 Alley Index(['Grvl', 'None', 'Pave'], dtype='object')
5 BldgType Index(['1Fam', '2fmCon', 'Duplex', 'Twnhs', 'TwnhsE'], dtype='object')
5 BsmtCond Index(['Fa', 'Gd', 'None', 'Po', 'TA'], dtype='object')
5 BsmtExposure Index(['Av', 'Gd', 'Mn', 'No', 'None'], dtype='object')
7 BsmtFinType1 Index(['ALQ', 'BLQ', 'GLQ', 'LwQ', 'None', 'Rec', 'Unf'], dtype='object')
7 BsmtFinType2 Index(['ALQ', 'BLQ', 'GLQ', 'LwQ', 'None', 'Rec', 'Unf'], dtype='object')
5 BsmtQual Index(['Ex', 'Fa', 'Gd', 'None', 'TA'], dtype='object')
2 CentralAir Index(['N', 'Y'], dtype='object')
9 Condition1 Index(['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNe',
       'RRNn'],
      dtype='object')
8 Condition2 Index(['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNn'], dtype='object')
5 Electrical Index(['FuseA', 'FuseF', 'FuseP', 'Mix', 'SBrkr'], dtype='object')
5 ExterCond Index(['Ex', 'Fa', 'Gd', 'Po', 'TA'], dtype='object')
4 ExterQual Index(['Ex', 'Fa', 'Gd', 'TA'], dtype='object

In [121]:
def unique_categories(df,n=float("inf")):
    """
    Print only categorical columns Names and Number of unique values in corresponding column 
    df - DataFrame
    n - show only columns with less then N unique values, 
        as default - not show column if more than 10000 unique value - not pseudo categorical
    """
    for col in df:
        if is_categorical_dtype(df[col]):
            if sum(np.unique(df[col].cat.categories,return_counts=True)[1])<n:
                print(col, sum(np.unique(df[col].cat.categories,return_counts=True)[1]))

In [124]:
unique_categories(df_all)

Alley 3
BldgType 5
BsmtCond 5
BsmtExposure 5
BsmtFinType1 7
BsmtFinType2 7
BsmtQual 5
CentralAir 2
Condition1 9
Condition2 8
Electrical 5
ExterCond 5
ExterQual 4
Exterior1st 15
Exterior2nd 16
Fence 5
FireplaceQu 6
Foundation 6
Functional 7
GarageCond 6
GarageFinish 4
GarageQual 6
GarageType 7
Heating 6
HeatingQC 5
HouseStyle 8
KitchenQual 4
LandContour 4
LandSlope 3
LotConfig 5
LotShape 4
MSZoning 6
MasVnrType 4
MiscFeature 5
Neighborhood 25
PavedDrive 3
PoolQC 4
RoofMatl 8
RoofStyle 6
SaleCondition 6
SaleType 9
Street 2
Utilities 2


## Check numeric columns (if they are actually Categorical, like Year)

### Experimenting - heavily convert NUMERICAL to CATEGORICAL

In [125]:
df_allcats=df_all.copy()

In [126]:
### Experimenting with Numerical Categories
def conv_num_cat (df):
    for col in df:
        if is_numeric_dtype(df[col]): 
            df[col]=df[col].astype('category')
        else:
            df.drop(columns=col, inplace=True)

In [127]:
conv_num_cat(df_allcats)

In [128]:
unique_categories(df_allcats,20)

BedroomAbvGr 8
BsmtFullBath 4
BsmtHalfBath 3
Fireplaces 5
FullBath 5
GarageCars 6
HalfBath 3
KitchenAbvGr 4
MSSubClass 16
MoSold 12
OverallCond 9
OverallQual 10
PoolArea 14
TotRmsAbvGrd 14
YrSold 5
Alley_missed 2
BsmtCond_missed 2
BsmtExposure_missed 2
BsmtFinSF1_missed 2
BsmtFinSF2_missed 2
BsmtFinType1_missed 2
BsmtFinType2_missed 2
BsmtFullBath_missed 2
BsmtHalfBath_missed 2
BsmtQual_missed 2
BsmtUnfSF_missed 2
Electrical_missed 2
Exterior1st_missed 2
Exterior2nd_missed 2
Fence_missed 2
FireplaceQu_missed 2
Functional_missed 2
GarageArea_missed 2
GarageCars_missed 2
GarageCond_missed 2
GarageFinish_missed 2
GarageQual_missed 2
GarageType_missed 2
GarageYrBlt_missed 2
KitchenQual_missed 2
LotFrontage_missed 2
MSZoning_missed 2
MasVnrArea_missed 2
MasVnrType_missed 2
MiscFeature_missed 2
PoolQC_missed 2
SaleType_missed 2
TotalBsmtSF_missed 2
Utilities_missed 2


In [None]:
#conv_to_cat_longlist=['BedroomAbvGr', 'BsmtFullBath','BsmtHalfBath', 'Fireplaces', 'FullBath',\
#             'GarageCars','HalfBath','KitchenAbvGr','MSSubClass','MoSold','OverallCond',\
#             'OverallQual','PoolArea','TotRmsAbvGrd','YrSold']

In [None]:
conv_to_cat_shortlist=['HalfBath','MSSubClass', 'MoSold','OverallCond', 'OverallQual','YrSold']

In [None]:
#for cat in conv_to_cat_longlist:
#    df_all[cat]=df_all[cat].astype('category')

for cat in conv_to_cat_shortlist:
    df_all[cat]=df_all[cat].astype('category')

In [109]:
df_all.info(114)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 114 columns):
1stFlrSF               int64
2ndFlrSF               int64
3SsnPorch              int64
Alley                  object
BedroomAbvGr           int64
BldgType               object
BsmtCond               object
BsmtExposure           object
BsmtFinSF1             float64
BsmtFinSF2             float64
BsmtFinType1           object
BsmtFinType2           object
BsmtFullBath           float64
BsmtHalfBath           float64
BsmtQual               object
BsmtUnfSF              float64
CentralAir             object
Condition1             object
Condition2             object
Electrical             object
EnclosedPorch          int64
ExterCond              object
ExterQual              object
Exterior1st            object
Exterior2nd            object
Fence                  object
FireplaceQu            object
Fireplaces             int64
Foundation             object
FullBath               

# Feature Importance

In [147]:
fi = pd.DataFrame({'feature': list(X_train.columns), 'importance':m_rf.feature_importances_}).sort_values('importance',ascending=False)

In [148]:
fi[:50]

Unnamed: 0,feature,importance
71,Sizes_Total,0.323823
28,OverallQual,0.25435
15,GrLivArea,0.050013
73,Age_Build,0.036942
34,YearBuilt,0.033387
144,ExterQual_TA,0.026398
13,GarageCars,0.015666
32,TotalBsmtSF,0.014533
0,1stFlrSF,0.014353
74,Age_Remod,0.014237


In [None]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

In [None]:
def find_features_to_drop(X_train, X_valid, y_train, y_valid):
    """ Using RandomForest identifies important feature 
    and one by one drop least important features from DataFrame to improve model score
    input - X_train, X_valid, y_train, y_valid, same as used in training and evaluation model using train/valid split
    """
    m_feature_to_drop = RandomForestRegressor(n_estimators=160, min_samples_leaf=1, max_features=0.5, n_jobs=-1, oob_score=False)
    # to try - not use actual feature importance each iteration, but use only first one
    #        m_feature_to_drop.fit(X_train, y_train)
    #        fi = pd.DataFrame({'feature': list(X_train.columns), 'importance':m_feature_to_drop.feature_importances_}).sort_values('importance',ascending=False)
    
    # Number of features in DataFrame
    num_of_features=X_train.shape[1]
    
    list_of_original_columns=X_train.columns
    
    best_grade=1
    list_of_feature_to_drop=pd.DataFrame()
    #grades={}
    
    for iteration in range(0, num_of_features):
            
        # Iteratively fit model with features without 1 least important (dropped in previos iteration)
        m_feature_to_drop.fit(X_train, y_train)
        # Evaluating performance withot this feature
        grade=math.sqrt(mean_squared_error(y_valid, m_feature_to_drop.predict(X_valid)))

        # Updating based on new model list of feature importance
        fi = pd.DataFrame({'feature': list(X_train.columns), 'importance':m_feature_to_drop.feature_importances_}).sort_values('importance',ascending=False)

        # Finding best score
        if grade<best_grade:
            best_grade=grade
            best_num_of_features=(num_of_features-iteration)
            list_of_feature_to_drop=list_of_original_columns.difference(fi.feature)

        # Dropping last 1 (least important feature)
        X_train=X_train.drop(columns=fi.feature[-1:])
        X_valid=X_valid.drop(columns=fi.feature[-1:])

        print ((num_of_features-iteration),grade, fi.feature[-1:])
        #grades.update({(num_of_features-iteration):grade})
    print(best_grade,best_num_of_features) 
    return list_of_feature_to_drop

In [None]:
#features_to_drop=find_features_to_drop(X_train, X_valid, y_train, y_valid)

In [None]:
features_to_drop
#fi.feature==fi.feature

In [None]:
x=list(grades.keys())
y=list(grades.values())

ax = plt.axes()
plt.plot(x,y)
plt.show()

In [None]:

ax = plt.axes()
plt.xlim(150,300)
plt.ylim(0.133,0.1350)
plt.plot(x,y)
plt.show()

In [None]:
df_all.shape

In [None]:
#df_all=df_all.drop(columns=features_to_drop)
#df_all=df_all.drop(columns=fi.feature[150:])

In [None]:
"""Dividing working DataFrame back to Train and Test"""
# split Validational/Test set from Training set after Categorical Value Engeneering
X_test=df_all.iloc[ntrain:] # Test set
X_train_full=df_all.iloc[:ntrain] # Train set
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full)


In [None]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

# Features generation

In [None]:
df_all['TotalSF'] = df_all['TotalBsmtSF'] + df_all['1stFlrSF'] + df_all['2ndFlrSF']

In [130]:
df_all['Sizes_Total']=df_all['GrLivArea']+df_all['GarageCars']+df_all['GarageArea']+df_all['TotalBsmtSF']+df_all['1stFlrSF']+df_all['2ndFlrSF']+df_all['OpenPorchSF']+df_all['MasVnrArea']
df_all['Quantity_Total']=df_all['Fireplaces']+df_all['FullBath']+df_all['KitchenAbvGr']+df_all['TotRmsAbvGrd']+df_all['BedroomAbvGr']+df_all['BsmtFullBath']
df_all['Age_Build']=df_all['YrSold']-df_all['YearBuilt']
df_all['Age_Remod']=df_all['YrSold']-df_all['YearRemodAdd']

                                


In [133]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Columns: 118 entries, 1stFlrSF to Age_Remod
dtypes: bool(34), category(43), float64(13), int64(28)
memory usage: 1.1 MB


In [137]:
m_rf.fit(X_train, y_train)
print_score(m_rf)

[0.051799008991690605, 0.12815084836318094, 0.9831386065332787, 0.8975207928980642, 0.8752826011537651]


In [138]:
m_xgb.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)
%time m_xgb.fit(X_train, y_train)
print_score(m_xgb)

CPU times: user 18.4 s, sys: 124 ms, total: 18.5 s
Wall time: 18.9 s
[0.08109415488093843, 0.12747493368378532, 0.9586734010839155, 0.8985989680480135]


In [None]:
# Real 0.14114 and after full stackNet 0.123

# Self made and experiment Evaluation techniques

In [85]:
# Evaluation of simple Random Forest
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, y_train)
#print_score(m)

CPU times: user 509 ms, sys: 12.4 ms, total: 522 ms
Wall time: 243 ms


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [86]:

math.sqrt(mean_squared_error(y_valid, m.predict(X_valid)))

0.16168199443998715

In [87]:
# if you need to evaluate LOG Root mean squared error but wouldn't like to convert y to log(y)

In [88]:
math.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(m.predict(X_valid))))

0.16168199443998718

# Dealing with Outliers

### -> To delete outliers

# Features engineering

In [None]:
"""#check the numbers of samples and features
print("The train data size before dropping Id feature is : {} ".format(df_train.shape))
print("The test data size before dropping Id feature is : {} ".format(df_test.shape))

#Save the 'Id' column
train_ID = df_train['Id']
test_ID = df_test['Id']

#Now drop the  'Id' colum since it's unnecessary for  the prediction process.
df_train.drop("Id", axis = 1, inplace = True)
df_test.drop("Id", axis = 1, inplace = True)

#check again the data size after dropping the 'Id' variable
print("\nThe train data size after dropping Id feature is : {} ".format(df_train.shape)) 
print("The test data size after dropping Id feature is : {} ".format(df_test.shape))
"""

## Correlation

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(df_train.corr())
#plt.plot()

In [None]:
"""
fig, axs = plt.subplots(nrows=2, figsize=(20, 20))

sns.heatmap(df_train.corr(), ax=axs[0], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 14})
sns.heatmap(df_test.corr(), ax=axs[1], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 14})

for i in range(2):    
    axs[i].tick_params(axis='x', labelsize=14)
    axs[i].tick_params(axis='y', labelsize=14)
    
axs[0].set_title('Training Set Correlations', size=15)
axs[1].set_title('Test Set Correlations', size=15)

plt.show()
"""

In [None]:
"""
df_corr=df_train.corr().sort_values(kind="quicksort", ascending=False, by='SalePrice').abs()
df_corr.drop(axis=1, columns=df_corr.columns.drop('SalePrice'), inplace=True)
df_corr
"""

# Scewed data

In [None]:
"""numeric_feats = df_all.dtypes[df_all.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = df_all[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)
"""

In [None]:
"""
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    #all_data[feat] += 1
    df_all[feat] = boxcox1p(df_all[feat], lam)
"""

# Normalization

In [None]:
#Normalization, the Sigmoid, Log, Cube Root and the Hyperbolic Tangent. 
#It all depends on what one is trying to accomplish.

In [None]:
#df_all.info()

# Label Encoding

In [None]:
"""from sklearn.preprocessing import LabelEncoder
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(df_all[c].values)) 
    df_all[c] = lbl.transform(list(df_all[c].values))
    """

# Dummies

In [142]:
df_all=pd.get_dummies(df_all)

# Machine Learning

### Experimenting with Random Forest

In [None]:
m_rf = RandomForestRegressor(n_estimators=160, min_samples_leaf=1, max_features=0.5, n_jobs=-1, oob_score=True)
m_rf.fit(X_train, y_train)
print_score(m_rf)

### XGBoost

In [None]:
m_xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05)
# using early_stop to find out where validation scores don't improve
m_xgb.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)
%time m_xgb.fit(X_train, y_train)
print_score(m_xgb)

### GBDT (Gradient Boosting Decision Tree)

In [None]:
m_gbdt=GradientBoostingRegressor(n_estimators=1000, learning_rate=0.05)
%time m_gbdt.fit(X_train, y_train)
print_score(m_gbdt)

# Stacking

# Testing stacking from Kaggle

In [155]:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin
from sklearn.base import TransformerMixin,clone

In [156]:
from sklearn.model_selection import KFold, cross_val_score
n_folds=2
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train_full.values)
    rmse= np.sqrt(-cross_val_score(model, X_train_full.values, y_train_full, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [157]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Gradient Boosting score: 0.1256 (0.0065)



In [158]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet, Lasso

lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))


In [159]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Lasso score: 0.1436 (0.0077)



In [160]:
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

ElasticNet score: 0.1433 (0.0077)



In [161]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [162]:
stacked_averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, lasso),
                                                 meta_model = lasso)

score = rmsle_cv(stacked_averaged_models)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

Stacking Averaged models score: 0.1265 (0.0026)


In [163]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [164]:
stacked_averaged_models.fit(X_train_full.values, y_train_full)
stacked_train_pred = stacked_averaged_models.predict(X_train_full.values)
stacked_pred = np.expm1(stacked_averaged_models.predict(X_test.values))
print(rmsle(y_train_full, stacked_train_pred))

0.06830790545302497


In [165]:
m_xgb.fit(X_train_full, y_train_full)
xgb_train_pred = m_xgb.predict(X_train_full)
xgb_pred = np.expm1(m_xgb.predict(X_test))
print(rmsle(y_train_full, xgb_train_pred))

0.08608955697515783


In [166]:
m_rf.fit(X_train_full, y_train_full)
rf_train_pred = m_rf.predict(X_train_full)
rf_pred = np.expm1(m_rf.predict(X_test.values))
print(rmsle(y_train_full, rf_train_pred))

0.050188742113954375


In [167]:
'''RMSE on the entire Train data when averaging'''

print('RMSLE score on train data:')
print(rmsle(y_train_full,stacked_train_pred*0.7 +
               xgb_train_pred*0.15+rf_train_pred*0.15))

RMSLE score on train data:
0.06438486755683918


In [168]:
y_pred = stacked_pred*0.7 +xgb_pred*0.15+rf_pred*0.15

In [169]:
y_pred

array([122143.79116434, 156383.39824532, 185453.28965963, ...,
       164321.96233872, 114447.48270412, 221742.57284308])

# Predictions for submission

In [139]:
### XGBoost

In [144]:
m_final_xgb = XGBRegressor(n_estimators=2000, learning_rate=0.05)
m_final_xgb.fit(X_train_full, y_train_full)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.05, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=2000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

## Predicting

In [149]:
y_pred = np.expm1(m_final_xgb.predict(X_test)); y_pred

array([121743.  , 161451.34, 196516.75, ..., 155372.27, 113494.2 ,
       214608.03], dtype=float32)

# Submission

In [170]:
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = y_pred
sub.to_csv('submittions/submission_26Aug19.csv',index=False)

In [171]:
sub.head()

Unnamed: 0,Id,SalePrice
0,1461,122143.791164
1,1462,156383.398245
2,1463,185453.28966
3,1464,191479.759974
4,1465,187387.874236
