# Initial Setup and Data Load

In [1]:
%load_ext autoreload
%autoreload 2
import os

%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from scipy.stats import norm, skew

import math
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

#from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_log_error

import string
import warnings
warnings.filterwarnings('ignore')

In [3]:
df_raw = pd.read_feather('tmp/house-after-cleaning')
test_ID = pd.read_feather('tmp/house-testDF-idxs')

#df_all = pd.read_feather('tmp/house-after-cleaning')
#y_train_full= pd.read_feather('tmp/house-y-after-cleaning')

# DataFrame Y separation, setting split values

In [4]:
#Dividing Target column (Y)
y_train_full = df_raw.SalePrice.values
df_all=df_raw.drop(['SalePrice'], axis=1)


In [5]:
ntrain = 1460
ntest = 1459
y_train_full=y_train_full[:ntrain]

# Pre-Evaluation - benchmarking before Feature Generation

## Evaluation

In [6]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)


## Making Training, Validation, Test Dataset

In [7]:
def div_train_test(df):
    
    """Dividing working DataFrame back to Train and Test"""
    # split Validational/Test set from Training set after Categorical Value Engeneering
    #def original_train_test(df_all):
#    X_test=df.iloc[ntrain:] # Test set
#    X_train_full=df.iloc[:ntrain] # Train set

X_test=df_all.iloc[ntrain:] # Test set
X_train_full=df_all.iloc[:ntrain] # Train set

In [8]:
df_all.shape, y_train_full.shape, X_test.shape, X_train_full.shape

((2919, 114), (1460,), (1459, 114), (1460, 114))

In [9]:
def quick_get_dumm(df):
    X_train_full=df.iloc[:ntrain] # Train set
    X_train, X_valid, y_train, y_valid = train_test_split(pd.get_dummies(X_train_full), y_train_full)
    return X_train, X_valid, y_train, y_valid
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)

## ML

### Experimenting with Random Forest

In [10]:
m_rf = RandomForestRegressor(n_estimators=160, min_samples_leaf=1, max_features=0.5, n_jobs=-1, oob_score=True)
m_rf.fit(X_train, y_train)
print_score(m_rf)

[0.05300868383340684, 0.13077168659081337, 0.983181655155502, 0.8745094817914816, 0.872490969887689]


### XGBoost

In [11]:
m_xgb = XGBRegressor(n_estimators=160, learning_rate=0.05)
# using early_stop to find out where validation scores don't improve
#m_xgb.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)
%time m_xgb.fit(X_train, y_train)
print_score(m_xgb)

CPU times: user 2.71 s, sys: 16.4 ms, total: 2.72 s
Wall time: 3.02 s
[0.09210898354530202, 0.12431164248045938, 0.9492199422757744, 0.886601562090951]


# Feature Importance

In [12]:
fi = pd.DataFrame({'feature': list(X_train.columns), 'importance':m_rf.feature_importances_}).sort_values('importance',ascending=False)

In [13]:
fi[:70]

Unnamed: 0,feature,importance
28,OverallQual,0.319735
15,GrLivArea,0.173052
34,YearBuilt,0.045797
32,TotalBsmtSF,0.043377
13,GarageCars,0.041237
140,ExterQual_TA,0.041082
12,GarageArea,0.035203
0,1stFlrSF,0.025352
4,BsmtFinSF1,0.019092
14,GarageYrBlt,0.018384


# Dealing with categorical values

In [14]:
def show_object_columns(df):
    for col in df:
        if is_string_dtype(df[col]):
            print(col)

In [15]:
show_object_columns(df_all)

Alley
BldgType
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
BsmtQual
CentralAir
Condition1
Condition2
Electrical
ExterCond
ExterQual
Exterior1st
Exterior2nd
Fence
FireplaceQu
Foundation
Functional
GarageCond
GarageFinish
GarageQual
GarageType
Heating
HeatingQC
HouseStyle
KitchenQual
LandContour
LandSlope
LotConfig
LotShape
MSZoning
MasVnrType
MiscFeature
Neighborhood
PavedDrive
PoolQC
RoofMatl
RoofStyle
SaleCondition
SaleType
Street
Utilities


## Label Encoding Based on Feature Importance

##### Encoding quality columns with self defined dictionary

In [16]:
""""
Encode Quality columns with:
Ex	Excellent
Gd	Good
TA	Average/Typical
Fa	Fair
Po	Poor
NA	No "Garage/Basement/Fireplace/..."

To decode we use same Disctionary as used in other dataset columns:
OverallCond: Rates the overall condition of the house
       10	Very Excellent
       9	Excellent
       8	Very Good
       7	Good
       6	Above Average	
       5	Average
       4	Below Average	
       3	Fair
       2	Poor
       1	Very Poor
"""

qual_cleanup = {"Ex": 9, "Gd": 7, "TA": 5, "Fa": 3,"Po": 2, "None": 0}

for col in ('ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual','FireplaceQu','GarageQual',
            'GarageCond','PoolQC'):
    df_all[col].replace(qual_cleanup, inplace=True)
    df_all[col].astype(int)


In [17]:
np.unique(df_all['BsmtCond'])

array([0, 2, 3, 5, 7])

In [18]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

[0.05471613599358513, 0.12559430034916128, 0.9808758693396465, 0.905903239961192, 0.8612525591438183]
[0.0911804624331161, 0.11914544343873631, 0.9468926730606617, 0.9153182773689691]


In [19]:
""""
BsmtFinType1: Rating of basement finished area

       GLQ	Good Living Quarters
       ALQ	Average Living Quarters
       BLQ	Below Average Living Quarters	
       Rec	Average Rec Room
       LwQ	Low Quality
       Unf	Unfinshed
       NA	No Basement
"""

qual_cleanup = {"GLQ": 10, "ALQ": 8, "BLQ": 6, "Rec": 4, "LwQ": 3,"Unf": 2, "None": 0}

for col in ('BsmtFinType1','BsmtFinType2'):
    df_all[col].replace(qual_cleanup, inplace=True)
    df_all[col].astype(int)
    
"""
BsmtExposure: Refers to walkout or garden level walls
       Gd	Good Exposure
       Av	Average Exposure (split levels or foyers typically score average or above)	
       Mn	Mimimum Exposure
       No	No Exposure
       NA	No Basement
"""
qual_cleanup = {"Gd": 10, "Av": 7, "Mn": 4, "No": 2, "None": 0}

df_all['BsmtExposure'].replace(qual_cleanup, inplace=True)
df_all['BsmtExposure'].astype(int)

0        2
1       10
2        4
3        2
4        7
5        2
6        7
7        4
8        2
9        2
10       2
11       2
12       2
13       7
14       2
15       2
16       2
17       0
18       2
19       2
20       7
21       2
22       2
23       2
24       4
25       2
26       4
27       2
28      10
29       2
        ..
2889     2
2890     2
2891     0
2892     4
2893     2
2894    10
2895    10
2896    10
2897     4
2898     2
2899     7
2900    10
2901     7
2902     7
2903    10
2904     0
2905    10
2906     4
2907     2
2908     2
2909     7
2910     2
2911     7
2912     2
2913     2
2914     2
2915     2
2916     2
2917     7
2918     7
Name: BsmtExposure, Length: 2919, dtype: int64

In [20]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

[0.05210550319229601, 0.1476910237396092, 0.9829321934850312, 0.8633503310040344, 0.8770324143140903]
[0.08891709880453061, 0.13790104932954927, 0.9502971730952092, 0.8808660547849153]


In [21]:
show_object_columns(df_all)

Alley
BldgType
CentralAir
Condition1
Condition2
Electrical
ExterCond
ExterQual
Exterior1st
Exterior2nd
Fence
Foundation
Functional
GarageFinish
GarageType
Heating
HeatingQC
HouseStyle
KitchenQual
LandContour
LandSlope
LotConfig
LotShape
MSZoning
MasVnrType
MiscFeature
Neighborhood
PavedDrive
RoofMatl
RoofStyle
SaleCondition
SaleType
Street
Utilities


##### Working on Functional (seems decrease score, not used now)

In [22]:
np.unique(df_all['Functional'])

array(['Maj1', 'Maj2', 'Min1', 'Min2', 'Mod', 'Sev', 'Typ'], dtype=object)

In [23]:
""""
Functional: Home functionality (Assume typical unless deductions are warranted)
       Typ	Typical Functionality
       Min1	Minor Deductions 1
       Min2	Minor Deductions 2
       Mod	Moderate Deductions
       Maj1	Major Deductions 1
       Maj2	Major Deductions 2
       Sev	Severely Damaged
       Sal	Salvage only

"""

qual_cleanup = {"Typ": 10, "Min1": 9, "Min2": 8, "Mod": 6, "Maj1": 4,"Maj2": 3, "Sev": 1, "Sal": 0}

#df_all['Functional'].replace(qual_cleanup, inplace=True)
#df_all['Functional'].astype(int)

In [24]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

[0.05236742834159967, 0.13469698935107138, 0.9825558002142957, 0.8907955572943907, 0.8739428765248387]
[0.08919003261599993, 0.12970196417252006, 0.9493987549062706, 0.8987447301878095]


##### Working with GarageFinish (seems decrease score, not used now)

In [25]:
np.unique(df_all['GarageFinish'])

array(['Fin', 'None', 'RFn', 'Unf'], dtype=object)

In [26]:
qual_cleanup = {"Fin": 10, "RFn": 7, "Unf": 4, "None": 0}

df_all['GarageFinish'].replace(qual_cleanup, inplace=True)
df_all['GarageFinish'].astype(int)

0        7
1        7
2        7
3        4
4        7
5        4
6        7
7        7
8        4
9        7
10       4
11      10
12       4
13       7
14       7
15       4
16      10
17       4
18       4
19       4
20       7
21       4
22       7
23       4
24       4
25       7
26       4
27       7
28       7
29       4
        ..
2889     4
2890     4
2891     0
2892     0
2893     0
2894    10
2895    10
2896    10
2897     4
2898     7
2899     4
2900     4
2901     4
2902    10
2903    10
2904     4
2905    10
2906     7
2907     7
2908     4
2909     0
2910     4
2911     7
2912     4
2913     0
2914     0
2915     4
2916     4
2917     0
2918    10
Name: GarageFinish, Length: 2919, dtype: int64

In [27]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

[0.05524542896575795, 0.11844884292349218, 0.9808655541042294, 0.9118745416707257, 0.8591195863691776]
[0.09013366760742259, 0.11183790070475517, 0.9490672323072421, 0.9214370547781192]


##### Other Categorical values

In [28]:
show_object_columns(df_all)

Alley
BldgType
CentralAir
Condition1
Condition2
Electrical
ExterCond
ExterQual
Exterior1st
Exterior2nd
Fence
Foundation
Functional
GarageType
Heating
HeatingQC
HouseStyle
KitchenQual
LandContour
LandSlope
LotConfig
LotShape
MSZoning
MasVnrType
MiscFeature
Neighborhood
PavedDrive
RoofMatl
RoofStyle
SaleCondition
SaleType
Street
Utilities


In [29]:
# Transforming some numerical variables that are really categorical

# MSSubClass=The building class
#df_all['MSSubClass'] = df_all['MSSubClass'].astype(str)


# Changing OverallCond into a categorical variable
#df_all['OverallCond'] = df_all['OverallCond'].astype(str)


# Year and month sold are transformed into categorical features.
#df_all['YrSold'] = df_all['YrSold'].astype(str)
#df_all['MoSold'] = df_all['MoSold'].astype(str)

In [30]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Columns: 114 entries, 1stFlrSF to Utilities_missed
dtypes: bool(34), float64(11), int64(36), object(33)
memory usage: 1.9+ MB


In [31]:
# convert object columns to categorical
def conv_obj_to_categories(df):
    """
    Convert Object columns to Categorical
    """
    for col in df:
        if is_string_dtype(df[col]):
            df[col]=df[col].astype('category')


In [32]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

[0.05054058177448705, 0.15438322724597497, 0.9834026427264911, 0.8646498862060439, 0.8764393418911318]
[0.08774907841048088, 0.14621423859906027, 0.9499684915545432, 0.8785946769703554]


In [33]:
conv_obj_to_categories(df_all)

In [34]:
def show_categorical_columns(df):
    """
    Print only categorical columns Number, Name and Codes of unique values in corresponding column 
    """
    for col in df:
        if is_categorical_dtype(df[col]):
            print(sum(np.unique(df[col].cat.categories,return_counts=True)[1]), col ,df[col].cat.categories)

In [35]:
show_categorical_columns(df_all)

3 Alley Index(['Grvl', 'None', 'Pave'], dtype='object')
5 BldgType Index(['1Fam', '2fmCon', 'Duplex', 'Twnhs', 'TwnhsE'], dtype='object')
2 CentralAir Index(['N', 'Y'], dtype='object')
9 Condition1 Index(['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNe',
       'RRNn'],
      dtype='object')
8 Condition2 Index(['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNn'], dtype='object')
5 Electrical Index(['FuseA', 'FuseF', 'FuseP', 'Mix', 'SBrkr'], dtype='object')
5 ExterCond Int64Index([2, 3, 5, 7, 9], dtype='int64')
4 ExterQual Int64Index([3, 5, 7, 9], dtype='int64')
15 Exterior1st Index(['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd',
       'HdBoard', 'ImStucc', 'MetalSd', 'Plywood', 'Stone', 'Stucco',
       'VinylSd', 'Wd Sdng', 'WdShing'],
      dtype='object')
16 Exterior2nd Index(['AsbShng', 'AsphShn', 'Brk Cmn', 'BrkFace', 'CBlock', 'CmentBd',
       'HdBoard', 'ImStucc', 'MetalSd', 'Other', 'Plywood', 'Stone', 'Stucco',
       'Vinyl

In [36]:
def unique_categories(df,n=float("inf")):
    """
    Print only categorical columns Names and Number of unique values in corresponding column 
    df - DataFrame
    n - show only columns with less then N unique values, 
        as default - not show column if more than 10000 unique value - not pseudo categorical
    """
    for col in df:
        if is_categorical_dtype(df[col]):
            if sum(np.unique(df[col].cat.categories,return_counts=True)[1])<n:
                print(col, sum(np.unique(df[col].cat.categories,return_counts=True)[1]))

In [37]:
unique_categories(df_all)

Alley 3
BldgType 5
CentralAir 2
Condition1 9
Condition2 8
Electrical 5
ExterCond 5
ExterQual 4
Exterior1st 15
Exterior2nd 16
Fence 5
Foundation 6
Functional 7
GarageType 7
Heating 6
HeatingQC 5
HouseStyle 8
KitchenQual 4
LandContour 4
LandSlope 3
LotConfig 5
LotShape 4
MSZoning 5
MasVnrType 4
MiscFeature 5
Neighborhood 25
PavedDrive 3
RoofMatl 8
RoofStyle 6
SaleCondition 6
SaleType 9
Street 2
Utilities 2


In [38]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

[0.051014354972352016, 0.14413904735747127, 0.9829443852970575, 0.8845892165130642, 0.8727289157118515]
[0.0882624102647997, 0.1424134848962164, 0.9489454864198504, 0.8873359595327927]


In [39]:
# Converting to int for work in feature generation

In [40]:
for col in ('ExterCond','ExterQual','KitchenQual'):
    df_all[col]=df_all[col].astype('int')

# Features generation

In [41]:
df_all['TotalSF'] = df_all['TotalBsmtSF'] + df_all['1stFlrSF'] + df_all['2ndFlrSF']

In [42]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

[0.05075270603292728, 0.1449647899559738, 0.9832914826404053, 0.880129064545173, 0.8771717891466775]
[0.08749674830451251, 0.14162322464633664, 0.9503403865000999, 0.8855916319727375]


In [43]:
df_all['Age_Build']=df_all['YrSold']-df_all['YearBuilt']
df_all['Age_Remod']=df_all['YrSold']-df_all['YearRemodAdd']

In [44]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

[0.05210438679433521, 0.13376646398553838, 0.9831124264018846, 0.8848578233557087, 0.8768650805254719]
[0.08415968659036217, 0.13230336414279478, 0.9559417852478386, 0.8873628337590789]


In [45]:
df_all['Sizes_Total']=df_all['GrLivArea']+df_all['GarageCars']+df_all['GarageArea']+df_all['TotalBsmtSF']+df_all['1stFlrSF']+df_all['2ndFlrSF']+df_all['OpenPorchSF']+df_all['MasVnrArea']
df_all['Quantity_Total']=df_all['Fireplaces']+df_all['FullBath']+df_all['KitchenAbvGr']+df_all['TotRmsAbvGrd']+df_all['BedroomAbvGr']+df_all['BsmtFullBath']

In [46]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

[0.051839556607311675, 0.12969924791666412, 0.9838614941083882, 0.8780359751797615, 0.8786628864015271]
[0.08420262460711303, 0.12392561460589781, 0.9574213364272105, 0.8886528783076213]


In [47]:
fi = pd.DataFrame({'feature': list(X_train.columns), 'importance':m_rf.feature_importances_}).sort_values('importance',ascending=False)
fi[:20]

Unnamed: 0,feature,importance
40,OverallQual,0.313229
87,Sizes_Total,0.238258
84,TotalSF,0.138954
26,GrLivArea,0.025212
85,Age_Build,0.02478
16,ExterQual,0.017146
47,YearBuilt,0.01554
86,Age_Remod,0.011154
39,OverallCond,0.010664
6,BsmtFinSF1,0.009733


In [48]:
df_all['Garage_Age_Build']=df_all['YrSold']-df_all['GarageYrBlt']

In [49]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

[0.052200387978894466, 0.1380007212543305, 0.9831531400697483, 0.8751690428999925, 0.8765040379370037]
[0.08628850340935736, 0.1337261518799143, 0.9539661417060249, 0.8827825463520781]


In [50]:
df_all['Quality_Aggregated']=df_all['ExterQual']+df_all['ExterCond']+df_all['BsmtQual']+df_all['BsmtCond']+df_all['KitchenQual']+df_all['OverallQual']

In [51]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

[0.050255724507692015, 0.1538985237631344, 0.9841409306054447, 0.8512640126992099, 0.8859387316307349]
[0.08791223264314388, 0.1448495879264726, 0.9514705642198004, 0.8682405780785871]


In [None]:
# Continue Feature generation here
#df_all['Basement']=df_all['TotalBsmtSF']+df_all['BsmtFinSF1']+df_all['BsmtFinSF2']-df_all['BsmtUnfSF'])*
#(df_all['BsmtQual']+df_all['BsmtCond']+df_all['BsmtFinType1']+df_all['BsmtExposure']+df_all['BsmtFinType2'])*
#df_all['BsmtFullBath']*0.5*df_all['BsmtHalfBath']



In [None]:
#Garage=
#House=

## Check numeric columns (if they are actually Categorical, like Year)

### Experimenting - heavily convert NUMERICAL to CATEGORICAL

In [51]:
df_allcats=df_all.copy()

In [52]:
### Experimenting with Numerical Categories
def conv_num_cat (df):
    for col in df:
        if is_numeric_dtype(df[col]): 
            df[col]=df[col].astype('category')
        else:
            df.drop(columns=col, inplace=True)

In [53]:
conv_num_cat(df_allcats)

In [54]:
unique_categories(df_allcats,20)

BedroomAbvGr 8
BsmtCond 5
BsmtExposure 5
BsmtFinType1 7
BsmtFinType2 7
BsmtFullBath 4
BsmtHalfBath 3
BsmtQual 5
FireplaceQu 6
Fireplaces 5
FullBath 5
GarageCars 6
GarageCond 6
GarageFinish 4
GarageQual 6
HalfBath 3
KitchenAbvGr 4
MSSubClass 16
MoSold 12
OverallCond 9
OverallQual 10
PoolArea 14
PoolQC 4
TotRmsAbvGrd 14
YrSold 5
Alley_missed 2
BsmtCond_missed 2
BsmtExposure_missed 2
BsmtFinSF1_missed 2
BsmtFinSF2_missed 2
BsmtFinType1_missed 2
BsmtFinType2_missed 2
BsmtFullBath_missed 2
BsmtHalfBath_missed 2
BsmtQual_missed 2
BsmtUnfSF_missed 2
Electrical_missed 2
Exterior1st_missed 2
Exterior2nd_missed 2
Fence_missed 2
FireplaceQu_missed 2
Functional_missed 2
GarageArea_missed 2
GarageCars_missed 2
GarageCond_missed 2
GarageFinish_missed 2
GarageQual_missed 2
GarageType_missed 2
GarageYrBlt_missed 2
KitchenQual_missed 2
LotFrontage_missed 2
MSZoning_missed 2
MasVnrArea_missed 2
MasVnrType_missed 2
MiscFeature_missed 2
PoolQC_missed 2
SaleType_missed 2
TotalBsmtSF_missed 2
Utilities_miss

In [None]:
conv_to_cat_shortlist=['MSSubClass', 'MoSold','YrSold']#'OverallCond', 'OverallQual']

In [None]:
#for cat in conv_to_cat_longlist:
#    df_all[cat]=df_all[cat].astype('category')

for cat in conv_to_cat_shortlist:
    df_all[cat]=df_all[cat].astype('category')

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

In [None]:
#conv_to_cat_longlist=['BedroomAbvGr', 'BsmtFullBath','BsmtHalfBath', 'Fireplaces', 'FullBath',\
#             'GarageCars','HalfBath','KitchenAbvGr','MSSubClass','MoSold','OverallCond',\
#             'OverallQual','PoolArea','TotRmsAbvGrd','YrSold']

# Feature Importance Dropping

In [None]:
fi = pd.DataFrame({'feature': list(X_train.columns), 'importance':m_rf.feature_importances_}).sort_values('importance',ascending=False)

In [None]:
fi[:50]

In [None]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

In [None]:
def find_features_to_drop(X_train, X_valid, y_train, y_valid):
    """ Using RandomForest identifies important feature 
    and one by one drop least important features from DataFrame to improve model score
    input - X_train, X_valid, y_train, y_valid, same as used in training and evaluation model using train/valid split
    """
    m_feature_to_drop = RandomForestRegressor(n_estimators=160, min_samples_leaf=1, max_features=0.5, n_jobs=-1, oob_score=False)
    # to try - not use actual feature importance each iteration, but use only first one
    #        m_feature_to_drop.fit(X_train, y_train)
    #        fi = pd.DataFrame({'feature': list(X_train.columns), 'importance':m_feature_to_drop.feature_importances_}).sort_values('importance',ascending=False)
    
    # Number of features in DataFrame
    num_of_features=X_train.shape[1]
    
    list_of_original_columns=X_train.columns
    
    best_grade=1
    list_of_feature_to_drop=pd.DataFrame()
    #grades={}
    
    for iteration in range(0, num_of_features):
            
        # Iteratively fit model with features without 1 least important (dropped in previos iteration)
        m_feature_to_drop.fit(X_train, y_train)
        # Evaluating performance withot this feature
        grade=math.sqrt(mean_squared_error(y_valid, m_feature_to_drop.predict(X_valid)))

        # Updating based on new model list of feature importance
        fi = pd.DataFrame({'feature': list(X_train.columns), 'importance':m_feature_to_drop.feature_importances_}).sort_values('importance',ascending=False)

        # Finding best score
        if grade<best_grade:
            best_grade=grade
            best_num_of_features=(num_of_features-iteration)
            list_of_feature_to_drop=list_of_original_columns.difference(fi.feature)

        # Dropping last 1 (least important feature)
        X_train=X_train.drop(columns=fi.feature[-1:])
        X_valid=X_valid.drop(columns=fi.feature[-1:])

        print ((num_of_features-iteration),grade, fi.feature[-1:])
        #grades.update({(num_of_features-iteration):grade})
    print(best_grade,best_num_of_features) 
    return list_of_feature_to_drop

In [None]:
#features_to_drop=find_features_to_drop(X_train, X_valid, y_train, y_valid)

In [None]:
features_to_drop
#fi.feature==fi.feature

In [None]:
x=list(grades.keys())
y=list(grades.values())

ax = plt.axes()
plt.plot(x,y)
plt.show()

In [None]:

ax = plt.axes()
plt.xlim(150,300)
plt.ylim(0.133,0.1350)
plt.plot(x,y)
plt.show()

In [None]:
df_all.shape

In [None]:
#df_all=df_all.drop(columns=features_to_drop)
#df_all=df_all.drop(columns=fi.feature[150:])

In [None]:
"""Dividing working DataFrame back to Train and Test"""
# split Validational/Test set from Training set after Categorical Value Engeneering
X_test=df_all.iloc[ntrain:] # Test set
X_train_full=df_all.iloc[:ntrain] # Train set
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full)


In [None]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

# Self made and experiment Evaluation techniques

In [None]:
# Evaluation of simple Random Forest
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, y_train)
#print_score(m)

In [None]:

math.sqrt(mean_squared_error(y_valid, m.predict(X_valid)))

In [None]:
# if you need to evaluate LOG Root mean squared error but wouldn't like to convert y to log(y)

In [None]:
math.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(m.predict(X_valid))))

# Dealing with Outliers

### -> To delete outliers

# Features engineering

In [None]:
"""#check the numbers of samples and features
print("The train data size before dropping Id feature is : {} ".format(df_train.shape))
print("The test data size before dropping Id feature is : {} ".format(df_test.shape))

#Save the 'Id' column
train_ID = df_train['Id']
test_ID = df_test['Id']

#Now drop the  'Id' colum since it's unnecessary for  the prediction process.
df_train.drop("Id", axis = 1, inplace = True)
df_test.drop("Id", axis = 1, inplace = True)

#check again the data size after dropping the 'Id' variable
print("\nThe train data size after dropping Id feature is : {} ".format(df_train.shape)) 
print("The test data size after dropping Id feature is : {} ".format(df_test.shape))
"""

## Correlation

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(df_train.corr())
#plt.plot()

In [None]:
"""
fig, axs = plt.subplots(nrows=2, figsize=(20, 20))

sns.heatmap(df_train.corr(), ax=axs[0], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 14})
sns.heatmap(df_test.corr(), ax=axs[1], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 14})

for i in range(2):    
    axs[i].tick_params(axis='x', labelsize=14)
    axs[i].tick_params(axis='y', labelsize=14)
    
axs[0].set_title('Training Set Correlations', size=15)
axs[1].set_title('Test Set Correlations', size=15)

plt.show()
"""

In [None]:
"""
df_corr=df_train.corr().sort_values(kind="quicksort", ascending=False, by='SalePrice').abs()
df_corr.drop(axis=1, columns=df_corr.columns.drop('SalePrice'), inplace=True)
df_corr
"""

# Scewed data

In [52]:
"""numeric_feats = df_all.dtypes[df_all.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = df_all[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)
"""

TypeError: ("unsupported operand type(s) for /: 'str' and 'int'", 'occurred at index Alley')

In [None]:
"""
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    #all_data[feat] += 1
    df_all[feat] = boxcox1p(df_all[feat], lam)
"""

# Normalization

In [None]:
#Normalization, the Sigmoid, Log, Cube Root and the Hyperbolic Tangent. 
#It all depends on what one is trying to accomplish.

In [None]:
#df_all.info()

# Label Encoding

In [None]:
"""from sklearn.preprocessing import LabelEncoder
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(df_all[c].values)) 
    df_all[c] = lbl.transform(list(df_all[c].values))
    """

# Dummies

In [53]:
df_all=pd.get_dummies(df_all)

In [64]:
"""Dividing working DataFrame back to Train and Test"""
# split Validational/Test set from Training set after Categorical Value Engeneering
#def original_train_test(df_all):
X_test=df_all.iloc[ntrain:] # Test set
X_train_full=df_all.iloc[:ntrain] # Train set

In [65]:
X_train, X_valid, y_train, y_valid = train_test_split(pd.get_dummies(X_train_full), y_train_full)

# Machine Learning

### Experimenting with Random Forest

In [66]:
m_rf = RandomForestRegressor(n_estimators=160, min_samples_leaf=1, max_features=0.5, n_jobs=-1, oob_score=True)
m_rf.fit(X_train, y_train)
print_score(m_rf)

[0.051198178735887695, 0.1320467711462653, 0.9839837861260845, 0.8812274929317997, 0.880431902660727]


### XGBoost

In [None]:
m_xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05)
# using early_stop to find out where validation scores don't improve
m_xgb.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)
%time m_xgb.fit(X_train, y_train)
print_score(m_xgb)

### GBDT (Gradient Boosting Decision Tree)

In [None]:
m_gbdt=GradientBoostingRegressor(n_estimators=1000, learning_rate=0.05)
%time m_gbdt.fit(X_train, y_train)
print_score(m_gbdt)

# Stacking

# Testing stacking from Kaggle

In [67]:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin
from sklearn.base import TransformerMixin,clone

In [68]:
from sklearn.model_selection import KFold, cross_val_score
n_folds=2
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train_full.values)
    rmse= np.sqrt(-cross_val_score(model, X_train_full.values, y_train_full, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [69]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Gradient Boosting score: 0.1272 (0.0054)



In [70]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet, Lasso

lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))


In [71]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Lasso score: 0.1424 (0.0071)



In [72]:
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

ElasticNet score: 0.1422 (0.0072)



In [73]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [74]:
stacked_averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, lasso),
                                                 meta_model = lasso)

score = rmsle_cv(stacked_averaged_models)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

Stacking Averaged models score: 0.1266 (0.0041)


In [75]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [76]:
stacked_averaged_models.fit(X_train_full.values, y_train_full)
stacked_train_pred = stacked_averaged_models.predict(X_train_full.values)
stacked_pred = np.expm1(stacked_averaged_models.predict(X_test.values))
print(rmsle(y_train_full, stacked_train_pred))

0.06934422741866346


In [77]:
m_xgb.fit(X_train_full, y_train_full)
xgb_train_pred = m_xgb.predict(X_train_full)
xgb_pred = np.expm1(m_xgb.predict(X_test))
print(rmsle(y_train_full, xgb_train_pred))

0.09009169305707966


In [78]:
m_rf.fit(X_train_full, y_train_full)
rf_train_pred = m_rf.predict(X_train_full)
rf_pred = np.expm1(m_rf.predict(X_test.values))
print(rmsle(y_train_full, rf_train_pred))

0.04929687680477529


In [79]:
'''RMSE on the entire Train data when averaging'''

print('RMSLE score on train data:')
print(rmsle(y_train_full,stacked_train_pred*0.7 +
               xgb_train_pred*0.15+rf_train_pred*0.15))

RMSLE score on train data:
0.06553333960297675


In [80]:
y_pred = stacked_pred*0.7 +xgb_pred*0.15+rf_pred*0.15

In [81]:
y_pred

array([119442.33155019, 156428.48520138, 182178.60797627, ...,
       160496.6202494 , 113304.43784513, 212712.1796712 ])

# Predictions for submission

In [82]:
### XGBoost

In [None]:
m_final_xgb = XGBRegressor(n_estimators=2000, learning_rate=0.05)
m_final_xgb.fit(X_train_full, y_train_full)

## Predicting

In [None]:
y_pred = np.expm1(m_final_xgb.predict(X_test)); y_pred

# Submission

In [83]:
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = y_pred
sub.to_csv('submittions/submission_27Aug19.csv',index=False)

ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series

In [None]:
sub.head()