# Initial Setup and Data Load

In [1]:
%load_ext autoreload
%autoreload 2
import os

%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from scipy.stats import norm, skew

import math
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

#from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_log_error

import string
import warnings
warnings.filterwarnings('ignore')

In [3]:
PATH = "../../../data/house_pricing/"

In [4]:
df_train=pd.read_csv(f'{PATH}train.csv')#, index_col='Id')
df_test=pd.read_csv(f'{PATH}test.csv')#, index_col='Id')

### Y (target value) to Log, as stated at Kaggle Evaluation page

In [5]:
# for the purpose of evaluation of current competition
#df_train.SalePrice = np.log1p(df_train.SalePrice)
df_train.SalePrice = np.log1p(df_train.SalePrice)

In [6]:
print('Number of Training Examples = {}'.format(df_train.shape[0]))
print('Number of Test Examples = {}\n'.format(df_test.shape[0]))
print('Training X Shape = {}'.format(df_train.shape))
print('Training y Shape = {}\n'.format(df_train['SalePrice'].shape[0]))
print('Test X Shape = {}'.format(df_test.shape))
print('Test y Shape = {}\n'.format(df_test.shape[0]))
#print(df_train.columns)
#print(df_test.columns)

Number of Training Examples = 1460
Number of Test Examples = 1459

Training X Shape = (1460, 81)
Training y Shape = 1460

Test X Shape = (1459, 80)
Test y Shape = 1459



In [7]:
#print(df_train.info())
#df_train.sample(3)
#print(df_test.info())
#df_test.sample(3)

# Dealing with Outliers

In [None]:
fig, ax = plt.subplots()
ax.scatter(x = df_train['GrLivArea'], y = df_train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()

In [None]:
# Deleting outliers
df_train = df_train.drop(df_train[(df_train['GrLivArea']>4000) & (df_train['SalePrice']<300000)].index)

#Check the graphic again
fig, ax = plt.subplots()
ax.scatter(df_train['GrLivArea'], df_train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()

# DataFrame concatination and Y separation

In [8]:
def concat_df(train_data, test_data):
    # Returns a concatenated df of training and test set on axis 0
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)

df_all = concat_df(df_train, df_test)

df_train.name = 'Training Set'
df_test.name = 'Test Set'
df_all.name = 'All Set' 

dfs = [df_train, df_test]

df_all.shape

(2919, 81)

In [9]:
#remember where to divide train and test
ntrain = df_train.shape[0]
ntest = df_test.shape[0]

#Save the 'Id' column
train_ID = df_train['Id']
test_ID = df_test['Id']

In [10]:
#Dividing Target column (Y)
y_train_full = df_train.SalePrice.values
df_all.drop(['SalePrice'], axis=1, inplace=True)

# Dealing with Missing Values

### Create columns to mark originally missed values

In [11]:
def mark_missing (df):
    for col in df.columns:
        if df_all[col].isnull().sum()>0:
            df_all[col+'_missed']=df_all[col].isnull()

In [12]:
mark_missing(df_all)

In [13]:
df_all.shape

(2919, 114)

### Replace Missing

In [14]:
def display_missing(df):
    for col in df.columns:
        print(col, df[col].isnull().sum())
    print('\n')
    
for df in dfs:
    print(format(df.name))
    display_missing(df)
    
    
    
#Check remaining missing values if any 
def display_only_missing(df):
    all_data_na = (df.isnull().sum() / len(df)) * 100
    all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
    print(missing_data)

Training Set
Id 0
MSSubClass 0
MSZoning 0
LotFrontage 259
LotArea 0
Street 0
Alley 1369
LotShape 0
LandContour 0
Utilities 0
LotConfig 0
LandSlope 0
Neighborhood 0
Condition1 0
Condition2 0
BldgType 0
HouseStyle 0
OverallQual 0
OverallCond 0
YearBuilt 0
YearRemodAdd 0
RoofStyle 0
RoofMatl 0
Exterior1st 0
Exterior2nd 0
MasVnrType 8
MasVnrArea 8
ExterQual 0
ExterCond 0
Foundation 0
BsmtQual 37
BsmtCond 37
BsmtExposure 38
BsmtFinType1 37
BsmtFinSF1 0
BsmtFinType2 38
BsmtFinSF2 0
BsmtUnfSF 0
TotalBsmtSF 0
Heating 0
HeatingQC 0
CentralAir 0
Electrical 1
1stFlrSF 0
2ndFlrSF 0
LowQualFinSF 0
GrLivArea 0
BsmtFullBath 0
BsmtHalfBath 0
FullBath 0
HalfBath 0
BedroomAbvGr 0
KitchenAbvGr 0
KitchenQual 0
TotRmsAbvGrd 0
Functional 0
Fireplaces 0
FireplaceQu 690
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageCars 0
GarageArea 0
GarageQual 81
GarageCond 81
PavedDrive 0
WoodDeckSF 0
OpenPorchSF 0
EnclosedPorch 0
3SsnPorch 0
ScreenPorch 0
PoolArea 0
PoolQC 1453
Fence 1179
MiscFeature 1406
MiscVal 0
M

In [15]:
display_only_missing(df_all)

              Missing Ratio
PoolQC            99.657417
MiscFeature       96.402878
Alley             93.216855
Fence             80.438506
FireplaceQu       48.646797
LotFrontage       16.649538
GarageQual         5.447071
GarageCond         5.447071
GarageFinish       5.447071
GarageYrBlt        5.447071
GarageType         5.378554
BsmtExposure       2.809181
BsmtCond           2.809181
BsmtQual           2.774923
BsmtFinType2       2.740665
BsmtFinType1       2.706406
MasVnrType         0.822199
MasVnrArea         0.787941
MSZoning           0.137033
BsmtFullBath       0.068517
BsmtHalfBath       0.068517
Utilities          0.068517
Functional         0.068517
Electrical         0.034258
BsmtUnfSF          0.034258
Exterior1st        0.034258
Exterior2nd        0.034258
TotalBsmtSF        0.034258
GarageArea         0.034258
GarageCars         0.034258
BsmtFinSF2         0.034258
BsmtFinSF1         0.034258
KitchenQual        0.034258
SaleType           0.034258


### Replace non-missing but "NA", "None", etc values by Data description

##### Replace NA in Object columns

In [16]:
display_only_missing(df_all)

              Missing Ratio
PoolQC            99.657417
MiscFeature       96.402878
Alley             93.216855
Fence             80.438506
FireplaceQu       48.646797
LotFrontage       16.649538
GarageQual         5.447071
GarageCond         5.447071
GarageFinish       5.447071
GarageYrBlt        5.447071
GarageType         5.378554
BsmtExposure       2.809181
BsmtCond           2.809181
BsmtQual           2.774923
BsmtFinType2       2.740665
BsmtFinType1       2.706406
MasVnrType         0.822199
MasVnrArea         0.787941
MSZoning           0.137033
BsmtFullBath       0.068517
BsmtHalfBath       0.068517
Utilities          0.068517
Functional         0.068517
Electrical         0.034258
BsmtUnfSF          0.034258
Exterior1st        0.034258
Exterior2nd        0.034258
TotalBsmtSF        0.034258
GarageArea         0.034258
GarageCars         0.034258
BsmtFinSF2         0.034258
BsmtFinSF1         0.034258
KitchenQual        0.034258
SaleType           0.034258


In [17]:
# fill NA values (not missed) with None - based on data description -  - for non-Numerical (object) Columns
for col in ('Alley','MasVnrType','BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
            'BsmtFinType2','FireplaceQu','GarageType', 'GarageFinish', 'GarageQual', 
            'GarageCond','PoolQC','Fence','MiscFeature'):
    df_all[col] = df_all[col].fillna('None')

##### Replace NA in Numerical columns

In [18]:
display_only_missing(df_all)

              Missing Ratio
LotFrontage       16.649538
GarageYrBlt        5.447071
MasVnrArea         0.787941
MSZoning           0.137033
Utilities          0.068517
BsmtFullBath       0.068517
BsmtHalfBath       0.068517
Functional         0.068517
Exterior1st        0.034258
BsmtFinSF2         0.034258
BsmtUnfSF          0.034258
Electrical         0.034258
GarageArea         0.034258
Exterior2nd        0.034258
TotalBsmtSF        0.034258
GarageCars         0.034258
KitchenQual        0.034258
SaleType           0.034258
BsmtFinSF1         0.034258


In [19]:
#fill NA numerical value with '0' - based on data description of correspondent Object columns - for Numerical Columns
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars','BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath','MasVnrArea'):
    df_all[col] = df_all[col].fillna(0)

##### Replace NA missing values by most often in column (only for columns with 2 and less NA values, where do not make sense to invest hugely into Analysis)

In [20]:
display_only_missing(df_all)

             Missing Ratio
LotFrontage      16.649538
MSZoning          0.137033
Utilities         0.068517
Functional        0.068517
SaleType          0.034258
KitchenQual       0.034258
Exterior2nd       0.034258
Exterior1st       0.034258
Electrical        0.034258


In [21]:
# Fill missing value in corresponding columns with most frequent value in column
for col in ('Utilities','Functional','SaleType','KitchenQual','Exterior2nd','Exterior1st','Electrical'):
    df_all[col].fillna(df_all[col].mode()[0], inplace=True)
    
# Functional : data description says NA means typical
# BTW we just used df_all.Functional.mode() = use most frequent value (as 'Typ' is most frequent value)
#df_all["Functional"] = df_all["Functional"].fillna("Typ")

### Replacing real missing values

##### Dealing with missing values left

In [22]:
display_only_missing(df_all)

             Missing Ratio
LotFrontage      16.649538
MSZoning          0.137033


In [23]:
# Dealing with MSZoning

In [24]:
df_all.MSZoning.isnull().sum()

4

In [25]:
df_all["MSZoning"] = df_all["MSZoning"].fillna("None")

In [26]:
display_only_missing(df_all)

             Missing Ratio
LotFrontage      16.649538


In [27]:
# Dealing with LotFrontage

In [28]:
df_all['LotFrontage'].isnull().sum()

486

In [29]:
def filling_na_with_predictions(df, feature):
    """
    df - DataFrame without target column y. Train+Test DataFrame (df_all)
    feature - feature (column), containing real NA values we will fill

    Assumption:
    All other columns do not have NA values. In case of having we have to impute with some Statistical method (Median, etc)
    We do not do it inside this function
    """

    flag_object=0
    
    if df[feature].isnull().sum()>0:
        ## Store Indexes of rows with NA values (we can just call "_missed" column with True values, to check those indexes as well)
        ## Creating index based on NA values present in column
        na_rows_idxs=df[df[feature].isnull()].index 
            ## Creating index based on NA values being present in original DF column
            #na_rows_idxs=df.index[df[feature+'_missed'] == True].tolist()

        ## For fitting and predictiong - convert DF to dummies DF, ready for ML
        #df=pd.get_dummies(df)
        ## If feature object we cant just dummy all, we shouldn't dummy feature column
        df=pd.concat([ pd.Series(df[feature]), pd.get_dummies(df.drop([feature], axis=1)) ], axis=1)


        ## Splitting DF to Feature_Train_X, Feature_Train_y, Feature_Predict_X:
        ## Feature_Train_X = DF without NA values in "feature_with_NA"column
        ## Feature_Train_y = target values that we have. All values in "feature_with_NA" except NA values
        ## Feature_Predict_X = DF of correcponding to NA values in "feature_with_NA" without target vales (basically because they is equal to NA)
        Feature_Train_X=df.drop(df[df[feature].isnull()].index).drop([feature], axis=1)
        Feature_Train_y=df[feature].drop(df[df[feature].isnull()].index).values
        Feature_Predict_X=df[df[feature].isnull()].drop([feature], axis=1)

        ## If feature is NOT Numerical
        ## Label encoding of y values in case it is not numerical
        if is_string_dtype(df[feature]) or is_categorical_dtype(df[feature]):
            flag_object=1
            from sklearn.preprocessing import LabelEncoder
            le = LabelEncoder()
            le.fit(Feature_Train_y)
            Feature_Train_y=le.transform(Feature_Train_y)
             
        ## Making predictions, what might be in NA fields based on Train DF
        m_xgb = XGBRegressor(n_estimators=400, learning_rate=0.05)
        m_xgb.fit(Feature_Train_X, Feature_Train_y)
    
        ## Creating (Predicting) values to impute NA
        fillna_values=m_xgb.predict(Feature_Predict_X)

        ## If feature is NOT Numerical
        ## Return Encoded values back to Object/Category if feature NOT numerical
        if flag_object==1:
            fillna_values=le.inverse_transform(np.around(fillna_values).astype(int))
        
        ## Replacing NA values with predicted Series of values
        df[feature]=df[feature].fillna(pd.Series(index=na_rows_idxs,data=fillna_values))

        ## Returning feature column without NA values    
        return df[feature]
    else:
        print ('There were no NA values')

In [30]:
df_all['LotFrontage']=filling_na_with_predictions(df_all, "LotFrontage")



In [31]:
df_all['LotFrontage'].isnull().sum()

0

In [32]:
display_only_missing(df_all)

Empty DataFrame
Columns: [Missing Ratio]
Index: []


In [33]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Columns: 114 entries, 1stFlrSF to Utilities_missed
dtypes: bool(34), float64(11), int64(26), object(43)
memory usage: 1.9+ MB


##### Once again dealing with missed MSZoning values

In [34]:
# returning original NA back

def return_original_na(df, feature):
    df[feature].loc[df.index[df[feature+'_missed'] == True].tolist()]=np.nan
    return df[feature]
    
df_all['MSZoning']=return_original_na(df_all, 'MSZoning')

In [35]:
display_only_missing(df_all)

          Missing Ratio
MSZoning       0.137033


In [36]:
df_all[df_all['MSZoning'].isnull()].index

Int64Index([1915, 2216, 2250, 2904], dtype='int64')

In [37]:
df_all['MSZoning']=filling_na_with_predictions(df_all, 'MSZoning')



In [38]:
df_all['MSZoning'].loc[df_all.index[df_all['MSZoning'+'_missed'] == True].tolist()]

1915    RH
2216    RL
2250    RL
2904    RL
Name: MSZoning, dtype: object

##### Dealing with Missing values we replaced with most common - now replacing them with predictions

In [39]:
#for col in ('Utilities','Functional','SaleType','KitchenQual','Exterior2nd','Exterior1st','Electrical'):
#    print ('Filling with most common:\n',df_all[col].loc[df_all.index[df_all[col+'_missed'] == True].tolist()])
#    df_all[col]=return_original_na(df_all, col)
#    df_all[col]=filling_na_with_predictions(df_all, col)
#    print ('Filling with predictions:\n',df_all[col].loc[df_all.index[df_all[col+'_missed'] == True].tolist()])

##### Seems no missed values
Missing Values = DONE

# Pre-Evaluation - benchmarking before Feature Generation

## Making Training, Validation, Test Dataset

In [40]:
"""Dividing working DataFrame back to Train and Test"""
# split Validational/Test set from Training set after Categorical Value Engeneering
#def original_train_test(df_all):
X_test=df_all.iloc[ntrain:] # Test set
X_train_full=df_all.iloc[:ntrain] # Train set

In [41]:
df_all.shape, y_train_full.shape, X_test.shape, X_train_full.shape

((2919, 114), (1460,), (1459, 114), (1460, 114))

In [42]:
X_train, X_valid, y_train, y_valid = train_test_split(pd.get_dummies(X_train_full), y_train_full)

In [43]:
X_train.shape, X_valid.shape

((1095, 337), (365, 337))

## Evaluation

In [44]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

### Experimenting with Random Forest

In [45]:
m_rf = RandomForestRegressor(n_estimators=160, min_samples_leaf=1, max_features=0.5, n_jobs=-1, oob_score=True)
m_rf.fit(X_train, y_train)
print_score(m_rf)

[0.05446730035614242, 0.11611444151610519, 0.9819695172626995, 0.9061902014709694, 0.8700809297354569]


### XGBoost

In [46]:
m_xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05)
# using early_stop to find out where validation scores don't improve
#m_xgb.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)
%time m_xgb.fit(X_train, y_train)
print_score(m_xgb)

CPU times: user 17.1 s, sys: 57.8 ms, total: 17.2 s
Wall time: 17.4 s
[0.03857017532293831, 0.10831367586529637, 0.990958533449573, 0.9183714057230711]


In [48]:
df_raw=pd.concat([df_all, pd.Series(y_train_full, name='SalePrice')], axis=1)
df_idxs_submission=pd.DataFrame(pd.Series(test_ID, name='Id'))
#y_df=pd.DataFrame(pd.Series(y_train_full, name='SalePrice'))

In [49]:
os.makedirs('tmp', exist_ok=True)
df_raw.reset_index(drop=True, inplace=True)
df_raw.to_feather('tmp/house-after-cleaning')
#df_all.to_feather('tmp/house-after-cleaning')
#y_df.to_feather('tmp/house-y-after-cleaning')

#Save the 'Id' column for submission
df_idxs_submission.to_feather('tmp/house-testDF-idxs')


# Feature Importance

In [61]:
fi = pd.DataFrame({'feature': list(X_train.columns), 'importance':m_rf.feature_importances_}).sort_values('importance',ascending=False)

In [62]:
fi[:50]

Unnamed: 0,feature,importance
28,OverallQual,0.347077
15,GrLivArea,0.133022
34,YearBuilt,0.060721
32,TotalBsmtSF,0.054491
12,GarageArea,0.03972
13,GarageCars,0.038199
0,1stFlrSF,0.030378
11,FullBath,0.030027
4,BsmtFinSF1,0.02218
140,ExterQual_TA,0.021423


In [None]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

In [None]:
def find_features_to_drop(X_train, X_valid, y_train, y_valid):
    """ Using RandomForest identifies important feature 
    and one by one drop least important features from DataFrame to improve model score
    input - X_train, X_valid, y_train, y_valid, same as used in training and evaluation model using train/valid split
    """
    m_feature_to_drop = RandomForestRegressor(n_estimators=160, min_samples_leaf=1, max_features=0.5, n_jobs=-1, oob_score=False)
    # to try - not use actual feature importance each iteration, but use only first one
    #        m_feature_to_drop.fit(X_train, y_train)
    #        fi = pd.DataFrame({'feature': list(X_train.columns), 'importance':m_feature_to_drop.feature_importances_}).sort_values('importance',ascending=False)
    
    # Number of features in DataFrame
    num_of_features=X_train.shape[1]
    
    list_of_original_columns=X_train.columns
    
    best_grade=1
    list_of_feature_to_drop=pd.DataFrame()
    #grades={}
    
    for iteration in range(0, num_of_features):
            
        # Iteratively fit model with features without 1 least important (dropped in previos iteration)
        m_feature_to_drop.fit(X_train, y_train)
        # Evaluating performance withot this feature
        grade=math.sqrt(mean_squared_error(y_valid, m_feature_to_drop.predict(X_valid)))

        # Updating based on new model list of feature importance
        fi = pd.DataFrame({'feature': list(X_train.columns), 'importance':m_feature_to_drop.feature_importances_}).sort_values('importance',ascending=False)

        # Finding best score
        if grade<best_grade:
            best_grade=grade
            best_num_of_features=(num_of_features-iteration)
            list_of_feature_to_drop=list_of_original_columns.difference(fi.feature)

        # Dropping last 1 (least important feature)
        X_train=X_train.drop(columns=fi.feature[-1:])
        X_valid=X_valid.drop(columns=fi.feature[-1:])

        print ((num_of_features-iteration),grade, fi.feature[-1:])
        #grades.update({(num_of_features-iteration):grade})
    print(best_grade,best_num_of_features) 
    return list_of_feature_to_drop

In [None]:
#features_to_drop=find_features_to_drop(X_train, X_valid, y_train, y_valid)

In [None]:
find_features_to_drop
#fi.feature==fi.feature

In [None]:
x=list(grades.keys())
y=list(grades.values())

ax = plt.axes()
plt.plot(x,y)
plt.show()

In [None]:

ax = plt.axes()
plt.xlim(150,300)
plt.ylim(0.133,0.1350)
plt.plot(x,y)
plt.show()

In [None]:
df_all.shape

In [None]:
#df_all=df_all.drop(columns=features_to_drop)
#df_all=df_all.drop(columns=fi.feature[150:])

In [None]:
"""Dividing working DataFrame back to Train and Test"""
# split Validational/Test set from Training set after Categorical Value Engeneering
X_test=df_all.iloc[ntrain:] # Test set
X_train_full=df_all.iloc[:ntrain] # Train set
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full)


In [None]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

# Features generation

In [None]:
df_all['TotalSF'] = df_all['TotalBsmtSF'] + df_all['1stFlrSF'] + df_all['2ndFlrSF']

In [None]:
df_all['Sizes_Total']=df_all['GrLivArea']+df_all['GarageCars']+df_all['GarageArea']+df_all['TotalBsmtSF']+df_all['1stFlrSF']+df_all['2ndFlrSF']+df_all['OpenPorchSF']+df_all['MasVnrArea']
df_all['Quantity_Total']=df_all['Fireplaces']+df_all['FullBath']+df_all['KitchenAbvGr']+df_all['TotRmsAbvGrd']+df_all['BedroomAbvGr']+df_all['BsmtFullBath']
df_all['Age_Build']=df_all['YrSold']-df_all['YearBuilt']
df_all['Age_Remod']=df_all['YrSold']-df_all['YearRemodAdd']

                                


In [None]:
df_all.info()

In [None]:
m_rf.fit(X_train, y_train)
print_score(m_rf)

In [None]:
#m_xgb.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)
%time m_xgb.fit(X_train, y_train)
print_score(m_xgb)

In [None]:
# Real 0.14114 and after full stackNet 0.123

# Dealing with categorical values

In [None]:
def show_object_columns(df):
    for col in df:
        if is_string_dtype(df[col]):
            print(col)
show_object_columns(df_all)

In [None]:
# Transforming some numerical variables that are really categorical

# MSSubClass=The building class
#df_all['MSSubClass'] = df_all['MSSubClass'].astype(str)


# Changing OverallCond into a categorical variable
#df_all['OverallCond'] = df_all['OverallCond'].astype(str)


# Year and month sold are transformed into categorical features.
#df_all['YrSold'] = df_all['YrSold'].astype(str)
#df_all['MoSold'] = df_all['MoSold'].astype(str)

In [None]:
# convert object columns to categorical
def conv_obj_to_categories(df):
    """
    Convert Object columns to Categorical
    """
    for col in df:
        if is_string_dtype(df[col]):
            df[col]=df[col].astype('category')


In [None]:



conv_obj_to_categories(df_all)






In [None]:
def show_categorical_columns(df):
    """
    Print only categorical columns Number, Name and Codes of unique values in corresponding column 
    """
    for col in df:
        if is_categorical_dtype(df[col]):
            print(sum(np.unique(df[col].cat.categories,return_counts=True)[1]), col ,df[col].cat.categories)

In [None]:
show_categorical_columns(df_all)

In [None]:
def unique_categories(df,n=float("inf")):
    """
    Print only categorical columns Names and Number of unique values in corresponding column 
    df - DataFrame
    n - show only columns with less then N unique values, 
        as default - not show column if more than 10000 unique value - not pseudo categorical
    """
    for col in df:
        if is_categorical_dtype(df[col]):
            if sum(np.unique(df[col].cat.categories,return_counts=True)[1])<n:
                print(col, sum(np.unique(df[col].cat.categories,return_counts=True)[1]))

In [None]:
unique_categories(df_all)

## Check numeric columns (if they are actually Categorical, like Year)

### Experimenting - heavily convert NUMERICAL to CATEGORICAL

In [None]:
df_allcats=df_all.copy()

In [None]:
### Experimenting with Numerical Categories
def conv_num_cat (df):
    for col in df:
        if is_numeric_dtype(df[col]): 
            df[col]=df[col].astype('category')
        else:
            df.drop(columns=col, inplace=True)

In [None]:
conv_num_cat(df_allcats)

In [None]:
unique_categories(df_allcats,20)

In [None]:
#conv_to_cat_longlist=['BedroomAbvGr', 'BsmtFullBath','BsmtHalfBath', 'Fireplaces', 'FullBath',\
#             'GarageCars','HalfBath','KitchenAbvGr','MSSubClass','MoSold','OverallCond',\
#             'OverallQual','PoolArea','TotRmsAbvGrd','YrSold']

In [None]:
conv_to_cat_shortlist=['HalfBath','MSSubClass', 'MoSold','OverallCond', 'OverallQual','YrSold']

In [None]:
#for cat in conv_to_cat_longlist:
#    df_all[cat]=df_all[cat].astype('category')

for cat in conv_to_cat_shortlist:
    df_all[cat]=df_all[cat].astype('category')

In [None]:
df_all.info(114)

# Self made and experiment Evaluation techniques

In [None]:
# Evaluation of simple Random Forest
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, y_train)
#print_score(m)

In [None]:

math.sqrt(mean_squared_error(y_valid, m.predict(X_valid)))

In [None]:
# if you need to evaluate LOG Root mean squared error but wouldn't like to convert y to log(y)

In [None]:
math.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(m.predict(X_valid))))

# Features engineering

In [None]:
"""#check the numbers of samples and features
print("The train data size before dropping Id feature is : {} ".format(df_train.shape))
print("The test data size before dropping Id feature is : {} ".format(df_test.shape))

#Save the 'Id' column
train_ID = df_train['Id']
test_ID = df_test['Id']

#Now drop the  'Id' colum since it's unnecessary for  the prediction process.
df_train.drop("Id", axis = 1, inplace = True)
df_test.drop("Id", axis = 1, inplace = True)

#check again the data size after dropping the 'Id' variable
print("\nThe train data size after dropping Id feature is : {} ".format(df_train.shape)) 
print("The test data size after dropping Id feature is : {} ".format(df_test.shape))
"""

## Correlation

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(df_train.corr())
#plt.plot()

In [None]:
"""
fig, axs = plt.subplots(nrows=2, figsize=(20, 20))

sns.heatmap(df_train.corr(), ax=axs[0], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 14})
sns.heatmap(df_test.corr(), ax=axs[1], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 14})

for i in range(2):    
    axs[i].tick_params(axis='x', labelsize=14)
    axs[i].tick_params(axis='y', labelsize=14)
    
axs[0].set_title('Training Set Correlations', size=15)
axs[1].set_title('Test Set Correlations', size=15)

plt.show()
"""

In [None]:
"""
df_corr=df_train.corr().sort_values(kind="quicksort", ascending=False, by='SalePrice').abs()
df_corr.drop(axis=1, columns=df_corr.columns.drop('SalePrice'), inplace=True)
df_corr
"""

# Scewed data

In [None]:
"""numeric_feats = df_all.dtypes[df_all.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = df_all[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)
"""

In [None]:
"""
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    #all_data[feat] += 1
    df_all[feat] = boxcox1p(df_all[feat], lam)
"""

# Normalization

In [None]:
#Normalization, the Sigmoid, Log, Cube Root and the Hyperbolic Tangent. 
#It all depends on what one is trying to accomplish.

In [None]:
#df_all.info()

# Label Encoding

In [None]:
"""from sklearn.preprocessing import LabelEncoder
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(df_all[c].values)) 
    df_all[c] = lbl.transform(list(df_all[c].values))
    """

# Dummies

In [None]:
df_all=pd.get_dummies(df_all)

# Machine Learning

### Experimenting with Random Forest

In [None]:
m_rf = RandomForestRegressor(n_estimators=160, min_samples_leaf=1, max_features=0.5, n_jobs=-1, oob_score=True)
m_rf.fit(X_train, y_train)
print_score(m_rf)

### XGBoost

In [None]:
m_xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05)
# using early_stop to find out where validation scores don't improve
m_xgb.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)
%time m_xgb.fit(X_train, y_train)
print_score(m_xgb)

### GBDT (Gradient Boosting Decision Tree)

In [None]:
m_gbdt=GradientBoostingRegressor(n_estimators=1000, learning_rate=0.05)
%time m_gbdt.fit(X_train, y_train)
print_score(m_gbdt)

# Stacking

# Testing stacking from Kaggle

In [None]:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin
from sklearn.base import TransformerMixin,clone

In [None]:
from sklearn.model_selection import KFold, cross_val_score
n_folds=2
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train_full.values)
    rmse= np.sqrt(-cross_val_score(model, X_train_full.values, y_train_full, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [None]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet, Lasso

lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))


In [None]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [None]:
stacked_averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, lasso),
                                                 meta_model = lasso)

score = rmsle_cv(stacked_averaged_models)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

In [None]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [None]:
stacked_averaged_models.fit(X_train_full.values, y_train_full)
stacked_train_pred = stacked_averaged_models.predict(X_train_full.values)
stacked_pred = np.expm1(stacked_averaged_models.predict(X_test.values))
print(rmsle(y_train_full, stacked_train_pred))

In [None]:
m_xgb.fit(X_train_full, y_train_full)
xgb_train_pred = m_xgb.predict(X_train_full)
xgb_pred = np.expm1(m_xgb.predict(X_test))
print(rmsle(y_train_full, xgb_train_pred))

In [None]:
m_rf.fit(X_train_full, y_train_full)
rf_train_pred = m_rf.predict(X_train_full)
rf_pred = np.expm1(m_rf.predict(X_test.values))
print(rmsle(y_train_full, rf_train_pred))

In [None]:
'''RMSE on the entire Train data when averaging'''

print('RMSLE score on train data:')
print(rmsle(y_train_full,stacked_train_pred*0.7 +
               xgb_train_pred*0.15+rf_train_pred*0.15))

In [None]:
y_pred = stacked_pred*0.7 +xgb_pred*0.15+rf_pred*0.15

In [None]:
y_pred

# Predictions for submission

In [None]:
### XGBoost

In [None]:
m_final_xgb = XGBRegressor(n_estimators=2000, learning_rate=0.05)
m_final_xgb.fit(X_train_full, y_train_full)

## Predicting

In [None]:
y_pred = np.expm1(m_final_xgb.predict(X_test)); y_pred

# Submission

In [None]:
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = y_pred
sub.to_csv('submittions/submission_27Aug19.csv',index=False)

In [None]:
sub.head()