# Initial Setup and Data Load

In [None]:
%load_ext autoreload
%autoreload 2
import os

%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from scipy.stats import norm, skew

import math
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

#from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
import category_encoders as ce
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_log_error
from scipy.special import boxcox1p


import string
import warnings
warnings.filterwarnings('ignore')

In [None]:
PATH = "../../../data/house_pricing/"

In [None]:
df_train=pd.read_csv(f'{PATH}train.csv')#, index_col='Id')
df_test=pd.read_csv(f'{PATH}test.csv')#, index_col='Id')

### Y (target value) to Log, as stated at Kaggle Evaluation page

In [None]:
# for the purpose of evaluation of current competition
#df_train.SalePrice = np.log1p(df_train.SalePrice)
df_train.SalePrice = np.log1p(df_train.SalePrice)

In [None]:
print('Number of Training Examples = {}'.format(df_train.shape[0]))
print('Number of Test Examples = {}\n'.format(df_test.shape[0]))
print('Training X Shape = {}'.format(df_train.shape))
print('Training y Shape = {}\n'.format(df_train['SalePrice'].shape[0]))
print('Test X Shape = {}'.format(df_test.shape))
print('Test y Shape = {}\n'.format(df_test.shape[0]))
#print(df_train.columns)
#print(df_test.columns)

In [None]:
#print(df_train.info())
#df_train.sample(3)
#print(df_test.info())
#df_test.sample(3)

# Dealing with Outliers

In [None]:
fig, ax = plt.subplots()
ax.scatter(x = df_train['GrLivArea'], y = df_train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()

In [None]:
# Deleting outliers
df_train = df_train.drop(df_train[(df_train['GrLivArea']>4000) & (df_train['SalePrice']<300000)].index)

#Check the graphic again
fig, ax = plt.subplots()
ax.scatter(df_train['GrLivArea'], df_train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()

# DataFrame concatination and Y separation

In [None]:
def concat_df(train_data, test_data):
    # Returns a concatenated df of training and test set on axis 0
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)

df_all = concat_df(df_train, df_test)

df_train.name = 'Training Set'
df_test.name = 'Test Set'
df_all.name = 'All Set' 

dfs = [df_train, df_test]

df_all.shape

In [None]:
#remember where to divide train and test
ntrain = df_train.shape[0]
ntest = df_test.shape[0]

#Save the 'Id' column
train_ID = df_train['Id']
test_ID = df_test['Id']

In [None]:
#Dividing Target column (Y)
y_train_full = df_train.SalePrice.values
df_all.drop(['SalePrice'], axis=1, inplace=True)
df_all.drop('Id',axis=1,inplace=True)

# Dealing with Missing Values

### Create columns to mark originally missed values

In [None]:
def mark_missing (df):
    for col in df.columns:
        if df_all[col].isnull().sum()>0:
            df_all[col+'_missed']=df_all[col].isnull()

In [None]:
mark_missing(df_all)

In [None]:
df_all.shape

### Replace Missing

In [None]:
def display_missing(df):
    for col in df.columns:
        print(col, df[col].isnull().sum())
    print('\n')
    
for df in dfs:
    print(format(df.name))
    display_missing(df)
    
    
    
#Check remaining missing values if any 
def display_only_missing(df):
    all_data_na = (df.isnull().sum() / len(df)) * 100
    all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
    print(missing_data)

In [None]:
display_only_missing(df_all)

### Replace non-missing but "NA", "None", etc values by Data description

##### Replace NA in Object columns

In [None]:
display_only_missing(df_all)

In [None]:
# fill NA values (not missed) with None - based on data description -  - for non-Numerical (object) Columns
for col in ('Alley','MasVnrType','BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
            'BsmtFinType2','FireplaceQu','GarageType', 'GarageFinish', 'GarageQual', 
            'GarageCond','PoolQC','Fence','MiscFeature'):
    df_all[col] = df_all[col].fillna('None')

##### Replace NA in Numerical columns

In [None]:
display_only_missing(df_all)

In [None]:
#fill NA numerical value with '0' - based on data description of correspondent Object columns - for Numerical Columns
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars','BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath','MasVnrArea'):
    df_all[col] = df_all[col].fillna(0)

##### Replace NA missing values by most often in column (only for columns with 2 and less NA values, where do not make sense to invest hugely into Analysis)

In [None]:
display_only_missing(df_all)

In [None]:
# Fill missing value in corresponding columns with most frequent value in column
for col in ('Utilities','Functional','SaleType','KitchenQual','Exterior2nd','Exterior1st','Electrical'):
    df_all[col].fillna(df_all[col].mode()[0], inplace=True)
    
# Functional : data description says NA means typical
# BTW we just used df_all.Functional.mode() = use most frequent value (as 'Typ' is most frequent value)
#df_all["Functional"] = df_all["Functional"].fillna("Typ")

### Replacing real missing values

##### Dealing with missing values left

In [None]:
display_only_missing(df_all)

In [None]:
# Dealing with MSZoning

In [None]:
df_all.MSZoning.isnull().sum()

In [None]:
df_all["MSZoning"] = df_all["MSZoning"].fillna("None")

In [None]:
display_only_missing(df_all)

In [None]:
# Dealing with LotFrontage

In [None]:
df_all['LotFrontage'].isnull().sum()

In [None]:
def filling_na_with_predictions(df, feature):
    """
    df - DataFrame without target column y. Train+Test DataFrame (df_all)
    feature - feature (column), containing real NA values we will fill

    Assumption:
    All other columns do not have NA values. In case of having we have to impute with some Statistical method (Median, etc)
    We do not do it inside this function
    """

    flag_object=0
    
    if df[feature].isnull().sum()>0:
        ## Store Indexes of rows with NA values (we can just call "_missed" column with True values, to check those indexes as well)
        ## Creating index based on NA values present in column
        na_rows_idxs=df[df[feature].isnull()].index 
            ## Creating index based on NA values being present in original DF column
            #na_rows_idxs=df.index[df[feature+'_missed'] == True].tolist()

        ## For fitting and predictiong - convert DF to dummies DF, ready for ML
        #df=pd.get_dummies(df)
        ## If feature object we cant just dummy all, we shouldn't dummy feature column
        df=pd.concat([ pd.Series(df[feature]), pd.get_dummies(df.drop([feature], axis=1)) ], axis=1)


        ## Splitting DF to Feature_Train_X, Feature_Train_y, Feature_Predict_X:
        ## Feature_Train_X = DF without NA values in "feature_with_NA"column
        ## Feature_Train_y = target values that we have. All values in "feature_with_NA" except NA values
        ## Feature_Predict_X = DF of correcponding to NA values in "feature_with_NA" without target vales (basically because they is equal to NA)
        Feature_Train_X=df.drop(df[df[feature].isnull()].index).drop([feature], axis=1)
        Feature_Train_y=df[feature].drop(df[df[feature].isnull()].index).values
        Feature_Predict_X=df[df[feature].isnull()].drop([feature], axis=1)

        ## If feature is NOT Numerical
        ## Label encoding of y values in case it is not numerical
        if is_string_dtype(df[feature]) or is_categorical_dtype(df[feature]):
            flag_object=1
            from sklearn.preprocessing import LabelEncoder
            le = LabelEncoder()
            le.fit(Feature_Train_y)
            Feature_Train_y=le.transform(Feature_Train_y)
             
        ## Making predictions, what might be in NA fields based on Train DF
        m_xgb = XGBRegressor(n_estimators=160, learning_rate=0.05)
        m_xgb.fit(Feature_Train_X, Feature_Train_y)
    
        ## Creating (Predicting) values to impute NA
        fillna_values=m_xgb.predict(Feature_Predict_X)

        ## If feature is NOT Numerical
        ## Return Encoded values back to Object/Category if feature NOT numerical
        if flag_object==1:
            fillna_values=le.inverse_transform(np.around(fillna_values).astype(int))
        
        ## Replacing NA values with predicted Series of values
        df[feature]=df[feature].fillna(pd.Series(index=na_rows_idxs,data=fillna_values))

        ## Returning feature column without NA values    
        return df[feature]
    else:
        print ('There were no NA values')

In [None]:
df_all['LotFrontage']=filling_na_with_predictions(df_all, "LotFrontage")

In [None]:
df_all['LotFrontage'].isnull().sum()

In [None]:
display_only_missing(df_all)

In [None]:
df_all.info()

##### Once again dealing with missed MSZoning values

In [None]:
# returning original NA back

def return_original_na(df, feature):
    df[feature].loc[df.index[df[feature+'_missed'] == True].tolist()]=np.nan
    return df[feature]
    
df_all['MSZoning']=return_original_na(df_all, 'MSZoning')

In [None]:
display_only_missing(df_all)

In [None]:
df_all[df_all['MSZoning'].isnull()].index

In [None]:
df_all['MSZoning']=filling_na_with_predictions(df_all, 'MSZoning')

In [None]:
df_all['MSZoning'].loc[df_all.index[df_all['MSZoning'+'_missed'] == True].tolist()]

##### Dealing with Missing values we replaced with most common - now replacing them with predictions

##### Seems no missed values
Missing Values = DONE

# Pre-Evaluation - benchmarking before Feature Generation

## Making Training, Validation, Test Dataset

In [None]:
"""Dividing working DataFrame back to Train and Test"""
# split Validational/Test set from Training set after Categorical Value Engeneering
#def original_train_test(df_all):
#X_test=df_all.iloc[ntrain:] # Test set
X_train_full=df_all.iloc[:ntrain] # Train set

In [None]:
#df_all.shape, y_train_full.shape, X_test.shape, X_train_full.shape

In [None]:
#X_train, X_valid, y_train, y_valid = train_test_split(pd.get_dummies(X_train_full), y_train_full, random_state=42)

In [None]:
#X_train.shape, X_valid.shape

## Splitting function (train/valid)

In [None]:
def quick_get_dumm(df):
    X_train_full=df.iloc[:ntrain] # Full Train set
#    X_test=df_all.iloc[ntrain:] # Test set
    
    # Creating train and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(pd.get_dummies(X_train_full), y_train_full, random_state=42)
    return X_train, X_valid, y_train, y_valid

In [None]:
X_train, X_valid, y_train, y_valid = quick_get_dumm(df_all)

In [None]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape, X_train_full.shape, y_train_full.shape

## Evaluation

In [None]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

### Experimenting with Random Forest

In [None]:
m_rf = RandomForestRegressor(n_estimators=160, min_samples_leaf=1, max_features=0.5, n_jobs=-1, oob_score=True, random_state=42)
m_rf.fit(X_train, y_train)
print_score(m_rf)

### XGBoost

In [None]:
m_xgb = XGBRegressor(n_estimators=160, learning_rate=0.05, random_state=42)
# using early_stop to find out where validation scores don't improve
#m_xgb.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)
%time m_xgb.fit(X_train, y_train)
print_score(m_xgb)

# Feature Importance


In [None]:
fi = pd.DataFrame({'feature': list(X_train.columns), 'importance':m_rf.feature_importances_}).sort_values('importance',ascending=False)

In [None]:
fi[:50]

# Label Encoding

In [None]:
# Deprecated, unnessesary
def select_encoding (df_all,encoding='onehot'):
    if encoding=='label':
        # Label Encoding
        cols=[]
        cols.extend(ordinal_features)
        cols.extend(categorical_features)
        cols.extend(df_all.select_dtypes(object).columns)
        # process columns, apply LabelEncoder to categorical features
        for c in cols:
            if c in df_all.columns:
                lbl = LabelEncoder() 
                lbl.fit(list(df_all[c].values)) 
                df_all[c] = lbl.transform(list(df_all[c].values))
    if encoding=='binary':
        # Binary Encoding
        cols=[]
        #cols.extend(ordinal_features)
        cols.extend(categorical_features)
        cols.extend(df_all.select_dtypes(object).columns)
        # process columns, apply BinaryEncoder to categorical features
        for c in cols:
            if c in df_all.columns:
                bnr = ce.binary.BinaryEncoder() 
                bnr.fit(list(df_all[c].values)) 
                df_all[c] = bnr.transform(list(df_all[c].values))
    if encoding=='onehot':
        df_all=pd.get_dummies(df_all)
    return df_all

In [None]:
def encoding_check_score(df):
    X_train, X_valid, y_train, y_valid=quick_get_dumm(df)
    # Random Forest
    m_rf.fit(X_train, y_train)
    print ('Random Forest Score: ')#; print_score(m_rf)
    res = [rmse(m_rf.predict(X_train), y_train), rmse(m_rf.predict(X_valid), y_valid),
                m_rf.score(X_train, y_train), m_rf.score(X_valid, y_valid)]
    if hasattr(m_rf, 'oob_score_'): res.append(m_rf.oob_score_)
    print(res)
    
    # XGBoost
    m_xgb.fit(X_train, y_train)
    print ('XGBoost Score: ')#; print_score(m_xgb)
    res = [rmse(m_xgb.predict(X_train), y_train), rmse(m_xgb.predict(X_valid), y_valid),
                m_xgb.score(X_train, y_train), m_xgb.score(X_valid, y_valid)]
    #if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [None]:
def encoding_measure (df, feature):
    enc=['ordinal','onehot','label','binary']
    for encoding in enc:
        if encoding=='ordinal':
        # As Is encoding
            df_ordinal=df.copy()
            print (feature, 'Ordinal Encoding')
            encoding_check_score(df_ordinal)
        if encoding=='onehot':
        # OneHot encoding
            df_onehot=df.copy()
            df_onehot[feature]=df_onehot[feature].astype(str)
            df_onehot=pd.get_dummies(df_onehot)
            print (feature, 'OneHot Encoding')
            encoding_check_score(df_onehot)
        if encoding=='label':
        # Label Encoding
            df_le=df.copy()
            df_le[feature]=df_le[feature].astype(str)
            lbl = LabelEncoder() 
            lbl.fit(list(df_le[feature].values)) 
            df_le[feature] = lbl.transform(list(df_le[feature].values))
            print (feature, 'Label Encoding')
            encoding_check_score(df_le)
        if encoding=='binary':
        # Binary Encoding
            df_be=df.copy()
            df_be[feature]=df_be[feature].astype(str)
            bnr = ce.binary.BinaryEncoder() 
            bnr.fit(list(df_be[feature].values)) 
            df_be[feature] = bnr.transform(list(df_be[feature].values))
            print (feature, 'Binary Encoding')
            encoding_check_score(df_be)
        print ('\n\n')
        #return df

# Dealing with Ordinal values

## Ordinal Data Encoding

### Encoding quality columns with dictionary

In [None]:
ordinal_features=[]

In [None]:
""""
Encode Quality columns with:
Ex	Excellent
Gd	Good
TA	Average/Typical
Fa	Fair
Po	Poor
NA	No "Garage/Basement/Fireplace/..."

To decode we use same Disctionary as used in other dataset columns:
OverallCond: Rates the overall condition of the house
       10	Very Excellent
       9	Excellent
       8	Very Good
       7	Good
       6	Above Average	
       5	Average
       4	Below Average	
       3	Fair
       2	Poor
       1	Very Poor
"""

qual_cleanup = {"Ex": 9, "Gd": 7, "TA": 5, "Fa": 3,"Po": 2, "None": 0}

# Checking/Evaluation effectiveness (error) of different encoding approaches (AsIs, OneHot, Label, Binary)
for col in ('ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual',
            'FireplaceQu','GarageQual','GarageCond','PoolQC'):
    df_all_tmp=df_all.copy()
    df_all_tmp[col].replace(qual_cleanup, inplace=True)
    ordinal_features.append(col)
    df_all_tmp[col]=df_all_tmp[col].astype(float)
    encoding_measure (df_all_tmp, feature=col)
    

In [None]:
# Chosing Ordinal Encoding for Ordinal Data as most effective
for col in ('ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual',
            'FireplaceQu','GarageQual','GarageCond','PoolQC'):
    df_all[col].replace(qual_cleanup, inplace=True)
    df_all[col]=df_all[col].astype(float)

In [None]:
np.unique(df_all['BsmtCond'])

In [None]:
df_all['BsmtCond'].value_counts()

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

In [None]:
""""
BsmtFinType1: Rating of basement finished area
BsmtFinType2: Rating of basement finished area (if multiple types)
       GLQ	Good Living Quarters
       ALQ	Average Living Quarters
       BLQ	Below Average Living Quarters	
       Rec	Average Rec Room
       LwQ	Low Quality
       Unf	Unfinshed
       NA	No Basement
"""

qual_cleanup = {"GLQ": 10, "ALQ": 8, "BLQ": 6, "Rec": 4, "LwQ": 3,"Unf": 2, "None": 0}

# Checking/Evaluation effectiveness (error) of different encoding approaches (AsIs, OneHot, Label, Binary)
for col in ('BsmtFinType1','BsmtFinType2'):
    df_all_tmp=df_all.copy()
    df_all_tmp[col].replace(qual_cleanup, inplace=True)
    ordinal_features.append(col)
    df_all_tmp[col]=df_all_tmp[col].astype(float)    
    encoding_measure (df_all_tmp, feature=col)

In [None]:
# Chosing Ordinal Encoding for Ordinal Data as most effective
for col in ('BsmtFinType1','BsmtFinType2'):
    df_all[col].replace(qual_cleanup, inplace=True)
    df_all[col]=df_all[col].astype(float)    

In [None]:
"""
BsmtExposure: Refers to walkout or garden level walls
       Gd	Good Exposure
       Av	Average Exposure (split levels or foyers typically score average or above)	
       Mn	Mimimum Exposure
       No	No Exposure
       NA	No Basement
"""
qual_cleanup = {"Gd": 10, "Av": 7, "Mn": 4, "No": 2, "None": 0}

# Checking/Evaluation effectiveness (error) of different encoding approaches (AsIs, OneHot, Label, Binary)
df_all_tmp=df_all.copy()
df_all_tmp['BsmtExposure'].replace(qual_cleanup, inplace=True)
ordinal_features.append('BsmtExposure')
df_all_tmp['BsmtExposure']=df_all_tmp['BsmtExposure'].astype(float)
encoding_measure (df_all_tmp, feature='BsmtExposure')

In [None]:
# Chosing Ordinal Encoding for Ordinal Data as most effective
df_all['BsmtExposure'].replace(qual_cleanup, inplace=True)
df_all['BsmtExposure']=df_all['BsmtExposure'].astype(float)

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

#### Working on Functional (seems decrease score, not used now)

In [None]:
np.unique(df_all['Functional'])

In [None]:
""""
Functional: Home functionality (Assume typical unless deductions are warranted)
       Typ	Typical Functionality
       Min1	Minor Deductions 1
       Min2	Minor Deductions 2
       Mod	Moderate Deductions
       Maj1	Major Deductions 1
       Maj2	Major Deductions 2
       Sev	Severely Damaged
       Sal	Salvage only

"""

qual_cleanup = {"Typ": 10, "Min1": 9, "Min2": 8, "Mod": 6, "Maj1": 4,"Maj2": 3, "Sev": 1, "Sal": 0}

# Checking/Evaluation effectiveness (error) of different encoding approaches (AsIs, OneHot, Label, Binary)
df_all_tmp=df_all.copy()
df_all_tmp['Functional'].replace(qual_cleanup, inplace=True)
ordinal_features.append('Functional')
df_all_tmp['Functional']=df_all_tmp['Functional'].astype(float)
encoding_measure (df_all_tmp, feature='Functional')

In [None]:
# Chosing Ordinal Encoding for Ordinal Data as most effective
df_all['Functional'].replace(qual_cleanup, inplace=True)
df_all['Functional']=df_all['Functional'].astype(float)

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

In [None]:
df_all['Functional'].value_counts()

#### Working with GarageFinish (seems decrease score, not used now)

In [None]:
np.unique(df_all['GarageFinish'])

In [None]:
qual_cleanup = {"Fin": 10, "RFn": 7, "Unf": 4, "None": 0}

df_all_tmp=df_all.copy()
df_all_tmp['GarageFinish'].replace(qual_cleanup, inplace=True)
ordinal_features.append('GarageFinish')
df_all_tmp['GarageFinish']=df_all_tmp['GarageFinish'].astype(float)
encoding_measure (df_all_tmp, feature='GarageFinish')

In [None]:
# Chosing Ordinal Encoding for Ordinal Data as most effective
df_all['GarageFinish'].replace(qual_cleanup, inplace=True)
df_all['GarageFinish']=df_all['GarageFinish'].astype(float)

In [None]:
ordinal_features

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

# Dealing with Categorical values

In [None]:
def show_object_columns(df):
    for col in df:
        if is_string_dtype(df[col]):
            print(col)

In [None]:
show_object_columns(df_all)

In [None]:
categorical_features=[]

In [None]:
#CentralAir
CentralAir_cleanup = {"Y": 1, "N": 0}

df_all['CentralAir'].replace(CentralAir_cleanup, inplace=True)
categorical_features.append('CentralAir')
#df_all['CentralAir']=df_all['CentralAir'].astype(str)
encoding_measure (df_all, feature='CentralAir')

In [None]:
df_all.MSSubClass

In [None]:
# Transforming some numerical variables that are really categorical

# MSSubClass=The building class
"""
MSSubClass: Identifies the type of dwelling involved in the sale.	
        20	1-STORY 1946 & NEWER ALL STYLES
        30	1-STORY 1945 & OLDER
        40	1-STORY W/FINISHED ATTIC ALL AGES
        45	1-1/2 STORY - UNFINISHED ALL AGES
        50	1-1/2 STORY FINISHED ALL AGES
        60	2-STORY 1946 & NEWER
        70	2-STORY 1945 & OLDER
        75	2-1/2 STORY ALL AGES
        80	SPLIT OR MULTI-LEVEL
        85	SPLIT FOYER
        90	DUPLEX - ALL STYLES AND AGES
       120	1-STORY PUD (Planned Unit Development) - 1946 & NEWER
       150	1-1/2 STORY PUD - ALL AGES
       160	2-STORY PUD - 1946 & NEWER
       180	PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
       190	2 FAMILY CONVERSION - ALL STYLES AND AGES
"""
#df_all['MSSubClass'] = df_all['MSSubClass'].astype(str)
#categorical_features.append('MSSubClass')
encoding_measure (df_all, feature='MSSubClass')

In [None]:
df_all.MSSubClass

In [None]:
# Changing OverallCond into a categorical variable
"""
OverallCond: Rates the overall condition of the house
       10	Very Excellent
       9	Excellent
       8	Very Good
       7	Good
       6	Above Average	
       5	Average
       4	Below Average	
       3	Fair
       2	Poor
       1	Very Poor
"""
#df_all['OverallCond'] = df_all['OverallCond'].astype(str)
#categorical_features.append('OverallCond')
encoding_measure (df_all, feature='OverallCond')

In [None]:
# Changing OverallQual into a categorical variable
"""
OverallQual: Rates the overall material and finish of the house
       10	Very Excellent
       9	Excellent
       8	Very Good
       7	Good
       6	Above Average
       5	Average
       4	Below Average
       3	Fair
       2	Poor
       1	Very Poor
"""
#df_all['OverallQual'] = df_all['OverallQual'].astype(str)
#categorical_features.append('OverallQual')
encoding_measure (df_all, feature='OverallQual')

In [None]:
# Year and month sold are transformed into categorical features.
#df_all['YrSold'] = df_all['YrSold'].astype(str)
#df_all['MoSold'] = df_all['MoSold'].astype(str)
#categorical_features.append('YrSold')
encoding_measure (df_all, feature='YrSold')
#categorical_features.append('MoSold')
encoding_measure (df_all, feature='MoSold')

In [None]:
#df_all['YearBuilt']=df_all['YearBuilt'].astype(str)
#categorical_features.append('YearBuilt')
encoding_measure (df_all, feature='YearBuilt')

#df_all['YearRemodAdd']=df_all['YearRemodAdd'].astype(str)
#categorical_features.append('YearRemodAdd')
encoding_measure (df_all, feature='YearRemodAdd')

#df_all['GarageYrBlt']=df_all['GarageYrBlt'].astype(str)
#categorical_features.append('GarageYrBlt')
encoding_measure (df_all, feature='GarageYrBlt')

In [None]:
df_all.info(all)

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

## Check numeric columns (if they are actually Categorical, like Year)

### Experimenting - heavily convert NUMERICAL to CATEGORICAL

# using list of quntative and qualitative

In [None]:
quantitative=['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']

In [None]:
qualitative=['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']



In [None]:
df_all.select_dtypes(object).columns

In [None]:
ordinal_features

In [None]:
#categorical_features

# Features generation

In [None]:
#df_all['TotalSF'] = df_all['TotalBsmtSF'] + df_all['1stFlrSF'] + df_all['2ndFlrSF']

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
#m_xgb.fit(X_train, y_train)
#print_score(m_xgb)

In [None]:
df_all['Age_Build']=df_all['YrSold'].astype(int)-df_all['YearBuilt'].astype(int)
df_all['Age_Remod']=df_all['YrSold'].astype(int)-df_all['YearRemodAdd'].astype(int)

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
#m_xgb.fit(X_train, y_train)
#print_score(m_xgb)

In [None]:
#df_all['Sizes_Total']=df_all['GrLivArea']+df_all['GarageCars']+df_all['GarageArea']+df_all['TotalBsmtSF']+df_all['1stFlrSF']+df_all['2ndFlrSF']+df_all['OpenPorchSF']+df_all['MasVnrArea']
#df_all['Quantity_Total']=df_all['Fireplaces']+df_all['FullBath']+df_all['KitchenAbvGr']+df_all['TotRmsAbvGrd']+df_all['BedroomAbvGr']+df_all['BsmtFullBath']

In [None]:
fi = pd.DataFrame({'feature': list(X_train.columns), 'importance':m_rf.feature_importances_}).sort_values('importance',ascending=False)
#fi[:20]
fi.tail(30)

In [None]:
df_all['Garage_Age_Build']=df_all['YrSold'].astype(float)-df_all['GarageYrBlt'].astype(float)

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
#m_xgb.fit(X_train, y_train)
#print_score(m_xgb)

In [None]:
#df_all['Quality_Aggregated']=df_all['ExterQual'].astype(int)+df_all['ExterCond'].astype(int)+df_all['BsmtQual'].astype(int)+df_all['BsmtCond'].astype(int)+df_all['KitchenQual'].astype(int)+df_all['OverallQual'].astype(int)

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
#m_xgb.fit(X_train, y_train)
#print_score(m_xgb)

In [None]:
# Continue Feature generation here
#df_all['Basement']=df_all['TotalBsmtSF']+df_all['BsmtFinSF1']+df_all['BsmtFinSF2']-df_all['BsmtUnfSF'])*
#(df_all['BsmtQual']+df_all['BsmtCond']+df_all['BsmtFinType1']+df_all['BsmtExposure']+df_all['BsmtFinType2'])*
#df_all['BsmtFullBath']*0.5*df_all['BsmtHalfBath']



In [None]:
#Garage=
#House=

In [None]:
df_all['YrBltAndRemod']=df_all['YearBuilt']+df_all['YearRemodAdd']
df_all['TotalSF']=df_all['TotalBsmtSF'] + df_all['1stFlrSF'] + df_all['2ndFlrSF']

df_all['Total_sqr_footage'] = (df_all['BsmtFinSF1'] + df_all['BsmtFinSF2'] +
                                 df_all['1stFlrSF'] + df_all['2ndFlrSF'])

df_all['Total_Bathrooms'] = (df_all['FullBath'] + (0.5 * df_all['HalfBath']) +
                               df_all['BsmtFullBath'] + (0.5 * df_all['BsmtHalfBath']))

df_all['Total_porch_sf'] = (df_all['OpenPorchSF'] + df_all['3SsnPorch'] +
                              df_all['EnclosedPorch'] + df_all['ScreenPorch'] +
                              df_all['WoodDeckSF'])

In [None]:
df_all['haspool'] = df_all['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
df_all['has2ndfloor'] = df_all['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
df_all['hasgarage'] = df_all['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
df_all['hasbsmt'] = df_all['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
df_all['hasfireplace'] = df_all['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
#m_xgb.fit(X_train, y_train)
#print_score(m_xgb)

# Data examining

In [None]:
df_all.info(all)

In [None]:
df_all.select_dtypes(object).columns

## Housing Crisis Data 2008-2009

# Feature Importance Dropping

In [None]:
df_all = df_all.drop(['Utilities', 'Street', 'PoolQC',], axis=1)
df_all = df_all.drop(['Utilities_missed'],['TotalBsmtSF_missed'],['SaleType_missed'],['MSZoning_missed'],
                     ['KitchenQual_missed'],['GarageCars_missed'],['GarageArea_missed'],['Exterior2nd_missed'],
                     ['Exterior1st_missed'],['BsmtFinSF2_missed'],['BsmtFullBath_missed'],['BsmtUnfSF_missed'],
                     ['BsmtHalfBath_missed'],['Functional_missed'],  axis=1)


# Scewed data

# Normalization

# Label Encoding

In [None]:
def select_encoding (df_all,encoding='onehot'):
    if encoding=='label':
        # Label Encoding
        cols=[]
        cols.extend(ordinal_features)
        cols.extend(categorical_features)
        cols.extend(df_all.select_dtypes(object).columns)
        # process columns, apply LabelEncoder to categorical features
        for c in cols:
            if c in df_all.columns:
                lbl = LabelEncoder() 
                lbl.fit(list(df_all[c].values)) 
                df_all[c] = lbl.transform(list(df_all[c].values))
    if encoding=='binary':
        # Binary Encoding
        cols=[]
        #cols.extend(ordinal_features)
        cols.extend(categorical_features)
        cols.extend(df_all.select_dtypes(object).columns)
        # process columns, apply BinaryEncoder to categorical features
        for c in cols:
            if c in df_all.columns:
                bnr = ce.binary.BinaryEncoder() 
                bnr.fit(list(df_all[c].values)) 
                df_all[c] = bnr.transform(list(df_all[c].values))
    if encoding=='onehot':
        df_all=pd.get_dummies(df_all)
    return df_all

In [None]:
df_all=select_encoding(df_all,'label')

In [None]:
df_all.info(all)

# Dummies

In [None]:
df_all.shape

In [None]:
df_all

In [None]:
"""Dividing working DataFrame back to Train and Test"""
# split Validational/Test set from Training set after Categorical Value Engeneering
#def original_train_test(df_all):
X_test=df_all.iloc[ntrain:] # Test set
X_train_full=df_all.iloc[:ntrain] # Train set

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(pd.get_dummies(X_train_full), y_train_full)

In [None]:
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

# Machine Learning

In [None]:
train=X_train_full; y_train=y_train_full; test=X_test

In [None]:
train.shape, y_train.shape, test.shape

In [None]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [None]:
train.shape

In [None]:
y_train.shape

In [None]:
#Validation function
n_folds = 4

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [None]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))


In [None]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))


In [None]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)


In [None]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

In [None]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

In [None]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [None]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

In [None]:
##Stacking  models

In [None]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

In [None]:
averaged_models = AveragingModels(models = (ENet, GBoost, KRR, lasso))

score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [None]:
stacked_averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, KRR),
                                                 meta_model = lasso)

score = rmsle_cv(stacked_averaged_models)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

In [None]:
## Ensembling StackedRegressor, XGBoost and LightGBM

In [None]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [None]:
stacked_averaged_models.fit(train.values, y_train)
stacked_train_pred = stacked_averaged_models.predict(train.values)
stacked_pred = np.expm1(stacked_averaged_models.predict(test.values))
print(rmsle(y_train, stacked_train_pred))

In [None]:
model_xgb.fit(train, y_train)
xgb_train_pred = model_xgb.predict(train)
xgb_pred = np.expm1(model_xgb.predict(test))
print(rmsle(y_train, xgb_train_pred))

In [None]:
model_lgb.fit(train, y_train)
lgb_train_pred = model_lgb.predict(train)
lgb_pred = np.expm1(model_lgb.predict(test.values))
print(rmsle(y_train, lgb_train_pred))

In [None]:
'''RMSE on the entire Train data when averaging'''

print('RMSLE score on train data:')
print(rmsle(y_train,stacked_train_pred*0.70 +
               xgb_train_pred*0.15 + lgb_train_pred*0.15 ))

In [None]:
y_pred = stacked_pred*0.70 + xgb_pred*0.15 + lgb_pred*0.15

# Predictions for submission

## Predicting

# Submission

In [None]:
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = y_pred
sub.to_csv('submittions/submission_29Aug19.csv',index=False)

In [None]:
sub.head()