# Initial Setup and Data Load

In [None]:
%load_ext autoreload
%autoreload 2
import os

%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from scipy.stats import norm, skew

import math
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

#from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
import category_encoders as ce
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_log_error

import string
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_raw = pd.read_feather('tmp/house-after-cleaning')
test_ID = pd.read_feather('tmp/house-testDF-idxs')

#df_all = pd.read_feather('tmp/house-after-cleaning')
#y_train_full= pd.read_feather('tmp/house-y-after-cleaning')

# DataFrame Y separation, setting split values

In [None]:
#Dividing Target column (Y)
y_train_full = df_raw.SalePrice.values
df_all=df_raw.drop(['SalePrice'], axis=1)


In [None]:
ntrain = 1460
ntest = 1459
y_train_full=y_train_full[:ntrain]

# Pre-Evaluation - benchmarking before Feature Generation

## Evaluation

In [None]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)


## Making Training, Validation, Test Dataset

In [None]:
def div_train_test(df):
    
    """Dividing working DataFrame back to Train and Test"""
    # split Validational/Test set from Training set after Categorical Value Engeneering
    #def original_train_test(df_all):
#    X_test=df.iloc[ntrain:] # Test set
#    X_train_full=df.iloc[:ntrain] # Train set

X_test=df_all.iloc[ntrain:] # Test set
X_train_full=df_all.iloc[:ntrain] # Train set

In [None]:
df_all.shape, y_train_full.shape, X_test.shape, X_train_full.shape

In [None]:
def quick_get_dumm(df):
    X_train_full=df.iloc[:ntrain] # Train set
#    X_train, X_valid, y_train, y_valid = train_test_split(pd.get_dummies(X_train_full), y_train_full)
    X_train, X_valid, y_train, y_valid = train_test_split(pd.get_dummies(X_train_full), y_train_full)
    return X_train, X_valid, y_train, y_valid
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)

## ML

### Experimenting with Random Forest

In [None]:
m_rf = RandomForestRegressor(n_estimators=160, min_samples_leaf=1, max_features=0.5, n_jobs=-1, oob_score=True)
m_rf.fit(X_train, y_train)
print_score(m_rf)

### XGBoost

In [None]:
m_xgb = XGBRegressor(n_estimators=160, learning_rate=0.05)
# using early_stop to find out where validation scores don't improve
#m_xgb.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)
%time m_xgb.fit(X_train, y_train)
print_score(m_xgb)

# Feature Importance

In [None]:
fi = pd.DataFrame({'feature': list(X_train.columns), 'importance':m_rf.feature_importances_}).sort_values('importance',ascending=False)

In [None]:
fi[:50]

# Dealing with Ordinal values

## Ordinal Data Encoding

### Encoding quality columns with dictionary

In [None]:
""""
Encode Quality columns with:
Ex	Excellent
Gd	Good
TA	Average/Typical
Fa	Fair
Po	Poor
NA	No "Garage/Basement/Fireplace/..."

To decode we use same Disctionary as used in other dataset columns:
OverallCond: Rates the overall condition of the house
       10	Very Excellent
       9	Excellent
       8	Very Good
       7	Good
       6	Above Average	
       5	Average
       4	Below Average	
       3	Fair
       2	Poor
       1	Very Poor
"""

qual_cleanup = {"Ex": 9, "Gd": 7, "TA": 5, "Fa": 3,"Po": 2, "None": 0}

for col in ('ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual','FireplaceQu','GarageQual',
            'GarageCond','PoolQC'):
    df_all[col].replace(qual_cleanup, inplace=True)
    df_all[col]=df_all[col].astype(str)


In [None]:
np.unique(df_all['BsmtCond'])

In [None]:
df_all['BsmtCond'].value_counts()

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

In [None]:
""""
BsmtFinType1: Rating of basement finished area

       GLQ	Good Living Quarters
       ALQ	Average Living Quarters
       BLQ	Below Average Living Quarters	
       Rec	Average Rec Room
       LwQ	Low Quality
       Unf	Unfinshed
       NA	No Basement
"""

qual_cleanup = {"GLQ": 10, "ALQ": 8, "BLQ": 6, "Rec": 4, "LwQ": 3,"Unf": 2, "None": 0}

for col in ('BsmtFinType1','BsmtFinType2'):
    df_all[col].replace(qual_cleanup, inplace=True)
    df_all[col]=df_all[col].astype(str)    
"""
BsmtExposure: Refers to walkout or garden level walls
       Gd	Good Exposure
       Av	Average Exposure (split levels or foyers typically score average or above)	
       Mn	Mimimum Exposure
       No	No Exposure
       NA	No Basement
"""
qual_cleanup = {"Gd": 10, "Av": 7, "Mn": 4, "No": 2, "None": 0}

df_all['BsmtExposure'].replace(qual_cleanup, inplace=True)
df_all[col]=df_all[col].astype(str)

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

In [None]:
show_object_columns(df_all)

#### Working on Functional (seems decrease score, not used now)

In [None]:
np.unique(df_all['Functional'])

In [None]:
""""
Functional: Home functionality (Assume typical unless deductions are warranted)
       Typ	Typical Functionality
       Min1	Minor Deductions 1
       Min2	Minor Deductions 2
       Mod	Moderate Deductions
       Maj1	Major Deductions 1
       Maj2	Major Deductions 2
       Sev	Severely Damaged
       Sal	Salvage only

"""

qual_cleanup = {"Typ": 10, "Min1": 9, "Min2": 8, "Mod": 6, "Maj1": 4,"Maj2": 3, "Sev": 1, "Sal": 0}

df_all['Functional'].replace(qual_cleanup, inplace=True)
df_all[col]=df_all[col].astype(str)

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

In [None]:
df_all['Functional'].value_counts()

#### Working with GarageFinish (seems decrease score, not used now)

In [None]:
np.unique(df_all['GarageFinish'])

In [None]:
qual_cleanup = {"Fin": 10, "RFn": 7, "Unf": 4, "None": 0}

df_all['GarageFinish'].replace(qual_cleanup, inplace=True)
df_all[col]=df_all[col].astype(str)

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

# Dealing with Categorical values

In [None]:
def show_object_columns(df):
    for col in df:
        if is_string_dtype(df[col]):
            print(col)

In [None]:
show_object_columns(df_all)

In [None]:
show_object_columns(df_all)

In [None]:
# Transforming some numerical variables that are really categorical

# MSSubClass=The building class
"""
MSSubClass: Identifies the type of dwelling involved in the sale.	
        20	1-STORY 1946 & NEWER ALL STYLES
        30	1-STORY 1945 & OLDER
        40	1-STORY W/FINISHED ATTIC ALL AGES
        45	1-1/2 STORY - UNFINISHED ALL AGES
        50	1-1/2 STORY FINISHED ALL AGES
        60	2-STORY 1946 & NEWER
        70	2-STORY 1945 & OLDER
        75	2-1/2 STORY ALL AGES
        80	SPLIT OR MULTI-LEVEL
        85	SPLIT FOYER
        90	DUPLEX - ALL STYLES AND AGES
       120	1-STORY PUD (Planned Unit Development) - 1946 & NEWER
       150	1-1/2 STORY PUD - ALL AGES
       160	2-STORY PUD - 1946 & NEWER
       180	PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
       190	2 FAMILY CONVERSION - ALL STYLES AND AGES
"""
df_all['MSSubClass'] = df_all['MSSubClass'].astype(str)


# Changing OverallCond into a categorical variable
"""
OverallCond: Rates the overall condition of the house
       10	Very Excellent
       9	Excellent
       8	Very Good
       7	Good
       6	Above Average	
       5	Average
       4	Below Average	
       3	Fair
       2	Poor
       1	Very Poor
"""
df_all['OverallCond'] = df_all['OverallCond'].astype(str)

# Changing OverallQual into a categorical variable
"""
OverallQual: Rates the overall material and finish of the house
       10	Very Excellent
       9	Excellent
       8	Very Good
       7	Good
       6	Above Average
       5	Average
       4	Below Average
       3	Fair
       2	Poor
       1	Very Poor
"""

# Year and month sold are transformed into categorical features.
df_all['YrSold'] = df_all['YrSold'].astype(str)
df_all['MoSold'] = df_all['MoSold'].astype(str)

df_all['YearBuilt']=df_all['YearBuilt'].astype(str)
df_all['YearRemodAdd']=df_all['YearRemodAdd'].astype(str)

In [None]:
df_all.info()

In [None]:
# convert object columns to categorical
def conv_obj_to_categories(df):
    """
    Convert Object columns to Categorical
    """
    for col in df:
        if is_string_dtype(df[col]):
            df[col]=df[col].astype('category')


In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

In [None]:
#conv_obj_to_categories(df_all)

In [None]:
def show_categorical_columns(df):
    """
    Print only categorical columns Number, Name and Codes of unique values in corresponding column 
    """
    for col in df:
        if is_categorical_dtype(df[col]):
            print(sum(np.unique(df[col].cat.categories,return_counts=True)[1]), col ,df[col].cat.categories)

In [None]:
show_categorical_columns(df_all)

In [None]:
def unique_categories(df,n=float("inf")):
    """
    Print only categorical columns Names and Number of unique values in corresponding column 
    df - DataFrame
    n - show only columns with less then N unique values, 
        as default - not show column if more than 10000 unique value - not pseudo categorical
    """
    for col in df:
        if is_categorical_dtype(df[col]):
            if sum(np.unique(df[col].cat.categories,return_counts=True)[1])<n:
                print(col, sum(np.unique(df[col].cat.categories,return_counts=True)[1]))

In [None]:
unique_categories(df_all)

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

In [None]:
# Converting to int for work in feature generation

In [None]:
#for col in ('ExterCond','ExterQual','KitchenQual'):
#    df_all[col]=df_all[col].astype('int')

## Check numeric columns (if they are actually Categorical, like Year)

### Experimenting - heavily convert NUMERICAL to CATEGORICAL

# Features generation

In [None]:
df_all['TotalSF'] = df_all['TotalBsmtSF'] + df_all['1stFlrSF'] + df_all['2ndFlrSF']

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

In [None]:
df_all['Age_Build']=df_all['YrSold'].astype(int)-df_all['YearBuilt'].astype(int)
df_all['Age_Remod']=df_all['YrSold'].astype(int)-df_all['YearRemodAdd'].astype(int)

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
m_xgb.fit(X_train, y_train)
print_score(m_xgb)

In [None]:
df_all['Sizes_Total']=df_all['GrLivArea']+df_all['GarageCars']+df_all['GarageArea']+df_all['TotalBsmtSF']+df_all['1stFlrSF']+df_all['2ndFlrSF']+df_all['OpenPorchSF']+df_all['MasVnrArea']
df_all['Quantity_Total']=df_all['Fireplaces']+df_all['FullBath']+df_all['KitchenAbvGr']+df_all['TotRmsAbvGrd']+df_all['BedroomAbvGr']+df_all['BsmtFullBath']

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
#m_xgb.fit(X_train, y_train)
#print_score(m_xgb)

In [None]:
fi = pd.DataFrame({'feature': list(X_train.columns), 'importance':m_rf.feature_importances_}).sort_values('importance',ascending=False)
fi[:20]

In [None]:
df_all['Garage_Age_Build']=df_all['YrSold'].astype(int)-df_all['GarageYrBlt']

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
#m_xgb.fit(X_train, y_train)
#print_score(m_xgb)

In [None]:
#df_all['Quality_Aggregated']=df_all['ExterQual'].astype(int)+df_all['ExterCond'].astype(int)+df_all['BsmtQual'].astype(int)+df_all['BsmtCond'].astype(int)+df_all['KitchenQual'].astype(int)+df_all['OverallQual'].astype(int)

In [None]:
X_train, X_valid, y_train, y_valid=quick_get_dumm(df_all)
m_rf.fit(X_train, y_train)
print_score(m_rf)
#m_xgb.fit(X_train, y_train)
#print_score(m_xgb)

In [None]:
# Continue Feature generation here
#df_all['Basement']=df_all['TotalBsmtSF']+df_all['BsmtFinSF1']+df_all['BsmtFinSF2']-df_all['BsmtUnfSF'])*
#(df_all['BsmtQual']+df_all['BsmtCond']+df_all['BsmtFinType1']+df_all['BsmtExposure']+df_all['BsmtFinType2'])*
#df_all['BsmtFullBath']*0.5*df_all['BsmtHalfBath']



In [None]:
#Garage=
#House=

## Housing Crisis Data 2008-2009

# Feature Importance Dropping

In [None]:
fi = pd.DataFrame({'feature': list(X_train.columns), 'importance':m_rf.feature_importances_}).sort_values('importance',ascending=False)

In [None]:
fi[:50]

In [None]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

In [None]:
def find_features_to_drop(X_train, X_valid, y_train, y_valid):
    """ Using RandomForest identifies important feature 
    and one by one drop least important features from DataFrame to improve model score
    input - X_train, X_valid, y_train, y_valid, same as used in training and evaluation model using train/valid split
    """
    m_feature_to_drop = RandomForestRegressor(n_estimators=160, min_samples_leaf=1, max_features=0.5, n_jobs=-1, oob_score=False)
    # to try - not use actual feature importance each iteration, but use only first one
    #        m_feature_to_drop.fit(X_train, y_train)
    #        fi = pd.DataFrame({'feature': list(X_train.columns), 'importance':m_feature_to_drop.feature_importances_}).sort_values('importance',ascending=False)
    
    # Number of features in DataFrame
    num_of_features=X_train.shape[1]
    
    list_of_original_columns=X_train.columns
    
    best_grade=1
    list_of_feature_to_drop=pd.DataFrame()
    #grades={}
    
    for iteration in range(0, num_of_features):
            
        # Iteratively fit model with features without 1 least important (dropped in previos iteration)
        m_feature_to_drop.fit(X_train, y_train)
        # Evaluating performance withot this feature
        grade=math.sqrt(mean_squared_error(y_valid, m_feature_to_drop.predict(X_valid)))

        # Updating based on new model list of feature importance
        fi = pd.DataFrame({'feature': list(X_train.columns), 'importance':m_feature_to_drop.feature_importances_}).sort_values('importance',ascending=False)

        # Finding best score
        if grade<best_grade:
            best_grade=grade
            best_num_of_features=(num_of_features-iteration)
            list_of_feature_to_drop=list_of_original_columns.difference(fi.feature)

        # Dropping last 1 (least important feature)
        X_train=X_train.drop(columns=fi.feature[-1:])
        X_valid=X_valid.drop(columns=fi.feature[-1:])

        print ((num_of_features-iteration),grade, fi.feature[-1:])
        #grades.update({(num_of_features-iteration):grade})
    print(best_grade,best_num_of_features) 
    return list_of_feature_to_drop

In [None]:
#features_to_drop=find_features_to_drop(X_train, X_valid, y_train, y_valid)

In [None]:
features_to_drop
#fi.feature==fi.feature

In [None]:
x=list(grades.keys())
y=list(grades.values())

ax = plt.axes()
plt.plot(x,y)
plt.show()

In [None]:

ax = plt.axes()
plt.xlim(150,300)
plt.ylim(0.133,0.1350)
plt.plot(x,y)
plt.show()

In [None]:
df_all.shape

In [None]:
#df_all=df_all.drop(columns=features_to_drop)
#df_all=df_all.drop(columns=fi.feature[150:])

In [None]:
"""Dividing working DataFrame back to Train and Test"""
# split Validational/Test set from Training set after Categorical Value Engeneering
X_test=df_all.iloc[ntrain:] # Test set
X_train_full=df_all.iloc[:ntrain] # Train set
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full)


In [None]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

# Self made and experiment Evaluation techniques

In [None]:
# Evaluation of simple Random Forest
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, y_train)
#print_score(m)

In [None]:

math.sqrt(mean_squared_error(y_valid, m.predict(X_valid)))

In [None]:
# if you need to evaluate LOG Root mean squared error but wouldn't like to convert y to log(y)

In [None]:
math.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(m.predict(X_valid))))

# Dealing with Outliers

### -> To delete outliers

# Features engineering

In [None]:
"""#check the numbers of samples and features
print("The train data size before dropping Id feature is : {} ".format(df_train.shape))
print("The test data size before dropping Id feature is : {} ".format(df_test.shape))

#Save the 'Id' column
train_ID = df_train['Id']
test_ID = df_test['Id']

#Now drop the  'Id' colum since it's unnecessary for  the prediction process.
df_train.drop("Id", axis = 1, inplace = True)
df_test.drop("Id", axis = 1, inplace = True)

#check again the data size after dropping the 'Id' variable
print("\nThe train data size after dropping Id feature is : {} ".format(df_train.shape)) 
print("The test data size after dropping Id feature is : {} ".format(df_test.shape))
"""

## Correlation

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(df_train.corr())
#plt.plot()

In [None]:
"""
fig, axs = plt.subplots(nrows=2, figsize=(20, 20))

sns.heatmap(df_train.corr(), ax=axs[0], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 14})
sns.heatmap(df_test.corr(), ax=axs[1], annot=True, square=True, cmap='coolwarm', annot_kws={'size': 14})

for i in range(2):    
    axs[i].tick_params(axis='x', labelsize=14)
    axs[i].tick_params(axis='y', labelsize=14)
    
axs[0].set_title('Training Set Correlations', size=15)
axs[1].set_title('Test Set Correlations', size=15)

plt.show()
"""

In [None]:
"""
df_corr=df_train.corr().sort_values(kind="quicksort", ascending=False, by='SalePrice').abs()
df_corr.drop(axis=1, columns=df_corr.columns.drop('SalePrice'), inplace=True)
df_corr
"""

# Scewed data

In [None]:
"""numeric_feats = df_all.dtypes[df_all.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = df_all[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)
"""

In [None]:
"""
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    #all_data[feat] += 1
    df_all[feat] = boxcox1p(df_all[feat], lam)
"""

# Normalization

In [None]:
#Normalization, the Sigmoid, Log, Cube Root and the Hyperbolic Tangent. 
#It all depends on what one is trying to accomplish.

In [None]:
#df_all.info()

# Label Encoding

In [None]:
#from sklearn.preprocessing import LabelEncoder
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(df_all[c].values)) 
    df_all[c] = lbl.transform(list(df_all[c].values))


# Binary Encoding

In [None]:
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# process columns, apply BinaryEncoder to categorical features
#for c in cols:
#    bnr = ce.binary.BinaryEncoder() 
#    bnr.fit(list(df_all[c].values)) 
#    df_all[c] = bnr.transform(list(df_all[c].values))

# Dummies

In [None]:
df_all=pd.get_dummies(df_all)

In [None]:
"""Dividing working DataFrame back to Train and Test"""
# split Validational/Test set from Training set after Categorical Value Engeneering
#def original_train_test(df_all):
X_test=df_all.iloc[ntrain:] # Test set
X_train_full=df_all.iloc[:ntrain] # Train set

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(pd.get_dummies(X_train_full), y_train_full)

# Machine Learning

### Experimenting with Random Forest

In [None]:
m_rf = RandomForestRegressor(n_estimators=160, min_samples_leaf=1, max_features=0.5, n_jobs=-1, oob_score=True)
m_rf.fit(X_train, y_train)
print_score(m_rf)

### XGBoost

In [None]:
m_xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05)
# using early_stop to find out where validation scores don't improve
m_xgb.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)
%time m_xgb.fit(X_train, y_train)
print_score(m_xgb)

### GBDT (Gradient Boosting Decision Tree)

In [None]:
m_gbdt=GradientBoostingRegressor(n_estimators=1000, learning_rate=0.05)
%time m_gbdt.fit(X_train, y_train)
print_score(m_gbdt)

# Stacking

# Testing stacking from Kaggle

In [None]:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin
from sklearn.base import TransformerMixin,clone

In [None]:
from sklearn.model_selection import KFold, cross_val_score
n_folds=2
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train_full.values)
    rmse= np.sqrt(-cross_val_score(model, X_train_full.values, y_train_full, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [None]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet, Lasso

lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))


In [None]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [None]:
stacked_averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, lasso),
                                                 meta_model = lasso)

score = rmsle_cv(stacked_averaged_models)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

In [None]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [None]:
stacked_averaged_models.fit(X_train_full.values, y_train_full)
stacked_train_pred = stacked_averaged_models.predict(X_train_full.values)
stacked_pred = np.expm1(stacked_averaged_models.predict(X_test.values))
print(rmsle(y_train_full, stacked_train_pred))

In [None]:
m_xgb.fit(X_train_full, y_train_full)
xgb_train_pred = m_xgb.predict(X_train_full)
xgb_pred = np.expm1(m_xgb.predict(X_test))
print(rmsle(y_train_full, xgb_train_pred))

In [None]:
m_rf.fit(X_train_full, y_train_full)
rf_train_pred = m_rf.predict(X_train_full)
rf_pred = np.expm1(m_rf.predict(X_test.values))
print(rmsle(y_train_full, rf_train_pred))

In [None]:
'''RMSE on the entire Train data when averaging'''

print('RMSLE score on train data:')
print(rmsle(y_train_full,stacked_train_pred*0.7 +
               xgb_train_pred*0.15+rf_train_pred*0.15))

In [None]:
y_pred = stacked_pred*0.7 +xgb_pred*0.15+rf_pred*0.15

In [None]:
y_pred

# Predictions for submission

In [None]:
### XGBoost

In [None]:
m_final_xgb = XGBRegressor(n_estimators=2000, learning_rate=0.05)
m_final_xgb.fit(X_train_full, y_train_full)

## Predicting

In [None]:
y_pred = np.expm1(m_final_xgb.predict(X_test)); y_pred

# Submission

In [None]:
sub = pd.DataFrame()
sub['Id'] = test_ID.T.squeeze()
sub['SalePrice'] = y_pred
sub.to_csv('submittions/submission_29Aug19.csv',index=False)

In [None]:
sub.head()