In [144]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from empiricaldist import Pmf, Cdf
from matplotlib.ticker import MaxNLocator
%matplotlib inline  
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# for one hot encoding with feature-engine
from feature_engine.categorical_encoders import OneHotCategoricalEncoder
from feature_engine.categorical_encoders import RareLabelCategoricalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error 
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
import scipy.stats as stats
from feature_engine import variable_transformers as vt
from sklearn.preprocessing import StandardScaler

In [145]:
train_data = pd.read_csv("train.csv")

In [146]:
# transform type

train_data[["MoSold", "MSSubClass"]] = train_data[["MoSold", "MSSubClass"]].astype(str)

In [147]:
# separate object and numerical columns. Separate area and surface area columns and year columns as wel

cat_cols = [x for x in train_data.columns if train_data[x].dtype==np.object]
num_cols = [x for x in train_data.columns if train_data[x].dtype!=np.object]
area_cols = [x for x in train_data.columns if ("SF" in x)|("Area" in x)|(x=="LotFrontage")]
year_cols = [x for x in num_cols if 'Yr' in x or 'Year' in x]

In [148]:
# split train and test

X_train, X_test, y_train, y_test = train_test_split(train_data.drop(['Id', 'SalePrice'], axis=1),
                                                    train_data['SalePrice'],
                                                    test_size=0.1,
                                                    random_state=0)

X_train.shape, X_test.shape


((1314, 79), (146, 79))

In [149]:
# function to calculate elapsed time

def elapsed_years(df, cols):
    # capture difference between year col and
    # year the house was sold
    for col in cols:
        if col=='YrSold':
            continue
        df[col] = df['YrSold'] - df[col]
   
    return df

X_train = elapsed_years(X_train, year_cols)
X_test = elapsed_years(X_test, year_cols)

In [150]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1314 entries, 930 to 684
Data columns (total 79 columns):
MSSubClass       1314 non-null object
MSZoning         1314 non-null object
LotFrontage      1081 non-null float64
LotArea          1314 non-null int64
Street           1314 non-null object
Alley            81 non-null object
LotShape         1314 non-null object
LandContour      1314 non-null object
Utilities        1314 non-null object
LotConfig        1314 non-null object
LandSlope        1314 non-null object
Neighborhood     1314 non-null object
Condition1       1314 non-null object
Condition2       1314 non-null object
BldgType         1314 non-null object
HouseStyle       1314 non-null object
OverallQual      1314 non-null int64
OverallCond      1314 non-null int64
YearBuilt        1314 non-null int64
YearRemodAdd     1314 non-null int64
RoofStyle        1314 non-null object
RoofMatl         1314 non-null object
Exterior1st      1314 non-null object
Exterior2nd      1314 no

In [151]:
# lets drop YrSold

X_train.drop(['YrSold', 'MiscVal'], axis=1, inplace=True)
X_test.drop(['YrSold', 'MiscVal'], axis=1, inplace=True)

In [152]:
# lets check missing values
for col in X_train.columns:
    if X_train[col].isnull().sum() > 0:
        print(col, X_train[col].isnull().mean())

LotFrontage 0.17732115677321156
Alley 0.9383561643835616
MasVnrType 0.0045662100456621
MasVnrArea 0.0045662100456621
BsmtQual 0.0243531202435312
BsmtCond 0.0243531202435312
BsmtExposure 0.02511415525114155
BsmtFinType1 0.0243531202435312
BsmtFinType2 0.02511415525114155
Electrical 0.00076103500761035
FireplaceQu 0.4726027397260274
GarageType 0.0563165905631659
GarageYrBlt 0.0563165905631659
GarageFinish 0.0563165905631659
GarageQual 0.0563165905631659
GarageCond 0.0563165905631659
PoolQC 0.9954337899543378
Fence 0.8143074581430746
MiscFeature 0.9611872146118722


In [153]:
# impute numerical cols with median value

# define an imputation function
def impute_na_median(df, cols):
    for col in cols:
        df[col]= df[col].fillna(df[col].median())
    
    return df

# apply on train set and test set
X_train_imp = impute_na_median(X_train, cols=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])
X_test_imp = impute_na_median(X_test, cols=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])

In [154]:
# impute cat cols with a 'NA' label

# define an imputation function
def impute_na_label(df, cols):
    for col in cols:
        df[col]= df[col].fillna('NA')
    
    return df

# apply on train set and test set
X_train_imp = impute_na_label(X_train_imp, cols=cat_cols)
X_test_imp = impute_na_label(X_test_imp, cols=cat_cols)

In [156]:
# transform the area columns to enhance skew

yjt = vt.YeoJohnsonTransformer(variables = area_cols)
yjt.fit(X_train_imp)
print(yjt.lambda_dict_)
X_train_imp_tr = yjt.transform(X_train_imp)
X_test_imp_tr = yjt.transform(X_test_imp)

{'LotFrontage': 0.422432655023388, 'LotArea': -12.55283001172003, 'MasVnrArea': -0.25336100411247126, 'BsmtFinSF1': 0.2241603622191745, 'BsmtFinSF2': -1.5253670841536835, 'BsmtUnfSF': 0.4741451871581765, 'TotalBsmtSF': 0.7320141644679369, '1stFlrSF': -12.55283001172003, '2ndFlrSF': -0.12038141633304861, 'LowQualFinSF': -9.769635009367672, 'GrLivArea': 0.03602357773149543, 'GarageArea': 0.8072365473023602, 'WoodDeckSF': -0.08553433814930124, 'OpenPorchSF': 0.015647864654396017, 'PoolArea': -34.48045639184592}


  loglike = -n_samples / 2 * np.log(trans.var(axis=0))
  w = xb - ((xb - xc) * tmp2 - (xb - xa) * tmp1) / denom
  tmp1 = (x - w) * (fx - fv)
  tmp2 = (x - v) * (fx - fw)


In [157]:
# Apply rare label encoding on some of the cat cols

rare_encoder = RareLabelCategoricalEncoder(
    tol=0.05,  # minimal percentage to be considered non-rare
    n_categories=3, # minimal number of categories the variable should have to re-cgroup rare categories
    variables=cat_cols # apply to all categorical columns
) 

rare_encoder.fit(X_train_imp_tr)
X_train_imp_tr_enc = rare_encoder.transform(X_train_imp_tr)
X_test_imp_tr_enc = rare_encoder.transform(X_test_imp_tr)

In [158]:
rare_encoder.encoder_dict_

{'MSSubClass': Index(['20', '60', '50', '120'], dtype='object'),
 'MSZoning': Index(['RL', 'RM'], dtype='object'),
 'Street': array(['Pave', 'Grvl'], dtype=object),
 'Alley': array(['NA', 'Grvl', 'Pave'], dtype=object),
 'LotShape': Index(['Reg', 'IR1'], dtype='object'),
 'LandContour': Index(['Lvl'], dtype='object'),
 'Utilities': array(['AllPub', 'NoSeWa'], dtype=object),
 'LotConfig': Index(['Inside', 'Corner', 'CulDSac'], dtype='object'),
 'LandSlope': array(['Gtl', 'Mod', 'Sev'], dtype=object),
 'Neighborhood': Index(['NAmes', 'CollgCr', 'OldTown', 'Edwards', 'Somerst', 'NridgHt',
        'Gilbert', 'Sawyer'],
       dtype='object'),
 'Condition1': Index(['Norm', 'Feedr'], dtype='object'),
 'Condition2': Index(['Norm'], dtype='object'),
 'BldgType': Index(['1Fam', 'TwnhsE'], dtype='object'),
 'HouseStyle': Index(['1Story', '2Story', '1.5Fin'], dtype='object'),
 'RoofStyle': Index(['Gable', 'Hip'], dtype='object'),
 'RoofMatl': Index(['CompShg'], dtype='object'),
 'Exterior1st': In

In [159]:
# Apply hot one encoding next

ohe_enc = OneHotCategoricalEncoder(
    top_categories=None,
    drop_last=False) # automatically detects all cat vars in data

ohe_enc.fit(X_train_imp_tr_enc)

X_train_imp_tr_enc = ohe_enc.transform(X_train_imp_tr_enc)
X_test_imp_tr_enc = ohe_enc.transform(X_test_imp_tr_enc)

In [160]:
X_train_imp_tr_enc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1314 entries, 930 to 684
Columns: 192 entries, LotFrontage to SaleCondition_Abnorml
dtypes: float64(16), int64(176)
memory usage: 1.9 MB


In [161]:
X_test_imp_tr_enc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146 entries, 529 to 310
Columns: 192 entries, LotFrontage to SaleCondition_Abnorml
dtypes: float64(16), int64(176)
memory usage: 220.1 KB


In [162]:
# lets skip scaling since we will use light gbm

lightgbm_regr = LGBMRegressor()

lightgbm_regr.fit(X_train_imp_tr_enc, y_train)

y_pred_train = lightgbm_regr.predict(X_train_imp_tr_enc)
y_pred_test = lightgbm_regr.predict(X_test_imp_tr_enc)

In [163]:
# check model performance:

print('train mse: {}'.format(mean_squared_error(y_train, y_pred_train)))
print('train rmse: {}'.format(np.sqrt(mean_squared_error(y_train, y_pred_train))))
print('train r2: {}'.format(r2_score(y_train, y_pred_train)))
#print('train rmsle: {}'.format(np.sqrt(mean_squared_error(np.log(y_train), np.log(y_pred_train)))))
print()
print('test mse: {}'.format(mean_squared_error(y_test, y_pred_test)))
print('test rmse: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_test))))
print('test rmsle: {}'.format(np.sqrt(mean_squared_error(np.log(y_test), np.log(y_pred_test)))))
print('test r2: {}'.format(r2_score(y_test, y_pred_test)))

train mse: 147693255.35459948
train rmse: 12152.911394172159
train r2: 0.9763457347548624

test mse: 1236729646.478677
test rmse: 35167.16716596145
test rmsle: 0.1300077902257026
test r2: 0.8200365397127115


In [164]:
# let's add scaling to see if there is a difference

# scaling the features since we will be using LR

scaler = StandardScaler()
scaler.fit(X_train_imp_tr_enc)

# transform 
X_train_imp_tr_enc_sc = scaler.transform(X_train_imp_tr_enc)
X_test_imp_tr_enc_sc = scaler.transform(X_test_imp_tr_enc)

In [165]:
lightgbm_regr = LGBMRegressor()

lightgbm_regr.fit(X_train_imp_tr_enc_sc, y_train)

y_pred_train = lightgbm_regr.predict(X_train_imp_tr_enc_sc)
y_pred_test = lightgbm_regr.predict(X_test_imp_tr_enc_sc)

In [166]:
print('train mse: {}'.format(mean_squared_error(y_train, y_pred_train)))
print('train rmse: {}'.format(np.sqrt(mean_squared_error(y_train, y_pred_train))))
print('train r2: {}'.format(r2_score(y_train, y_pred_train)))
print('train rmsle: {}'.format(np.sqrt(mean_squared_error(np.log(y_train), np.log(y_pred_train)))))
print()
print('test mse: {}'.format(mean_squared_error(y_test, y_pred_test)))
print('test rmse: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_test))))
print('test rmsle: {}'.format(np.sqrt(mean_squared_error(np.log(y_test), np.log(y_pred_test)))))
print('test r2: {}'.format(r2_score(y_test, y_pred_test)))

train mse: 145963260.1377637
train rmse: 12081.525571622307
train r2: 0.9766228074325117
train rmsle: 0.0580950897471909

test mse: 1177202724.236507
test rmse: 34310.38799309193
test rmsle: 0.12405642445651678
test r2: 0.8286986356990536


In [167]:
# there is clearly a lot of overfitting, so we can try to introduce regularization

# tune the lamda_l1 value 

lambdas=[1e-3, 1e-2, 0.1, 1.0, 10]

for lambda_value in lambdas:
    lightgbm_regr = LGBMRegressor(lambda_l1=lambda_value)    
    lightgbm_regr.fit(X_train_imp_tr_enc_sc, y_train)

    y_pred_train = lightgbm_regr.predict(X_train_imp_tr_enc_sc)
    y_pred_test = lightgbm_regr.predict(X_test_imp_tr_enc_sc)
   
    print("For lambda = {}".format(lambda_value))
    #print('train mse: {}'.format(mean_squared_error(y_train, y_pred_train)))
    #print('train rmse: {}'.format(np.sqrt(mean_squared_error(y_train, y_pred_train))))
    #print('train r2: {}'.format(r2_score(y_train, y_pred_train)))
    print('train rmsle: {}'.format(np.sqrt(mean_squared_error(np.log(y_train), np.log(y_pred_train)))))
    #print('test mse: {}'.format(mean_squared_error(y_test, y_pred_test)))
    #print('test rmse: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_test))))
    print('test rmsle: {}'.format(np.sqrt(mean_squared_error(np.log(y_test), np.log(y_pred_test)))))
    #print('test r2: {}'.format(r2_score(y_test, y_pred_test)))
    print()

For lambda = 0.001
train rmsle: 0.05781990984584178
test rmsle: 0.124548629208967

For lambda = 0.01
train rmsle: 0.05781991142680559
test rmsle: 0.12454862894909223

For lambda = 0.1
train rmsle: 0.057147165752472896
test rmsle: 0.12370177208429929

For lambda = 1.0
train rmsle: 0.05743526025068686
test rmsle: 0.12266215282760119

For lambda = 10
train rmsle: 0.057906482825402984
test rmsle: 0.12359304609634947



In [168]:
# lambda1 = 1.0 is the best value. we can also tune other params for the model for additional regularization

feature_fractions = list(np.arange(0.1,1.1,step=0.10))

for frac in feature_fractions:
    lightgbm_regr = LGBMRegressor(lambda_l1=1.0, feature_fraction=frac)    
    lightgbm_regr.fit(X_train_imp_tr_enc_sc, y_train)

    y_pred_train = lightgbm_regr.predict(X_train_imp_tr_enc_sc)
    y_pred_test = lightgbm_regr.predict(X_test_imp_tr_enc_sc)
   
    print("For feature_fraction = {}".format(frac))
    #print('train mse: {}'.format(mean_squared_error(y_train, y_pred_train)))
    #print('train rmse: {}'.format(np.sqrt(mean_squared_error(y_train, y_pred_train))))
    #print('train r2: {}'.format(r2_score(y_train, y_pred_train)))
    print('train rmsle: {}'.format(np.sqrt(mean_squared_error(np.log(y_train), np.log(y_pred_train)))))
    #print('test mse: {}'.format(mean_squared_error(y_test, y_pred_test)))
    #print('test rmse: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_test))))
    print('test rmsle: {}'.format(np.sqrt(mean_squared_error(np.log(y_test), np.log(y_pred_test)))))
    #print('test r2: {}'.format(r2_score(y_test, y_pred_test)))
    print()


For feature_fraction = 0.1
train rmsle: 0.08135332635939375
test rmsle: 0.12742341337047783

For feature_fraction = 0.2
train rmsle: 0.06767824128092653
test rmsle: 0.12393654726914535

For feature_fraction = 0.30000000000000004
train rmsle: 0.06504597610445007
test rmsle: 0.11828561041520136

For feature_fraction = 0.4
train rmsle: 0.06053329967555061
test rmsle: 0.12176495516785761

For feature_fraction = 0.5
train rmsle: 0.05927436081153298
test rmsle: 0.12262472571001555

For feature_fraction = 0.6
train rmsle: 0.057775411210222206
test rmsle: 0.1193798070088095

For feature_fraction = 0.7000000000000001
train rmsle: 0.056804724123292995
test rmsle: 0.1206026952829246

For feature_fraction = 0.8
train rmsle: 0.056705375786455674
test rmsle: 0.12792893114679826

For feature_fraction = 0.9
train rmsle: 0.05723319965709363
test rmsle: 0.12250496372183027

For feature_fraction = 1.0
train rmsle: 0.05743526025068686
test rmsle: 0.12266215282760119



In [181]:
number_leaves = list(np.arange(2,31,step=3))+[31]

for num in number_leaves:
    lightgbm_regr = LGBMRegressor(lambda_l1=1.0, feature_fraction=0.30, num_leaves=num)    
    lightgbm_regr.fit(X_train_imp_tr_enc_sc, y_train)

    y_pred_train = lightgbm_regr.predict(X_train_imp_tr_enc_sc)
    y_pred_test = lightgbm_regr.predict(X_test_imp_tr_enc_sc)
   
    print("For num_leaves = {}".format(num))
    #print('train mse: {}'.format(mean_squared_error(y_train, y_pred_train)))
    #print('train rmse: {}'.format(np.sqrt(mean_squared_error(y_train, y_pred_train))))
    #print('train r2: {}'.format(r2_score(y_train, y_pred_train)))
    print('train rmsle: {}'.format(np.sqrt(mean_squared_error(np.log(y_train), np.log(y_pred_train)))))
    #print('test mse: {}'.format(mean_squared_error(y_test, y_pred_test)))
    #print('test rmse: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_test))))
    print('test rmsle: {}'.format(np.sqrt(mean_squared_error(np.log(y_test), np.log(y_pred_test)))))
    #print('test r2: {}'.format(r2_score(y_test, y_pred_test)))
    print()


For num_leaves = 2
train rmsle: 0.15435206893721345
test rmsle: 0.14247334934599887

For num_leaves = 5
train rmsle: 0.1177221563670797
test rmsle: 0.127509485079394

For num_leaves = 8
train rmsle: 0.10451851480369113
test rmsle: 0.12585950273848398

For num_leaves = 11
train rmsle: 0.09606545689771369
test rmsle: 0.12651946703124253

For num_leaves = 14
train rmsle: 0.08903601052772049
test rmsle: 0.1208761868175641

For num_leaves = 17
train rmsle: 0.08293062229116341
test rmsle: 0.11943604804103249

For num_leaves = 20
train rmsle: 0.07749923245037495
test rmsle: 0.12285435528853468

For num_leaves = 23
train rmsle: 0.07252039249567102
test rmsle: 0.12016821423174813

For num_leaves = 26
train rmsle: 0.06972377050966279
test rmsle: 0.12239255047670805

For num_leaves = 29
train rmsle: 0.06614760063791958
test rmsle: 0.12046852526536023

For num_leaves = 31
train rmsle: 0.06504597610445007
test rmsle: 0.11828561041520136



In [182]:
bagging_fractions = list(np.arange(0.1,1.1,step=0.10))

for frac in bagging_fractions:
    lightgbm_regr = LGBMRegressor(lambda_l1=1.0, feature_fraction=0.30, num_leaves=31, bagging_fraction=frac, 
                                 bagging_freq=5)    
    lightgbm_regr.fit(X_train_imp_tr_enc_sc, y_train)

    y_pred_train = lightgbm_regr.predict(X_train_imp_tr_enc_sc)
    y_pred_test = lightgbm_regr.predict(X_test_imp_tr_enc_sc)
   
    print("For bagging_fraction = {}".format(frac))
    #print('train mse: {}'.format(mean_squared_error(y_train, y_pred_train)))
    #print('train rmse: {}'.format(np.sqrt(mean_squared_error(y_train, y_pred_train))))
    #print('train r2: {}'.format(r2_score(y_train, y_pred_train)))
    print('train rmsle: {}'.format(np.sqrt(mean_squared_error(np.log(y_train), np.log(y_pred_train)))))
    #print('test mse: {}'.format(mean_squared_error(y_test, y_pred_test)))
    #print('test rmse: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_test))))
    print('test rmsle: {}'.format(np.sqrt(mean_squared_error(np.log(y_test), np.log(y_pred_test)))))
    #print('test r2: {}'.format(r2_score(y_test, y_pred_test)))
    print()


For bagging_fraction = 0.1
train rmsle: 0.15371949783556577
test rmsle: 0.14693591409768206

For bagging_fraction = 0.2
train rmsle: 0.12828640393671362
test rmsle: 0.12705526106332884

For bagging_fraction = 0.30000000000000004
train rmsle: 0.11040022227172241
test rmsle: 0.12259481164071746

For bagging_fraction = 0.4
train rmsle: 0.09892089650076027
test rmsle: 0.12078098637928553

For bagging_fraction = 0.5
train rmsle: 0.0883559672264315
test rmsle: 0.11693532175847905

For bagging_fraction = 0.6
train rmsle: 0.081259027161325
test rmsle: 0.11956489522614754

For bagging_fraction = 0.7000000000000001
train rmsle: 0.07343466148600492
test rmsle: 0.1191041865144283

For bagging_fraction = 0.8
train rmsle: 0.06906120450935284
test rmsle: 0.11856358406523869

For bagging_fraction = 0.9
train rmsle: 0.06681459627417624
test rmsle: 0.12007506385212889

For bagging_fraction = 1.0
train rmsle: 0.06504597610445007
test rmsle: 0.11828561041520136



In [183]:
max_depths = list(np.arange(3,15,step=2))+[-1]

for depth in max_depths:
    lightgbm_regr = LGBMRegressor(lambda_l1=1.0, feature_fraction=0.30, num_leaves=31, bagging_fraction=0.5, 
                                 bagging_freq=5, max_depth=depth)    
    lightgbm_regr.fit(X_train_imp_tr_enc_sc, y_train)

    y_pred_train = lightgbm_regr.predict(X_train_imp_tr_enc_sc)
    y_pred_test = lightgbm_regr.predict(X_test_imp_tr_enc_sc)
   
    print("For max_depth = {}".format(depth))
    #print('train mse: {}'.format(mean_squared_error(y_train, y_pred_train)))
    #print('train rmse: {}'.format(np.sqrt(mean_squared_error(y_train, y_pred_train))))
    #print('train r2: {}'.format(r2_score(y_train, y_pred_train)))
    print('train rmsle: {}'.format(np.sqrt(mean_squared_error(np.log(y_train), np.log(y_pred_train)))))
    #print('test mse: {}'.format(mean_squared_error(y_test, y_pred_test)))
    #print('test rmse: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_test))))
    print('test rmsle: {}'.format(np.sqrt(mean_squared_error(np.log(y_test), np.log(y_pred_test)))))
    #print('test r2: {}'.format(r2_score(y_test, y_pred_test)))
    print()


For max_depth = 3
train rmsle: 0.1182789710043338
test rmsle: 0.12194658508904342

For max_depth = 5
train rmsle: 0.10820958457701117
test rmsle: 0.11807684830463812

For max_depth = 7
train rmsle: 0.09915424231983065
test rmsle: 0.11645675834939422

For max_depth = 9
train rmsle: 0.09273582349463858
test rmsle: 0.1142312526149283

For max_depth = 11
train rmsle: 0.08966120595668073
test rmsle: 0.11847751816095213

For max_depth = 13
train rmsle: 0.08881680704168372
test rmsle: 0.11860233830580175

For max_depth = -1
train rmsle: 0.0883559672264315
test rmsle: 0.11693532175847905



In [184]:
# best model so far

lightgbm_regr = LGBMRegressor(lambda_l1=1.0, feature_fraction=0.30, num_leaves=31, bagging_fraction=0.50, 
                                 bagging_freq=5, max_depth=9)

lightgbm_regr.fit(X_train_imp_tr_enc_sc, y_train)

y_pred_train = lightgbm_regr.predict(X_train_imp_tr_enc_sc)
y_pred_test = lightgbm_regr.predict(X_test_imp_tr_enc_sc)

print('train mse: {}'.format(mean_squared_error(y_train, y_pred_train)))
print('train rmse: {}'.format(np.sqrt(mean_squared_error(y_train, y_pred_train))))
print('train r2: {}'.format(r2_score(y_train, y_pred_train)))
print('train rmsle: {}'.format(np.sqrt(mean_squared_error(np.log(y_train), np.log(y_pred_train)))))
print()   
print('test mse: {}'.format(mean_squared_error(y_test, y_pred_test)))
print('test rmse: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_test))))
print('test rmsle: {}'.format(np.sqrt(mean_squared_error(np.log(y_test), np.log(y_pred_test)))))
print('test r2: {}'.format(r2_score(y_test, y_pred_test)))
print()


train mse: 384040401.9021788
train rmse: 19596.948790619903
train r2: 0.9384928342893317
train rmsle: 0.09273582349463858

test mse: 977558046.7978072
test rmse: 31265.924691232263
test rmsle: 0.1142312526149283
test r2: 0.857750051327447



In [185]:
np.sqrt(mean_squared_log_error(y_test, y_pred_test))

0.11423057858674021

In [None]:
# train with all data and evaluate again on test data set

