In [244]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import metrics
from scipy.stats import skew
from scipy.special import boxcox1p,inv_boxcox
from scipy.stats import boxcox_normmax

sns.set()

def log_transform(df,col_name):
    df[col_name+'_Log'] = np.log(df[col_name])
    
    print("Skewness_orig: %f" % df[col_name].skew())
    print("Kurtosis_orig: %f" % df[col_name].kurt())
    print("Skewness_new: %f" % df[col_name+'_Log'].skew())
    print("Kurtosis_new: %f" % df[col_name+'_Log'].kurt())

    f, axes = plt.subplots(2, 1)
    f.tight_layout()
    sns.distplot(df[col_name], ax=axes[0])
    sns.distplot(df[col_name+'_Log'], ax=axes[1])
    
    plt.show()
    
def add_sqrt_transform(df,col_name,add):
    df[col_name+'_sqrt'] = np.sqrt(df[col_name]+add)
    
    print("Skewness_orig: %f" % df[col_name].skew())
    print("Kurtosis_orig: %f" % df[col_name].kurt())
    print("Skewness_new: %f" % df[col_name+'_sqrt'].skew())
    print("Kurtosis_new: %f" % df[col_name+'_sqrt'].kurt())

    f, axes = plt.subplots(2, 1)
    f.tight_layout()
    sns.distplot(df[col_name], ax=axes[0])
    sns.distplot(df[col_name+'_sqrt'], ax=axes[1])
    plt.show()
    
def print_missing_data(df,top=20):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    print(missing_data.head(top))
    
def get_corr_pairs(df):
    corrs = df.corr() >= 0.8
    names = list(corrs.index)
    corr_pairs={}
    for item in sorted(corrs):
        relevant=[ names[ind] for ind, row in enumerate(corrs[item]) if row and names[ind] != item ]
        if len(relevant)>0:
            corr_pairs[item] = relevant
    return corr_pairs
    
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def prediction_stats(y_test,y_pred):
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('Mean Absolute Percentage Error:',mean_absolute_percentage_error(y_test,y_pred))


In [245]:
df_train = pd.read_csv("/Users/gandharkamat/projects/8010/Final/train.csv")
df_test_data = pd.read_csv("/Users/gandharkamat/projects/8010/Final/test.csv")
sals_price=pd.read_csv("/Users/gandharkamat/projects/8010/Final/test_actual_price.csv")

df_test = pd.merge(df_test_data, sals_price,  how='inner', on='Id')

df_train['source']='train'
df_test['source']='test'
df = pd.concat([df_train,df_test], ignore_index = True, sort = False)

print(df_train.shape, df_test.shape, df.shape)


(1460, 82) (1459, 82) (2919, 82)


In [246]:
df['MSSubClass'] = df['MSSubClass'].apply(str)
df['YrSold'] = df['YrSold'].astype(str)
df['MoSold'] = df['MoSold'].astype(str)
df['Functional'] = df['Functional'].fillna('Typ') 
df['Electrical'] = df['Electrical'].fillna("SBrkr") 
df['KitchenQual'] = df['KitchenQual'].fillna("TA") 
df["PoolQC"] = df["PoolQC"].fillna("None")
df['Exterior1st'] = df['Exterior1st'].fillna(df['Exterior1st'].mode()[0]) 
df['Exterior2nd'] = df['Exterior2nd'].fillna(df['Exterior2nd'].mode()[0])
df['SaleType'] = df['SaleType'].fillna(df['SaleType'].mode()[0])

for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    df[col] = df[col].fillna(0)

for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    df[col] = df[col].fillna('None')

for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    df[col] = df[col].fillna('None')

df['MSZoning'] = df.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

objects = []
for i in df.columns:
    if df[i].dtype == object:
        objects.append(i)
df.update(df[objects].fillna('None'))

df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics = []
for i in df.columns:
    if df[i].dtype in numeric_dtypes:
        numerics.append(i)
df.update(df[numerics].fillna(0))

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,source
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,,,,0,2,2008,WD,Normal,208500,train
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,,,,0,5,2007,WD,Normal,181500,train
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,,,,0,9,2008,WD,Normal,223500,train
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,,,,0,2,2006,WD,Abnorml,140000,train
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,,,,0,12,2008,WD,Normal,250000,train


In [247]:
try:
    df = df.drop(['Utilities', 'Street', 'PoolQC',], axis=1)
except Exception:
    pass

df['YrBltAndRemod']=df['YearBuilt']+df['YearRemodAdd']
df['TotalSF']=df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

df['Total_sqr_footage'] = (df['BsmtFinSF1'] + df['BsmtFinSF2'] +
                                 df['1stFlrSF'] + df['2ndFlrSF'])

df['Total_Bathrooms'] = (df['FullBath'] + (0.5 * df['HalfBath']) +
                               df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath']))

df['Total_porch_sf'] = (df['OpenPorchSF'] + df['3SsnPorch'] +
                              df['EnclosedPorch'] + df['ScreenPorch'] +
                              df['WoodDeckSF'])

df['haspool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
df['has2ndfloor'] = df['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
df['hasgarage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
df['hasbsmt'] = df['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
df['hasfireplace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

df.drop(df[(df['source'] == 'train') & (df['GrLivArea']>5000)].index,inplace=True)
df.drop(df[(df['source'] == 'train') & (df['LotArea']>50000)].index,inplace=True)
df.drop(df[(df['source'] == 'train') & (df['LotFrontage']>250)].index,inplace=True)

df.shape

numerics2 = []
for i in df.columns:
    if df[i].dtype in numeric_dtypes:
        numerics2.append(i)
skew_df = df[numerics2].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_df[skew_df > 0.5]
skew_index = high_skew.index

# y = np.log(df[df['source'] == 'train']['SalePrice'])
# y_test = np.log(df[df['source'] == 'test']['SalePrice'])

for i in skew_index:
    df[i] = boxcox1p(df[i], boxcox_normmax(df[i] + 1))

from sklearn.preprocessing import RobustScaler 


final_df = pd.get_dummies(df).reset_index(drop=True)

# scaler = RobustScaler()
# final_df = pd.DataFrame(scaler.fit_transform(final_df_1), index=final_df_1.index, columns=final_df_1.columns)
# # final_df

# final_df['Id'] = df['Id']
# final_df.shape




In [248]:
    
X = final_df[final_df['source_train'] == 1]
y = X['SalePrice']
X.drop('SalePrice',axis=1,inplace=True)
X.drop('Id',axis=1,inplace=True)

# print(final_df_1[final_df_1['source_test'] == 1])

X_test = final_df[final_df['source_train'] == 0]
print(X_test.shape)
y_test = X_test['SalePrice']
X_test.drop('SalePrice',axis=1,inplace=True)
X_test.drop('Id',axis=1,inplace=True)

overfit = []
for i in X.columns:
    counts = X[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(X) * 100 > 99.94:
        overfit.append(i)

overfit = list(overfit)
X = X.drop(overfit, axis=1)
X_test = X_test.drop(overfit, axis=1)

print(X.shape, y.shape, X_test.shape, y_test.shape)

# outliers = [30, 88, 462, 631, 1322]
# X = train_df.drop(X.index[outliers])
# y = train_df.drop(y.index[outliers])

# overfit = []
# for i in X.columns:
#     counts = X[i].value_counts()
#     zeros = counts.iloc[0]
#     if zeros / len(X) * 100 > 99.94:
#         overfit.append(i)

# overfit = list(overfit)
# X = X.drop(overfit, axis=1)
# X_sub = X_sub.drop(overfit, axis=1)
y_test


(1459, 337)
(1448, 332) (1448,) (1459, 332) (1459,)


1448    15.465436
1449    16.336388
1450    16.513605
1451    16.565789
1452    16.528663
          ...    
2902    15.207205
2903    14.789481
2904    15.853303
2905    15.866708
2906    16.495567
Name: SalePrice, Length: 1459, dtype: float64

In [249]:
from sklearn.svm import SVR

def svr_thing():
    svr = SVR()
    parameters = {
        'C':[20 ], 'epsilon' : [0.008], 'gamma': [0.0003]
                 }
        
    svr_regressor = GridSearchCV(svr, parameters, scoring = 'neg_mean_squared_error', cv=5,n_jobs=-1,verbose=2)
    svr_regressor.fit(X,y)
    
    print(svr_regressor.best_params_)
    print(svr_regressor.best_score_)

    y_pred = svr_regressor.predict(X_test)

    prediction_stats(np.expm1(y_test),np.expm1(y_pred))
    output_all['svr'] = np.expm1(y_pred)

svr_thing()
# output_all

# SVR()
# svr_model_full_data = svr.fit(X, y)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.2s remaining:    3.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.3s finished


{'C': 20, 'epsilon': 0.008, 'gamma': 0.0003}
-0.30099683773066505
Mean Absolute Error: 6077166.368282289
Mean Squared Error: 166165570131699.1
Root Mean Squared Error: 12890522.492579542
Mean Absolute Percentage Error: 48.35050807140407


In [221]:
from sklearn.linear_model import Ridge
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

output_all = pd.DataFrame()
output_all['actual'] = y_test

def ridge_thing():
    ridge = Ridge()
    parameters = {'alpha': [ 0.1 + 0.1*i for i in range(100)],
                 'fit_intercept':[True,False],
                 'normalize': [True,False]
                 }
        
    ridge_regressor = GridSearchCV(ridge, parameters, scoring = 'neg_mean_squared_error', cv=5,n_jobs=-1,verbose=2)
    ridge_regressor.fit(X,y)
    
    print(ridge_regressor.best_params_)
    print(ridge_regressor.best_score_)

    y_pred = ridge_regressor.predict(X_test)

    prediction_stats(np.exp(y_test),np.exp(y_pred))
    output_all['ridge'] = y_pred

ridge_thing()
output_all

Fitting 5 folds for each of 400 candidates, totalling 2000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 408 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 1220 tasks      | elapsed:    6.9s


{'alpha': 10.0, 'fit_intercept': True, 'normalize': False}
-0.01416407272898457
Mean Absolute Error: 13378.249460001012
Mean Squared Error: 569323043.1326401
Root Mean Squared Error: 23860.49125924779
Mean Absolute Percentage Error: 8.173035439232242


[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:   10.2s finished


Unnamed: 0,actual,ridge
1460,11.561716,11.693216
1461,12.055250,11.969641
1462,12.154253,12.123119
1463,12.183316,12.213653
1464,12.162643,12.178081
...,...,...
2914,11.413105,11.362736
2915,11.170435,11.287837
2916,11.782953,12.054027
2917,11.790557,11.669669


In [222]:
@ignore_warnings(category=ConvergenceWarning)

def lasso_thing():
    lasso = Lasso() # 1
    parameters = {'alpha': [1e-4,2e-4,4e-4,8e-4,1e-3,1e-2,1],
                 'fit_intercept':[True,False],
                 'normalize': [True,False]}
    
    lasso_regressor = GridSearchCV(lasso, parameters, scoring = 'neg_mean_squared_error', cv=5,n_jobs=-1,verbose=2)
    lasso_regressor.fit(X,y) # 2
    
    print(lasso_regressor.best_params_)
    print(lasso_regressor.best_score_)

    y_pred = lasso_regressor.predict(X_test) # 3

    prediction_stats(np.exp(y_test),np.exp(y_pred)) # 4 
    output_all['lasso'] = np.exp(y_pred)

lasso_thing()


Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.5s


{'alpha': 0.0008, 'fit_intercept': True, 'normalize': False}
-0.013517991235689527
Mean Absolute Error: 13371.133431211361
Mean Squared Error: 614044536.8507667
Root Mean Squared Error: 24779.922050942103
Mean Absolute Percentage Error: 8.150958545233527


[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:    8.4s finished


In [223]:
from sklearn.tree import DecisionTreeRegressor

param_grid = {"max_depth": range(2,10,2),
              'min_samples_split': [2,5,8,10],
            "max_features": [1, 3, 10, 50, 100, 'sqrt', 'auto','log2']}

regressor_dtree = DecisionTreeRegressor()
dtree=GridSearchCV(regressor_dtree,param_grid,cv=5,n_jobs=-1,verbose=2)
dtree.fit(X,y)
print(dtree.best_estimator_)

y_pred = dtree.predict(X_test)

prediction_stats(np.exp(y_test),np.exp(y_pred))
output_all['dtree'] = np.exp(y_pred)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 602 tasks      | elapsed:    4.0s


DecisionTreeRegressor(criterion='mse', max_depth=6, max_features='auto',
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=10, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')
Mean Absolute Error: 21769.624129466363
Mean Squared Error: 1036904849.011683
Root Mean Squared Error: 32201.00695648636
Mean Absolute Percentage Error: 13.351049246396594


[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed:    4.2s finished


In [224]:
from sklearn.ensemble import RandomForestRegressor

rfp = RandomForestRegressor()

param_grid = {"max_depth": range(6,12,2),
              "n_estimators": [1500],
            "max_features": ['auto'],
             "bootstrap": [True]}

rf = GridSearchCV(rfp, param_grid=param_grid,cv=5,n_jobs=-1,verbose=2)
rf.fit(X, y)
print(rf.best_estimator_)

y_pred = rf.predict(X_test)
output_all['rf'] = np.exp(y_pred)

prediction_stats(np.exp(y_test),np.exp(y_pred))


Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:   51.7s remaining:   45.2s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.7min finished


KeyboardInterrupt: 

In [None]:
import xgboost as xgb

param_grid = {
    "max_depth": [4],
    "n_estimators": [410],
    'objective':['reg:squarederror']
    }

model_xgb = xgb.XGBRegressor()

xgbr=GridSearchCV(model_xgb,param_grid,cv=5,n_jobs=-1,verbose=2)
xgbr.fit(X,y)
print(xgbr.best_estimator_)

y_pred = xgbr.predict(X_test)

prediction_stats(np.exp(y_test),np.exp(y_pred))
output_all['xgb'] = np.exp(y_pred)


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbp = GradientBoostingRegressor()

param_grid = {"max_depth": range(1,5,1),
              "n_estimators": range(470,510,10),
            "criterion": ['friedman_mse','mse']
             }

gb = GridSearchCV(gbp, param_grid=param_grid,cv=3,n_jobs=-1,verbose=2)
gb.fit(X, y)
print(gb.best_estimator_)

y_pred = gb.predict(X_test)
output_all['gb'] = np.exp(y_pred)

prediction_stats(np.exp(y_test),np.exp(y_pred))


In [None]:
#linear reg to find linear combination of individual
from sklearn import linear_model

X_final = output_all.loc[:,['lasso','ridge','xgb','dtree','rf','gb']]
X_final
Y_final = output_all['actual']

regr = linear_model.LinearRegression()
regr.fit(X_final, Y_final)

print(regr.coef_,regr.intercept_)

output_all['lr'] = regr.intercept_ +  \
    regr.coef_[0]*output_all['lasso']+ \
    regr.coef_[1]*output_all['ridge']+ \
    regr.coef_[2]*output_all['xgb']+ \
    regr.coef_[3]*output_all['dtree']+ \
    regr.coef_[4]*output_all['rf'] + \
    regr.coef_[5]*output_all['gb']

output_all['lr']

print(output_all['actual'],output_all['lr'])
prediction_stats(output_all['actual'],output_all['lr'])

output_all['guess'] = 0.1*output_all['xgb'] + 0.4*output_all['dtree']  + 0.3*output_all['rf'] + 0.2*output_all['gb']
prediction_stats(output_all['actual'],output_all['guess'])

In [None]:
col='GrLivArea'
log_transform(df,col)
num_feats.remove(col)
num_feats.append(col+'_Log')


scale_variables = ['LotFrontage', 'LotArea_Log', 'YearBuilt', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea_Log', 'GarageYrBlt', 'GarageArea', 'PoolArea']

robust = RobustScaler()
df[scale_variables] = robust.fit_transform(df[scale_variables])

df

In [None]:
#dummies 
df_dummies = pd.concat([df.loc[:,'source'],df.loc[:,num_feats],pd.get_dummies(df.loc[:,cat_feats])],axis=1)

#prep for models 
train = df_dummies.loc[df_dummies['source']=="train"]
test = df_dummies.loc[df_dummies['source']=="test"]

train.drop(['source'],axis=1,inplace=True)
test.drop(['source'],axis=1,inplace=True)

target = 'SalePrice_Log'
X = train.drop(target,axis=1)
y = train[target]

X_test = test.drop(target,axis=1)
y_test = test[target]
print(X.shape,y.shape,X_test.shape,y_test.shape)
output_all = pd.DataFrame()
output_all['actual'] = np.exp(y_test)

In [None]:
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

@ignore_warnings(category=ConvergenceWarning)

def lasso_thing():
    lasso = Lasso()
    parameters = {'alpha': [1e-4,2e-4,4e-4,8e-4,1e-3,1e-2,1],
                 'fit_intercept':[True,False],
                 'normalize': [True,False]}
    
    lasso_regressor = GridSearchCV(lasso, parameters, scoring = 'neg_mean_squared_log_error', cv=5,n_jobs=-1,verbose=2)
    lasso_regressor.fit(X,y)
    
    print(lasso_regressor.best_params_)
    print(lasso_regressor.best_score_)

    y_pred = lasso_regressor.predict(X_test)

    prediction_stats(np.exp(y_test),np.exp(y_pred))
    output_all['lasso'] = np.exp(y_pred)

lasso_thing()


In [None]:
from sklearn.linear_model import Ridge

def ridge_thing():
    ridge = Ridge()
    parameters = {'alpha': [ 10 + 0.01*i for i in range(500)],
                 'fit_intercept':[True,False],
                 'normalize': [True,False]
                 }
        
    ridge_regressor = GridSearchCV(ridge, parameters, scoring = 'neg_mean_squared_log_error', cv=5,n_jobs=-1,verbose=2)
    ridge_regressor.fit(X,y)
    
    print(ridge_regressor.best_params_)
    print(ridge_regressor.best_score_)

    y_pred = ridge_regressor.predict(X_test)

    prediction_stats(np.exp(y_test),np.exp(y_pred))
    output_all['ridge'] = np.exp(y_pred)

ridge_thing()

In [None]:
np.sqrt(metrics.mean_squared_log_error( np.exp(y_test),np.exp(y_pred) ))

In [None]:
from sklearn.tree import DecisionTreeRegressor

param_grid = {"max_depth": range(2,10,2),
              'min_samples_split': [2,5,8,10],
            "max_features": [1, 3, 10, 50, 100, 'sqrt', 'auto','log2']}

regressor_dtree = DecisionTreeRegressor()
dtree=GridSearchCV(regressor_dtree,param_grid,cv=5,n_jobs=-1,verbose=2)
dtree.fit(X,y)
print(dtree.best_estimator_)

y_pred = dtree.predict(X_test)

prediction_stats(np.exp(y_test),np.exp(y_pred))
output_all['dtree'] = np.exp(y_pred)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfp = RandomForestRegressor()

param_grid = {"max_depth": [17,19,21],
              "n_estimators": [1500],
            "max_features": ['auto'],
             "bootstrap": [True]}

rf = GridSearchCV(rfp, param_grid=param_grid,cv=3,n_jobs=-1,verbose=2)
rf.fit(X, y)
print(rf.best_estimator_)

y_pred = rf.predict(X_test)
output_all['rf'] = np.exp(y_pred)

prediction_stats(np.exp(y_test),np.exp(y_pred))


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbp = GradientBoostingRegressor()

param_grid = {"max_depth": range(1,5,1),
              "n_estimators": range(470,510,10),
            "criterion": ['friedman_mse','mse']
             }

gb = GridSearchCV(gbp, param_grid=param_grid,cv=3,n_jobs=-1,verbose=2)
gb.fit(X, y)
print(gb.best_estimator_)

y_pred = gb.predict(X_test)
output_all['gb'] = np.exp(y_pred)

prediction_stats(np.exp(y_test),np.exp(y_pred))


In [None]:
import xgboost as xgb

param_grid = {
    "max_depth": range(3,5,1),
    "n_estimators": range(350,450,20),
    'objective':['reg:squarederror']
    }

model_xgb = xgb.XGBRegressor()

xgbr=GridSearchCV(model_xgb,param_grid,cv=5,n_jobs=-1,verbose=2)
xgbr.fit(X,y)
print(xgbr.best_estimator_)

y_pred = xgbr.predict(X_test)

# model_xgb = xgb.XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1)
# model_xgb.fit(X, y)

# y_pred = model_xgb.predict(X_test)

prediction_stats(np.exp(y_test),np.exp(y_pred))
output_all['xgb'] = np.exp(y_pred)


In [None]:
# graph multiple models 
output_all
[ 10 + 0.01*i for i in range(500)]
# sns.scatterplot(output_all['actual'],output_all['ridge'])

In [None]:
#linear reg to find linear combination of individual

from sklearn import linear_model

X_final = output_all.loc[:,['lasso','ridge','xgb','dtree','rf','gb']] # here we have 2 variables for multiple regression. If you just want to use one variable for simple linear regression, then use X = df['Interest_Rate'] for example.Alternatively, you may add additional variables within the brackets
Y_final = output_all['actual']

regr = linear_model.LinearRegression()
regr.fit(X_final, Y_final)

print(regr.coef_,regr.intercept_)

output_all['lr'] = regr.intercept_ +  \
    regr.coef_[0]*output_all['lasso']+ \
    regr.coef_[1]*output_all['ridge']+ \
    regr.coef_[2]*output_all['xgb']+ \
    regr.coef_[3]*output_all['dtree']+ \
    regr.coef_[4]*output_all['rf'] + \
    regr.coef_[5]*output_all['gb']

output_all['lr']

print(output_all['actual'],output_all['lr'])
prediction_stats(output_all['actual'],output_all['lr'])

In [None]:
np.sqrt(metrics.mean_squared_log_error( output_all['actual'],output_all['lr']))