In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, Ridge, BayesianRidge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, VotingRegressor, StackingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, cross_validate
from sklearn.preprocessing import MinMaxScaler

In [3]:
df_train_fd001 = pd.read_csv('data/train_fd001_with_rul.csv')
df_test_fd001 = pd.read_csv('data/test_fd001_with_rul.csv')

In [4]:
index_names =  ['unit_number', 'time_cycles']
# setting_names = ['setting_1', 'setting_2', 'setting_3']
# drop_sensors = ['sensor_1','sensor_5','sensor_6','sensor_10','sensor_16','sensor_18','sensor_19']
target = ["rul"]
drop_labels = index_names + target

In [5]:
x_train = df_train_fd001.drop(drop_labels, axis = 1)
y_train = df_train_fd001["rul"]

y_train = y_train.clip(upper = 125)

In [6]:
test = df_test_fd001.groupby('unit_number').last().reset_index()
x_test = test.drop(drop_labels, axis = 1)
y_test = test["rul"]

y_test = y_test.clip(upper = 125)

In [7]:
scaler = MinMaxScaler()
scaler.fit(x_train)

x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

### Experiment Set 1: Comparing machine learning algorithms

In [8]:
def evaluate(y_true, y_pred, label = ''):
    # MAE
    mae = mean_absolute_error(y_true, y_pred)
    # RMSE
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    # R Squared
    variance = r2_score(y_true, y_pred)
 
    print('{} set - MAE: {}, RMSE: {}, R2: {}'.format(label, mae, rmse, variance))
    
    return { 'label': label, 'mae': mae, 'rmse': rmse, 'r2': variance}

In [24]:
### Decision tree regression

dt_regressor = DecisionTreeRegressor()
dt_regressor.fit(x_train, y_train)

y_pred_train = dt_regressor.predict(x_train)
evaluate(y_train, y_pred_train, 'train')

y_pred_test = dt_regressor.predict(x_test)
evaluate(y_test, y_pred_test, 'test')

train set - MAE: 0.0, RMSE: 0.0, R2: 1.0
test set - MAE: 20.28, RMSE: 26.21449980449751, R2: 0.5720693021061825


{'label': 'test',
 'mae': 20.28,
 'rmse': 26.21449980449751,
 'r2': 0.5720693021061825}

In [9]:
### Decision tree regression

dt_regressor = DecisionTreeRegressor()
dt_regressor.fit(x_train_scaled, y_train)

y_pred_train = dt_regressor.predict(x_train_scaled)
evaluate(y_train, y_pred_train, 'train')

y_pred_test = dt_regressor.predict(x_test_scaled)
evaluate(y_test, y_pred_test, 'test')

train set - MAE: 0.0, RMSE: 0.0, R2: 1.0
test set - MAE: 20.19, RMSE: 27.265546024240923, R2: 0.5370664142589597


In [10]:
### Linear regression

lm = LinearRegression()
lm.fit(x_train, y_train)

y_pred_train = lm.predict(x_train)
evaluate(y_train, y_pred_train, 'train')

y_pred_test = lm.predict(x_test)
evaluate(y_test, y_pred_test, 'test')

train set - MAE: 17.556207830698224, RMSE: 21.468325520200214, R2: 0.7346046578123867
test set - MAE: 16.56873644302483, RMSE: 20.83045241899899, R2: 0.7297985369402016


In [11]:
### Linear regression

lm = LinearRegression()
lm.fit(x_train_scaled, y_train)

y_pred_train = lm.predict(x_train_scaled)
evaluate(y_train, y_pred_train, 'train')

y_pred_test = lm.predict(x_test_scaled)
evaluate(y_test, y_pred_test, 'test')

train set - MAE: 17.55620783069864, RMSE: 21.468325520200416, R2: 0.7346046578123817
test set - MAE: 16.56873644302445, RMSE: 20.830452418998043, R2: 0.7297985369402261


In [12]:
### Lasso regression

lasso = Lasso()
lasso.fit(x_train, y_train)

y_pred_train = lasso.predict(x_train)
evaluate(y_train, y_pred_train, 'train')

y_pred_test = lasso.predict(x_test)
evaluate(y_test, y_pred_test, 'test')

train set - MAE: 18.199130447391227, RMSE: 22.275622979259197, R2: 0.7142694544426023
test set - MAE: 18.346068176190283, RMSE: 22.544121918687434, R2: 0.6835122243369125


In [13]:
### Lasso regression

lasso = Lasso()
lasso.fit(x_train_scaled, y_train)

y_pred_train = lasso.predict(x_train_scaled)
evaluate(y_train, y_pred_train, 'train')

y_pred_test = lasso.predict(x_test_scaled)
evaluate(y_test, y_pred_test, 'test')

train set - MAE: 20.683910107419383, RMSE: 24.467312702040964, R2: 0.6552776054180337
test set - MAE: 20.550098420754153, RMSE: 24.071326877399574, R2: 0.6391802078075437


In [14]:
### Ridge regression

ridge = Ridge()
ridge.fit(x_train, y_train)

y_pred_train = ridge.predict(x_train)
evaluate(y_train, y_pred_train, 'train')

y_pred_test = ridge.predict(x_test)
evaluate(y_test, y_pred_test, 'test')

train set - MAE: 17.588114545133624, RMSE: 21.490134634490907, R2: 0.7340651674545624
test set - MAE: 16.605640060629668, RMSE: 20.811634561438304, R2: 0.7302865067517381


In [15]:
### Ridge regression

ridge = Ridge()
ridge.fit(x_train_scaled, y_train)

y_pred_train = ridge.predict(x_train_scaled)
evaluate(y_train, y_pred_train, 'train')

y_pred_test = ridge.predict(x_test_scaled)
evaluate(y_test, y_pred_test, 'test')

train set - MAE: 17.556410899296328, RMSE: 21.4683732129695, R2: 0.7346034786377118
test set - MAE: 16.56478227313438, RMSE: 20.82460206085889, R2: 0.7299502910463482


In [16]:
### SVM regression

regressor = SVR(kernel = 'linear')
regressor.fit(x_train, y_train)

y_pred_train = regressor.predict(x_train)
evaluate(y_train, y_pred_train, 'train')

y_pred_test = regressor.predict(x_test)
evaluate(y_test, y_pred_test, 'test')

train set - MAE: 40.46440028808841, RMSE: 50.53294493441891, R2: -0.47043614659774335
test set - MAE: 42.20551250130502, RMSE: 52.95307904008474, R2: -0.7461145330019205


In [17]:
### SVM regression

regressor = SVR(kernel = 'linear')
regressor.fit(x_train_scaled, y_train)

y_pred_train = regressor.predict(x_train_scaled)
evaluate(y_train, y_pred_train, 'train')

y_pred_test = regressor.predict(x_test_scaled)
evaluate(y_test, y_pred_test, 'test')

train set - MAE: 17.52291774871658, RMSE: 21.55202918033055, R2: 0.7325311036271246
test set - MAE: 16.148181298376162, RMSE: 20.389718132213673, R2: 0.7411115141746607


In [18]:
### KNN regression

neigh = KNeighborsRegressor()
neigh.fit(x_train, y_train)

y_pred_train = neigh.predict(x_train)
evaluate(y_train, y_pred_train, 'train')

y_pred_test = neigh.predict(x_test)
evaluate(y_test, y_pred_test, 'test')

train set - MAE: 12.811128883718677, RMSE: 18.12939954108852, R2: 0.8107378395480288
test set - MAE: 15.825999999999999, RMSE: 21.319390235182617, R2: 0.7169651917110222


In [19]:
### KNN regression

neigh = KNeighborsRegressor()
neigh.fit(x_train_scaled, y_train)

y_pred_train = neigh.predict(x_train_scaled)
evaluate(y_train, y_pred_train, 'train')

y_pred_test = neigh.predict(x_test_scaled)
evaluate(y_test, y_pred_test, 'test')

train set - MAE: 11.71491444912995, RMSE: 16.684766401932727, R2: 0.839698637327521
test set - MAE: 14.468000000000002, RMSE: 20.170235496889966, R2: 0.7466550633847437


In [20]:
### Random forest regression

regr = RandomForestRegressor()
regr.fit(x_train, y_train)

y_pred_train = regr.predict(x_train)
evaluate(y_train, y_pred_train, 'train')

y_pred_test = regr.predict(x_test)
evaluate(y_test, y_pred_test, 'test')

train set - MAE: 4.936506713198585, RMSE: 6.85165327568097, R2: 0.9729674068513569
test set - MAE: 12.260700000000002, RMSE: 17.329780177486384, R2: 0.8129850183779173


In [21]:
### Random forest regression

regr = RandomForestRegressor()
regr.fit(x_train_scaled, y_train)

y_pred_train = regr.predict(x_train_scaled)
evaluate(y_train, y_pred_train, 'train')

y_pred_test = regr.predict(x_test_scaled)
evaluate(y_test, y_pred_test, 'test')

train set - MAE: 4.947358344239252, RMSE: 6.866586097810948, R2: 0.9728494461837641
test set - MAE: 12.1334, RMSE: 17.09894423641413, R2: 0.8179339864590323


In [22]:
### Bayesian ridge regression

clf = BayesianRidge()
clf.fit(x_train, y_train)

y_pred_train = clf.predict(x_train)
evaluate(y_train, y_pred_train, 'train')

y_pred_test = clf.predict(x_test)
evaluate(y_test, y_pred_test, 'test')

train set - MAE: 17.58611651558171, RMSE: 21.488842239738755, R2: 0.7340971525912936
test set - MAE: 16.60429421216977, RMSE: 20.815597461114727, R2: 0.7301837806275014


In [23]:
### Bayesian ridge regression

clf = BayesianRidge()
clf.fit(x_train_scaled, y_train)

y_pred_train = clf.predict(x_train_scaled)
evaluate(y_train, y_pred_train, 'train')

y_pred_test = clf.predict(x_test_scaled)
evaluate(y_test, y_pred_test, 'test')

train set - MAE: 17.556325024506652, RMSE: 21.468346882919988, R2: 0.7346041296324743
test set - MAE: 16.56599158648626, RMSE: 20.82644714688442, R2: 0.7299024354363169


### Find Best Parameter For Selected Algorithms
Linear regression will be skipped for finding best parameters as there ar not parameters we can fine tuning.

In [8]:
### KNN Regressor

param_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'p': [1, 3]
}

knn = KNeighborsRegressor()

grid_search = GridSearchCV(estimator = knn, param_grid = param_grid, cv = 5)
grid_search.fit(x_train_scaled, y_train)

In [9]:
best_params = grid_search.best_params_

print("Best parameters:", best_params)

Best parameters: {'n_neighbors': 7, 'p': 1, 'weights': 'distance'}


In [25]:
### Decision Tree Regressor

param_grid = {
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [1.0, None]
}

tree = DecisionTreeRegressor()

grid_search = GridSearchCV(estimator = tree, param_grid = param_grid, cv = 5)
grid_search.fit(x_train_scaled, y_train)

In [26]:
best_params = grid_search.best_params_

print("Best parameters:", best_params)

Best parameters: {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 10}


In [21]:
### Bayesian ridge regression

param_grid = {
    'alpha_1': [1e-6, 1e-5, 1e-4],
    'alpha_2': [1e-6, 1e-5, 1e-4],
    'lambda_1': [1e-6, 1e-5, 1e-4],
    'lambda_2': [1e-6, 1e-5, 1e-4],
}

bayesian_ridge = BayesianRidge()

grid_search = GridSearchCV(estimator = bayesian_ridge, param_grid = param_grid, cv = 5)
grid_search.fit(x_train_scaled, y_train)

In [22]:
best_params = grid_search.best_params_

print("Best parameters:", best_params)

Best parameters: {'alpha_1': 1e-06, 'alpha_2': 0.0001, 'lambda_1': 0.0001, 'lambda_2': 1e-06}


In [47]:
neigh = KNeighborsRegressor(n_neighbors = 7, p = 1, weights = 'distance')
scores = cross_validate(neigh, x_train_scaled, y_train,  scoring = ('r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error'), cv = 5)
mae = np.mean(-scores['test_neg_mean_absolute_error'])
rmse = np.mean(-scores['test_neg_root_mean_squared_error'])
r2 = np.mean(scores['test_r2'])

print('train set - MAE: {}, RMSE: {}, R2: {}'.format(mae, rmse, r2))

neigh.fit(x_train_scaled, y_train)

y_pred_train = neigh.predict(x_train_scaled)
evaluate(y_train, y_pred_train, 'train')

y_pred_test = neigh.predict(x_test_scaled)
evaluate(y_test, y_pred_test, 'test')

train set - MAE: 0.0, RMSE: 0.0, R2: 1.0
test set - MAE: 13.582238158999472, RMSE: 18.333549142719992, R2: 0.790693177258691


{'label': 'test',
 'mae': 13.582238158999472,
 'rmse': 18.333549142719992,
 'r2': 0.790693177258691}

In [48]:
dt_regressor = DecisionTreeRegressor(max_depth = 10, max_features = None, min_samples_leaf = 4, min_samples_split = 10)
# scores = cross_validate(dt_regressor, x_train_scaled, y_train,  scoring = ('r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error'), cv = 5)
# mae = np.mean(-scores['test_neg_mean_absolute_error'])
# rmse = np.mean(-scores['test_neg_root_mean_squared_error'])
# r2 = np.mean(scores['test_r2'])

# print('train set - MAE: {}, RMSE: {}, R2: {}'.format(mae, rmse, r2))

dt_regressor.fit(x_train_scaled, y_train)

y_pred_train = dt_regressor.predict(x_train_scaled)
evaluate(y_train, y_pred_train, 'train')

y_pred_test = dt_regressor.predict(x_test_scaled)
evaluate(y_test, y_pred_test, 'test')

train set - MAE: 11.797953066481881, RMSE: 16.779325930854867, R2: 0.8378764991701514
test set - MAE: 14.472401108423746, RMSE: 19.39517427688668, R2: 0.7657510440738063


{'label': 'test',
 'mae': 14.472401108423746,
 'rmse': 19.39517427688668,
 'r2': 0.7657510440738063}

In [49]:
clf = BayesianRidge(alpha_1 = 1e-06, alpha_2 = 0.0001, lambda_1 = 0.0001, lambda_2 = 1e-06)
clf.fit(x_train_scaled, y_train)

y_pred_train = clf.predict(x_train_scaled)
evaluate(y_train, y_pred_train, 'train')

y_pred_test = clf.predict(x_test_scaled)
evaluate(y_test, y_pred_test, 'test')

train set - MAE: 17.556325026265185, RMSE: 21.468346883397693, R2: 0.7346041296206633
test set - MAE: 16.56599155802789, RMSE: 20.826447104130434, R2: 0.7299024365452673


{'label': 'test',
 'mae': 16.56599155802789,
 'rmse': 20.826447104130434,
 'r2': 0.7299024365452673}

### Experiment Set 2: Selecting features

### Experiment Set 3: Ensemble learning

In [41]:
metric_df = pd.DataFrame(columns = ['ensemble type','model', 'train/ test', 'MAE', 'RMSE', 'R2'])

linear_reg = LinearRegression()
tree_reg = DecisionTreeRegressor(max_depth = 10, max_features = None, min_samples_leaf = 4, min_samples_split = 10)
knn_reg = KNeighborsRegressor(n_neighbors = 7, p = 1, weights = 'distance')
br_reg = BayesianRidge()

### Bagging

Linear Regressor

In [42]:
bagging_reg = BaggingRegressor(estimator = linear_reg, n_estimators = 10, random_state = 42)
bagging_reg.fit(x_train_scaled, y_train)

y_pred_train = bagging_reg.predict(x_train_scaled)
result = evaluate(y_train, y_pred_train, 'train')

metric_df.loc[len(metric_df)] = ['bagging', 'linear regressor', result['label'], result['mae'], result['rmse'], result['r2']]

y_pred_test = bagging_reg.predict(x_test_scaled)
result = evaluate(y_test, y_pred_test, 'test')
metric_df.loc[len(metric_df)] = ['bagging', 'linear regressor', result['label'], result['mae'], result['rmse'], result['r2']]

train set - MAE: 17.562470534187053, RMSE: 21.469586457353124, R2: 0.7345734810282857
test set - MAE: 16.571424022552705, RMSE: 20.832435972921644, R2: 0.7297470752936468


Decision Tree Regressor

In [43]:
bagging_reg = BaggingRegressor(estimator = tree_reg, n_estimators = 10, random_state = 42)
bagging_reg.fit(x_train_scaled, y_train)

y_pred_train = bagging_reg.predict(x_train_scaled)
result = evaluate(y_train, y_pred_train, 'train')

metric_df.loc[len(metric_df)] = ['bagging', 'decision tree regressor', result['label'], result['mae'], result['rmse'], result['r2']]

y_pred_test = bagging_reg.predict(x_test_scaled)
result = evaluate(y_test, y_pred_test, 'test')
metric_df.loc[len(metric_df)] = ['bagging', 'decision tree regressor', result['label'], result['mae'], result['rmse'], result['r2']]

train set - MAE: 11.435702268253877, RMSE: 15.861465457681469, R2: 0.85512829463455
test set - MAE: 12.541042247829475, RMSE: 17.61517650147877, R2: 0.8067745669065163


In [45]:
bagging_reg = BaggingRegressor(estimator = knn_reg, n_estimators = 10, random_state = 42)
bagging_reg.fit(x_train_scaled, y_train)

y_pred_train = bagging_reg.predict(x_train_scaled)
result = evaluate(y_train, y_pred_train, 'train')

metric_df.loc[len(metric_df)] = ['bagging', 'knn regressor', result['label'], result['mae'], result['rmse'], result['r2']]

y_pred_test = bagging_reg.predict(x_test_scaled)
result = evaluate(y_test, y_pred_test, 'test')
metric_df.loc[len(metric_df)] = ['bagging', 'knn regressor', result['label'], result['mae'], result['rmse'], result['r2']]

train set - MAE: 5.170746478500697, RMSE: 7.764651412186009, R2: 0.9652831047275834
test set - MAE: 13.851890032128512, RMSE: 18.731877058463326, R2: 0.7814992718058092


In [50]:
bagging_reg = BaggingRegressor(estimator = br_reg, n_estimators = 10, random_state = 42)
bagging_reg.fit(x_train_scaled, y_train)

y_pred_train = bagging_reg.predict(x_train_scaled)
result = evaluate(y_train, y_pred_train, 'train')

metric_df.loc[len(metric_df)] = ['bagging', 'bayesian ridge', result['label'], result['mae'], result['rmse'], result['r2']]

y_pred_test = bagging_reg.predict(x_test_scaled)
result = evaluate(y_test, y_pred_test, 'test')
metric_df.loc[len(metric_df)] = ['bagging', 'bayesian ridge', result['label'], result['mae'], result['rmse'], result['r2']]

train set - MAE: 17.56268310697441, RMSE: 21.46971263700095, R2: 0.7345703611244002
test set - MAE: 16.569187686976576, RMSE: 20.829045143721473, R2: 0.7298350445480795


### Boosting

In [54]:
boosting_reg = AdaBoostRegressor(estimator = linear_reg, n_estimators = 10, random_state = 42)
boosting_reg.fit(x_train_scaled, y_train)

y_pred_train = boosting_reg.predict(x_train_scaled)
result = evaluate(y_train, y_pred_train, 'train')

metric_df.loc[len(metric_df)] = ['boosting', 'linear regressor', result['label'], result['mae'], result['rmse'], result['r2']]

y_pred_test = boosting_reg.predict(x_test_scaled)
result = evaluate(y_test, y_pred_test, 'test')
metric_df.loc[len(metric_df)] = ['boosting', 'linear regressor', result['label'], result['mae'], result['rmse'], result['r2']]

train set - MAE: 17.63353327564782, RMSE: 21.503338283140252, R2: 0.7337382836229128
test set - MAE: 16.896787061848226, RMSE: 21.086276591929856, R2: 0.7231209545548627


In [55]:
boosting_reg = AdaBoostRegressor(estimator = tree_reg, n_estimators = 10, random_state = 42)
boosting_reg.fit(x_train_scaled, y_train)

y_pred_train = boosting_reg.predict(x_train_scaled)
result = evaluate(y_train, y_pred_train, 'train')

metric_df.loc[len(metric_df)] = ['boosting', 'decision tree regressor', result['label'], result['mae'], result['rmse'], result['r2']]

y_pred_test = boosting_reg.predict(x_test_scaled)
result = evaluate(y_test, y_pred_test, 'test')
metric_df.loc[len(metric_df)] = ['boosting', 'decision tree regressor', result['label'], result['mae'], result['rmse'], result['r2']]

train set - MAE: 12.477362833006673, RMSE: 16.170369362451705, R2: 0.849430560672633
test set - MAE: 13.317677136581256, RMSE: 17.825588424403016, R2: 0.8021308715219588


In [57]:
boosting_reg = AdaBoostRegressor(estimator = knn_reg, n_estimators = 10, random_state = 42)
boosting_reg.fit(x_train_scaled, y_train)

y_pred_train = boosting_reg.predict(x_train_scaled)
result = evaluate(y_train, y_pred_train, 'train')

metric_df.loc[len(metric_df)] = ['boosting', 'knn regressor', result['label'], result['mae'], result['rmse'], result['r2']]

y_pred_test = boosting_reg.predict(x_test_scaled)
result = evaluate(y_test, y_pred_test, 'test')
metric_df.loc[len(metric_df)] = ['boosting', 'knn regressor', result['label'], result['mae'], result['rmse'], result['r2']]

train set - MAE: 1.1011059326108315, RMSE: 2.7212694630990044, R2: 0.9957357756145416
test set - MAE: 16.2302559813445, RMSE: 22.07665552248048, R2: 0.6965012872741569


In [58]:
boosting_reg = AdaBoostRegressor(estimator = br_reg, n_estimators = 10, random_state = 42)
boosting_reg.fit(x_train_scaled, y_train)

y_pred_train = boosting_reg.predict(x_train_scaled)
result = evaluate(y_train, y_pred_train, 'train')

metric_df.loc[len(metric_df)] = ['boosting', 'bayesian ridge', result['label'], result['mae'], result['rmse'], result['r2']]

y_pred_test = boosting_reg.predict(x_test_scaled)
result = evaluate(y_test, y_pred_test, 'test')
metric_df.loc[len(metric_df)] = ['boosting', 'bayesian ridge', result['label'], result['mae'], result['rmse'], result['r2']]

train set - MAE: 17.62605100585966, RMSE: 21.49857204946637, R2: 0.7338563048247568
test set - MAE: 16.903581310967088, RMSE: 21.098437600942365, R2: 0.7228014956396722


### Voting

In [65]:
vooting_reg = VotingRegressor(estimators = [('lr', linear_reg), ('dtr', tree_reg), ('knnr', knn_reg), ('brr', br_reg)])
vooting_reg.fit(x_train_scaled, y_train)

y_pred_train = vooting_reg.predict(x_train_scaled)
result = evaluate(y_train, y_pred_train, 'train')

metric_df.loc[len(metric_df)] = ['vooting', 'all', result['label'], result['mae'], result['rmse'], result['r2']]

y_pred_test = vooting_reg.predict(x_test_scaled)
result = evaluate(y_test, y_pred_test, 'test')
metric_df.loc[len(metric_df)] = ['vooting', 'all', result['label'], result['mae'], result['rmse'], result['r2']]

train set - MAE: 11.23346486556008, RMSE: 14.100627806180537, R2: 0.8855083401724068
test set - MAE: 14.108728370578246, RMSE: 18.38136820053915, R2: 0.7895998910720892


### Stacking

In [67]:
stacking_reg = StackingRegressor(estimators = [('lr', linear_reg), ('dtr', tree_reg), ('knnr', knn_reg), ('brr', br_reg)], final_estimator = None)
stacking_reg.fit(x_train_scaled, y_train)

y_pred_train = stacking_reg.predict(x_train_scaled)
result = evaluate(y_train, y_pred_train, 'train')

metric_df.loc[len(metric_df)] = ['stacking - no meta', 'all', result['label'], result['mae'], result['rmse'], result['r2']]

y_pred_test = stacking_reg.predict(x_test_scaled)
result = evaluate(y_test, y_pred_test, 'test')
metric_df.loc[len(metric_df)] = ['stacking - no meta', 'all', result['label'], result['mae'], result['rmse'], result['r2']]

train set - MAE: 6.881395091845863, RMSE: 9.06088111185157, R2: 0.9527243164394503
test set - MAE: 13.09938689427927, RMSE: 17.737807185222838, R2: 0.8040748668615917


In [69]:
stacking_reg = StackingRegressor(estimators = [('lr', linear_reg), ('dtr', tree_reg), ('knnr', knn_reg), ('brr', br_reg)], final_estimator = linear_reg)
stacking_reg.fit(x_train_scaled, y_train)

y_pred_train = stacking_reg.predict(x_train_scaled)
result = evaluate(y_train, y_pred_train, 'train')

metric_df.loc[len(metric_df)] = ['stacking - linear regression', 'all', result['label'], result['mae'], result['rmse'], result['r2']]

y_pred_test = stacking_reg.predict(x_test_scaled)
result = evaluate(y_test, y_pred_test, 'test')
metric_df.loc[len(metric_df)] = ['stacking - linear regression', 'all', result['label'], result['mae'], result['rmse'], result['r2']]

train set - MAE: 6.8730686654451265, RMSE: 9.04962985074604, R2: 0.952841651785853
test set - MAE: 13.118408017756444, RMSE: 17.75326741537575, R2: 0.8037331822694946


In [71]:
stacking_reg = StackingRegressor(estimators = [('lr', linear_reg), ('dtr', tree_reg), ('knnr', knn_reg), ('brr', br_reg)], final_estimator = tree_reg)
stacking_reg.fit(x_train_scaled, y_train)

y_pred_train = stacking_reg.predict(x_train_scaled)
result = evaluate(y_train, y_pred_train, 'train')

metric_df.loc[len(metric_df)] = ['stacking - decision tree regression', 'all', result['label'], result['mae'], result['rmse'], result['r2']]

y_pred_test = stacking_reg.predict(x_test_scaled)
result = evaluate(y_test, y_pred_test, 'test')
metric_df.loc[len(metric_df)] = ['stacking - decision tree regression', 'all', result['label'], result['mae'], result['rmse'], result['r2']]

train set - MAE: 10.688809092327704, RMSE: 15.287362565393899, R2: 0.8654257139721766
test set - MAE: 13.580713198065977, RMSE: 18.622427425950594, R2: 0.7840451947406639


In [73]:
stacking_reg = StackingRegressor(estimators = [('lr', linear_reg), ('dtr', tree_reg), ('knnr', knn_reg), ('brr', br_reg)], final_estimator = knn_reg)
stacking_reg.fit(x_train_scaled, y_train)

y_pred_train = stacking_reg.predict(x_train_scaled)
result = evaluate(y_train, y_pred_train, 'train')

metric_df.loc[len(metric_df)] = ['stacking - knn regression', 'all', result['label'], result['mae'], result['rmse'], result['r2']]

y_pred_test = stacking_reg.predict(x_test_scaled)
result = evaluate(y_test, y_pred_test, 'test')
metric_df.loc[len(metric_df)] = ['stacking - knn regression', 'all', result['label'], result['mae'], result['rmse'], result['r2']]

train set - MAE: 11.103271330518524, RMSE: 16.002933054643236, R2: 0.8525325638105726
test set - MAE: 13.679582370473897, RMSE: 18.304564574504685, R2: 0.7913544646353377


In [75]:
stacking_reg = StackingRegressor(estimators = [('lr', linear_reg), ('dtr', tree_reg), ('knnr', knn_reg), ('brr', br_reg)], final_estimator = br_reg)
stacking_reg.fit(x_train_scaled, y_train)

y_pred_train = stacking_reg.predict(x_train_scaled)
result = evaluate(y_train, y_pred_train, 'train')

metric_df.loc[len(metric_df)] = ['stacking - bayesian ridge', 'all', result['label'], result['mae'], result['rmse'], result['r2']]

y_pred_test = stacking_reg.predict(x_test_scaled)
result = evaluate(y_test, y_pred_test, 'test')
metric_df.loc[len(metric_df)] = ['stacking - bayesian ridge', 'all', result['label'], result['mae'], result['rmse'], result['r2']]

train set - MAE: 6.912629383580071, RMSE: 9.105407980673084, R2: 0.9522585316199785
test set - MAE: 13.076112622050294, RMSE: 17.71824954526338, R2: 0.804506681311985


In [79]:
metric_df

Unnamed: 0,ensemble type,model,train/ test,MAE,RMSE,R2
0,bagging,linear regressor,train,17.562471,21.469586,0.734573
1,bagging,linear regressor,test,16.571424,20.832436,0.729747
2,bagging,decision tree regressor,train,11.435702,15.861465,0.855128
3,bagging,decision tree regressor,test,12.541042,17.615177,0.806775
4,bagging,knn regressor,train,5.170746,7.764651,0.965283
5,bagging,knn regressor,test,13.85189,18.731877,0.781499
6,bagging,bayesian ridge,train,17.562683,21.469713,0.73457
7,bagging,bayesian ridge,test,16.569188,20.829045,0.729835
8,boosting,linear regressor,train,17.633533,21.503338,0.733738
9,boosting,linear regressor,test,16.896787,21.086277,0.723121


In [83]:
metric_df[metric_df['train/ test'] == 'test']['MAE'].min()

12.541042247829475

In [84]:
metric_df[metric_df['train/ test'] == 'test']['RMSE'].min()

17.61517650147877

In [85]:
metric_df[metric_df['train/ test'] == 'test']['R2'].max()

0.8067745669065163

In [86]:
metric_df[metric_df['MAE'] == 12.541042247829475]

Unnamed: 0,ensemble type,model,train/ test,MAE,RMSE,R2
3,bagging,decision tree regressor,test,12.541042,17.615177,0.806775


In [87]:
metric_df[metric_df['RMSE'] == 17.61517650147877]

Unnamed: 0,ensemble type,model,train/ test,MAE,RMSE,R2
3,bagging,decision tree regressor,test,12.541042,17.615177,0.806775


In [88]:
metric_df[metric_df['R2'] == 0.8067745669065163]

Unnamed: 0,ensemble type,model,train/ test,MAE,RMSE,R2
3,bagging,decision tree regressor,test,12.541042,17.615177,0.806775


### Experiment Set 4: Varying training sample size