# ITI105 Machine Learning Project

Team members:
* Lye Suh Jeng (7487427Y)
* Lee Li Neng (6203055B)
* Lim Chan Boon (9704541M)
---
Project Problem: 4 (a) as in suggested project:
* The success of the song can often been measured by whether the song is on the Hit Chart such as Billboard Hot 100. It is important for music labels to know what makes a song successful so that they can focus their budget on making songs that has the highest chance of being successful.
---
We want to solve the problem statement by using the follow steps:
1.   Load dataset
2.   Discover & visualize data to gain insights
3.   Prepare data
4.   Feature scaling
6.   Feature reduction
5.   Split data into train and test datasets
6.   Train, fine tune and evaluate models
7.   Compare performance of models
8.   Deploy the model

# (1) Gather and Load dataset

In [1]:
# Suppress warnings about too few trees from the early models
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor

df = pd.read_csv('https://raw.githubusercontent.com/dy018/project105/main/song_data.csv')

## 2.1 Split features into numeric and categorical features

In [2]:
# song_popularity is target. The rest of columns are features
# Get list of features from df
features = df.columns[1:].tolist()

# put features into 2 types: categorical features and numeric features
category_features = ['audio_mode', 'time_signature', 'key']
numeric_features = [feature for feature in features if feature not in category_features]
print(">>> Categorical features are:", category_features)
print(">>> Numeric features are:", numeric_features)


>>> Categorical features are: ['audio_mode', 'time_signature', 'key']
>>> Numeric features are: ['song_popularity', 'song_duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'audio_valence']


# (3) Prepare data


In [3]:
# Backup original df
df2 = df.copy()

## 3.1 Prepare data by removing any duplicat records if any


In [4]:
# Remove duplicate rows (if any)
print(">>>> Original df's shape:", df.shape)

counter = 0
row,col = df.shape

df2.drop_duplicates(inplace=True)

if df2.shape==(row,col):
    print('>>> The dataset doesn\'t have any duplicates')
else:
    print('>>> Number of duplicates dropped/fixed:', {row-df2.shape[0]})
    print(">>> New df2's shape after removing dupliate rows", df2.shape)

>>>> Original df's shape: (18835, 15)
>>> Number of duplicates dropped/fixed: {3909}
>>> New df2's shape after removing dupliate rows (14926, 15)


## (3)(b) Convert categorical features to numeric using dummy encoding

In [5]:
# backup df:
df3 = df2.copy()

# Convert categorical features to numeric using dummy encoding
for feature in category_features:
    dummies = pd.get_dummies(df3[feature], prefix=feature)
    df3 = pd.concat([df3, dummies], axis=1)
    df3.drop(feature, axis=1, inplace=True)

print(">>> df's shape after converting categorical features to numeric using dummy encoding:\n", df3.shape)
print(">>> Columns in new df:\n", df3.columns)
print(">>> First 5 records in new df3:\n")
df3.head()

>>> df's shape after converting categorical features to numeric using dummy encoding:
 (14926, 31)
>>> Columns in new df:
 Index(['song_name', 'song_popularity', 'song_duration_ms', 'acousticness',
       'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness',
       'speechiness', 'tempo', 'audio_valence', 'audio_mode_0', 'audio_mode_1',
       'time_signature_0', 'time_signature_1', 'time_signature_3',
       'time_signature_4', 'time_signature_5', 'key_0', 'key_1', 'key_2',
       'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10',
       'key_11'],
      dtype='object')
>>> First 5 records in new df3:



Unnamed: 0,song_name,song_popularity,song_duration_ms,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,...,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11
0,Boulevard of Broken Dreams,73,262333,0.00552,0.496,0.682,2.9e-05,0.0589,-4.095,0.0294,...,False,False,False,False,False,False,True,False,False,False
1,In The End,66,216933,0.0103,0.542,0.853,0.0,0.108,-6.407,0.0498,...,False,True,False,False,False,False,False,False,False,False
2,Seven Nation Army,76,231733,0.00817,0.737,0.463,0.447,0.255,-7.828,0.0792,...,False,False,False,False,False,False,False,False,False,False
3,By The Way,74,216933,0.0264,0.451,0.97,0.00355,0.102,-4.938,0.107,...,False,False,False,False,False,False,False,False,False,False
4,How You Remind Me,56,223826,0.000954,0.447,0.766,0.0,0.113,-5.065,0.0313,...,False,False,False,False,False,False,False,False,True,False


# 4) Combine features

In [6]:
df4 = df3.copy()
# combine loudness and energy (correlation  = 0.76)
df4['loudness_energy'] = df4['loudness'] * df4['energy']

# combine danceability and audio_valence (correlation = 0.33)
df4['danceability_av'] = df4['danceability'] * df4['audio_valence']

#drop loudness, energy, danceability, audio_valence
df4.drop(['song_name','loudness', 'energy', 'danceability', 'audio_valence'], axis=1, inplace=True)

print(">>> df's shape after combining features:\n", df4.shape)

>>> df's shape after combining features:
 (14926, 28)


In [7]:
df4.head()

Unnamed: 0,song_popularity,song_duration_ms,acousticness,instrumentalness,liveness,speechiness,tempo,audio_mode_0,audio_mode_1,time_signature_0,...,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,loudness_energy,danceability_av
0,73,262333,0.00552,2.9e-05,0.0589,0.0294,167.06,False,True,False,...,False,False,False,False,True,False,False,False,-2.79279,0.235104
1,66,216933,0.0103,0.0,0.108,0.0498,105.256,True,False,False,...,False,False,False,False,False,False,False,False,-5.465171,0.20054
2,76,231733,0.00817,0.447,0.255,0.0792,123.881,False,True,False,...,False,False,False,False,False,False,False,False,-3.624364,0.238788
3,74,216933,0.0264,0.00355,0.102,0.107,122.444,False,True,False,...,False,False,False,False,False,False,False,False,-4.78986,0.089298
4,56,223826,0.000954,0.0,0.113,0.0313,172.011,False,True,False,...,False,False,False,False,False,False,True,False,-3.87979,0.256578


# Remove instrumentalness

In [8]:
df4 = df4.drop(['instrumentalness'], axis=1)

# 5) Feature scaling

In [9]:
# Apply scaling on columns of df
def scaler(temp_df):
  # Initialize the MinMaxScaler
  min_max_scaler = MinMaxScaler()

  # Fit the scaler to the data and transform it
  min_max_scaled_data = min_max_scaler.fit_transform(temp_df)

  temp_df = pd.DataFrame(min_max_scaled_data, columns=temp_df.columns)
  return temp_df

Y = df4['song_popularity']

X_scaled = scaler(df4.drop(['song_popularity'], axis=1))

print (">>> Display first 5 records of training data after scaling:\n")
X_scaled.head()

>>> Display first 5 records of training data after scaling:



Unnamed: 0,song_duration_ms,acousticness,liveness,speechiness,tempo,audio_mode_0,audio_mode_1,time_signature_0,time_signature_1,time_signature_3,...,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,loudness_energy,danceability_av
0,0.140059,0.005541,0.049226,0.031243,0.689425,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.714949,0.252207
1,0.114658,0.01034,0.09958,0.052922,0.434371,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.539481,0.215128
2,0.122938,0.008202,0.250333,0.084166,0.511233,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.660348,0.256159
3,0.114658,0.026505,0.093426,0.113709,0.505303,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.583822,0.095794
4,0.118514,0.000957,0.104707,0.033262,0.709856,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.643577,0.275243


In [10]:
print (">>> Display df's statistic after scaling:\n")
X_scaled.describe()

>>> Display df's statistic after scaling:



Unnamed: 0,song_duration_ms,acousticness,liveness,speechiness,tempo,audio_mode_0,audio_mode_1,time_signature_0,time_signature_1,time_signature_3,...,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,loudness_energy,danceability_av
count,14926.0,14926.0,14926.0,14926.0,14926.0,14926.0,14926.0,14926.0,14926.0,14926.0,...,14926.0,14926.0,14926.0,14926.0,14926.0,14926.0,14926.0,14926.0,14926.0,14926.0
mean,0.115786,0.271538,0.173864,0.105652,0.499777,0.368083,0.631917,0.000201,0.004489,0.045826,...,0.072625,0.084215,0.070213,0.110813,0.070146,0.094466,0.070012,0.081804,0.620629,0.367743
std,0.034879,0.299315,0.148803,0.110038,0.119862,0.4823,0.4823,0.014176,0.06685,0.209115,...,0.259529,0.27772,0.255514,0.313911,0.255402,0.292486,0.255176,0.274074,0.098197,0.212455
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0962,0.023694,0.084196,0.039532,0.404943,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.568219,0.194664
50%,0.111811,0.139557,0.113937,0.057492,0.495283,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.631436,0.345284
75%,0.130204,0.459839,0.218542,0.120085,0.577509,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.682854,0.523929
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Split data into training and testing datasets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, train_size=0.8, test_size=0.2, random_state=40)
X_train.reset_index(drop=True,inplace=True)

print(">>> Size of trainig set: ", X_train.shape)
print(">>> Size of testing set: ", X_test.shape)

>>> Size of trainig set:  (11940, 26)
>>> Size of testing set:  (2986, 26)


# 6.  Train, fine tune, and evaluate model's performance

## Model Evaluation Functions

In [12]:
# calculates r2_score
def cal_r2(y_true, y_pred):
  r2 = r2_score(y_true, y_pred)
  return r2

# Calculates adjusted_r2
def cal_adj_r2(x_df, r2):
  number_variables = x_df[1] - 1
  adjusted_r2 = 1 - ((1-r2) * (x_df[0]-1)) / (x_df[0] - number_variables -1)
  return adjusted_r2

def cal_mse(y_true, y_pred):
  mse = mean_squared_error(y_true, y_pred)
  return mse

def cal_performance(x_train_shape, x_test_shape, y_train, y_train_pred, y_test, y_test_pred):
  r2_train = cal_r2(y_train, y_train_pred)
  r2_test = cal_r2(y_test, y_test_pred)
  adj_r2_train = cal_adj_r2(x_train_shape, r2_train)
  adj_r2_test = cal_adj_r2(x_test_shape, r2_test)
  mse_train = cal_mse(y_train, y_train_pred)
  mse_test = cal_mse(y_test, y_test_pred)

  performance_dict = {
    'r2_train': r2_train,
    'r2_test': r2_test,
    'adj_r2_train': adj_r2_train,
    'adj_r2_test': adj_r2_test,
    'mse_train': mse_train,
    'mse_test': mse_test
    }
  return performance_dict

## 6a) Polynomial Regression (PR)

In [13]:
# specify degree of 3 for polynomial regression model
# include bias=False means don't force y-intercept to equal zero
poly = PolynomialFeatures(degree=3, include_bias=False)
poly_train_features = poly.fit_transform(X_train)
poly_test_features = poly.fit_transform(X_test)

# Create LinearRegression
pr = LinearRegression()

# train the model
pr.fit(poly_train_features, y_train)

pr_pred_train = pr.predict(poly_train_features)
pr_pred_test = pr.predict(poly_test_features)

In [14]:
pr_performance = cal_performance(X_train.shape, X_test.shape, y_train, pr_pred_train, y_test, pr_pred_test)
print(pr_performance)

{'r2_train': 0.11534059569001276, 'r2_test': -1.1681942334011783e+21, 'adj_r2_train': 0.11348425146408103, 'adj_r2_test': -1.1780607387508505e+21, 'mse_train': 367.3253733674885, 'mse_test': 4.855003923638608e+23}


## 6b) Multiple Linear Regression (mlr)

In [15]:
mlr = LinearRegression()
mlr_param = {'copy_X': [True, False], 'fit_intercept': [True, False], 'n_jobs': [1,5,10,15,None], 'positive': [True, False]}

random_search = RandomizedSearchCV(mlr, mlr_param, n_iter=100, cv=5)
random_search.fit(X_train, y_train)

# Parameter which gives the best results
print(f"Best Hyperparameters: {random_search.best_params_}")

# Accuracy of the model after using best parameters
print(f"Best Score: {random_search.best_score_}")

# Train the Elastic Net model with the best parameters
best_mlr = random_search.best_estimator_
best_mlr.fit(X_train, y_train)

# Predict on the test set
mlr_pred_train = best_mlr.predict(X_train)
mlr_pred_test = best_mlr.predict(X_test)

Best Hyperparameters: {'positive': False, 'n_jobs': 1, 'fit_intercept': True, 'copy_X': True}
Best Score: 0.0158185375147875


In [16]:
# Evalute performance
mlr_performance = cal_performance(X_train.shape, X_test.shape, y_train, mlr_pred_train, y_test, mlr_pred_test)
print(mlr_performance)

{'r2_train': 0.02170581443832431, 'r2_test': 0.015319341808314735, 'adj_r2_train': 0.019652989640687735, 'adj_r2_test': 0.007002782195209334, 'mse_train': 406.20409981960415, 'mse_test': 409.23232818338425}


## 6c) Elastic Net Regression (enr)

In [17]:
# Create an ElasticNet regression model instance
# l1_ratio corresponds to the mix of L1 and L2 regularization
# alpha corresponds to the strength of the regularization
enr = ElasticNet()

# Define the hyperparameters grid to search
param_grid = {
    'alpha': [0.1, 1, 10, 100],
    'l1_ratio': [0.1, 0.5, 0.7, 0.9, 1.0],
    'fit_intercept': [True, False],
    'max_iter': [100, 500, 1000]
}

# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(enr, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score (negative MSE): ", grid_search.best_score_)

# Train the Elastic Net model with the best parameters
best_elastic_net = grid_search.best_estimator_
best_elastic_net.fit(X_train, y_train)

# Predict on the test set
enr_pred_train = best_elastic_net.predict(X_train)
enr_pred_test = best_elastic_net.predict(X_test)

Best parameters found:  {'alpha': 0.1, 'fit_intercept': False, 'l1_ratio': 1.0, 'max_iter': 100}
Best cross-validation score (negative MSE):  -408.85305681933676


In [18]:
# Evaluate the model
enr_performance = cal_performance(X_train.shape, X_test.shape, y_train, enr_pred_train, y_test, enr_pred_test)
print(enr_performance)

{'r2_train': 0.01630523757622282, 'r2_test': 0.017815696346046406, 'adj_r2_train': 0.014241080361131697, 'adj_r2_test': 0.009520220808428514, 'mse_train': 408.44650961325635, 'mse_test': 408.19484565445634}


## 6d) Decision Tree Regressor (dtr)

In [19]:
# Create a Decision Tree Regressor object
dtr = DecisionTreeRegressor(random_state=40)

# Fit the regressor to the training data
dtr.fit(X_train, y_train)

# Predict on the test data
dtr_pred_train = dtr.predict(X_train)
dtr_pred_test = dtr.predict(X_test)



In [20]:
# Evaluate the model
dtr_performance = cal_performance(X_train.shape, X_test.shape, y_train, dtr_pred_train, y_test, dtr_pred_test)
print(dtr_performance)

{'r2_train': 0.985016468830553, 'r2_test': -0.9594657284717041, 'adj_r2_train': 0.9849850278133265, 'adj_r2_test': -0.9760152700973097, 'mse_train': 6.221412618648799, 'mse_test': 814.3520596115204}


## 6e) Adaboost (ada)

In [21]:
# Define the AdaBoost Regressor model
adaboost = AdaBoostRegressor() # base estimator is DecisionTreeRegressor

# Define the hyperparameters grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 1, 0.2, 0.5, 0.7, 1.0],
    'loss': ['linear', 'square', 'exponential']
}

# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(adaboost, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score (negative MSE): ", grid_search.best_score_)

# Train the AdaBoost Regressor model with the best parameters
best_adaboost = grid_search.best_estimator_
best_adaboost.fit(X_train, y_train)

# Predict on the test data
ada_pred_train = best_adaboost.predict(X_train)
ada_pred_test = best_adaboost.predict(X_test)

Best parameters found:  {'learning_rate': 0.01, 'loss': 'linear', 'n_estimators': 100}
Best cross-validation score (negative MSE):  -405.5350281909025


In [22]:
# Evaluate the model
ada_performance = cal_performance(X_train.shape, X_test.shape, y_train, ada_pred_train, y_test, ada_pred_test)
print(ada_performance)

{'r2_train': 0.030921479834622567, 'r2_test': 0.021674720849512052, 'adj_r2_train': 0.028887992928114725, 'adj_r2_test': 0.01341183842425453, 'mse_train': 402.37760149037945, 'mse_test': 406.5910388070971}


## 6f) Gradient Boosting Regressor (gbr)

In [23]:
from sklearn.ensemble import GradientBoostingRegressor

# Create a Gradient Boosting Regressor model
gbr = GradientBoostingRegressor()

# Define the hyperparameters grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 1, 0.2, 0.5, 0.7, 1.0],
    'max_depth': [3,5,7,9,11,13,15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform GridSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(gbr, param_distributions=param_grid, n_iter=100, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)


# Print the best parameters and the best score
print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation score (negative MSE): ", random_search.best_score_)

# Train the AdaBoost Regressor model with the best parameters
best_gbr = random_search.best_estimator_
best_gbr.fit(X_train, y_train)

# Predict on the test data
gbr_pred_train = best_gbr.predict(X_train)
gbr_pred_test = best_gbr.predict(X_test)


Best parameters found:  {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 3, 'learning_rate': 0.05}
Best cross-validation score (negative MSE):  -402.5265135225333


In [24]:
# Evaluate the model
gbr_performance = cal_performance(X_train.shape, X_test.shape, y_train, gbr_pred_train, y_test, gbr_pred_test)
print(gbr_performance)

{'r2_train': 0.06682418488509945, 'r2_test': 0.030075373837147867, 'adj_r2_train': 0.06486603519751577, 'adj_r2_test': 0.021883442872934533, 'mse_train': 387.4701981741217, 'mse_test': 403.0997355588905}


## 6g) RBF SVC (RBF)

In [25]:
# Define the SVR model with RBF kernel
svr = SVR(kernel='rbf')

# Define the hyperparameters grid to search
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1e-3, 1e-2, 1e-1, 1]
}

# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score (negative MSE): ", grid_search.best_score_)

# Train the SVR model with the best parameters
best_svr = grid_search.best_estimator_
best_svr.fit(X_train, y_train)

# Predict on the test set
rbf_pred_train = best_svr.predict(X_train)
rbf_pred_test = best_svr.predict(X_test)

Best parameters found:  {'C': 10, 'gamma': 0.1}
Best cross-validation score (negative MSE):  -416.36154670135164


In [26]:
# Evaluate the model
rbf_performance = cal_performance(X_train.shape, X_test.shape, y_train, rbf_pred_train, y_test, rbf_pred_test)
print(rbf_performance)

{'r2_train': 0.010569824454949361, 'r2_test': -0.00615785159674731, 'adj_r2_train': 0.008493632211485691, 'adj_r2_test': -0.014655806424422657, 'mse_train': 410.8279490191165, 'mse_test': 418.1582289684562}


Observation from comparison table above:
* DTR has best performance as it got high r2

## 6h) Perform stacking on base models (STA)

In [27]:
# Define the base estimators
#base_estimators = [pr, best_mlr, best_elastic_net, best_adaboost, best_gbr, best_svr]

base_estimators = [
    ('pr', pr),
    ('enr', best_elastic_net),
    ('ada', best_adaboost),
    ('gbr', best_gbr),
    ('rbf',best_svr)
]

# Create the Stacking Regressor with a Linear Regression meta-regressor
stacking_regressor = StackingRegressor(
    estimators=base_estimators,
    cv=5
)

# Fit the stacking regressor
stacking_regressor.fit(X_train, y_train)

# Predict on the test data
stack_pred_train = stacking_regressor.predict(X_train)
stack_pred_test = stacking_regressor.predict(X_test)

In [28]:
# Evaluate the model
stacking_performance = cal_performance(X_train.shape, X_test.shape, y_train, stack_pred_train, y_test, stack_pred_test)
print(stacking_performance)

{'r2_train': 0.06424392916781363, 'r2_test': 0.03139682758459972, 'adj_r2_train': 0.062280365144747996, 'adj_r2_test': 0.0232160575473076, 'mse_train': 388.5415634816266, 'mse_test': 402.55054066087195}


## Compare performance

In [29]:
# convert dictionary to dataframe
pr_df  = pd.DataFrame.from_dict(pr_performance, orient='index', columns=['PR'])
mlr_df = pd.DataFrame.from_dict(mlr_performance, orient='index', columns=['MLR'])
enr_df = pd.DataFrame.from_dict(enr_performance, orient='index', columns=['ENR'])
dtr_df = pd.DataFrame.from_dict(dtr_performance, orient='index', columns=['DTR'])
ada_df = pd.DataFrame.from_dict(ada_performance, orient='index', columns=['ADA'])
gbr_df = pd.DataFrame.from_dict(gbr_performance, orient='index', columns=['GBR'])
rbf_df = pd.DataFrame.from_dict(rbf_performance, orient='index', columns=['RBF'])
stack_df = pd.DataFrame.from_dict(stacking_performance, orient='index', columns=['STA'])

df_performance = pd.concat([pr_df, mlr_df, enr_df, dtr_df, ada_df, gbr_df, rbf_df, stack_df], axis=1)
print(">>> Comparision of model performance:\n")
df_performance

>>> Comparision of model performance:



Unnamed: 0,PR,MLR,ENR,DTR,ADA,GBR,RBF,STA
r2_train,0.1153406,0.021706,0.016305,0.985016,0.030921,0.066824,0.01057,0.064244
r2_test,-1.168194e+21,0.015319,0.017816,-0.959466,0.021675,0.030075,-0.006158,0.031397
adj_r2_train,0.1134843,0.019653,0.014241,0.984985,0.028888,0.064866,0.008494,0.06228
adj_r2_test,-1.178061e+21,0.007003,0.00952,-0.976015,0.013412,0.021883,-0.014656,0.023216
mse_train,367.3254,406.2041,408.44651,6.221413,402.377601,387.470198,410.827949,388.541563
mse_test,4.855004e+23,409.232328,408.194846,814.35206,406.591039,403.099736,418.158229,402.550541


validation

In [30]:
print(y_test[0:10].values)
print(stack_pred_test[:10])

[55 51 72 56 73 58 60 16 38 95]
[48.48669754 51.32279257 43.46202183 47.82278925 50.93506974 51.60493726
 49.15882143 45.47050826 46.26702392 54.11374921]
