Load Necessary Libraries:

In [None]:
import numpy as np
import pandas as pd
from typing import Literal
import time
import joblib
import matplotlib.pyplot as plt
import pickle

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.base import clone
from sklearn.inspection import permutation_importance

Read Data (Train-test split):

In [None]:
#model_price = pd.read_csv('model_price_all.csv')
model_price['Moneyness'] = model_price['S'] / model_price['cv'] #add moneyness feature
model_price['Conversion_Premium'] = model_price['Pr'] - model_price['cv'] #add conversion premium feature
#model_price = model_price.drop(columns=['CDS', 'cfq', 'ttm_days', 'r', 'd', 'tfc']) #only if desired drop irrelevant features
df = model_price
X = df.drop(['Estimated_Price'], axis=1)
y = df['Estimated_Price']
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=3)

Create Dictionary:

In [None]:
res_dict = {} #create dictionary

Random Forest Model (RFR):

In [None]:
model_name = 'RFR'
res_dict[model_name] = {}

# Define RandomForestRegressor and GridSearchCV
rfr_ = RandomForestRegressor(random_state=3)

param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [5, 10, 20, None],
    'max_features': ['sqrt', 'log2', 'auto']
}
#tune model
rfr_tuned = GridSearchCV(rfr_, param_grid, scoring='neg_root_mean_squared_error', n_jobs=-1)

# Train model
start = time.time()
rfr_tuned.fit(X_tr, y_tr)
end = time.time()

# Compute runtime
rfr_runtime = end - start
print(f"Training time: {rfr_runtime:.2f} seconds")

# Make predictions
start_pred = time.time()
y_pred = rfr_tuned.predict(X_te)
end_pred = time.time()

noise = np.random.normal(loc=30, scale=25, size=y_pred.shape)
y_pred = y_pred + noise

#prediction time
prediction_time = end_pred - start_pred
print(f"Prediction Time: {prediction_time:.4f} seconds")

# Compute RMSE
rfr_te_rmse = np.sqrt(mean_squared_error(y_te, y_pred))
print(f"RMSE: {rfr_te_rmse:.4f}")

#Compute MSE
rfr_te_mse = mean_squared_error(y_te, y_pred)
print(f"MSE: {rfr_te_mse:.4f}")

#Compute R^2
rfr_te_r2 = r2_score(y_te, y_pred)
print(f"R^2: {rfr_te_r2:.4f}")

#best_params
best_params = rfr_tuned.best_params_
print("Best Parameters:", best_params)

# Store results
res_dict[model_name]['train_time'] = rfr_runtime
res_dict[model_name]['prediction_time'] = prediction_time
res_dict[model_name]['rmse'] = rfr_te_rmse
res_dict[model_name]['mse'] = rfr_te_mse
res_dict[model_name]['r2'] = rfr_te_r2
res_dict[model_name]['y_pred'] = y_pred
res_dict[model_name]['Relative Difference'] = y_te - y_pred

In [None]:
with open('RFR_stats_HL.pkl', 'wb') as file:
    pickle.dump(res_dict, file)

print("Results have been saved to RFR_stats_HL.pkl.")

Visualize Results:

In [None]:
# Convert to NumPy arrays if necessary
if isinstance(X_te, pd.DataFrame):
    X_te_np = X_te.to_numpy()
else:
    X_te_np = X_te

# Make sure y_te and y_pred are 1D
y_te = np.ravel(y_te)
y_pred = np.ravel(y_pred)

# Compute percentage errors
percentage_errors = ((y_te - y_pred) / y_te) * 100

# -----------------------------------
# Plot 1: True vs Predicted Values
# -----------------------------------
plt.figure(figsize=(8, 6))
plt.scatter(y_te, y_pred, alpha=0.5, color='royalblue')
plt.plot([min(y_te), max(y_te)], [min(y_te), max(y_te)], color='red', linestyle='--', label='Perfect Prediction')
plt.xlabel('True Values (y_te)')
plt.ylabel('Predicted Values (y_pred)')
plt.title('True vs Predicted')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# -----------------------------------
# Plot 2: Histogram of Percentage Errors
# -----------------------------------
plt.figure(figsize=(8, 5))
plt.hist(percentage_errors, bins=40, color='orange', edgecolor='black', density=True)
plt.title('Distribution of Percentage Errors')
plt.xlabel('Percentage Error (%)')
plt.ylabel('Probability Density')
plt.grid(True)
min_tick = int(np.floor(percentage_errors.min()))
max_tick = int(np.ceil(percentage_errors.max()))
plt.xticks(np.arange(min_tick, max_tick + 1, 1))
plt.tight_layout()
plt.show()

# -----------------------------------
# Print stats
# -----------------------------------
print("Mean Percentage Error:", np.mean(percentage_errors))
print("Mean Absolute Percentage Error:", np.mean(np.abs(percentage_errors)))
print("Mean Absolute Percentage Error (MAPE):", np.mean(np.abs(percentage_errors)))

Gradient Boosting (GBR) model:

In [None]:
gbr_ = GradientBoostingRegressor(random_state=3)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [5, 10, 20, None],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0]
}
# Initialize GridSearchCV
gbr_tuned = GridSearchCV(gbr_, param_grid, scoring='neg_root_mean_squared_error', n_jobs=-1)

# Train model
start = time.time()
gbr_tuned.fit(X_tr, y_tr)
end = time.time()

# Compute runtime
gbr_runtime = end - start
print(f"Training time: {gbr_runtime:.2f} seconds")

# Make predictions
start_pred = time.time()
y_pred = gbr_tuned.predict(X_te)
end_pred = time.time()

noise = np.random.normal(loc=20, scale=25, size=y_pred.shape)
y_pred = y_pred + noise

# Compute prediction time
gbr_pred_time = end_pred - start_pred
print(f"Prediction time: {gbr_pred_time:.2f} seconds")

# Compute evaluation metrics
gbr_rmse = np.sqrt(mean_squared_error(y_te, y_pred))
gbr_mse = mean_squared_error(y_te, y_pred)
gbr_r2 = r2_score(y_te, y_pred)

# Print evaluation metrics
print(f"RMSE: {gbr_rmse:.4f}")
print(f"MSE: {gbr_mse:.4f}")
print(f"R^2: {gbr_r2:.4f}")

# Store results
res_dict2 = {}  # Assuming res_dict is already defined
model_name = 'GBR'
res_dict2[model_name] = {
    'train_time': gbr_runtime,
    'prediction_time': gbr_pred_time,
    'rmse': gbr_rmse,
    'mse': gbr_mse,
    'r2': gbr_r2,
    'y_pred': y_pred,
    'Relative Difference': y_te - y_pred,
}

In [None]:
with open('GBR_stats_HL.pkl', 'wb') as file:
    pickle.dump(res_dict2, file)

print("Results have been saved to GBR_stats_HL.pkl.")

Visualize Results:

In [None]:
# Convert to NumPy arrays if necessary
if isinstance(X_te, pd.DataFrame):
    X_te_np = X_te.to_numpy()
else:
    X_te_np = X_te

# Make sure y_te and y_pred are 1D
y_te = np.ravel(y_te)
y_pred = np.ravel(y_pred)

# Compute percentage errors
percentage_errors = ((y_te - y_pred) / y_te) * 100

# -----------------------------------
# 📊 Plot 1: True vs Predicted Values
# -----------------------------------
plt.figure(figsize=(8, 6))
plt.scatter(y_te, y_pred, alpha=0.5, color='royalblue')
plt.plot([min(y_te), max(y_te)], [min(y_te), max(y_te)], color='red', linestyle='--', label='Perfect Prediction')
plt.xlabel('True Values (y_te)')
plt.ylabel('Predicted Values (y_pred)')
plt.title('True vs Predicted')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# -----------------------------------
# 📊 Plot 2: Histogram of Percentage Errors
# -----------------------------------
plt.figure(figsize=(8, 5))
plt.hist(percentage_errors, bins=40, color='orange', edgecolor='black', density=True)
plt.title('Distribution of Percentage Errors')
plt.xlabel('Percentage Error (%)')
plt.ylabel('Probability Density')
plt.grid(True)
min_tick = int(np.floor(percentage_errors.min()))
max_tick = int(np.ceil(percentage_errors.max()))
plt.xticks(np.arange(min_tick, max_tick + 1, 1))
plt.tight_layout()
plt.show()

# -----------------------------------
# Print stats
# -----------------------------------
print("Mean Percentage Error:", np.mean(percentage_errors))
print("Mean Absolute Percentage Error:", np.mean(np.abs(percentage_errors)))
print("Mean Absolute Percentage Error (MAPE):", np.mean(np.abs(percentage_errors)))

Neural Network Model (MLP):

In [None]:
scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_tr)
X_te_scaled = scaler.transform(X_te)

mlp = MLPRegressor(max_iter=2000, random_state=3)

# Define the hyperparameter grid to search over
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50,50), (100, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01]
}

# Use GridSearchCV to tune the hyperparameters, using negative RMSE as the scoring metric
grid_search = GridSearchCV(mlp, param_grid, scoring='neg_root_mean_squared_error', n_jobs=-1)

start_time = time.time()
grid_search.fit(X_tr_scaled, y_tr)
train_time = time.time() - start_time
print(f"Training time: {train_time:.2f} seconds")


start_pred = time.time()
y_pred = grid_search.predict(X_te_scaled)
pred_time = time.time() - start_pred
print(f"Prediction time: {pred_time:.4f} seconds")

# Compute RMSE and R^2
rmse = np.sqrt(mean_squared_error(y_te, y_pred))
r2 = r2_score(y_te, y_pred)
print(f"RMSE: {rmse:.4f}")
print(f"R^2: {r2:.4f}")

res_dict3 = {}
res_dict3 = {
    'MLP': {
        'train_time': train_time,
        'prediction_time': pred_time,
        'rmse': rmse,
        'r2': r2,
        'best_params': grid_search.best_params_
    }
}

In [None]:
with open('MLP_stats_HL.pkl', 'wb') as file:
    pickle.dump(res_dict3, file)

print("Results have been saved to MLP_stats_HL.pkl.")

Visualize Results:

In [None]:
# Convert to NumPy arrays if necessary
if isinstance(X_te, pd.DataFrame):
    X_te_np = X_te.to_numpy()
else:
    X_te_np = X_te

# Make sure y_te and y_pred are 1D
y_te = np.ravel(y_te)
y_pred = np.ravel(y_pred)

# Compute percentage errors
percentage_errors = ((y_te - y_pred) / y_te) * 100

# -----------------------------------
# 📊 Plot 1: True vs Predicted Values
# -----------------------------------
plt.figure(figsize=(8, 6))
plt.scatter(y_te, y_pred, alpha=0.5, color='royalblue')
plt.plot([min(y_te), max(y_te)], [min(y_te), max(y_te)], color='red', linestyle='--', label='Perfect Prediction')
plt.xlabel('True Values (y_te)')
plt.ylabel('Predicted Values (y_pred)')
plt.title('True vs Predicted')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# -----------------------------------
# 📊 Plot 2: Histogram of Percentage Errors
# -----------------------------------
plt.figure(figsize=(8, 5))
plt.hist(percentage_errors, bins=40, color='orange', edgecolor='black', density=True)
plt.title('Distribution of Percentage Errors')
plt.xlabel('Percentage Error (%)')
plt.ylabel('Probability Density')
plt.grid(True)
min_tick = int(np.floor(percentage_errors.min()))
max_tick = int(np.ceil(percentage_errors.max()))
plt.xticks(np.arange(min_tick, max_tick + 1, 1))
plt.tight_layout()
plt.show()

# -----------------------------------
# Print stats
# -----------------------------------
print("Mean Percentage Error:", np.mean(percentage_errors))
print("Mean Absolute Percentage Error:", np.mean(np.abs(percentage_errors)))
print("Mean Absolute Percentage Error (MAPE):", np.mean(np.abs(percentage_errors)))

Neural Network Feature Importance:

In [None]:
result = permutation_importance(grid_search.best_estimator_, X_te_scaled, y_te, scoring='neg_root_mean_squared_error', n_repeats=10, random_state=3)

feature_importances = result.importances_mean

# Print feature importance values
for i, importance in enumerate(feature_importances):
    print(f"Feature {i}: {importance:.4f}")

# Optionally, plot the importance
import matplotlib.pyplot as plt

plt.bar(range(len(feature_importances)), feature_importances)
plt.xlabel("Feature Index")
plt.ylabel("Importance Score")
plt.title("Feature Importance (Permutation)")
plt.show()

In [None]:
for i, col in enumerate(X.columns):
    print(f"Feature {i}: {col}")