In [94]:
import os
import sys

# Get the current working directory
current_dir = os.getcwd()
# Add the ./src folder to the Python module search path
sys.path.append(os.path.join(current_dir, '..', 'src'))

from utils import *

import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_regression
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import xgboost as xgb

sns.set_style('ticks')

### Create an MLP and a SVR model for benchmarking.

In [None]:
X_features.shape

(500, 203)

In [35]:
input_path = '../Data'

feature_file = 'deepnose_features.npy'
CID_file = 'molecules_train_cid.npy'

mixture_file = 'Mixure_Definitions_Training_set.csv'
training_task_file = 'TrainingData_mixturedist.csv'

# Deepnose features
features = np.load(os.path.join(input_path, feature_file))
# Training dataframe
training_set = pd.read_csv(os.path.join(input_path, training_task_file))

# Mapping helper files
mixtures_IDs = pd.read_csv(os.path.join(input_path, mixture_file))
features_CIDs = np.load(os.path.join(input_path, CID_file))

In [39]:
epsilon = 1e-8
scaler = StandardScaler(with_mean=True, with_std=True)
features = scaler.fit_transform(np.log(features + epsilon))

# Map CID to 96 dim features:
CID2features =  {CID: features[i] for i, CID in enumerate(features_CIDs)}

In [45]:
X, y, num_mixtures, all_pairs_CIDs = format_Xy(training_set,  mixtures_IDs, CID2features, method = 'avg')

In [46]:
# Convert the input pairs to a suitable format for training
X_pairs = np.array([(np.concatenate((x1, x2))) for x1, x2 in X])
y_true = np.array(y)

In [47]:
distances = [get_euclidean_distance(m[0], m[1]) for m in X]
similarities = [get_cosine_similarity(m[0], m[1]) for m in X]
angles = [get_cosine_angle(m[0], m[1]) for m in X]

In [48]:
shared_monos = [ len( set(pair[0]).intersection(set(pair[1]))) for pair in all_pairs_CIDs]
diff_monos = [ len( set(pair[0]).difference(set(pair[1]))) for pair in all_pairs_CIDs]

In [49]:
datasets = training_set['Dataset'].to_numpy()
encoder = OneHotEncoder()
data_arr = encoder.fit_transform(datasets.reshape(-1, 1))
data_arr = data_arr.toarray()

In [50]:
# Combine features above
X_features = np.hstack((X_pairs, np.array(distances).reshape(500, 1), 
                        np.array(similarities).reshape(500, 1), 
                        np.array(angles).reshape(500, 1), 
                        np.array(shared_monos).reshape(500, 1), 
                        np.array(diff_monos).reshape(500, 1), 
                        np.array(num_mixtures), 
                        data_arr))

In [65]:
n_folds = 10
seed = 314159

In [88]:
mlp_model = MLPRegressor(hidden_layer_sizes=(100, 50),  # Hidden layers, as input is ~200, we use 100 and 50
                         activation='relu', 
                         solver='adam',  
                         alpha=0.3,  # Strength of the L2 regularization term. The L2 regularization term is divided by the sample size when added to the loss.
                         max_iter=1000,  # Set the maximum number of iterations for weight optimization
                         random_state=seed)  

# Custom scoring function for RMSE
rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

# 10-fold cross-validation with RMSE as the scoring metric
cv_scores = cross_val_score(mlp_model, X_features, y_true, cv=n_folds, scoring=rmse_scorer)

In [89]:
print("Mean cross-validation RMSE:", cv_scores.mean())
print()
print("Std Cross-validation RMSE:", cv_scores.std())

Mean cross-validation RMSE: -0.16496821102103204

Std Cross-validation RMSE: 0.024555685887717907


### MLP 

In [90]:
# Fit the model on the entire dataset
mlp_model.fit(X_features, y_true)

# Make predictions on the dataset
y_pred = mlp_model.predict(X_features)

# Calculate the correlation between y_true and y_pred
correlation = r2_score(y_true, y_pred)
print("Correlation between y_true and y_pred:", correlation)

Correlation between y_true and y_pred: 0.8241040354845336


Over-fitting to the data; to see that, we do a train-test split:

In [93]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_features, y_true, test_size=0.1, random_state=seed)

# Create an MLP Regressor with L1 regularization
mlp_model = MLPRegressor(hidden_layer_sizes=(100, 50), 
                         activation='relu', 
                         solver='adam', 
                         alpha=0.3,
                         max_iter=1000, 
                         random_state=seed)
                        

# Fit the model on the training set (450 samples)
mlp_model.fit(X_train, y_train)

# Make predictions on the test set (50 samples)
y_pred = mlp_model.predict(X_test)

# Calculate the correlation between y_true and y_pred for the test set
correlation = r2_score(y_test, y_pred)
print("Correlation between y_true and y_pred for the test set:", correlation)

Correlation between y_true and y_pred for the test set: -0.1757289855573687


### SVR

We used the Radial Basis Function (RBF) kernel as it can handle non-linear relationships in the data effectively, and regress based on similarity of data in all dimensions.

In [97]:
# Initialize the SVR model with the RBF kernel
svr = SVR(kernel='rbf')

# Define the hyperparameter grid for GridSearchCV 
param_grid = {'C': [0.1, 1, 10, 100], # regularization parameter
              'gamma': [0.01, 0.1, 1, 'auto'], # Kernel Coefficient, 1/n_features for 'auto'
              'epsilon': [0.01, 0.1, 0.2]} # width of the insensitive region around the true values, the tube

# Perform GridSearchCV or RandomizedSearchCV to tune the hyperparameters
grid_search = GridSearchCV(svr, param_grid, cv=10, scoring='r2')
grid_search.fit(X_features, y_true)

# Get the best hyperparameters and the best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y_true, test_size=0.2, random_state=seed)

# Fit the best estimator on the training data
best_estimator.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_estimator.predict(X_test)

# Evaluate the performance
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", np.sqrt(mse))

r2 = r2_score(y_test, y_pred)
print("Correlation:", np.sqrt(r2))

Mean Squared Error: 0.13136123218098572
Correlation: 0.5192976367021426
