In [48]:
from utils import *

import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

import xgboost as xgb

sns.set_style('ticks')

In [76]:
input_path = 'Data'

feature_file_dg = 'Dragon_Descriptors.csv'
features_file_dn =  'deepnose_features.npy'

CID_file = 'molecules_train_cid.npy'

mixture_file = 'Mixure_Definitions_Training_set.csv'

training_task_file = 'TrainingData_mixturedist.csv'

# Dragon features
features_dg = pd.read_csv(os.path.join(input_path, feature_file_dg)).values
features_dn = np.load(os.path.join(input_path, features_file_dn))

# Training dataframe
training_set = pd.read_csv(os.path.join(input_path, training_task_file))

# Mapping helper files
mixtures_IDs = pd.read_csv(os.path.join(input_path, mixture_file))
features_CIDs = np.load(os.path.join(input_path, CID_file))

In [77]:
scaler = StandardScaler(with_mean=True, with_std=True)
features = scaler.fit_transform(features_dg)
CID2features_dg =  {CID: features_dg[i] for i, CID in enumerate(features_CIDs)}

scaler = StandardScaler(with_mean=True, with_std=True)
features = scaler.fit_transform(features_dn)
CID2features_dn =  {CID: features_dn[i] for i, CID in enumerate(features_CIDs)}

 <span style="color:orange;">NOTE: Reduce the dimension! </span>

In [78]:
X_dg, y, num_mixtures, all_pairs_CIDs = format_Xy(training_set,  mixtures_IDs, CID2features_dg,'avg')
X_dn, y, num_mixtures, all_pairs_CIDs = format_Xy(training_set,  mixtures_IDs, CID2features_dn,'log')

In [59]:
# Convert the input pairs to a suitable format for training
X_pairs_dg = np.array([(np.concatenate((x1, x2))) for x1, x2 in X_dg])
y_true = np.array(y)
X_pair1 = X_pairs_dg[:, :96] 
X_pair2 = X_pairs_dg[:, 96:] 
distances = [get_euclidean_distance(m[0], m[1]) for m in X_dg]
similarities = [get_cosine_similarity(m[0], m[1]) for m in X_dg]
angles = [get_cosine_angle(m[0], m[1]) for m in X_dg]
# add those first:
shared_monos = [ len( set(pair[0]).intersection(set(pair[1]))) for pair in all_pairs_CIDs]
diff_monos = [ len( set(pair[0]).difference(set(pair[1]))) for pair in all_pairs_CIDs]
datasets = training_set['Dataset'].to_numpy()
encoder = OneHotEncoder()
data_arr = encoder.fit_transform(datasets.reshape(-1, 1))
data_arr = data_arr.toarray()

X_dg_features = np.hstack((X_pairs_dg, np.array(distances).reshape(500, 1), 
                        np.array(similarities).reshape(500, 1), 
                        np.array(angles).reshape(500, 1), 
                        np.array(shared_monos).reshape(500, 1), 
                        np.array(diff_monos).reshape(500, 1), 
                        np.array(num_mixtures), 
                        data_arr))

In [62]:
# Convert the input pairs to a suitable format for training
X_pairs_dn = np.array([(np.concatenate((x1, x2))) for x1, x2 in X_dn])
X_pair1 = X_pairs_dn[:, :96] 
X_pair2 = X_pairs_dn[:, 96:] 
distances = [get_euclidean_distance(m[0], m[1]) for m in X_pairs_dn]
similarities = [get_cosine_similarity(m[0], m[1]) for m in X_pairs_dn]
angles = [get_cosine_angle(m[0], m[1]) for m in X_pairs_dn]

X_dn_features = np.hstack((X_pairs_dn, np.array(distances).reshape(500, 1), 
                        np.array(similarities).reshape(500, 1), 
                        np.array(angles).reshape(500, 1), 
                        data_arr))

In [68]:
X_features = np.hstack((X_dg_features, X_dn_features))

In [71]:
X_features.shape

(500, 9950)

----
## 2. Training
### 2.1 Example attempt, standard intialized RF and XGBoost

In [72]:
n_folds = 10
seed = 314159

In [73]:
rf_pred_list = []
xgb_pred_list = []
y_true_list = []
test_indices_list = []  # Keep track of the test indices in each fold

# Perform k-fold cross-validation:
kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
for train_index, test_index in kf.split(X_features):
    X_train, X_test = X_features[train_index], X_features[test_index]
    y_train, y_test = y_true[train_index], y_true[test_index]
    
    # Train the Random Forest regressor
    rf = RandomForestRegressor(n_estimators=100, random_state=seed)
    rf.fit(X_train, y_train)
    
    # Train the XGBoost regressor
    xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=seed)
    xgb_model.fit(X_train, y_train)
    
    # Make predictions 
    rf_pred = rf.predict(X_test)
    xgb_pred = xgb_model.predict(X_test)
    
    rf_pred_list.extend(rf_pred)
    xgb_pred_list.extend(xgb_pred)
    y_true_list.extend(y_test)
    test_indices_list.extend(test_index)  # Store the test indices

# Store the predictions and actual values
results_df = pd.DataFrame({
    'test_index': test_indices_list,
    'rf_pred': rf_pred_list,
    'xgb_pred': xgb_pred_list,
    'y_true': y_true_list
})

# Merge the results with the training_set df
training_set = training_set.merge(results_df, left_index=True, right_on='test_index')
training_set.drop('test_index', axis=1, inplace=True)

In [74]:
training_set.head()

Unnamed: 0,Dataset,Mixture 1,Mixture 2,Experimental Values,rf_pred,xgb_pred,y_true
150,Snitz 1,1,2,0.604167,0.553904,0.550512,0.604167
300,Snitz 1,1,3,0.651042,0.611426,0.619293,0.651042
0,Snitz 1,1,5,0.505208,0.52998,0.514722,0.505208
1,Snitz 1,1,6,0.411458,0.543901,0.531595,0.411458
50,Snitz 1,1,7,0.5625,0.572152,0.54554,0.5625


In [75]:
# Calculate the correlation and R^2 for Random Forest
rf_corr = np.corrcoef(rf_pred_list, y_true_list)[0, 1]
rf_rmse = np.sqrt(mean_squared_error(np.array(y_true_list), np.array(rf_pred_list)))

print(f"Random Forest - R: {rf_corr:.3f}")
print(f"Random Forest - RMSE: {rf_rmse:.3f}")
print()
# Calculate the correlation and R^2 for XGBoost
xgb_corr = np.corrcoef(xgb_pred_list, y_true_list)[0, 1]
xgb_rmse = np.sqrt(mean_squared_error(np.array(y_true_list), np.array(xgb_pred_list)))

print(f"XGBoost - R: {xgb_corr:.3f}")
print(f"XGBoost - RMSE: {xgb_rmse:.3f}")


Random Forest - R: 0.585
Random Forest - RMSE: 0.127

XGBoost - R: 0.551
XGBoost - RMSE: 0.132
