In [8]:
import os
import sys

# Get the current working directory
current_dir = os.getcwd()
# Add the ./src folder to the Python module search path
sys.path.append(os.path.join(current_dir, '..', 'src'))


from utils import *
from optimize import *
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

import xgboost as xgb

sns.set_style('ticks')

In [9]:
input_path = '../Data/'

features_file_1 = 'featureSelection/selection_cleanDragonDescriptors.csv'
features_file_3 = 'featureSelection/selection_cleanMordredDescriptors.csv'

features_file_2 =  'deepnose_features.npy'
CID_file = 'molecules_train_cid.npy'

# Read all copies, before and after correction; before was also downloaded from Dropbox.
mixture_file = 'Mixure_Definitions_Training_set.csv' 
training_task_file = 'TrainingData_mixturedist.csv'

# Mordred features
features_1 = pd.read_csv(os.path.join(input_path, features_file_1), index_col= 0)
features_3 = pd.read_csv(os.path.join(input_path, features_file_3), index_col= 0)

features_2 = np.load(os.path.join(input_path, features_file_2))

features_CIDs = np.load(os.path.join(input_path, CID_file))
# Training dataframe
training_set = pd.read_csv(os.path.join(input_path, training_task_file))

# Mapping helper files
mixtures_IDs = pd.read_csv(os.path.join(input_path, mixture_file))

In [10]:
shared = set(features_1.index.tolist()) & set(features_CIDs)
len(shared) # this is expected!!!

154

In [11]:
# normalized_features_1_names = features_1_normalized.columns.tolist()
# features_1_names = features_1.columns.tolist()
# mordred_features_combined = list(set(normalized_features_1_names + features_1_names))
# np.save('../Data/featureSelection/combined_dragon_feature_names.npy', mordred_features_combined)

In [12]:
scaler = StandardScaler(with_mean=True, with_std=True)

# standardize Mordred
features_1_np = scaler.fit_transform(features_1)
features_1 = pd.DataFrame(features_1_np, columns=features_1.columns, index=features_1.index)

features_3_np = scaler.fit_transform(features_3)
features_3 = pd.DataFrame(features_3_np, columns=features_3.columns, index=features_3.index)

# log standardize deepnose
epsilon = 1e-8 
features_2 = scaler.fit_transform(np.log(features_2 + epsilon))

In [13]:
# Double check the number of unique non-NaN values in each feature column
num_unique_values = np.count_nonzero(~np.isnan(features_1), axis=0)

# Print if the number of unique non-NaN values for each feature
for i, count in enumerate(num_unique_values):
    if count == 0:
        print(f"Feature {i}: {count} unique non-NaN values")

In [14]:
# Map CID to features:
CID2features_dragon = {CID: np.array(features_1.loc[CID].tolist()) if CID in features_1.index else np.full(len(features_1.columns), np.nan) for CID in features_CIDs}
CID2features_deepnose=  {CID: features_2[i] for i, CID in enumerate(features_CIDs)}
CID2features_mordred =  {CID: features_3.loc[CID].tolist() for CID in features_CIDs}

In [15]:
X_m, y, num_mixtures, all_pairs_CIDs = format_Xy(training_set,  mixtures_IDs, CID2features_mordred, method = 'avg')
X_dr, _, _, _  = format_Xy(training_set,  mixtures_IDs, CID2features_dragon, method = 'avg')
X_d, _, _, _ = format_Xy(training_set,  mixtures_IDs, CID2features_deepnose, method = 'avg')

In [16]:
# Convert the input pairs to a suitable format for training
X_pairs_m = np.array([(np.concatenate((x1, x2))) for x1, x2 in X_m])
X_pairs_d = np.array([(np.concatenate((x1, x2))) for x1, x2 in X_d])
X_pairs_dr = np.array([(np.concatenate((x1, x2))) for x1, x2 in X_dr])

y_true = np.array(y)

In [17]:
distances_m = [get_euclidean_distance(m[0], m[1]) for m in X_m]
similarities_m = [get_cosine_similarity(m[0], m[1]) for m in X_m]
angles_m = [get_cosine_angle(m[0], m[1]) for m in X_m] 

distances_d = [get_euclidean_distance(m[0], m[1]) for m in X_d]
similarities_d = [get_cosine_similarity(m[0], m[1]) for m in X_d]
angles_d = [get_cosine_angle(m[0], m[1]) for m in X_d] 

distances_dr = [get_euclidean_distance(m[0], m[1]) for m in X_dr]
similarities_dr = [get_cosine_similarity(m[0], m[1]) for m in X_dr]
angles_dr = [get_cosine_angle(m[0], m[1]) for m in X_dr] 


In [18]:
shared_monos = [ len( set(pair[0]).intersection(set(pair[1]))) for pair in all_pairs_CIDs]
diff_monos = [ len( set(pair[0]).difference(set(pair[1]))) for pair in all_pairs_CIDs]

In [19]:
training_set['Sum num monos'] = np.array(num_mixtures).sum(axis = 1)
training_set['Shared'] = shared_monos
training_set['Diff'] = diff_monos
training_set['Num mixture1'] = np.array(num_mixtures)[:, 0]
training_set['Num mixture2'] = np.array(num_mixtures)[:, 1]

In [20]:
datasets = training_set['Dataset'].to_numpy()
encoder = OneHotEncoder()
data_arr = encoder.fit_transform(datasets.reshape(-1, 1))
data_arr = data_arr.toarray()

In [21]:
### add all information above
# X_features = np.hstack( (X_pairs_d, X_pairs_m,
#                         np.array(distances_m).reshape(500, 1), 
#                         np.array(similarities_m).reshape(500, 1), 
#                         np.array(angles_m).reshape(500, 1), 
#                         np.array(distances_d).reshape(500, 1), 
#                         np.array(similarities_d).reshape(500, 1), 
#                         np.array(angles_d).reshape(500, 1), 
#                         np.array(shared_monos).reshape(500, 1), 
#                         np.array(diff_monos).reshape(500, 1), 
#                         np.array(num_mixtures).reshape(500,2), 
#                         data_arr))
X_features = np.hstack( (X_pairs_m, X_pairs_d, X_pairs_dr,
                        np.array(distances_m).reshape(500, 1), 
                        np.array(similarities_m).reshape(500, 1), 
                        np.array(angles_m).reshape(500, 1), 
                        np.array(distances_d).reshape(500, 1), 
                        np.array(similarities_d).reshape(500, 1), 
                        np.array(angles_d).reshape(500, 1), 
                        np.array(distances_dr).reshape(500, 1), 
                        np.array(similarities_dr).reshape(500, 1), 
                        np.array(angles_dr).reshape(500, 1), 
                        np.array(shared_monos).reshape(500, 1), 
                        np.array(diff_monos).reshape(500, 1), 
                        np.array(num_mixtures).reshape(500,2), 
                        data_arr))


In [22]:
# dist_corr = np.corrcoef(distances_d, y_true)[0, 1]
dist_corr = np.corrcoef(distances_m, y_true)[0, 1]

print('R (Deepnose embedding Eucledian distance v.s Experimental Value): ', dist_corr)

R (Deepnose embedding Eucledian distance v.s Experimental Value):  0.4777691715913296


In [23]:
# sim_corr = np.corrcoef(similarities_d, y_true)[0, 1]
sim_corr = np.corrcoef(similarities_m, y_true)[0, 1]

print('R (Cosyne similarity v.s. Experimental Value): ', sim_corr)

R (Cosyne similarity v.s. Experimental Value):  -0.4930930966753656


In [24]:
# sim_corr = np.corrcoef(angles_d, y_true)[0, 1]
sim_corr = np.corrcoef(angles_m, y_true)[0, 1]

print('R (Vector angle v.s. Experimental Value): ', sim_corr)

R (Vector angle v.s. Experimental Value):  0.5028727737317864


### First training:

In [25]:
n_folds = 10
seed = 314159

In [26]:
rf_pred_list = []
xgb_pred_list = []
kf_rf_importances = []
y_true_list = []
test_indices_list = []  # Keep track of the test indices in each fold

# Perform k-fold cross-validation:
kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
for train_index, test_index in kf.split(X_features):
    X_train, X_test = X_features[train_index], X_features[test_index]
    y_train, y_test = y_true[train_index], y_true[test_index]
    
    # Train the Random Forest regressor
    rf = RandomForestRegressor(n_estimators=100, random_state=seed)
    rf.fit(X_train, y_train)
    
    # Train the XGBoost regressor
    xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=seed)
    xgb_model.fit(X_train, y_train)
    
    # Make predictions 
    rf_pred = rf.predict(X_test)
    xgb_pred = xgb_model.predict(X_test)
    
    # Get the feature importances
    importances = rf.feature_importances_
    kf_rf_importances.append(importances)
    rf_pred_list.extend(rf_pred)
    xgb_pred_list.extend(xgb_pred)
    y_true_list.extend(y_test)
    test_indices_list.extend(test_index)  # Store the test indices

# Store the predictions and actual values
results_df = pd.DataFrame({
    'test_index': test_indices_list,
    'rf_pred': rf_pred_list,
    'xgb_pred': xgb_pred_list,
    'y_true': y_true_list
})

# Merge the results with the training_set df
training_set = training_set.merge(results_df, left_index=True, right_on='test_index')
training_set.drop('test_index', axis=1, inplace=True)

In [27]:
training_set.head()

Unnamed: 0,Dataset,Mixture 1,Mixture 2,Experimental Values,Sum num monos,Shared,Diff,Num mixture1,Num mixture2,rf_pred,xgb_pred,y_true
150,Snitz 1,1,2,0.604167,20,0,10,10,10,0.577419,0.555801,0.604167
300,Snitz 1,1,3,0.651042,11,0,10,10,1,0.633379,0.718815,0.651042
0,Snitz 1,1,5,0.505208,40,0,10,10,30,0.535816,0.41101,0.505208
1,Snitz 1,1,6,0.411458,50,0,10,10,40,0.536816,0.452299,0.411458
50,Snitz 1,1,7,0.5625,14,0,10,10,4,0.588263,0.594626,0.5625


In [28]:
# Calculate the correlation and R^2 for Random Forest
rf_corr = np.corrcoef(rf_pred_list, y_true_list)[0, 1]
rf_rmse = np.sqrt(mean_squared_error(np.array(y_true_list), np.array(rf_pred_list)))

print(f"Random Forest - R: {rf_corr:.3f}")
print(f"Random Forest - RMSE: {rf_rmse:.3f}")
print()
# Calculate the correlation and R^2 for XGBoost
xgb_corr = np.corrcoef(xgb_pred_list, y_true_list)[0, 1]
xgb_rmse = np.sqrt(mean_squared_error(np.array(y_true_list), np.array(xgb_pred_list)))

print(f"XGBoost - R: {xgb_corr:.3f}")
print(f"XGBoost - RMSE: {xgb_rmse:.3f}")


Random Forest - R: 0.610
Random Forest - RMSE: 0.124

XGBoost - R: 0.579
XGBoost - RMSE: 0.129


In [29]:
for seed in [0, 1, 2]:
    print("Starting round :", seed)
    rf_best, rgb_best = para_search(seed, X_features, y_true)
    _ =  avg_rf_best(rf_best, X_features, y_true)
    _ =  avg_rgb_best(rgb_best, X_features, y_true)

Starting round : 0
