In [1]:
import os
import sys

# Get the current working directory
current_dir = os.getcwd()
# Add the ./src folder to the Python module search path
sys.path.append(os.path.join(current_dir, '..', 'src'))

from utils import *
from collections import Counter
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

sns.set_style('ticks')

In [2]:
input_path = '../Data/'

feature_file = 'Mordred_reduced_features_50.npy'
features_file_2 =  'deepnose_features.npy'
CID_file = 'molecules_train_cid.npy'

# Read all copies, before and after correction; before was also downloaded from Dropbox.
mixture_file = 'Mixure_Definitions_Training_set_UPD2.csv' 

training_task_file = 'TrainingData_mixturedist.csv'

# Mordred features
features = np.load(os.path.join(input_path, feature_file))
features_2 = np.load(os.path.join(input_path, features_file_2))

features_CIDs = np.load(os.path.join(input_path, CID_file))

# Training dataframe
training_set = pd.read_csv(os.path.join(input_path, training_task_file))

# Mapping helper files
mixtures_IDs = pd.read_csv(os.path.join(input_path, mixture_file))


---
## 1. Prepare training data

In [3]:
scaler = StandardScaler(with_mean=True, with_std=True)

# standardize Mordred
features = scaler.fit_transform(features)
# log standardize deepnose
epsilon = 1e-8 
features_2 = scaler.fit_transform(np.log(features_2 + epsilon))

In [4]:
# Convert DataFrame to a numpy array
features_array = features

# Create an imputer object with mean strategy, can change later!!!
imputer = SimpleImputer(strategy='mean')
# Impute missing values
imputed_features = imputer.fit_transform(features_array)


In [5]:
# Double check the number of unique non-NaN values in each feature column
num_unique_values = np.count_nonzero(~np.isnan(features), axis=0)

# Print if the number of unique non-NaN values for each feature
for i, count in enumerate(num_unique_values):
    if count == 0:
        print(f"Feature {i}: {count} unique non-NaN values")

In [6]:
# Map CID to 96 dim features:
CID2features_mordred =  {CID: imputed_features[i] for i, CID in enumerate(features_CIDs)}
CID2features_deepnose=  {CID: features_2[i] for i, CID in enumerate(features_CIDs)}

In [7]:
X_m, y, num_mixtures, all_pairs_CIDs = format_Xy(training_set,  mixtures_IDs, CID2features_mordred, method = 'avg')
X_d, _, _, _ = format_Xy(training_set,  mixtures_IDs, CID2features_deepnose, method = 'avg')

In [8]:
# Convert the input pairs to a suitable format for training
X_pairs_m = np.array([(np.concatenate((x1, x2))) for x1, x2 in X_m])
X_pairs_d = np.array([(np.concatenate((x1, x2))) for x1, x2 in X_d])

y_true = np.array(y)

In [9]:
distances_m = [get_euclidean_distance(m[0], m[1]) for m in X_m]
similarities_m = [get_cosine_similarity(m[0], m[1]) for m in X_m]
angles_m = [get_cosine_angle(m[0], m[1]) for m in X_m] 

distances_d = [get_euclidean_distance(m[0], m[1]) for m in X_d]
similarities_d = [get_cosine_similarity(m[0], m[1]) for m in X_d]
angles_d = [get_cosine_angle(m[0], m[1]) for m in X_d] 

In [10]:
shared_monos = [ len( set(pair[0]).intersection(set(pair[1]))) for pair in all_pairs_CIDs]
diff_monos = [ len( set(pair[0]).difference(set(pair[1]))) for pair in all_pairs_CIDs]

In [11]:
training_set['Sum num monos'] = np.array(num_mixtures).sum(axis = 1)
training_set['Shared'] = shared_monos
training_set['Diff'] = diff_monos
training_set['Num mixture1'] = np.array(num_mixtures)[:, 0]
training_set['Num mixture2'] = np.array(num_mixtures)[:, 1]

In [12]:
datasets = training_set['Dataset'].to_numpy()
encoder = OneHotEncoder()
data_arr = encoder.fit_transform(datasets.reshape(-1, 1))
data_arr = data_arr.toarray()

In [13]:
### add all information above
X_features = np.hstack( (X_pairs_d, X_pairs_m,
                        np.array(distances_m).reshape(500, 1), 
                        np.array(similarities_m).reshape(500, 1), 
                        np.array(angles_m).reshape(500, 1), 
                        np.array(distances_d).reshape(500, 1), 
                        np.array(similarities_d).reshape(500, 1), 
                        np.array(angles_d).reshape(500, 1), 
                        np.array(shared_monos).reshape(500, 1), 
                        np.array(diff_monos).reshape(500, 1), 
                        np.array(num_mixtures).reshape(500,2), 
                        data_arr))

----

## 2. Train 10 models on all data, using previously found best hyperparameters

In [14]:
rf_best =  {'n_estimators': 300, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': True}

In [15]:
# Train 10 random forest models
num_models = 10
rf_models = []
for i in range(num_models):
    rf = RandomForestRegressor(**rf_best, random_state=i)
    rf.fit(X_features, y_true)
    rf_models.append(rf)

---
## 3. First testing:

In [16]:
test_task_file = 'Data/LeaderboardData_mixturedist.csv'
test_set = pd.read_csv(test_task_file)

In [17]:
test_set.head()

Unnamed: 0,Dataset,Mixture 1,Mixture 2,Experimental Values
0,Bushdid,65,66,0.653846
1,Snitz 2,1,8,0.64042
2,Snitz 1,24,17,0.745192
3,Snitz 2,12,14,0.608784
4,Snitz 1,49,45,0.676136


#### 3.1 Same way preparing features:

In [18]:
X_m_test, y_test_true, num_mixtures, all_pairs_CIDs = format_Xy(test_set,  mixtures_IDs, CID2features_mordred, method = 'avg')
X_d_test, _, _, _ = format_Xy(test_set,  mixtures_IDs, CID2features_deepnose, method = 'avg')

In [19]:
X_pairs_m_test = np.array([(np.concatenate((x1, x2))) for x1, x2 in X_m_test])
X_pairs_d_test = np.array([(np.concatenate((x1, x2))) for x1, x2 in X_d_test])

distances_m_test = [get_euclidean_distance(m[0], m[1]) for m in X_m_test]
similarities_m_test = [get_cosine_similarity(m[0], m[1]) for m in X_m_test]
angles_m_test = [get_cosine_angle(m[0], m[1]) for m in X_m_test] 

distances_d_test = [get_euclidean_distance(m[0], m[1]) for m in X_d_test]
similarities_d_test = [get_cosine_similarity(m[0], m[1]) for m in X_d_test]
angles_d_test = [get_cosine_angle(m[0], m[1]) for m in X_d_test] 

In [20]:
shared_monos_test = [ len( set(pair[0]).intersection(set(pair[1]))) for pair in all_pairs_CIDs]
diff_monos_test = [ len( set(pair[0]).difference(set(pair[1]))) for pair in all_pairs_CIDs]

In [21]:
test_set['Sum num monos'] = np.array(num_mixtures).sum(axis = 1)
test_set['Shared'] = shared_monos_test
test_set['Diff'] = diff_monos_test
test_set['Num mixture1'] = np.array(num_mixtures)[:, 0]
test_set['Num mixture2'] = np.array(num_mixtures)[:, 1]

#### 3.2 For the `Dataset` feature that is unavailable, we impute with a KNN inputator

A different strategy can just be, to use `Bushdid` as it's the same type of experimental paradigm.

In [22]:
# Strategy 1, start with filling NaN first, and then later impute
data_arr = np.full((len(test_set), 4), np.nan) 

In [23]:
### add all information above
X_test = np.hstack( (X_pairs_d_test, X_pairs_m_test,
                        np.array(distances_m_test).reshape(46, 1), 
                        np.array(similarities_m_test).reshape(46, 1), 
                        np.array(angles_m_test).reshape(46, 1), 
                        np.array(distances_d_test).reshape(46, 1), 
                        np.array(similarities_d_test).reshape(46, 1), 
                        np.array(angles_d_test).reshape(46, 1), 
                        np.array(shared_monos_test).reshape(46, 1), 
                        np.array(diff_monos_test).reshape(46, 1), 
                        np.array(num_mixtures).reshape(46,2), 
                        data_arr))

In [24]:
# Create a KNNImputer object
imputer = KNNImputer(n_neighbors=5)

# Fit the imputer on the training data
imputer.fit(X_features)

# Transform the training data and test data
X_test_imputed = imputer.transform(X_test)

In [25]:
y_pred_list = []
for model in rf_models:
    y_pred = model.predict(X_test_imputed)
    y_pred_list.append(y_pred)

y_pred_avg = np.mean(y_pred_list, axis=0)

In [26]:
y_true = test_set['Experimental Values'].tolist()

In [27]:
# Calculate the correlation and R^2 for Random Forest
rf_corr = np.corrcoef(y_pred_avg, y_true)[0, 1]
rf_rmse = np.sqrt(mean_squared_error(np.array(y_true), y_pred_avg))

print(f"Random Forest - R: {rf_corr:.3f}")
print(f"Random Forest - RMSE: {rf_rmse:.3f}")


Random Forest - R: 0.709
Random Forest - RMSE: 0.121
