In [142]:
import os
import sys

# Get the current working directory
current_dir = os.getcwd()
# Add the ./src folder to the Python module search path
sys.path.append(os.path.join(current_dir, '..', 'src'))

from utils import *

import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

import xgboost as xgb

sns.set_style('ticks')

In [143]:
input_path = '../Data'

feature_file = 'deepnose_features.npy'
CID_file = 'molecules_train_cid.npy'

mixture_file = 'Mixure_Definitions_Training_set.csv'
intensity_file = 'Mixure_Definitions_Intensity_Training_set.csv'
training_task_file = 'TrainingData_mixturedist.csv'

features = np.load(os.path.join(input_path, feature_file))
training_set = pd.read_csv(os.path.join(input_path, training_task_file))
mixtures_IDs = pd.read_csv(os.path.join(input_path, mixture_file))
molecule_intensities = pd.read_csv(os.path.join(input_path, intensity_file))

features_CIDs = np.load(os.path.join(input_path, CID_file))

In [144]:
extended_features = np.load(os.path.join(input_path, 'Extra/deepnose_features_extRavia.npy'))
extended_training_set = pd.read_csv(os.path.join(input_path, 'Extra/extended_training_set.csv'))
extended_mixture_IDs = pd.read_csv(os.path.join(input_path, 'Extra/extended_mixture_IDs.csv'))
extended_molecule_intensities = pd.read_csv(os.path.join(input_path, 'Extra/extended_molecule_intensites.csv'))
extended_features_CIDs = np.load(os.path.join(input_path, 'Extra/extended_ravia_cid.npy'))

In [145]:
scaler = StandardScaler(with_mean=True, with_std=True)
epsilon = 1e-8
features = scaler.fit_transform(np.log(features + epsilon))
CID2features =  {CID: features[i] for i, CID in enumerate(features_CIDs)}

In [146]:
extended_features = scaler.fit_transform(np.log(extended_features + epsilon))
extended_CID2features =  {CID: extended_features[i] for i, CID in enumerate(extended_features_CIDs)}

In [147]:
# overlapped_CIDs = list(set(extended_features_CIDs) & set(features_CIDs))
# overlapped_CIDs[2]
# extended_CID2features[7685]
# CID2features[7685]

In [148]:
for key, value in extended_CID2features.items():
        if key not in CID2features:
            CID2features[key] = value

#### Prepare for `X_features` and `y_true`

In [149]:
# Define the scaling constant
scaling_constant = 2 # Adjust this value as needed

# Get the list of column names containing "CID"
cid_columns = [col for col in molecule_intensities.columns if 'CID' in col]

# Create a mask to identify rows where "Dataset" is in ['Snitz 1', 'Snitz 2', 'Bushdid']
mask = molecule_intensities['Dataset'].isin(['Snitz 1', 'Snitz 2', 'Bushdid'])

# Scale the values of "CID" columns for the selected rows
molecule_intensities.loc[mask, cid_columns] *= scaling_constant

In [150]:
X = []
y = []
num_monos = []
CIDs_all = []

for _, row in training_set.iterrows():
    mixture1 = combine_molecules_intensity_weighed(label=row['Mixture 1'], dataset=row['Dataset'],
                                            mixtures_IDs=mixtures_IDs, CID2features=CID2features,
                                            mixtures_intensities= molecule_intensities)
    mixture2 = combine_molecules_intensity_weighed(label=row['Mixture 2'], dataset=row['Dataset'],
                                            mixtures_IDs=mixtures_IDs, CID2features=CID2features,
                                            mixtures_intensities= molecule_intensities)
    X.append((mixture1, mixture2))
    y.append(row['Experimental Values'])

In [151]:
_, _, num_mixtures, all_pairs_CIDs = format_Xy(training_set,  mixtures_IDs, CID2features, method = 'avg')

- each `x` in `X` contains a two vector tuple `(mixture_1, mixture_2)`, index ordered same way as `training_set`
- `method` specifies the ways to create the mixture embeeding from 

In [152]:
# Convert the input pairs to a suitable format for training
X_pairs = np.array([(np.concatenate((x1, x2))) for x1, x2 in X])
y_true = np.array(y)

In [153]:
distances = [get_euclidean_distance(m[0], m[1]) for m in X]
similarities = [get_cosine_similarity(m[0], m[1]) for m in X]
angles = [get_cosine_angle(m[0], m[1]) for m in X]

In [154]:
shared_monos = [ len( set(pair[0]).intersection(set(pair[1]))) for pair in all_pairs_CIDs]
diff_monos = [ len( set(pair[0]).difference(set(pair[1]))) for pair in all_pairs_CIDs]

In [155]:
datasets = training_set['Dataset'].to_numpy()
encoder = OneHotEncoder()
data_arr = encoder.fit_transform(datasets.reshape(-1, 1))
data_arr = data_arr.toarray()

In [156]:
### add all information above
X_features = np.hstack((X_pairs, np.array(distances).reshape(500, 1), 
                        np.array(similarities).reshape(500, 1), 
                        np.array(angles).reshape(500, 1), 
                        np.array(shared_monos).reshape(500, 1), 
                        np.array(diff_monos).reshape(500, 1), 
                        np.array(num_mixtures), 
                        data_arr,
                        np.zeros((500, 2)) # this is given the addition of the new dataset// changed to 2 after filtiering out exp1
                        ))

#### Prepare `X_features_aug` and `y_true_aug`:

In [157]:
extended_training_set['Dataset'].unique()

array(['Exp1', 'Exp2', 'Exp6'], dtype=object)

In [158]:
extended_training_set

Unnamed: 0,Dataset,Mixture 1,Mixture 2,Experimental Values
0,Exp1,1,3,38.477273
1,Exp1,1,8,44.840909
2,Exp1,1,11,31.022727
3,Exp1,2,5,41.000000
4,Exp1,2,7,65.181818
...,...,...,...,...
310,Exp6,294,294,78.000000
311,Exp6,295,295,68.000000
312,Exp6,296,296,76.000000
313,Exp6,297,297,70.588234


In [159]:
extended_training_set = extended_training_set[extended_training_set['Dataset']!= 'Exp1']

In [161]:
print(f'total extended data to use: {len(extended_training_set)}')

total extended data to use: 220


In [162]:
X_extended = []
y_extended = []
num_monos = []
CIDs_all = []

for _, row in extended_training_set.iterrows():
    mixture1 = combine_molecules_intensity_weighed(label=row['Mixture 1'], dataset=row['Dataset'],
                                            mixtures_IDs=extended_mixture_IDs, CID2features=CID2features,
                                            mixtures_intensities= extended_molecule_intensities)
    mixture2 = combine_molecules_intensity_weighed(label=row['Mixture 2'], dataset=row['Dataset'],
                                            mixtures_IDs=extended_mixture_IDs, CID2features=CID2features,
                                            mixtures_intensities= extended_molecule_intensities)
    X_extended.append((mixture1, mixture2))
    y_extended.append(row['Experimental Values']/100)

In [174]:
len(y_extended)

220

In [163]:
print(f'We will have a total of {len(y_extended)} extra Ravia samples.')

We will have a total of 220 extra Ravia samples.


In [164]:
X_extended = np.array([(np.concatenate((x1, x2))) for (x1, x2) in X_extended])

In [165]:
_, _, extra_num_mixtures, extend_pairs_CIDs = format_Xy(extended_training_set,  extended_mixture_IDs, CID2features, method = 'avg')

- each `x` in `X` contains a two vector tuple `(mixture_1, mixture_2)`, index ordered same way as `training_set`
- `method` specifies the ways to create the mixture embeeding from 

In [166]:
y_true_extended = np.array(y_extended)

In [167]:
distances_e = [get_euclidean_distance(m[0], m[1]) for m in X_extended]
similarities_e = [get_cosine_similarity(m[0], m[1]) for m in X_extended]
angles_e = [get_cosine_angle(m[0], m[1]) for m in X_extended]

In [168]:
shared_monos_e = [ len( set(pair[0]).intersection(set(pair[1]))) for pair in extend_pairs_CIDs]
diff_monos_e = [ len( set(pair[0]).difference(set(pair[1]))) for pair in extend_pairs_CIDs]

In [169]:
datasets_e = extended_training_set['Dataset'].to_numpy()
encoder = OneHotEncoder()
data_arr_e = encoder.fit_transform(datasets_e.reshape(-1, 1))
data_arr_e = data_arr_e.toarray()

In [171]:
### add all information above
X_features_extended = np.hstack((X_extended, np.array(distances_e).reshape(220, 1), 
                        np.array(similarities_e).reshape(220, 1), 
                        np.array(angles_e).reshape(220, 1), 
                        np.array(shared_monos_e).reshape(220, 1), 
                        np.array(diff_monos_e).reshape(220, 1), 
                        np.array(extra_num_mixtures), 
                        np.zeros((220, 4)), # this is given the addition of the new dataset
                        data_arr_e
                        ))

In [172]:
aug_num = 50
indices = np.random.choice(len(y_true_extended), size=aug_num, replace=False)
X_features_aug = X_features_extended[indices]
y_true_aug = np.array(y_true_extended)[indices]

In [173]:
seed = 314159
n_folds = 10

rf_pred_list = []
xgb_pred_list = []
y_true_list = []
test_indices_list = []  # Keep track of the test indices in each fold

# Stack the original X and augmented X_pool
stacked_X = np.vstack((X_features, X_features_aug))
# Stack the original y_true and augmented y_pool
stacked_y = np.concatenate((y_true, y_true_aug))

# Get the number of original samples
n_original_samples = X_features.shape[0]

# Perform k-fold cross-validation:
kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
for train_index, test_index in kf.split(stacked_X):
    X_train, X_test = stacked_X[train_index], stacked_X[test_index]
    y_train, y_test = stacked_y[train_index], stacked_y[test_index]

    # Get the original test indices
    original_test_index = test_index[test_index < n_original_samples]
    
    # Train the Random Forest regressor
    rf = RandomForestRegressor(n_estimators=100, random_state=seed)
    rf.fit(X_train, y_train)
    
    # Train the XGBoost regressor
    xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=seed)
    xgb_model.fit(X_train, y_train)
    
    # Make predictions on the original X_features
    rf_pred = rf.predict(stacked_X[original_test_index])
    xgb_pred = xgb_model.predict(stacked_X[original_test_index])
    
    rf_pred_list.extend(rf_pred)
    xgb_pred_list.extend(xgb_pred)
    y_true_list.extend(y_true[original_test_index])  # Use the original y_true for evaluation
    test_indices_list.extend(original_test_index)  # Store the original test indices

# Store the predictions and actual values
results_df = pd.DataFrame({
    'test_index': test_indices_list,
    'rf_pred': rf_pred_list,
    'xgb_pred': xgb_pred_list,
    'y_true': y_true_list
})

# Merge the results with the training_set df
training_set = training_set.merge(results_df, left_index=True, right_on='test_index')
training_set.drop('test_index', axis=1, inplace=True)

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 206 and the array at index 1 has size 205

In [122]:
# Calculate the correlation and R^2 for Random Forest
rf_corr = np.corrcoef(rf_pred_list, y_true_list)[0, 1]
rf_rmse = np.sqrt(mean_squared_error(np.array(y_true_list), np.array(rf_pred_list)))

print(f"Random Forest - R: {rf_corr:.3f}")
print(f"Random Forest - RMSE: {rf_rmse:.3f}")
print()
# Calculate the correlation and R^2 for XGBoost
xgb_corr = np.corrcoef(xgb_pred_list, y_true_list)[0, 1]
xgb_rmse = np.sqrt(mean_squared_error(np.array(y_true_list), np.array(xgb_pred_list)))

print(f"XGBoost - R: {xgb_corr:.3f}")
print(f"XGBoost - RMSE: {xgb_rmse:.3f}")


Random Forest - R: 0.610
Random Forest - RMSE: 0.124

XGBoost - R: 0.543
XGBoost - RMSE: 0.133


In [None]:
_, _, num_mixtures, all_pairs_CIDs = format_Xy(training_set,  mixtures_IDs, CID2features, method = 'avg')

- each `x` in `X` contains a two vector tuple `(mixture_1, mixture_2)`, index ordered same way as `training_set`
- `method` specifies the ways to create the mixture embeeding from 

In [None]:
# Convert the input pairs to a suitable format for training
X_pairs = np.array([(np.concatenate((x1, x2))) for x1, x2 in X])
y_true = np.array(y)

In [None]:
distances = [get_euclidean_distance(m[0], m[1]) for m in X]
similarities = [get_cosine_similarity(m[0], m[1]) for m in X]
angles = [get_cosine_angle(m[0], m[1]) for m in X]

In [None]:
shared_monos = [ len( set(pair[0]).intersection(set(pair[1]))) for pair in all_pairs_CIDs]
diff_monos = [ len( set(pair[0]).difference(set(pair[1]))) for pair in all_pairs_CIDs]

In [None]:
### this we will need to augment:
datasets = training_set['Dataset'].to_numpy()
encoder = OneHotEncoder()
data_arr = encoder.fit_transform(datasets.reshape(-1, 1))
data_arr = data_arr.toarray()

In [None]:
### add all information above
X_features = np.hstack((X_pairs, np.array(distances).reshape(500, 1), 
                        np.array(similarities).reshape(500, 1), 
                        np.array(angles).reshape(500, 1), 
                        np.array(shared_monos).reshape(500, 1), 
                        np.array(diff_monos).reshape(500, 1), 
                        np.array(num_mixtures), 
                        data_arr))