In [29]:
import os
import sys

# Get the current working directory
current_dir = os.getcwd()
# Add the ./src folder to the Python module search path
sys.path.append(os.path.join(current_dir, '..', 'src'))

from utils import *

import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

import xgboost as xgb

sns.set_style('ticks')

## 0. Read and inspect data files

In [30]:
input_path = '../Data'

feature_file = 'deepnose_features_UPD.npy'
CID_file = 'molecules_train_cid.npy'

mixture_file = 'Mixure_Definitions_Training_set.csv'
training_task_file = 'TrainingData_mixturedist.csv'

# Deepnose features
features = np.load(os.path.join(input_path, feature_file))
# Training dataframe
training_set = pd.read_csv(os.path.join(input_path, training_task_file))

# Mapping helper files
mixtures_IDs = pd.read_csv(os.path.join(input_path, mixture_file))
features_CIDs = np.load(os.path.join(input_path, CID_file))

#### Try out log standardization:

In [31]:
# Exponentiation
# features = np.exp(features)
# Standard transform features:
epsilon = 1e-8
scaler = StandardScaler(with_mean=True, with_std=True)
features = scaler.fit_transform(np.log(features + epsilon))

# Map CID to 96 dim features:
CID2features =  {CID: features[i] for i, CID in enumerate(features_CIDs)}

- each `x` in `X` contains a two vector tuple `(mixture_1, mixture_2)`, index ordered same way as `training_set`
- `method` specifies the ways to create the mixture embeeding from 

In [32]:
X, y, num_mixtures, all_pairs_CIDs = format_Xy(training_set,  mixtures_IDs, CID2features, method = 'avg')

In [33]:
# Convert the input pairs to a suitable format for training
X_pairs_1 = np.array([(np.concatenate((x1, x2))) for x1, x2 in X])
X_pairs_2 = np.array([(np.concatenate((x1, x2))) for x2, x1 in X])

y_true = np.array(y)

In [34]:
distances = [get_euclidean_distance(m[0], m[1]) for m in X]
similarities = [get_cosine_similarity(m[0], m[1]) for m in X]
angles = [get_cosine_angle(m[0], m[1]) for m in X]

In [35]:
shared_monos = [ len( set(pair[0]).intersection(set(pair[1]))) for pair in all_pairs_CIDs]
diff_monos = [ len( set(pair[0]).difference(set(pair[1]))) for pair in all_pairs_CIDs]

In [36]:
datasets = training_set['Dataset'].to_numpy()
# Returns the uniques in the order of appearance
desired_order = training_set['Dataset'].unique().tolist() 
encoder = OneHotEncoder(categories=[desired_order])
data_arr = encoder.fit_transform(datasets.reshape(-1, 1))
data_arr = data_arr.toarray()

In [37]:
### add all information above
X_features_1 = np.hstack((X_pairs_1, np.array(distances).reshape(500, 1), 
                        np.array(similarities).reshape(500, 1), 
                        np.array(angles).reshape(500, 1), 
                        np.array(shared_monos).reshape(500, 1), 
                        np.array(diff_monos).reshape(500, 1), 
                        np.array(num_mixtures), 
                        data_arr))
X_features_2 = np.hstack((X_pairs_2, np.array(distances).reshape(500, 1), 
                        np.array(similarities).reshape(500, 1), 
                        np.array(angles).reshape(500, 1), 
                        np.array(shared_monos).reshape(500, 1), 
                        np.array(diff_monos).reshape(500, 1), 
                        np.array(num_mixtures), 
                        data_arr))

In [38]:
X_features = np.empty((1000, X_features_1.shape[1]), dtype=X_features_1.dtype)
X_features[0::2] = X_features_1
X_features[1::2] = X_features_2

y_true= np.repeat(y_true, 2)

----
## 2. Training
### 2.1 Example attempt, standard intialized RF and XGBoost

In [39]:
n_folds = 10
seed = 314159

1. this is normal version, that there is no pairing:

In [40]:
# rf_pred_list = []
# xgb_pred_list = []
# y_true_list = []
# test_indices_list = []  # Keep track of the test indices in each fold

# # Perform k-fold cross-validation:
# kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
# for train_index, test_index in kf.split(X_features):
#     X_train, X_test = X_features[train_index], X_features[test_index]
#     y_train, y_test = y_true[train_index], y_true[test_index]
    
#     # Train the Random Forest regressor
#     rf = RandomForestRegressor(n_estimators=100, random_state=seed)
#     rf.fit(X_train, y_train)
    
#     # Train the XGBoost regressor
#     xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=seed)
#     xgb_model.fit(X_train, y_train)
    
#     # Make predictions 
#     rf_pred = rf.predict(X_test)
#     xgb_pred = xgb_model.predict(X_test)
    
#     rf_pred_list.extend(rf_pred)
#     xgb_pred_list.extend(xgb_pred)
#     y_true_list.extend(y_test)
#     test_indices_list.extend(test_index)  # Store the test indices

# # Store the predictions and actual values
# results_df = pd.DataFrame({
#     'test_index': test_indices_list,
#     'rf_pred': rf_pred_list,
#     'xgb_pred': xgb_pred_list,
#     'y_true': y_true_list
# })

# # Merge the results with the training_set df
# training_set = training_set.merge(results_df, left_index=True, right_on='test_index')
# training_set.drop('test_index', axis=1, inplace=True)

2. pairing, that indicies were selected such that the two that are the same samples always belong to either train or test.

In [41]:
rf_pred_list = []
xgb_pred_list = []
y_true_list = []
test_indices_list = []

# Create indices for the original samples (before duplication)
original_indices = np.arange(X_features.shape[0] // 2)

# Perform k-fold cross-validation on the original indices:
kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)

for train_index, test_index in kf.split(original_indices):
    # Convert original indices to the coupled indices
    train_index_coupled = np.concatenate([2*train_index, 2*train_index+1])
    test_index_coupled = np.concatenate([2*test_index, 2*test_index+1])
    
    X_train, X_test = X_features[train_index_coupled], X_features[test_index_coupled]
    y_train, y_test = y_true[train_index_coupled], y_true[test_index_coupled]
    
    # Train the Random Forest regressor
    rf = RandomForestRegressor(n_estimators=100, random_state=seed)
    rf.fit(X_train, y_train)
    
    # Train the XGBoost regressor
    xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=seed)
    xgb_model.fit(X_train, y_train)
    
    # Make predictions 
    rf_pred = rf.predict(X_test)
    xgb_pred = xgb_model.predict(X_test)
    
    rf_pred_list.extend(rf_pred)
    xgb_pred_list.extend(xgb_pred)
    y_true_list.extend(y_test)
    test_indices_list.extend(test_index_coupled)  # Store the coupled test indices

# Store the predictions and actual values
results_df = pd.DataFrame({
    'test_index': test_indices_list,
    'rf_pred': rf_pred_list,
    'xgb_pred': xgb_pred_list,
    'y_true': y_true_list
})

# Merge the results with the training_set df
training_set = training_set.merge(results_df, left_index=True, right_on='test_index')
training_set.drop('test_index', axis=1, inplace=True)

In [42]:
training_set.head()

Unnamed: 0,Dataset,Mixture 1,Mixture 2,Experimental Values,rf_pred,xgb_pred,y_true
300,Snitz 1,1,2,0.604167,0.548903,0.566519,0.604167
350,Snitz 1,1,3,0.651042,0.571137,0.569616,0.604167
600,Snitz 1,1,5,0.505208,0.675994,0.694503,0.651042
650,Snitz 1,1,6,0.411458,0.692739,0.679863,0.651042
0,Snitz 1,1,7,0.5625,0.541557,0.560929,0.505208


In [43]:
# Calculate the correlation and R^2 for Random Forest
rf_corr = np.corrcoef(rf_pred_list, y_true_list)[0, 1]
rf_rmse = np.sqrt(mean_squared_error(np.array(y_true_list), np.array(rf_pred_list)))

print(f"Random Forest - R: {rf_corr:.3f}")
print(f"Random Forest - RMSE: {rf_rmse:.3f}")
print()
# Calculate the correlation and R^2 for XGBoost
xgb_corr = np.corrcoef(xgb_pred_list, y_true_list)[0, 1]
xgb_rmse = np.sqrt(mean_squared_error(np.array(y_true_list), np.array(xgb_pred_list)))

print(f"XGBoost - R: {xgb_corr:.3f}")
print(f"XGBoost - RMSE: {xgb_rmse:.3f}")


Random Forest - R: 0.613
Random Forest - RMSE: 0.124

XGBoost - R: 0.541
XGBoost - RMSE: 0.135


Result's pretty the same range; which is more reassuring