In [1]:
import os
import sys

# Get the current working directory
current_dir = os.getcwd()
# Add the ./src folder to the Python module search path
sys.path.append(os.path.join(current_dir, '..', 'src'))

from utils import *
from optimize_symmetric import *

import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

import xgboost as xgb

sns.set_style('ticks')

## 0. Read and inspect data files

In [2]:
input_path = '../Data'

feature_file = 'deepnose_features_UPD.npy'
CID_file = 'molecules_train_cid.npy'

mixture_file = 'Mixure_Definitions_Training_set.csv'
training_task_file = 'TrainingData_mixturedist.csv'

# Deepnose features
features = np.load(os.path.join(input_path, feature_file))
# Training dataframe
training_set = pd.read_csv(os.path.join(input_path, training_task_file))

# Mapping helper files
mixtures_IDs = pd.read_csv(os.path.join(input_path, mixture_file))
features_CIDs = np.load(os.path.join(input_path, CID_file))

#### Try out log standardization:

In [3]:
# Exponentiation
# features = np.exp(features)
# Standard transform features:
epsilon = 1e-8
scaler = StandardScaler(with_mean=True, with_std=True)
features = scaler.fit_transform(np.log(features + epsilon))

# Map CID to 96 dim features:
CID2features =  {CID: features[i] for i, CID in enumerate(features_CIDs)}

- each `x` in `X` contains a two vector tuple `(mixture_1, mixture_2)`, index ordered same way as `training_set`
- `method` specifies the ways to create the mixture embeeding from 

In [4]:
X, y, num_mixtures, all_pairs_CIDs = format_Xy(training_set,  mixtures_IDs, CID2features, method = 'avg')

In [5]:
# Convert the input pairs to a suitable format for training
X_pairs_1 = np.array([(np.concatenate((x1, x2))) for x1, x2 in X])
X_pairs_2 = np.array([(np.concatenate((x1, x2))) for x2, x1 in X])

y_true = np.array(y)

In [6]:
distances = [get_euclidean_distance(m[0], m[1]) for m in X]
similarities = [get_cosine_similarity(m[0], m[1]) for m in X]
angles = [get_cosine_angle(m[0], m[1]) for m in X]

In [7]:
shared_monos = [ len( set(pair[0]).intersection(set(pair[1]))) for pair in all_pairs_CIDs]
diff_monos = [ len( set(pair[0]).difference(set(pair[1]))) for pair in all_pairs_CIDs]

In [8]:
datasets = training_set['Dataset'].to_numpy()
# Returns the uniques in the order of appearance
desired_order = training_set['Dataset'].unique().tolist() 
encoder = OneHotEncoder(categories=[desired_order])
data_arr = encoder.fit_transform(datasets.reshape(-1, 1))
data_arr = data_arr.toarray()

In [9]:
### add all information above
X_features_1 = np.hstack((X_pairs_1, np.array(distances).reshape(500, 1), 
                        np.array(similarities).reshape(500, 1), 
                        np.array(angles).reshape(500, 1), 
                        np.array(shared_monos).reshape(500, 1), 
                        np.array(diff_monos).reshape(500, 1), 
                        np.array(num_mixtures), 
                        data_arr))
X_features_2 = np.hstack((X_pairs_2, np.array(distances).reshape(500, 1), 
                        np.array(similarities).reshape(500, 1), 
                        np.array(angles).reshape(500, 1), 
                        np.array(shared_monos).reshape(500, 1), 
                        np.array(diff_monos).reshape(500, 1), 
                        np.array(num_mixtures), 
                        data_arr))

In [10]:
X_features = np.empty((1000, X_features_1.shape[1]), dtype=X_features_1.dtype)
X_features[0::2] = X_features_1
X_features[1::2] = X_features_2

y_true= np.repeat(y_true, 2)

In [11]:
# Repeat each row of the training_set dataframe
training_set_repeated = training_set.loc[training_set.index.repeat(2)].reset_index(drop=True)

# Create a new column for the paired index
training_set_repeated['paired_index'] = training_set_repeated.index // 2

# Merge the results with the repeated training_set df
training_set_final = training_set_repeated

# Drop unnecessary columns
training_set_final.drop(['paired_index'], axis=1, inplace=True)

In [12]:
training_set_repeated

Unnamed: 0,Dataset,Mixture 1,Mixture 2,Experimental Values
0,Snitz 1,1,2,0.604167
1,Snitz 1,1,2,0.604167
2,Snitz 1,1,3,0.651042
3,Snitz 1,1,3,0.651042
4,Snitz 1,1,5,0.505208
...,...,...,...,...
995,Bushdid,515,516,0.730769
996,Bushdid,517,518,0.538462
997,Bushdid,517,518,0.538462
998,Bushdid,519,520,0.807692


----
## 2. Training
### 2.1 Example attempt, standard intialized RF and XGBoost

In [19]:
n_folds = 10
seed = 314159

Pairing, that indicies were selected such that the two that are the same samples always belong to either train or test.

In [20]:
rf_pred_list = []
xgb_pred_list = []
y_true_list = []
test_indices_list = []

# Create indices for the original samples (before duplication)
original_indices = np.arange(X_features.shape[0] // 2)

# Perform k-fold cross-validation on the original indices:
kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)

for train_index, test_index in kf.split(original_indices):
    # Convert original indices to the coupled indices
    train_index_coupled = np.concatenate([2*train_index, 2*train_index+1])
    test_index_coupled = np.concatenate([2*test_index, 2*test_index+1])
    
    X_train, X_test = X_features[train_index_coupled], X_features[test_index_coupled]
    y_train, y_test = y_true[train_index_coupled], y_true[test_index_coupled]
    
    # Train the Random Forest regressor
    rf = RandomForestRegressor(n_estimators=100, random_state=seed)
    rf.fit(X_train, y_train)
    
    # Train the XGBoost regressor
    xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=seed)
    xgb_model.fit(X_train, y_train)
    
    # Make predictions 
    rf_pred = rf.predict(X_test)
    xgb_pred = xgb_model.predict(X_test)
    
    rf_pred_list.extend(rf_pred)
    xgb_pred_list.extend(xgb_pred)
    y_true_list.extend(y_test)
    test_indices_list.extend(test_index_coupled)  # Store the coupled test indices

# Store the predictions and actual values
results_df = pd.DataFrame({
    'test_index': test_indices_list,
    'rf_pred': rf_pred_list,
    'xgb_pred': xgb_pred_list,
    'y_true': y_true_list
})


# Create a temporary index column in training_set_final
training_set_final['original_index'] = range(len(training_set_final))

# Merge the results with the training_set df
training_set_final = training_set_final.merge(results_df, left_on='original_index', right_on='test_index')

# Sort by the original index to restore the original order
training_set_final = training_set_final.sort_values('original_index').reset_index(drop=True)

# Drop unnecessary columns
training_set_final.drop(['original_index', 'test_index'], axis=1, inplace=True)

In [21]:
training_set_final.head()

Unnamed: 0,Dataset,Mixture 1,Mixture 2,Experimental Values,rf_pred,xgb_pred,y_true
0,Snitz 1,1,2,0.604167,0.548903,0.566519,0.604167
1,Snitz 1,1,2,0.604167,0.571137,0.569616,0.604167
2,Snitz 1,1,3,0.651042,0.675994,0.694503,0.651042
3,Snitz 1,1,3,0.651042,0.692739,0.679863,0.651042
4,Snitz 1,1,5,0.505208,0.541557,0.560929,0.505208


In [22]:
# Calculate the correlation and R^2 for Random Forest
rf_corr = np.corrcoef(rf_pred_list, y_true_list)[0, 1]
rf_rmse = np.sqrt(mean_squared_error(np.array(y_true_list), np.array(rf_pred_list)))

print(f"Random Forest - R: {rf_corr:.3f}")
print(f"Random Forest - RMSE: {rf_rmse:.3f}")
print()
# Calculate the correlation and R^2 for XGBoost
xgb_corr = np.corrcoef(xgb_pred_list, y_true_list)[0, 1]
xgb_rmse = np.sqrt(mean_squared_error(np.array(y_true_list), np.array(xgb_pred_list)))

print(f"XGBoost - R: {xgb_corr:.3f}")
print(f"XGBoost - RMSE: {xgb_rmse:.3f}")


Random Forest - R: 0.613
Random Forest - RMSE: 0.124

XGBoost - R: 0.541
XGBoost - RMSE: 0.135


In [23]:
# Convert lists to numpy arrays
rf_pred_array = np.array(rf_pred_list)
xgb_pred_array = np.array(xgb_pred_list)
y_true_array = np.array(y_true_list)
test_indices_array = np.array(test_indices_list)

# Create a sorting index based on test_indices_array
sort_idx = np.argsort(test_indices_array)

# Sort all arrays based on this index
rf_pred_sorted = rf_pred_array[sort_idx]
xgb_pred_sorted = xgb_pred_array[sort_idx]
y_true_sorted = y_true_array[sort_idx]
test_indices_sorted = test_indices_array[sort_idx]

# Now, let's pair the sorted arrays
rf_pred_paired = rf_pred_sorted.reshape(-1, 2)
xgb_pred_paired = xgb_pred_sorted.reshape(-1, 2)
y_true_paired = y_true_sorted.reshape(-1, 2)

# Average the pairs
rf_pred_avg = rf_pred_paired.mean(axis=1)
xgb_pred_avg = xgb_pred_paired.mean(axis=1)
y_true_avg = y_true_paired.mean(axis=1)

# Calculate correlations
rf_corr = np.corrcoef(rf_pred_avg, y_true_avg)[0, 1]
xgb_corr = np.corrcoef(xgb_pred_avg, y_true_avg)[0, 1]

# Calculate RMSE
rf_rmse = np.sqrt(mean_squared_error(y_true_avg, rf_pred_avg))
xgb_rmse = np.sqrt(mean_squared_error(y_true_avg, xgb_pred_avg))

# Print results
print("Random Forest Results:")
print(f"Correlation: {rf_corr:.4f}")
print(f"RMSE: {rf_rmse:.4f}")
print("\nXGBoost Results:")
print(f"Correlation: {xgb_corr:.4f}")
print(f"RMSE: {xgb_rmse:.4f}")

# If you need the original indices for these averaged results:
original_indices_avg = test_indices_sorted.reshape(-1, 2).mean(axis=1).astype(int)

Random Forest Results:
Correlation: 0.6154
RMSE: 0.1234

XGBoost Results:
Correlation: 0.5549
RMSE: 0.1323


Result's pretty the same range; which is more reassuring

In [24]:
seeds = list(range(3))
for seed in seeds: 
    print(f"Random search for best hyperparams: round {seed +1} \n")
    rf_best,rbg_best = para_search(seed, X_features, y_true)
    print()
    rf_out = avg_rf_best(rf_best, X_features, y_true)
    print()
    rbg_out = avg_xgb_best(rbg_best, X_features, y_true)
    print()