In [84]:
import os
import sys

# Get the current working directory
current_dir = os.getcwd()
# Add the ./src folder to the Python module search path
sys.path.append(os.path.join(current_dir, '..', 'src'))


from utils import *
from optimize import *
import numpy as np
import pandas as pd
from scipy import stats
from scipy.optimize import minimize
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

import xgboost as xgb

sns.set_style('ticks')

In [85]:
input_path = '../Data/'

features_file_1 = 'featureSelection/selection_cleanMordredDescriptors.csv'

features_file_2 =  'deepnose_features.npy'
CID_file = 'molecules_train_cid.npy'

# Read all copies, before and after correction; before was also downloaded from Dropbox.
mixture_file = 'Mixure_Definitions_Training_set.csv' 
training_task_file = 'TrainingData_mixturedist.csv'

# Mordred features
features_1 = pd.read_csv(os.path.join(input_path, features_file_1), index_col= 0)

features_2 = np.load(os.path.join(input_path, features_file_2))

features_CIDs = np.load(os.path.join(input_path, CID_file))
# Training dataframe
training_set = pd.read_csv(os.path.join(input_path, training_task_file))

# Mapping helper files
mixtures_IDs = pd.read_csv(os.path.join(input_path, mixture_file))

In [86]:
shared = set(features_1.index.tolist()) & set(features_CIDs)
len(shared) # this is expected!!!

162

In [87]:
# normalized_features_1_names = features_1_normalized.columns.tolist()
# features_1_names = features_1.columns.tolist()
# mordred_features_combined = list(set(normalized_features_1_names + features_1_names))
# np.save('../Data/featureSelection/combined_dragon_feature_names.npy', mordred_features_combined)

In [88]:
scaler = StandardScaler(with_mean=True, with_std=True)

# standardize Mordred
features_1_np = scaler.fit_transform(features_1)
features_1 = pd.DataFrame(features_1_np, columns=features_1.columns, index=features_1.index)


# log standardize deepnose
epsilon = 1e-8 
features_2 = scaler.fit_transform(np.log(features_2 + epsilon))

In [89]:
# Double check the number of unique non-NaN values in each feature column
num_unique_values = np.count_nonzero(~np.isnan(features_1), axis=0)

# Print if the number of unique non-NaN values for each feature
for i, count in enumerate(num_unique_values):
    if count == 0:
        print(f"Feature {i}: {count} unique non-NaN values")

In [90]:
# Map CID to features:
CID2features_deepnose=  {CID: features_2[i] for i, CID in enumerate(features_CIDs)}
CID2features_mordred =  {CID: features_1.loc[CID].tolist() for CID in features_CIDs}

In [91]:
X_m, y, num_mixtures, all_pairs_CIDs = format_Xy(training_set,  mixtures_IDs, CID2features_mordred, method = 'avg')
X_d, _, _, _ = format_Xy(training_set,  mixtures_IDs, CID2features_deepnose, method = 'avg')

In [92]:
# Convert the input pairs to a suitable format for training
X_pairs_m = np.array([(np.concatenate((x1, x2))) for x1, x2 in X_m])
X_pairs_d = np.array([(np.concatenate((x1, x2))) for x1, x2 in X_d])

y_true = np.array(y)

In [93]:
distances_m = [get_euclidean_distance(m[0], m[1]) for m in X_m]
similarities_m = [get_cosine_similarity(m[0], m[1]) for m in X_m]
angles_m = [get_cosine_angle(m[0], m[1]) for m in X_m] 

distances_d = [get_euclidean_distance(m[0], m[1]) for m in X_d]
similarities_d = [get_cosine_similarity(m[0], m[1]) for m in X_d]
angles_d = [get_cosine_angle(m[0], m[1]) for m in X_d] 


In [94]:
shared_monos = [ len( set(pair[0]).intersection(set(pair[1]))) for pair in all_pairs_CIDs]
diff_monos = [ len( set(pair[0]).difference(set(pair[1]))) for pair in all_pairs_CIDs]

In [95]:
training_set['Sum num monos'] = np.array(num_mixtures).sum(axis = 1)
training_set['Shared'] = shared_monos
training_set['Diff'] = diff_monos
training_set['Num mixture1'] = np.array(num_mixtures)[:, 0]
training_set['Num mixture2'] = np.array(num_mixtures)[:, 1]

In [96]:
datasets = training_set['Dataset'].to_numpy()
# Returns the uniques in the order of appearance
desired_order = training_set['Dataset'].unique().tolist() 
encoder = OneHotEncoder(categories=[desired_order])
data_arr = encoder.fit_transform(datasets.reshape(-1, 1))
data_arr = data_arr.toarray()

In [97]:
X_features = np.hstack( (X_pairs_m, X_pairs_d,
                        np.array(distances_m).reshape(500, 1), 
                        np.array(similarities_m).reshape(500, 1), 
                        np.array(angles_m).reshape(500, 1), 
                        np.array(distances_d).reshape(500, 1), 
                        np.array(similarities_d).reshape(500, 1), 
                        np.array(angles_d).reshape(500, 1), 
                        np.array(shared_monos).reshape(500, 1), 
                        np.array(diff_monos).reshape(500, 1), 
                        np.array(num_mixtures).reshape(500,2), 
                        data_arr))


In [98]:
# dist_corr = np.corrcoef(distances_d, y_true)[0, 1]
dist_corr = np.corrcoef(distances_m, y_true)[0, 1]

print('R (Deepnose embedding Eucledian distance v.s Experimental Value): ', dist_corr)

R (Deepnose embedding Eucledian distance v.s Experimental Value):  0.4777691715913296


In [99]:
# sim_corr = np.corrcoef(similarities_d, y_true)[0, 1]
sim_corr = np.corrcoef(similarities_m, y_true)[0, 1]

print('R (Cosyne similarity v.s. Experimental Value): ', sim_corr)

R (Cosyne similarity v.s. Experimental Value):  -0.4930930966753656


In [100]:
# sim_corr = np.corrcoef(angles_d, y_true)[0, 1]
sim_corr = np.corrcoef(angles_m, y_true)[0, 1]

print('R (Vector angle v.s. Experimental Value): ', sim_corr)

R (Vector angle v.s. Experimental Value):  0.5028727737317864


### Training with optimizing threshold function

In [101]:
n_folds = 10
seed = 314159

best_rf = {'n_estimators': 500, 'min_samples_split': 2, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': True}
best_rgb = {'subsample': 0.7, 'n_estimators': 400, 'max_depth': 9, 'learning_rate': 0.01, 'colsample_bytree': 0.5}

In [102]:
def combine_predictions(xgb_pred, rf_pred, threshold_low, threshold_high):
    combined = np.where(xgb_pred <= threshold_low, xgb_pred,
                        np.where(xgb_pred >= threshold_high, xgb_pred,
                                 rf_pred))
    return combined

def objective_function(thresholds, xgb_pred, rf_pred, true_values):
    combined_pred = combine_predictions(xgb_pred, rf_pred, thresholds[0], thresholds[1])
    rmse = np.sqrt(mean_squared_error(true_values, combined_pred))
    corr = np.corrcoef(combined_pred, true_values)[0, 1]
    return -corr * 0.1 + rmse * 0.9  # Adjust weights as needed

In [103]:
# rf_pred_list = []
# xgb_pred_list = []
# kf_rf_importances = []
# y_true_list = []
# test_indices_list = []  # Keep track of the test indices in each fold

# # Perform k-fold cross-validation:
# kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
# for train_index, test_index in kf.split(X_features):

#     X_train, X_test = X_features[train_index], X_features[test_index]
#     y_train, y_test = y_true[train_index], y_true[test_index]
    
#     # Train the Random Forest regressor
#     rf_model = RandomForestRegressor(**best_rf, random_state=seed)
#     rf_model.fit(X_train, y_train)
    
#     # Train the XGBoost regressor
#     xgb_model = xgb.XGBRegressor(**best_rgb, random_state=seed)
#     xgb_model.fit(X_train, y_train)
    
#     # Make predictions 
#     rf_pred = rf_model.predict(X_test)
#     xgb_pred = xgb_model.predict(X_test)
    
#     # Get the feature importances
#     importances = rf_model.feature_importances_
#     kf_rf_importances.append(importances)
#     rf_pred_list.extend(rf_pred)
#     xgb_pred_list.extend(xgb_pred)
#     y_true_list.extend(y_test)
#     test_indices_list.extend(test_index)  # Store the test indices

# # Store the predictions and actual values
# results_df = pd.DataFrame({
#     'test_index': test_indices_list,
#     'rf_pred': rf_pred_list,
#     'xgb_pred': xgb_pred_list,
#     'y_true': y_true_list
# })

# # Merge the results with the training_set df
# training_set = training_set.merge(results_df, left_index=True, right_on='test_index')
# training_set.drop('test_index', axis=1, inplace=True)

In [104]:
# training_set.head()

In [105]:
# # Calculate the correlation and R^2 for Random Forest
# rf_corr = np.corrcoef(rf_pred_list, y_true_list)[0, 1]
# rf_rmse = np.sqrt(mean_squared_error(np.array(y_true_list), np.array(rf_pred_list)))

# print(f"Random Forest - R: {rf_corr:.3f}")
# print(f"Random Forest - RMSE: {rf_rmse:.3f}")
# print()
# # Calculate the correlation and R^2 for XGBoost
# xgb_corr = np.corrcoef(xgb_pred_list, y_true_list)[0, 1]
# xgb_rmse = np.sqrt(mean_squared_error(np.array(y_true_list), np.array(xgb_pred_list)))

# print(f"XGBoost - R: {xgb_corr:.3f}")
# print(f"XGBoost - RMSE: {xgb_rmse:.3f}")


In [106]:
rf_pred_list = []
xgb_pred_list = []
combined_pred_list = []
kf_rf_importances = []
y_true_list = []
test_indices_list = []
optimal_thresholds_list = []

# Perform k-fold cross-validation:
kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
for train_index, test_index in kf.split(X_features):

    X_train, X_test = X_features[train_index], X_features[test_index]
    y_train, y_test = y_true[train_index], y_true[test_index]
    
    # Train the Random Forest regressor
    rf_model = RandomForestRegressor(**best_rf, random_state=seed)
    rf_model.fit(X_train, y_train)
    
    # Train the XGBoost regressor
    xgb_model = xgb.XGBRegressor(**best_rgb, random_state=seed)
    xgb_model.fit(X_train, y_train)
    
    # Make predictions 
    rf_pred = rf_model.predict(X_test)
    xgb_pred = xgb_model.predict(X_test)
    
    # Optimize thresholds
    initial_thresholds = [0.4, 0.6]  # Starting guess
    # initial_thresholds = [0.3, 0.7]  # Starting guess

    result = minimize(lambda x: objective_function(x, xgb_pred, rf_pred, y_test),
                      initial_thresholds,
                      method='Nelder-Mead',
                      bounds=[(0, 1), (0, 1)])
    
    optimal_thresholds = result.x
    optimal_thresholds_list.append(optimal_thresholds)
    
    # Generate combined predictions
    combined_pred = combine_predictions(xgb_pred, rf_pred, optimal_thresholds[0], optimal_thresholds[1])
    
    # Get the feature importances
    importances = rf_model.feature_importances_
    kf_rf_importances.append(importances)
    
    rf_pred_list.extend(rf_pred)
    xgb_pred_list.extend(xgb_pred)
    combined_pred_list.extend(combined_pred)
    y_true_list.extend(y_test)
    test_indices_list.extend(test_index)

# Store the predictions and actual values
results_df = pd.DataFrame({
    'test_index': test_indices_list,
    'rf_pred': rf_pred_list,
    'xgb_pred': xgb_pred_list,
    'combined_pred': combined_pred_list,
    'y_true': y_true_list
})

# Merge the results with the training_set df
training_set = training_set.merge(results_df, left_index=True, right_on='test_index')
training_set.drop('test_index', axis=1, inplace=True)


In [107]:
# Calculate overall performance metrics
overall_rf_rmse = np.sqrt(mean_squared_error(y_true_list, rf_pred_list))
overall_xgb_rmse = np.sqrt(mean_squared_error(y_true_list, xgb_pred_list))
overall_combined_rmse = np.sqrt(mean_squared_error(y_true_list, combined_pred_list))

overall_rf_corr = np.corrcoef(rf_pred_list, y_true_list)[0, 1]
overall_xgb_corr = np.corrcoef(xgb_pred_list, y_true_list)[0, 1]
overall_combined_corr = np.corrcoef(combined_pred_list, y_true_list)[0, 1]

print(f"RF RMSE: {overall_rf_rmse} \nCorrelation: {overall_rf_corr}\n")
print(f"XGB RMSE: {overall_xgb_rmse} \nCorrelation: {overall_xgb_corr}\n")
print(f"Combined RMSE: {overall_combined_rmse} \nCorrelation: {overall_combined_corr}\n")

# Average optimal thresholds
avg_optimal_thresholds = np.mean(optimal_thresholds_list, axis=0)
print(f"Average optimal thresholds: {avg_optimal_thresholds}")

RF RMSE: 0.12197581633078773 
Correlation: 0.6474515913228636
XGB RMSE: 0.12131653217126667 
Correlation: 0.633644940861654
Combined RMSE: 0.1193005272566889 
Correlation: 0.6552910046267708
Average optimal thresholds: [0.40566357 0.61836108]


In [108]:
# Train final models on full data
# final_rf_model = RandomForestRegressor(**best_rf, random_state=seed)
# final_rf_model.fit(X_features, y_true)
# final_xgb_model = xgb.XGBRegressor(**best_rgb, random_state=seed)
# final_xgb_model.fit(X_features, y_true)

In [109]:
# def make_combined_predictions(X):
#     rf_pred = final_rf_model.predict(X)
#     xgb_pred = final_xgb_model.predict(X)
#     combined_pred = combine_predictions(xgb_pred, rf_pred, avg_optimal_thresholds[0], avg_optimal_thresholds[1])
#     return combined_pred

In [110]:
# Use this function for your leaderboard submissions
# leaderboard_predictions = make_combined_predictions(X_leaderboard)