In [117]:
import os
import sys

# Get the current working directory
current_dir = os.getcwd()
# Add the ./src folder to the Python module search path
sys.path.append(os.path.join(current_dir, '..', 'src'))

from train_test import *
from utils import *
from optimize import *
import numpy as np
import pandas as pd
from scipy import stats
from scipy.optimize import minimize
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge 
import xgboost as xgb
from scipy.stats import pearsonr


sns.set_style('ticks')

In [118]:
input_path = '../Data/'

features_file_1 = 'featureSelection/selection_cleanMordredDescriptors.csv'
features_file_2 =  'deepnose_features.npy'
CID_file = 'molecules_train_cid.npy'

# Read all copies, before and after correction; before was also downloaded from Dropbox.
mixture_file = 'Mixure_Definitions_Training_set_UPD2.csv' 
training_task_file = 'TrainingData_mixturedist.csv'

# Mordred features
features_1 = pd.read_csv(os.path.join(input_path, features_file_1), index_col= 0)

features_2 = np.load(os.path.join(input_path, features_file_2))

features_CIDs = np.load(os.path.join(input_path, CID_file))
# Training dataframe
training_set = pd.read_csv(os.path.join(input_path, training_task_file))

# Mapping helper files
mixtures_IDs = pd.read_csv(os.path.join(input_path, mixture_file))


feature_file_3 = 'Fingerprints/Morgan_Fingerprints_Frequency_Size50.csv'
features_3 = pd.read_csv(os.path.join(input_path, feature_file_3), index_col= 0)
features_file_4 =  'leffingwell_features_96.npy'
features_4 = np.load(os.path.join(input_path, features_file_4))




In [119]:
scaler = StandardScaler(with_mean=True, with_std=True)

# standardize Mordred
features_1_np = scaler.fit_transform(features_1)
features_1 = pd.DataFrame(features_1_np, columns=features_1.columns, index=features_1.index)


# log standardize deepnose
scaler = StandardScaler(with_mean=True, with_std=True)
epsilon = 1e-8 
features_2 = scaler.fit_transform(np.log(features_2 + epsilon))

In [120]:
# Double check the number of unique non-NaN values in each feature column
num_unique_values = np.count_nonzero(~np.isnan(features_1), axis=0)

# Print if the number of unique non-NaN values for each feature
for i, count in enumerate(num_unique_values):
    if count == 0:
        print(f"Feature {i}: {count} unique non-NaN values")

In [121]:
# Map CID to features:
CID2features_deepnose=  {CID: features_2[i] for i, CID in enumerate(features_CIDs)}
CID2features_mordred =  {CID: features_1.loc[CID].tolist() for CID in features_CIDs}
CID2features_morgan =  {CID: features_3.loc[CID].tolist() for CID in features_CIDs}
CID2features_leffingwell = {CID: features_4[i] for i, CID in enumerate(features_CIDs)}



In [122]:
X_m, y, num_mixtures, all_pairs_CIDs = format_Xy(training_set,  mixtures_IDs, CID2features_mordred, method = 'avg')
X_d, _, _, _ = format_Xy(training_set,  mixtures_IDs, CID2features_deepnose, method = 'avg')
X_mg, y, _, _ = format_Xy(training_set,  mixtures_IDs, CID2features_morgan, method = 'sum')
X_lw, y, _, _ = format_Xy(training_set,  mixtures_IDs, CID2features_leffingwell, method = 'sum')


In [123]:
# Convert the input pairs to a suitable format for training
X_pairs_m = np.array([(np.concatenate((x1, x2))) for x1, x2 in X_m])
X_pairs_d = np.array([(np.concatenate((x1, x2))) for x1, x2 in X_d])
X_pairs_mg= np.array([(np.concatenate((x1, x2))) for x1, x2 in X_mg])
X_pairs_lw= np.array([(np.concatenate((x1, x2))) for x1, x2 in X_lw])

y_true = np.array(y)

In [124]:
distances_m = [get_euclidean_distance(m[0], m[1]) for m in X_m]
similarities_m = [get_cosine_similarity(m[0], m[1]) for m in X_m]
angles_m = [get_cosine_angle(m[0], m[1]) for m in X_m] 

distances_d = [get_euclidean_distance(m[0], m[1]) for m in X_d]
similarities_d = [get_cosine_similarity(m[0], m[1]) for m in X_d]
angles_d = [get_cosine_angle(m[0], m[1]) for m in X_d] 

distances_mg = [get_euclidean_distance(m[0], m[1]) for m in X_mg]
similarities_mg = [get_cosine_similarity(m[0], m[1]) for m in X_mg]
angles_mg = [get_cosine_angle(m[0], m[1]) for m in X_mg] 

distances_lw = [get_euclidean_distance(m[0], m[1]) for m in X_lw]
similarities_lw = [get_cosine_similarity(m[0], m[1]) for m in X_lw]
angles_lw = [get_cosine_angle(m[0], m[1]) for m in X_lw] 


In [125]:
shared_monos = [ len( set(pair[0]).intersection(set(pair[1]))) for pair in all_pairs_CIDs]
diff_monos = [ len( set(pair[0]).difference(set(pair[1]))) for pair in all_pairs_CIDs]

In [126]:
training_set['Sum num monos'] = np.array(num_mixtures).sum(axis = 1)
training_set['Shared'] = shared_monos
training_set['Diff'] = diff_monos
training_set['Num mixture1'] = np.array(num_mixtures)[:, 0]
training_set['Num mixture2'] = np.array(num_mixtures)[:, 1]

In [127]:
datasets = training_set['Dataset'].to_numpy()
# Returns the uniques in the order of appearance
desired_order = training_set['Dataset'].unique().tolist() 
encoder = OneHotEncoder(categories=[desired_order])
data_arr = encoder.fit_transform(datasets.reshape(-1, 1))
data_arr = data_arr.toarray()

In [128]:
desired_order

['Snitz 1', 'Snitz 2', 'Ravia', 'Bushdid']

In [129]:
X_dense = np.hstack( (  X_pairs_m, 
                        np.array(distances_m).reshape(500, 1), 
                        np.array(similarities_m).reshape(500, 1), 
                        np.array(angles_m).reshape(500, 1), 
                        X_pairs_d,
                        np.array(distances_d).reshape(500, 1), 
                        np.array(similarities_d).reshape(500, 1), 
                        np.array(angles_d).reshape(500, 1), 
                        np.array(shared_monos).reshape(500, 1), 
                        np.array(diff_monos).reshape(500, 1), 
                        np.array(num_mixtures).reshape(500,2), 
                        data_arr))


In [130]:
X_sparse = np.hstack( (X_pairs_mg,
                        np.array(distances_mg).reshape(500, 1), 
                        np.array(similarities_mg).reshape(500, 1), 
                        np.array(angles_mg).reshape(500, 1), 
                        X_pairs_lw,
                        np.array(distances_lw).reshape(500, 1), 
                        np.array(similarities_lw).reshape(500, 1), 
                        np.array(angles_lw).reshape(500, 1),                        
                        np.array(shared_monos).reshape(500, 1), 
                        np.array(diff_monos).reshape(500, 1), 
                        np.array(num_mixtures).reshape(500,2), 
                        data_arr))


### Training with optimizing threshold function

In [131]:
n_folds = 10
seed = 314159

best_rf_dense = {'n_estimators': 500, 'min_samples_split': 2, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': True}
best_rf_sparse = {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 0.5, 'max_depth': 30, 'bootstrap': True}


In [132]:
def stacking_ensemble_cv(X_dense, X_sparse, y, base_model_dense, base_model_sparse, meta_model, n_folds=10):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=314159)
    
    dense_preds = np.zeros(len(y))
    sparse_preds = np.zeros(len(y))
    meta_preds = np.zeros(len(y))
    
    for train_index, val_index in kf.split(X_dense):
        X_dense_train, X_dense_val = X_dense[train_index], X_dense[val_index]
        X_sparse_train, X_sparse_val = X_sparse[train_index], X_sparse[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        # Train and predict with base models
        base_model_dense.fit(X_dense_train, y_train)
        base_model_sparse.fit(X_sparse_train, y_train)
        
        dense_preds[val_index] = base_model_dense.predict(X_dense_val)
        sparse_preds[val_index] = base_model_sparse.predict(X_sparse_val)
        
        # Train and predict with meta model
        meta_features_train = np.column_stack((
            base_model_dense.predict(X_dense_train),
            base_model_sparse.predict(X_sparse_train)
        ))
        meta_model.fit(meta_features_train, y_train)
        
        meta_features_val = np.column_stack((dense_preds[val_index], sparse_preds[val_index]))
        meta_preds[val_index] = meta_model.predict(meta_features_val)
    
    # Evaluate models
    dense_rmse = np.sqrt(mean_squared_error(y, dense_preds))
    dense_corr, _ = pearsonr(y, dense_preds)
    sparse_rmse = np.sqrt(mean_squared_error(y, sparse_preds))
    sparse_corr, _ = pearsonr(y, sparse_preds)
    stacked_rmse = np.sqrt(mean_squared_error(y, meta_preds))
    stacked_corr, _ = pearsonr(y, meta_preds)
    
    return {
        'performance': {
            'dense_model': {'RMSE': dense_rmse, 'Correlation': dense_corr},
            'sparse_model': {'RMSE': sparse_rmse, 'Correlation': sparse_corr},
            'stacked_model': {'RMSE': stacked_rmse, 'Correlation': stacked_corr}
        }
    }

# Usage for cross-validation
base_model_dense = RandomForestRegressor(**best_rf_dense, random_state=314159)
base_model_sparse = RandomForestRegressor(**best_rf_sparse, random_state=314159)
meta_model = Ridge()

cv_results = stacking_ensemble_cv(X_dense, X_sparse, y_true, base_model_dense, base_model_sparse, meta_model)

print("Cross-validation Performance:")
print("Dense Model Performance:", cv_results['performance']['dense_model'])
print("Sparse Model Performance:", cv_results['performance']['sparse_model'])
print("Stacked Model Performance:", cv_results['performance']['stacked_model'])

Cross-validation Performance:
Dense Model Performance: {'RMSE': 0.12248608448322859, 'Correlation': 0.6414345447059586}
Sparse Model Performance: {'RMSE': 0.12308167512831363, 'Correlation': 0.6183843395845805}
Stacked Model Performance: {'RMSE': 0.11969892217196872, 'Correlation': 0.6448669668837788}


In [133]:
def train_final_models(X_dense, X_sparse, y, base_model_dense_class, base_model_sparse_class, meta_model_class, n_models=10):
    final_models = []
    
    for seed in range(n_models):
        base_model_dense = base_model_dense_class(**best_rf_dense, random_state=seed)
        base_model_sparse = base_model_sparse_class(**best_rf_sparse, random_state=seed)
        meta_model = meta_model_class()
        
        # Train base models
        final_base_model_dense = base_model_dense.fit(X_dense, y)
        final_base_model_sparse = base_model_sparse.fit(X_sparse, y)
        
        # Train meta model
        final_meta_features = np.column_stack((
            final_base_model_dense.predict(X_dense),
            final_base_model_sparse.predict(X_sparse)
        ))
        final_meta_model = meta_model.fit(final_meta_features, y)
        
        final_models.append((final_base_model_dense, final_base_model_sparse, final_meta_model))
    
    return final_models


def predict_stacked_ensemble(X_dense_new, X_sparse_new, final_models):
    dense_predictions = []
    sparse_predictions = []
    meta_predictions = []
    
    for dense_model, sparse_model, meta_model in final_models:
        dense_pred = dense_model.predict(X_dense_new)
        sparse_pred = sparse_model.predict(X_sparse_new)
        
        dense_predictions.append(dense_pred)
        sparse_predictions.append(sparse_pred)
        
        meta_features = np.column_stack((dense_pred, sparse_pred))
        meta_pred = meta_model.predict(meta_features)
        meta_predictions.append(meta_pred)
    
    mean_dense_pred = np.mean(dense_predictions, axis=0)
    mean_sparse_pred = np.mean(sparse_predictions, axis=0)
    mean_meta_pred = np.mean(meta_predictions, axis=0)
    
    return {
        'dense_prediction': mean_dense_pred,
        'sparse_prediction': mean_sparse_pred,
        'meta_prediction': mean_meta_pred
    }



In [134]:
final_models = train_final_models(X_dense, X_sparse, y_true, RandomForestRegressor, RandomForestRegressor, Ridge)

In [135]:
features_list = [CID2features_mordred, CID2features_deepnose]
features_list_sparse = [CID2features_morgan, CID2features_leffingwell]
X_dense_new, y_test_true = stacking_X_test_features(features_list,  X_dense, "avg")
X_sparse_new, _ = stacking_X_test_features(features_list_sparse,  X_sparse, "sum")

In [136]:
# Example usage:
predictions = predict_stacked_ensemble(X_dense_new, X_sparse_new, final_models)

In [137]:
for model_type in ['dense', 'sparse', 'meta']:
    pred = predictions[f'{model_type}_prediction']
    corr, _ = pearsonr(pred, y_test_true)
    rmse = np.sqrt(mean_squared_error(y_test_true, pred))
    
    print(f"{model_type.capitalize()} Model Performance:")
    print(f"  Correlation: {corr:.3f}")
    print(f"  RMSE: {rmse:.3f}")
    print()

Dense Model Performance:
  Correlation: 0.724
  RMSE: 0.120

Sparse Model Performance:
  Correlation: 0.683
  RMSE: 0.120

Meta Model Performance:
  Correlation: 0.714
  RMSE: 0.116

