In [1]:
import os
import sys

# Get the current working directory
current_dir = os.getcwd()
# Add the ./src folder to the Python module search path
sys.path.append(os.path.join(current_dir, '..', 'src'))

from train_test import *
from utils import *
from optimize import *
import numpy as np
import pandas as pd
from scipy import stats
from scipy.optimize import minimize
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge 
import xgboost as xgb
from scipy.stats import pearsonr


sns.set_style('ticks')

In [2]:
input_path = '../Data/'

features_file_1 = 'featureSelection/selection_cleanMordredDescriptors.csv'
features_file_2 =  'deepnose_features.npy'
CID_file = 'molecules_train_cid.npy'

# Read all copies, before and after correction; before was also downloaded from Dropbox.
mixture_file = 'Mixure_Definitions_Training_set_UPD2.csv' 
training_task_file = 'TrainingData_mixturedist.csv'

# Mordred features
features_1 = pd.read_csv(os.path.join(input_path, features_file_1), index_col= 0)

features_2 = np.load(os.path.join(input_path, features_file_2))

features_CIDs = np.load(os.path.join(input_path, CID_file))
# Training dataframe
training_set = pd.read_csv(os.path.join(input_path, training_task_file))

# Mapping helper files
mixtures_IDs = pd.read_csv(os.path.join(input_path, mixture_file))


feature_file_3 = 'Fingerprints/Morgan_Fingerprints_Frequency_Size50.csv'
features_3 = pd.read_csv(os.path.join(input_path, feature_file_3), index_col= 0)
features_file_4 =  'leffingwell_features_96.npy'
features_4 = np.load(os.path.join(input_path, features_file_4))




In [3]:
scaler = StandardScaler(with_mean=True, with_std=True)

# standardize Mordred
features_1_np = scaler.fit_transform(features_1)
features_1 = pd.DataFrame(features_1_np, columns=features_1.columns, index=features_1.index)


# log standardize deepnose
scaler = StandardScaler(with_mean=True, with_std=True)
epsilon = 1e-8 
features_2 = scaler.fit_transform(np.log(features_2 + epsilon))

In [4]:
# Double check the number of unique non-NaN values in each feature column
num_unique_values = np.count_nonzero(~np.isnan(features_1), axis=0)

# Print if the number of unique non-NaN values for each feature
for i, count in enumerate(num_unique_values):
    if count == 0:
        print(f"Feature {i}: {count} unique non-NaN values")

In [6]:
# Map CID to features:

# Dense
CID2features_deepnose=  {CID: features_2[i] for i, CID in enumerate(features_CIDs)}
CID2features_mordred =  {CID: features_1.loc[CID].tolist() for CID in features_CIDs}

# Sparse
CID2features_morgan =  {CID: features_3.loc[CID].tolist() for CID in features_CIDs}
CID2features_leffingwell = {CID: features_4[i] for i, CID in enumerate(features_CIDs)}

In [7]:
# Make X_feature and y
features_list = [CID2features_mordred, CID2features_deepnose]
features_list_sparse = [CID2features_morgan, CID2features_leffingwell]

X_dense, y_true = stacking_X_features(features_list, "avg")
X_sparse, _ = stacking_X_features(features_list_sparse, "sum")

X_dense_new, y_test_true = stacking_X_test_features(features_list,  X_dense, "avg")
X_sparse_new, _ = stacking_X_test_features(features_list_sparse,  X_sparse, "sum")

### Training with sparse, dense and meta model

In [8]:
n_folds = 10
seed = 314159

best_rf_dense = {'n_estimators': 500, 'min_samples_split': 2, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': True}
best_rf_sparse = {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 0.5, 'max_depth': 30, 'bootstrap': True}


###  Sequential residual training

Train a sparse model over the residual of the best dense model

In [9]:
def residual_ensemble_cv(X_dense, X_sparse, y, base_model_dense, base_model_sparse, n_folds=10):

    kf = KFold(n_splits=n_folds, shuffle=True, random_state=314159)
    
    dense_preds = np.zeros(len(y))
    sparse_preds = np.zeros(len(y))
    combined_preds = np.zeros(len(y))
    
    for train_index, val_index in kf.split(X_dense):
        X_dense_train, X_dense_val = X_dense[train_index], X_dense[val_index]
        X_sparse_train, X_sparse_val = X_sparse[train_index], X_sparse[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        # Train and predict with dense model
        base_model_dense.fit(X_dense_train, y_train)
        dense_preds[val_index] = base_model_dense.predict(X_dense_val)
        
        # Calculate residuals
        train_residuals = y_train - base_model_dense.predict(X_dense_train)
        
        # Train sparse model on residuals
        base_model_sparse.fit(X_sparse_train, train_residuals)
        sparse_preds[val_index] = base_model_sparse.predict(X_sparse_val)
        
        # Combined prediction
        combined_preds[val_index] = dense_preds[val_index] + sparse_preds[val_index]
    
    # Evaluate models
    dense_rmse = np.sqrt(mean_squared_error(y, dense_preds))
    dense_corr, _ = pearsonr(y, dense_preds)
    sparse_rmse = np.sqrt(mean_squared_error(y-dense_preds, sparse_preds))
    sparse_corr, _ = pearsonr(y, sparse_preds)
    combined_rmse = np.sqrt(mean_squared_error(y, combined_preds))
    combined_corr, _ = pearsonr(y, combined_preds)
    
    return {
        'performance': {
            'dense_model': {'RMSE': dense_rmse, 'Correlation': dense_corr},
            'sparse_model (residuals)': {'RMSE': sparse_rmse, 'Correlation': sparse_corr},
            'combined_model': {'RMSE': combined_rmse, 'Correlation': combined_corr}
        }
    }

base_model_dense = RandomForestRegressor(**best_rf_dense, random_state=314159)
base_model_sparse = RandomForestRegressor(**best_rf_sparse, random_state=314159)

cv_results_residual = residual_ensemble_cv(X_dense, X_sparse, y_true, base_model_dense, base_model_sparse)

print("Cross-validation Performance (Residual Approach):")
print("Dense Model Performance:", cv_results_residual['performance']['dense_model'])
print("Sparse Model Performance (on residuals):", cv_results_residual['performance']['sparse_model (residuals)'])
print("Combined Model Performance:", cv_results_residual['performance']['combined_model'])

Cross-validation Performance (Residual Approach):
Dense Model Performance: {'RMSE': 0.12248608448322859, 'Correlation': 0.6414345447059586}
Sparse Model Performance (on residuals): {'RMSE': 0.5876211745488226, 'Correlation': 0.5100612998906364}
Combined Model Performance: {'RMSE': 0.12050644047109231, 'Correlation': 0.6448345407631964}


In [10]:
def train_final_residual_models(X_dense, X_sparse, y, base_model_dense_class, base_model_sparse_class, n_models=10):
    final_models = []
    
    for seed in range(n_models):
        base_model_dense = base_model_dense_class(**best_rf_dense, random_state=seed)
        base_model_sparse = base_model_sparse_class(**best_rf_sparse, random_state=seed)
        
        # Train dense model
        final_base_model_dense = base_model_dense.fit(X_dense, y)
        
        # Calculate residuals
        dense_predictions = final_base_model_dense.predict(X_dense)
        residuals = y - dense_predictions
        
        # Train sparse model on residuals
        final_base_model_sparse = base_model_sparse.fit(X_sparse, residuals)
        
        final_models.append((final_base_model_dense, final_base_model_sparse))
    
    return final_models

def predict_residual_ensemble(X_dense_new, X_sparse_new, final_models):
    dense_predictions = []
    sparse_predictions = []
    combined_predictions = []
    
    for dense_model, sparse_model in final_models:
        dense_pred = dense_model.predict(X_dense_new)
        sparse_pred = sparse_model.predict(X_sparse_new)
        
        dense_predictions.append(dense_pred)
        sparse_predictions.append(sparse_pred)
        
        combined_pred = dense_pred + sparse_pred
        combined_predictions.append(combined_pred)
    
    mean_dense_pred = np.mean(dense_predictions, axis=0)
    mean_sparse_pred = np.mean(sparse_predictions, axis=0)
    mean_combined_pred = np.mean(combined_predictions, axis=0)
    
    return {
        'dense_prediction': mean_dense_pred,
        'sparse_prediction': mean_sparse_pred,
        'combined_prediction': mean_combined_pred
    }

final_models = train_final_residual_models(X_dense, X_sparse, y_true, RandomForestRegressor, RandomForestRegressor)



In [16]:
# Make predictions on new data
predictions = predict_residual_ensemble(X_dense_new, X_sparse_new, final_models)

# Access predictions
dense_preds = predictions['dense_prediction']
sparse_preds = predictions['sparse_prediction']
combined_preds = predictions['combined_prediction']



# Evaluate models
dense_rmse = np.sqrt(mean_squared_error(y_test_true, dense_preds))
dense_corr, _ = pearsonr(y_test_true, dense_preds)
sparse_rmse = np.sqrt(mean_squared_error(y_test_true-dense_preds, sparse_preds))
sparse_corr, _ = pearsonr(y_test_true-dense_preds, sparse_preds)
combined_rmse = np.sqrt(mean_squared_error(y_test_true, combined_preds))
combined_corr, _ = pearsonr(y_test_true, combined_preds)

print( {
    'performance': {
        'dense_model': {'RMSE': dense_rmse, 'Correlation': dense_corr},
        'sparse_model (residuals)': {'RMSE': sparse_rmse, 'Correlation': sparse_corr},
        'combined_model': {'RMSE': combined_rmse, 'Correlation': combined_corr}
    }
})

{'performance': {'dense_model': {'RMSE': 0.11981374609469903, 'Correlation': 0.7236704364028862}, 'sparse_model (residuals)': {'RMSE': 0.11682718231230123, 'Correlation': 0.5848251558597222}, 'combined_model': {'RMSE': 0.11682718231230123, 'Correlation': 0.7233837879784443}}}
