In [2]:
from utils import *
from optimize import *
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

import xgboost as xgb

sns.set_style('ticks')

In [3]:
input_path = 'Data'

feature_file = 'deepnose_features.npy'
CID_file = 'molecules_train_cid.npy'
mixture_file = 'Mixure_Definitions_Training_set.csv'
training_task_file = 'TrainingData_mixturedist.csv'

features = np.load(os.path.join(input_path, feature_file))
mixtures_IDs = pd.read_csv(os.path.join(input_path, mixture_file))
training_set = pd.read_csv(os.path.join(input_path, training_task_file))

### Optimization for the RF and RGBoost regressors with  `X_feature`

In [4]:
feature_exp  = 0

In [5]:
if feature_exp:
    ...

else:
    # Standard transform features:
    scaler = StandardScaler(with_mean=True, with_std=True)
    features = scaler.fit_transform(features)

    features_CIDs = np.load(os.path.join(input_path, CID_file))
    CID2features =  {CID: features[i] for i, CID in enumerate(features_CIDs)}

    X, y, num_mixtures, all_pairs_CIDs = format_Xy(training_set,  mixtures_IDs, CID2features, method = 'avg')

    

In [6]:
# Convert the input pairs to a suitable format for training
X_pairs = np.array([(np.concatenate((x1, x2))) for x1, x2 in X])
y_true = np.array(y)

X_pair1 = X_pairs[:, :96] 
X_pair2 = X_pairs[:, 96:] 

In [7]:
# Embedding related summary features:
distances = [get_euclidean_distance(m[0], m[1]) for m in X]
similarities = [get_cosine_similarity(m[0], m[1]) for m in X]
angles = [get_cosine_angle(m[0], m[1]) for m in X]

In [8]:
# Mixture related summary features:
shared_monos = [ len( set(pair[0]).intersection(set(pair[1]))) for pair in all_pairs_CIDs]
diff_monos = [ len( set(pair[0]).difference(set(pair[1]))) for pair in all_pairs_CIDs]

In [9]:
# Dataset info
datasets = training_set['Dataset'].to_numpy()
encoder = OneHotEncoder()
data_arr = encoder.fit_transform(datasets.reshape(-1, 1))
data_arr = data_arr.toarray()

In [10]:
### Add features:
X_features = np.hstack((X_pairs, np.array(distances).reshape(500, 1), 
                        np.array(similarities).reshape(500, 1), 
                        np.array(angles).reshape(500, 1), 
                        np.array(shared_monos).reshape(500, 1), 
                        np.array(diff_monos).reshape(500, 1), 
                        np.array(num_mixtures), 
                        data_arr))

### Search for best parameter space

In [13]:
seeds = list(range(3))
# rf_bests = []
# rbg_bests = []
for seed in seeds:
    print(f"Random search for best hyperparams: round {seed +1} \n")
    rf_best,rbg_best = para_search(seed, X_features, y_true)
    # rf_bests.append(rf_best)
    # rbg_bests.append(rbg_best)

Random search for best hyperparams: round 1 



  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best Random Forest model:
Hyperparameters: {'n_estimators': 100, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': True}
Correlation: 0.9483296208132784
RMSE: 0.06555086953795551

Best XGBoost model:
Hyperparameters: {'subsample': 1.0, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 0.7}
Correlation: 0.8044860482247098
RMSE: 0.10508383246155775
Random search for best hyperparams: round 2 



  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best Random Forest model:
Hyperparameters: {'n_estimators': 200, 'min_samples_split': 2, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': True}
Correlation: 0.9623717480793464
RMSE: 0.060002213239424146

Best XGBoost model:
Hyperparameters: {'subsample': 1.0, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 1.0}
Correlation: 0.8062696410939838
RMSE: 0.10473618251187888
Random search for best hyperparams: round 3 



  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best Random Forest model:
Hyperparameters: {'n_estimators': 200, 'min_samples_split': 2, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': True}
Correlation: 0.9806225818096718
RMSE: 0.0462815830460351

Best XGBoost model:
Hyperparameters: {'subsample': 0.7, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 0.5}
Correlation: 0.8112106940273409
RMSE: 0.10551013395287083


### Get average performance for the best param combo 

In [25]:
rbg_best =  {'subsample': 0.7, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 0.5}
rf_best =  {'n_estimators': 200, 'min_samples_split': 2, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': True}

In [23]:
# Random seeds to get average performance
random_seeds = [42, 123, 456, 789, 1011]
n_fold = 10  # Number of folds for cross-validation

xgb_corr_list = []
xgb_rmse_list = []

# Evaluate the models with different random seeds
for seed in random_seeds:
    np.random.seed(seed)
    
    # Create the XGBoost model with the best hyperparameters
    xgb_model = xgb.XGBRegressor(**rbg_best, random_state=seed)
    
    # Create the KFold object for cross-validation
    kf = KFold(n_splits=n_fold, shuffle=True, random_state=seed)
    
    xgb_corr_fold = []
    xgb_rmse_fold = []
    
    # Perform cross-validation
    for train_index, test_index in kf.split(X_features):
        X_train, X_test = X_features[train_index], X_features[test_index]
        y_train, y_test = y_true[train_index], y_true[test_index]
        
        # Train the model
        xgb_model.fit(X_train, y_train)
        
        # Evaluate the model on the testing fold
        xgb_pred = xgb_model.predict(X_test)
        xgb_corr = np.corrcoef(xgb_pred, y_test)[0, 1]
        xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))
        
        xgb_corr_fold.append(xgb_corr)
        xgb_rmse_fold.append(xgb_rmse)
    
    # Calculate the average performance across all folds
    xgb_corr_avg = np.mean(xgb_corr_fold)
    xgb_rmse_avg = np.mean(xgb_rmse_fold)
    
    xgb_corr_list.append(xgb_corr_avg)
    xgb_rmse_list.append(xgb_rmse_avg)

In [24]:
print("XGBoost:")
print("R mean:", np.mean(xgb_corr_list))
print("R std:", np.std(xgb_corr_list))
print("RMSE mean:", np.mean(xgb_rmse_list))
print("RMSE std:", np.std(xgb_rmse_list))

XGBoost:
R mean: 0.5722108295854037
R std: 0.012305155699621435
RMSE mean: 0.13011032718717613
RMSE std: 0.0007585502287922674


In [26]:
# Random seeds to get average performance
random_seeds = [42, 123, 456, 789, 1011]
n_fold = 10  # Number of folds for cross-validation

rf_corr_list = []
rf_rmse_list = []

# Evaluate the models with different random seeds
for seed in random_seeds:
    np.random.seed(seed)
    
    # Create the XGBoost model with the best hyperparameters
    rf_model =  RandomForestRegressor(**rf_best, random_state=seed)
    
    # Create the KFold object for cross-validation
    kf = KFold(n_splits=n_fold, shuffle=True, random_state=seed)
    
    rf_corr_fold = []
    rf_rmse_fold = []
    
    # Perform cross-validation
    for train_index, test_index in kf.split(X_features):
        X_train, X_test = X_features[train_index], X_features[test_index]
        y_train, y_test = y_true[train_index], y_true[test_index]
        
        # Train the model
        rf_model.fit(X_train, y_train)
        
        # Evaluate the model on the testing fold
        rf_pred = rf_model.predict(X_test)
        rf_corr = np.corrcoef(rf_pred, y_test)[0, 1]
        rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
        
        rf_corr_fold.append(rf_corr)
        rf_rmse_fold.append(rf_rmse)
    
    # Calculate the average performance across all folds
    rf_corr_avg = np.mean(rf_corr_fold)
    rf_rmse_avg = np.mean(rf_rmse_fold)
    
    rf_corr_list.append(rf_corr_avg)
    rf_rmse_list.append(rf_rmse_avg)

In [27]:
print("RandomForest:")
print("R mean:", np.mean(rf_corr_list))
print("R std:", np.std(rf_corr_list))
print("RMSE mean:", np.mean(rf_rmse_list))
print("RMSE std:", np.std(rf_rmse_list))

RandomForest:
R mean: 0.624009999222098
R std: 0.01012993384541656
RMSE mean: 0.12392184704104232
RMSE std: 0.0005575734739967653
