# Random Forest Baselines for FART 

In [10]:
import itertools
import math
import random
from collections import Counter, defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import AllChem, rdFingerprintGenerator
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    precision_recall_fscore_support,
)
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_sample_weight

In [11]:
# Loading the curated dataset from a CSV file
data = pd.read_csv('fart_curated.csv')

# Define a function to convert SMILES to fingerprints
def smiles_to_fingerprints(smiles, n_bits=1024):
    """
    Converts a SMILES string to a molecular fingerprint.

    Parameters
    ----------
    smiles : str
        A SMILES string representing the molecular structure.
    n_bits : int, Optional, default: 1024
        The number of bits in the fingerprint.

    Returns
    -------
    list
        A list of integers representing the molecular fingerprint.
        Returns a zero vector if the SMILES string is invalid.
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048)
        return mfpgen.GetFingerprint(mol).ToList()
    else:
        return [0] * n_bits  # Return a zero vector if the SMILES is invalid

# Get Morgan fingerprints from SMILES
data['fingerprints'] = data['Standardized SMILES'].apply(smiles_to_fingerprints)



In [None]:
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize
import numpy as np

def return_scores(all_y_pred, all_y_true, y, title="Define Title"):

    # Get unique classes
    classes = np.unique(all_y_true)

    # Binarize the true and predicted labels for multi-class AUC calculation
    y_true_bin = label_binarize(all_y_true, classes=classes)
    y_pred_bin = label_binarize(all_y_pred, classes=classes)

    # Calculate AUC for each class using One-vs-Rest (OvR) method
  
    # Average AUC across all classes
    avg_auc = roc_auc_score(y_true_bin, y_pred_bin, average="macro", multi_class="ovr")

    # Weighted AUC, which considers the support of each class
    weighted_auc = roc_auc_score(y_true_bin, y_pred_bin, average="weighted", multi_class="ovr")


    print(f"{title}")
    # Calculate overall accuracy
    overall_accuracy = accuracy_score(all_y_true, all_y_pred)

    # Print classification report for per-class metrics
    print("\nPer-Class Classification Report:")
    print(classification_report(all_y_true, all_y_pred, target_names=np.unique(y), digits=4))

    # Calculate weighted and macro averages for precision, recall, and F1 score
    precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(
        all_y_true, all_y_pred, average='weighted'
    )

    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        all_y_true, all_y_pred, average='macro'
    )

    print(f"Overall Accuracy: {overall_accuracy:.4f}")

    # Print weighted averages
    print("\nWeighted Averages:")
    print(f"Weighted Precision: {precision_weighted:.4f}")
    print(f"Weighted Recall: {recall_weighted:.4f}")
    print(f"Weighted F1 Score: {f1_weighted:.4f}")
    print(f"Weighted Average AUC: {weighted_auc:.4f}")

    # Print unweighted (macro) averages
    print("\nMacro Averages (Unweighted):")
    print(f"Macro Precision: {precision_macro:.4f}")
    print(f"Macro Recall: {recall_macro:.4f}")
    print(f"Macro F1 Score: {f1_macro:.4f}")
    print(f"Average AUC (macro): {avg_auc:.4f}")

    # Optional: Calculate per-class precision, recall, and F1 score explicitly
    # Get per-class metrics using precision_recall_fscore_support without averaging
    precision_per_class, recall_per_class, f1_per_class, _ = precision_recall_fscore_support(
        all_y_true, all_y_pred, average=None, labels=np.unique(all_y_true)
    )



# XGBoost on fingerprints

In [17]:
 # Convert features list to a numpy array for modeling
X1 = np.array(data['fingerprints'].tolist())
y1 = data['Canonicalized Taste'].values

# Initialize the LabelEncoder
encoder = LabelEncoder()

# Fit the encoder and transform the labels to integers
y1_encoded = encoder.fit_transform(y1)


In [18]:
# Define the XGBClassifier model, same parameters as for the other RF models
model = XGBClassifier(
    n_estimators=150,
    max_depth=15,
    learning_rate=0.01,
    subsample=0.8,               # now with full subsampling rate
    objective='multi:softprob',
    num_class=5,
    eval_metric='mlogloss',
    device='gpu',  # Ensure GPU is used for tree construction
    random_state=101
)

skf = StratifiedKFold(n_splits=5, random_state=101, shuffle=True)

# Lists to store predictions and true labels across all folds
all_y1_pred = []
all_y1_true = []

# Perform K-fold cross-validation
for train_index, val_index in skf.split(X1, y1_encoded):
    X1_train, X1_val = X1[train_index], X1[val_index]
    y1_train, y1_val = y1_encoded[train_index], y1_encoded[val_index]

    # Fit the model on training data
    model.fit(X1_train, y1_train)

    # Predict probabilities for the test set
    y1_pred_proba = model.predict_proba(X1_val)

    # Predicted class: highest probability
    y1_pred = np.argmax(y1_pred_proba, axis=1)

    # Collect true labels and predictions for evaluation
    all_y1_true.extend(y1_val)
    all_y1_pred.extend(y1_pred)



In [27]:
return_scores(all_y1_pred, all_y1_true, y1, title="XGBoost on Fingerprints")

XGBoost on Fingerprints

Per-Class Classification Report:
              precision    recall  f1-score   support

      bitter     0.8104    0.5943    0.6857      1676
        sour     0.8242    0.8941    0.8577      1605
       sweet     0.9288    0.9257    0.9273      9542
       umami     0.7600    0.3276    0.4578        58
   undefined     0.6338    0.7447    0.6848      2150

    accuracy                         0.8572     15031
   macro avg     0.7915    0.6973    0.7227     15031
weighted avg     0.8616    0.8572    0.8564     15031

Overall Accuracy: 0.8572

Weighted Averages:
Weighted Precision: 0.8616
Weighted Recall: 0.8572
Weighted F1 Score: 0.8564
Weighted Average AUC: 0.8821

Macro Averages (Unweighted):
Macro Precision: 0.7915
Macro Recall: 0.6973
Macro F1 Score: 0.7227
Average AUC (macro): 0.8250


# XGBoost on fingerprints and 15 Mordred Descriptors

In [None]:
from mordred import Calculator, descriptors # requires python version 3.10 or earlier


# Create a Mordred Calculator
calc = Calculator()

# Add specific descriptors to the calculator: https://mordred-descriptor.github.io/documentation/master/descriptors.html
calc.register(descriptors.Autocorrelation.ATSC(0, 'c'))
calc.register(descriptors.Autocorrelation.ATSC(0, 'se'))
calc.register(descriptors.Autocorrelation.AATS(0, 'i'))
calc.register(descriptors.Autocorrelation.ATSC(1, 'p'))
calc.register(descriptors.Autocorrelation.AATSC(2, 'se'))
calc.register(descriptors.Autocorrelation.AATSC(0, 'm'))
calc.register(descriptors.Autocorrelation.AATSC(1, 'Z'))
calc.register(descriptors.Autocorrelation.AATSC(2, 'are'))
calc.register(descriptors.Autocorrelation.AATSC(1, 'pe'))
calc.register(descriptors.AdjacencyMatrix.AdjacencyMatrix('SpDiam'))
calc.register(descriptors.Autocorrelation.ATSC(1, 'c'))
calc.register(descriptors.Autocorrelation.ATSC(1, 'se'))
calc.register(descriptors.Autocorrelation.ATSC(1, 'Z'))
calc.register(descriptors.Autocorrelation.ATSC(1, 'm'))
calc.register(descriptors.Autocorrelation.ATSC(4, 's'))


def generate_descriptors(smiles, calculator=calc):
    """
    Calculates selected descriptors from SMILES

    Parameters
    ----------
    smiles : str
        A SMILES string representing the molecular structure.
    calculator : Calculator
        A mordred Calculator with given descriptors.
    Returns
    -------

    """
    if calculator is None:
        return None

    mol = Chem.MolFromSmiles(smiles)

    try:
      return calculator(mol)

    except Exception as error:
      return None

data['mordred_descriptors'] = data['Standardized SMILES'].apply(generate_descriptors)



In [11]:
 # Convert features list to a numpy array for modeling
X_fingerprints = np.array(data['fingerprints'].tolist())
X_descriptors = np.array(data['mordred_descriptors'].tolist())

X2 = np.concatenate((X_fingerprints, X_descriptors), axis=1)
y2 = data['Canonicalized Taste'].values

# Initialize the LabelEncoder
encoder = LabelEncoder()

# Fit the encoder and transform the labels to integers
y2_encoded = encoder.fit_transform(y2)

In [12]:
from xgboost import XGBClassifier

# Define the XGBClassifier model, same parameters as for the other RF models
model_descriptors = XGBClassifier(
    n_estimators=150,
    max_depth=15,
    learning_rate=0.01,
    subsample=0.8,               # now with full subsampling rate
    objective='multi:softprob',
    num_class=5,
    eval_metric='mlogloss',
    device='gpu',  # Ensure GPU is used for tree construction
    random_state=101
)
skf = StratifiedKFold(n_splits=5, random_state=101, shuffle=True)

# Lists to store predictions and true labels across all folds
all_y2_pred = []
all_y2_true = []

# Perform K-fold cross-validation
for train_index, val_index in skf.split(X2, y2_encoded):
    X2_train, X2_val = X2[train_index], X2[val_index]
    y2_train, y2_val = y2_encoded[train_index], y2_encoded[val_index]

    # Fit the model on training data
    model_descriptors.fit(X2_train, y2_train)

    # Predict probabilities for the test set
    y2_pred_proba = model_descriptors.predict_proba(X2_val)

    # Predicted class: highest probability
    y2_pred = np.argmax(y2_pred_proba, axis=1)

    # Collect true labels and predictions for evaluation
    all_y2_true.extend(y2_val)
    all_y2_pred.extend(y2_pred)



In [28]:
return_scores(all_y2_pred, all_y2_true, y2, title="XGBoost on fp+descriptors")

XGBoost on fp+descriptors

Per-Class Classification Report:
              precision    recall  f1-score   support

      bitter     0.7842    0.6116    0.6872      1676
        sour     0.8292    0.8773    0.8526      1605
       sweet     0.9146    0.9299    0.9222      9542
       umami     0.7273    0.2759    0.4000        58
   undefined     0.6490    0.6949    0.6712      2150

    accuracy                         0.8526     15031
   macro avg     0.7809    0.6779    0.7066     15031
weighted avg     0.8522    0.8526    0.8506     15031

Overall Accuracy: 0.8526

Weighted Averages:
Weighted Precision: 0.8522
Weighted Recall: 0.8526
Weighted F1 Score: 0.8506
Weighted Average AUC: 0.8716

Macro Averages (Unweighted):
Macro Precision: 0.7809
Macro Recall: 0.6779
Macro F1 Score: 0.7066
Average AUC (macro): 0.8133


# Balanced Random Forest

In [23]:
 # Convert features list to a numpy array for modeling
X3 = np.array(data['fingerprints'].tolist())
y3 = data['Canonicalized Taste'].values

# Initialize the LabelEncoder
encoder = LabelEncoder()

# Fit the encoder and transform the labels to integers
y3_encoded = encoder.fit_transform(y3)


In [24]:
from imblearn.ensemble import BalancedRandomForestClassifier

#  Using Balanced Random Forest
model = BalancedRandomForestClassifier(
    n_estimators=150,
    criterion="gini",
    max_depth=15,
    random_state=101
)

skf = StratifiedKFold(n_splits=5, random_state=101, shuffle=True)

# Lists to store predictions and true labels across all folds
all_y3_pred = []
all_y3_true = []

# Perform K-fold cross-validation
for train_index, val_index in skf.split(X3, y3_encoded):
    X3_train, X3_val = X3[train_index], X3[val_index]
    y3_train, y3_val = y3_encoded[train_index], y3_encoded[val_index]

    # Fit the model on training data
    model.fit(X3_train, y3_train)

    # Predict probabilities for the test set
    y3_pred_proba = model.predict_proba(X3_val)

    # Predicted class: highest probability
    y3_pred = np.argmax(y3_pred_proba, axis=1)

    # Collect true labels and predictions for evaluation
    all_y3_true.extend(y3_val)
    all_y3_pred.extend(y3_pred)


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [29]:
return_scores(all_y3_pred, all_y3_true, y3, title="Balanced Random Forest on fp")

Balanced Random Forest on fp

Per-Class Classification Report:
              precision    recall  f1-score   support

      bitter     0.6845    0.4039    0.5081      1676
        sour     0.5583    0.8766    0.6822      1605
       sweet     0.9436    0.7785    0.8531      9542
       umami     0.0567    0.7241    0.1051        58
   undefined     0.5263    0.7121    0.6053      2150

    accuracy                         0.7375     15031
   macro avg     0.5539    0.6991    0.5507     15031
weighted avg     0.8105    0.7375    0.7580     15031

Overall Accuracy: 0.7375

Weighted Averages:
Weighted Precision: 0.8105
Weighted Recall: 0.7375
Weighted F1 Score: 0.7580
Weighted Average AUC: 0.8296

Macro Averages (Unweighted):
Macro Precision: 0.5539
Macro Recall: 0.6991
Macro F1 Score: 0.5507
Average AUC (macro): 0.8154


# XGBoost on 2048 bit fingerprints 

In [13]:
# Loading the curated dataset from a CSV file
data = pd.read_csv('fart_curated.csv')

# Get Morgan fingerprints from SMILES
data['2048-fingerprints'] = data['Standardized SMILES'].apply(lambda x: smiles_to_fingerprints(x, n_bits=2048))




In [14]:
 # Convert features list to a numpy array for modeling
X4 = np.array(data['2048-fingerprints'].tolist())
y4 = data['Canonicalized Taste'].values

# Initialize the LabelEncoder
encoder = LabelEncoder()

# Fit the encoder and transform the labels to integers
y4_encoded = encoder.fit_transform(y4)

In [15]:
# Define the XGBClassifier model, same parameters as for the other RF models
model = XGBClassifier(
    n_estimators=150,
    max_depth=15,
    learning_rate=0.01,
    subsample=0.8,               # now with full subsampling rate
    objective='multi:softprob',
    num_class=5,
    eval_metric='mlogloss',
    device='gpu',  # Ensure GPU is used for tree construction
    random_state=101
)

skf = StratifiedKFold(n_splits=5, random_state=101, shuffle=True)

# Lists to store predictions and true labels across all folds
all_y4_pred = []
all_y4_true = []

# Perform K-fold cross-validation
for train_index, val_index in skf.split(X4, y4_encoded):
    X4_train, X4_val = X4[train_index], X4[val_index]
    y4_train, y4_val = y4_encoded[train_index], y4_encoded[val_index]

    # Fit the model on training data
    model.fit(X4_train, y4_train)

    # Predict probabilities for the test set
    y4_pred_proba = model.predict_proba(X4_val)

    # Predicted class: highest probability
    y4_pred = np.argmax(y4_pred_proba, axis=1)

    # Collect true labels and predictions for evaluation
    all_y4_true.extend(y4_val)
    all_y4_pred.extend(y4_pred)



In [16]:
return_scores(all_y4_pred, all_y4_true, y4, title="XGBoost on Fingerprints")

XGBoost on Fingerprints

Per-Class Classification Report:
              precision    recall  f1-score   support

      bitter     0.8122    0.5805    0.6771      1676
        sour     0.8288    0.8991    0.8625      1605
       sweet     0.9274    0.9248    0.9261      9542
       umami     0.7200    0.3103    0.4337        58
   undefined     0.6277    0.7451    0.6814      2150

    accuracy                         0.8556     15031
   macro avg     0.7832    0.6920    0.7162     15031
weighted avg     0.8604    0.8556    0.8546     15031

Overall Accuracy: 0.8556

Weighted Averages:
Weighted Precision: 0.8604
Weighted Recall: 0.8556
Weighted F1 Score: 0.8546
Weighted Average AUC: 0.8804

Macro Averages (Unweighted):
Macro Precision: 0.7832
Macro Recall: 0.6920
Macro F1 Score: 0.7162
Average AUC (macro): 0.8221
