# Model validation with external data
In this notebook I will try to get the performance of the model on an external dataset I have obtained from a public repository, like ChEMBL, PubChem, Therapeutics Data Commons or MoleculeNet.

I need to find a dataset for which experimental data exactly as the one used to train my model of interest is available.

In [1]:
# In this codeblock I will import the necessary packages and specify the paths to relevant folders
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Specify the paths to relevant folders and files
model_predictions_file = '../data/output.csv'
external_dataset_file = '../data/themolecules.csv'

In [3]:
# In this codeblock I will load the external dataset as a pandas dataframe
# model_predictions_df = pd.read_csv(model_predictions_file)
external_dataset_df = pd.read_csv(external_dataset_file)

In [None]:
# In this codeblock I will process the external dataset so that I have a dataframe with three columns: standard smiles / InchiKey / experimental_value
processed_external_dataset_df = process_external_dataset(external_dataset_df)


In [None]:
# In this codeblock I will make sure there are no repeated molecules between the train set used in the model and the external dataset I curated
# Repeated molecules must be eliminated to avoid bias
repeated_molecules = set(model_predictions_df['standard_smiles']).intersection(processed_external_dataset_df['standard_smiles'])

if repeated_molecules:
    print(f"Warning: {len(repeated_molecules)} molecules are present in both the model training set and the external dataset. Removing them...")
    processed_external_dataset_df = processed_external_dataset_df[~processed_external_dataset_df['standard_smiles'].isin(repeated_molecules)]

In [None]:
# In this codeblock I will load the predictions I obtained with the EMH model and check several ML performance metrics

# Load the model predictions from the DataFrame
model_scores = model_predictions_df['score']

# Load the experimental values from the processed external dataset DataFrame
experimental_values = processed_external_dataset_df['experimental_value']

# Calculate and print the performance metrics
mse = mean_squared_error(experimental_values, model_scores)
r2 = r2_score(experimental_values, model_scores)
mae = mean_absolute_error(experimental_values, model_scores)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2) Score: {r2}")
print(f"Mean Absolute Error (MAE): {mae}")