In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

original_dataset = pd.read_csv('Doyle_original.csv')
original_dataset.drop(columns=['Aniline', 'Product'], inplace=True)

original_dataset['Yield'] = original_dataset['Yield']

original_dataset.to_csv('Doyle_cleaned.csv', index=False)

In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdFingerprintGenerator
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
import pickle
import torch


# Function to convert SMILES to Morgan Fingerprint
def smiles_to_fingerprint(smiles, radius=2, n_bits=1024):
    if smiles is np.nan:
        return torch.zeros(n_bits, dtype=torch.float64)
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return torch.zeros(n_bits, dtype=torch.float64)
    #fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)
    fp = mfpgen.GetFingerprint(mol)
    return torch.tensor(fp, dtype=torch.float64)

In [3]:
fingerprints = []
for idx, row in original_dataset.iterrows():
    ligand_fp = smiles_to_fingerprint(row['Ligand'])
    additive_fp = smiles_to_fingerprint(row['Additive'])
    base_fp = smiles_to_fingerprint(row['Base'])
    halide_fp = smiles_to_fingerprint(row['Aryl halide'])
    combined_fp = np.concatenate([ligand_fp, additive_fp, base_fp, halide_fp])
    fingerprints.append(combined_fp)

X = np.array(fingerprints)
y = original_dataset['Yield'].values
print(X.shape)
model = RandomForestRegressor(n_estimators=1000, random_state=12)
model.fit(X, y)

# Evaluate the model
y_pred = model.predict(X)
mse = root_mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
print(f'Root Mean Squared Error: {mse}, Mean Absolute Error: {mae}')

# Save the trained model
#with open('Doyle.pkl', 'wb') as f:
#    pickle.dump(model, f)

(3955, 4096)
Root Mean Squared Error: 0.02317244153356333, Mean Absolute Error: 0.01500399830305788
