In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

original_dataset = pd.read_csv('deoxyfluorination_original.csv')
original_dataset.drop(columns=['base_name', 'fluoride_name', 'substrate_name'], inplace=True)

new_order = ['substrate_smiles', 'base_smiles', 'fluoride_smiles']
new_names = ['substrate', 'base', 'fluoride']
original_dataset = original_dataset[new_order + [col for col in original_dataset.columns if col not in new_order]]
original_dataset.columns = new_names + [col for col in original_dataset.columns if col not in new_order]

original_dataset['yield'] = original_dataset['yield'] / 100

original_dataset.to_csv('deoxyfluorination_cleaned.csv', index=False)


In [4]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdFingerprintGenerator
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
import pickle
import torch


# Function to convert SMILES to Morgan Fingerprint
def smiles_to_fingerprint(smiles, radius=2, n_bits=1024):
    if smiles is np.nan:
        return torch.zeros(n_bits, dtype=torch.float64)
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return torch.zeros(n_bits, dtype=torch.float64)
    #fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)
    fp = mfpgen.GetFingerprint(mol)
    return torch.tensor(fp, dtype=torch.float64)

In [5]:
fingerprints = []
for idx, row in original_dataset.iterrows():
    base_fp = smiles_to_fingerprint(row['base'])
    fluoride_fp = smiles_to_fingerprint(row['fluoride'])
    substrate_fp = smiles_to_fingerprint(row['substrate'])
    combined_fp = np.concatenate([base_fp, fluoride_fp, substrate_fp])
    fingerprints.append(combined_fp)

X = np.array(fingerprints)
y = original_dataset['yield'].values
print(X.shape)
model = RandomForestRegressor(n_estimators=1000, random_state=12)
model.fit(X, y)

# Evaluate the model
y_pred = model.predict(X)
rmse = root_mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
print(f'Root Mean Squared Error: {rmse}, Mean Absolute Error: {mae}')

# Save the trained model
with open('deoxyfluorination.pkl', 'wb') as f:
    pickle.dump(model, f)

(740, 3072)
Root Mean Squared Error: 0.03375579979528082, Mean Absolute Error: 0.02128952565422535
