In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

original_dataset = pd.read_csv('Cernak_original.csv')
original_dataset.drop(columns=['Electrophile', 'Product'], inplace=True)

original_dataset['Conversion'] = original_dataset['Conversion'] / 100

original_dataset.to_csv('Cernak_cleaned.csv', index=False)

In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdFingerprintGenerator
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
import pickle
import torch


# Function to convert SMILES to Morgan Fingerprint
def smiles_to_fingerprint(smiles, radius=2, n_bits=1024):
    if smiles is np.nan:
        return torch.zeros(n_bits, dtype=torch.float64)
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return torch.zeros(n_bits, dtype=torch.float64)
    #fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)
    fp = mfpgen.GetFingerprint(mol)
    return torch.tensor(fp, dtype=torch.float64)

In [3]:
fingerprints = []
for idx, row in original_dataset.iterrows():
    catalyst_fp = smiles_to_fingerprint(row['Catalyst'])
    base_fp = smiles_to_fingerprint(row['Base'])
    nuc_fp = smiles_to_fingerprint(row['Nucleophile'])
    combined_fp = np.concatenate([catalyst_fp, base_fp, nuc_fp])
    fingerprints.append(combined_fp)

X = np.array(fingerprints)
y = original_dataset['Conversion'].values
print(X.shape)
model = RandomForestRegressor(n_estimators=1000, random_state=12)
model.fit(X, y)

# Evaluate the model
y_pred = model.predict(X)
rmse = root_mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
print(f'Root Mean Squared Error: {rmse}, Mean Absolute Error: {mae}')

# Save the trained model
with open('Cernak.pkl', 'wb') as f:
    pickle.dump(model, f)

(1536, 3072)
Root Mean Squared Error: 0.008747582672686749, Mean Absolute Error: 0.003164011653645854
