In [5]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdFingerprintGenerator
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
import pickle
import torch

In [6]:
original_dataset = pd.read_csv('Denmark_original.csv')
original_dataset.to_csv('Denmark_cleaned.csv', index=False)

In [7]:
# Function to convert SMILES to Morgan Fingerprint
def smiles_to_fingerprint(smiles, radius=2, n_bits=1024):
    if smiles is np.nan:
        return torch.zeros(n_bits, dtype=torch.float64)
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return torch.zeros(n_bits, dtype=torch.float64)
    #fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)
    fp = mfpgen.GetFingerprint(mol)
    return torch.tensor(fp, dtype=torch.float64)

In [None]:
fingerprints = []
for idx, row in original_dataset.iterrows():
    catalyst_fp = smiles_to_fingerprint(row['Catalyst'])
    imine_fp = smiles_to_fingerprint(row['Imine'])
    thiol_fp = smiles_to_fingerprint(row['Thiol'])
    combined_fp = np.concatenate([catalyst_fp, imine_fp, thiol_fp])
    fingerprints.append(combined_fp)

X = np.array(fingerprints)
y = original_dataset['Delta_Delta_G'].values
print(X.shape)
model = RandomForestRegressor(n_estimators=1000, random_state=12)
model.fit(X, y)

# Evaluate the model
y_pred = model.predict(X)
mse = root_mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
print(f'Root Mean Squared Error: {mse}, Mean Absolute Error: {mae}')

# Save the trained model
with open('Denmark.pkl', 'wb') as f:
    pickle.dump(model, f)

(1075, 3072)
Root Mean Squared Error: 0.07386662005016159, Mean Absolute Error: 0.049466426860590496
