In [None]:
from rdkit import Chem
import numpy as np
import pandas as pd
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdDepictor
rdDepictor.SetPreferCoordGen(True)
from rdkit.Chem.Draw import IPythonConsole
from IPython.display import SVG
from IPython.display import set_matplotlib_formats
from collections import Counter
import matplotlib.pyplot as plt
import torch
import mols2grid
import re
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from collections import Counter

In [None]:
# Load the full dataset.
full_data_path = '/gpfs/workspace/users/yangq34/Rokas/HTEExternal_toDisclose_results4_CleanedRoger_122021_IDmapped.txt'
d = pd.read_csv(full_data_path, low_memory=False, sep='\t')

In [None]:
b = d[d['KeyWord_STD'] == 'BUCHWALD']
b = b.reset_index(drop=True)

# Removing the reactions with no reactant_2 and those with two different aryl halides.
b = b.loc[(pd.isna(b['reactant_2_SMILES']) == False) & (pd.isna(b['reactant_3_SMILES']) == True)]
b = b.reset_index(drop=True)

In [None]:
# Canonicalizing the Buchwald reactants
reactant_1_canon = []
reactant_2_canon = []
for i in range(len(b)):
    s1 = b.loc[i, 'Reactant_1_SMILES']
    s2 = b.loc[i, 'reactant_2_SMILES']
    canon_1_s = Chem.MolToSmiles(Chem.MolFromSmiles(s1))
    reactant_1_canon.append(canon_1_s)
    canon_2_s = Chem.MolToSmiles(Chem.MolFromSmiles(s2))
    reactant_2_canon.append(canon_2_s)
b['react_1_canon'] = reactant_1_canon
b['react_2_canon'] = reactant_2_canon 

In [None]:
# Separating the halides and nucleophiles.

halide = []
amine = []
for i in range(len(b)):
    r1 = Chem.MolFromSmiles(b.loc[i, 'react_1_canon'])
    chloro = Chem.MolFromSmarts('[Cl]')
    bromo = Chem.MolFromSmarts('[Br]')
    iodo = Chem.MolFromSmarts('[I]')
    if r1.GetSubstructMatches(chloro):
        halide.append(b.loc[i, 'react_1_canon'])
        amine.append(b.loc[i, 'react_2_canon'])
    elif r1.GetSubstructMatches(bromo):
        halide.append(b.loc[i, 'react_1_canon'])
        amine.append(b.loc[i, 'react_2_canon'])
    elif r1.GetSubstructMatches(iodo):
        halide.append(b.loc[i, 'react_1_canon'])
        amine.append(b.loc[i, 'react_2_canon'])
    else:
        halide.append(b.loc[i, 'react_2_canon'])
        amine.append(b.loc[i, 'react_1_canon'])

b['halide'] = halide
b['nuc'] = amine

In [None]:
# Separating metal source (including pre-complexed palladium) from ligand.

b['Catalyst_1_Short_Hand'].unique() # nan, Pd(OAc)2, NiCl2.DME, CuI

# Remove copper.
b = b.loc[b['Catalyst_1_Short_Hand'] != 'CuI'] # Incorrectly labeled as Buchwalds. Move to Ullmann dataset.

nocat = b.loc[(pd.isna(b['catalyst_1_ID_1_SMILES']) == True) & 
              (pd.isna(b['catalyst_1_ID_2_SMILES']) == True) & 
              (pd.isna(b['catalyst_2_ID_1_SMILES']) == True) & 
              (pd.isna(b['catalyst_2_ID_2_SMILES']) == True) & 
            (pd.isna(b['Catalyst_2_Short_Hand']) == False)]

# Check with Pfizer that these reactions indeed are baseline reactions. UPDATE: YES!

hascat = b.loc[(pd.isna(b['catalyst_1_ID_1_SMILES']) == False) | (pd.isna(b['catalyst_1_ID_2_SMILES']) == False) | (pd.isna(b['catalyst_2_ID_1_SMILES']) == False) | (pd.isna(b['catalyst_2_ID_2_SMILES']) == False)]
hascat = pd.concat((hascat,nocat))
hascat = hascat.reset_index(drop=True)

In [None]:
# Checking cat_1_id_1 = cat_2_id_1 for all catalysts
for i in range(len(hascat)):
    cat_1_id_2 = hascat.loc[i, 'catalyst_1_ID_2_SMILES']
    cat_2_id_1 = hascat.loc[i, 'catalyst_2_ID_1_SMILES']
    if pd.isna(cat_1_id_2) == False:
        if cat_1_id_2 != cat_2_id_1:
            print('no', cat_1_id_2, cat_2_id_1)

In [None]:
# Some Cleanup.
hascat.loc[(hascat['catalyst_2_ID_1_SMILES'] == 'c1(c2c(P(c3ccccc3)c3ccccc3)ccc3c2cccc3)c(P(c2ccccc2)c2ccccc2)ccc2c1cccc2') & (pd.isna(hascat['catalyst_1_ID_1_SMILES']) == True), ['catalyst_1_ID_1_SMILES']] = 'C(O[Pd]OC(C)=O)(C)=O' 
hascat.loc[(pd.isna(hascat['catalyst_1_ID_1_SMILES']) == True) &
           (hascat['catalyst_2_ID_1_SMILES'] == 'C(O[Pd]OC(C)=O)(C)=O'), 
           ['catalyst_1_ID_1_SMILES', 'catalyst_2_ID_1_SMILES']] = ['C(O[Pd]OC(C)=O)(C)=O', np.nan]
hascat.loc[(pd.isna(hascat['catalyst_1_ID_1_SMILES']) == False) & 
           (hascat['catalyst_2_ID_1_SMILES'] == 'C(O[Pd]OC(C)=O)(C)=O'), 
           ['catalyst_1_ID_1_SMILES', 'catalyst_2_ID_1_SMILES']] = ['C(O[Pd]OC(C)=O)(C)=O', np.nan]

In [None]:
# Finally separating out the two.
catalyst = []
ligand = []
for i in range(len(hascat)):
    
    cat_1 = hascat.loc[i, 'catalyst_1_ID_1_SMILES']
    cat_2 = hascat.loc[i, 'catalyst_2_ID_1_SMILES']

    if pd.isna(cat_1) == False:
        catalyst.append(cat_1)
        if pd.isna(cat_2) == False:
            ligand.append(cat_2)
        else:
            ligand.append(np.nan)
    elif cat_2 == 'c1(c2c(cc(cc2C(C)C)C(C)C)C(C)C)c(c(c(c(c1C)C)C)C)P(C(C)(C)C)C(C)(C)C':
            catalyst.append(np.nan)
            ligand.append(cat_2)
    else:
        catalyst.append(cat_2)
        ligand.append(np.nan)


hascat['catalyst'] = catalyst
hascat['ligand'] = ligand

In [None]:
# Save out dataset as needed.