In [1]:
import re, sys
import pandas as pd
import numpy as np
import json
from rdkit import Chem, DataStructs, RDLogger
from rdkit.Chem import rdChemReactions, AllChem, Draw, PandasTools
RDLogger.DisableLog('rdApp.*')
import warnings
warnings.filterwarnings('ignore')

In [40]:
def remove_bb_smi_label(smi):
    return re.sub(r"\[\d{2}\*\]", "[*]", smi)

#### Extract

In [49]:
with open("../data/raw/DORA_Lactam_mols_bbs.json", "r") as f:
    mols = json.load(f)
with open("../data/raw/DORA_Lactam_bbs.json", "r") as f:
    bbs = json.load(f)
    
df_mols = pd.json_normalize(mols)
df_bbs = pd.json_normalize(bbs)

In [50]:
df_bbs = df_bbs[["bb_smi", "bb_id"]]
df_mols = df_mols[["mol_smi", "mol_id", "A_id", "B_id", "C_id"]]

for col in ["A", "B", "C"]:
    df_mols = df_mols.merge(
        df_bbs,
        left_on=f"{col}_id",
        right_on="bb_id",
        how="left"
    ).rename(columns={"bb_smi": f"{col}_smi"}).drop(columns=["bb_id"])

df_mols = df_mols.dropna(subset=["A_id", "B_id", "C_id"])

In [51]:
for col in ["A_smi", "B_smi", "C_smi"]:
    df_mols[col] = df_mols[col].apply(remove_bb_smi_label)

In [52]:
df_mols.to_pickle("../data/transformed/df_mols_ready.pkl")

In [54]:
df_mols.head(3)

Unnamed: 0,mol_smi,mol_id,A_id,B_id,C_id,A_smi,B_smi,C_smi
0,COc1cccc(OC)c1[C@@H]1C[C@H](F)C(=O)N1Cc1ccc(OC...,4,A1,B1,C1,[*]c1c(OC)cccc1OC,[*][C@]1([H])C[C@]([H])(F)C(=O)N1[*],[*]Cc1ccc(OC(F)(F)F)cc1
1,COc1cccc(OC)c1[C@@H]1CC(F)(F)C(=O)N1Cc1ccc(OC(...,8,A1,B2,C1,[*]c1c(OC)cccc1OC,[*][C@]1([H])CC(F)(F)C(=O)N1[*],[*]Cc1ccc(OC(F)(F)F)cc1
2,COc1cccc(OC)c1[C@@H]1CCC(=O)N1Cc1ccc(OC(F)(F)F...,9,A1,B3,C1,[*]c1c(OC)cccc1OC,[*][C@]1([H])CCC(=O)N1[*],[*]Cc1ccc(OC(F)(F)F)cc1
