In [167]:
import os
import re
import json
import pandas as pd
import rdkit.Chem as Chem
from collections import defaultdict
from glob import glob
from rdkit.Chem import PandasTools
from rdkit.Chem import Descriptors, Lipinski

In [207]:
def calcMolAttr(smiles):
    m = Chem.MolFromSmiles(smiles)
    m = Chem.AddHs(m)
    return [round(Descriptors.MolWt(m)),
            len(m.GetAtoms()),
            Descriptors.HeavyAtomCount(m),
            Lipinski.RingCount(m),
            Lipinski.NumRotatableBonds(m),
            Lipinski.NumAromaticRings(m),
            Descriptors.NumHAcceptors(m),
            Descriptors.NumHDonors(m),
            Descriptors.TPSA(m),
            Descriptors.MolLogP(m),
            0, 0, 0
           ]

attributes = [
    "MW",
    "Atoms",
    "HeavyAtom",
    "RotateBond",
    "RingCount",
    "AromaticRing",
    "HBA",
    "HBD",
    "TPSA",
    "cLogP",
    "cLogD",
    "cLogS",
    "pKa"
]

def check_column_type(name):
    name = name.lower().strip().strip(".")
    if name.startswith("comp") or name in ["cmpd", "cpd", "no", "id", "entry", "ex"]:
        return "Compound"
    elif re.match(r"^r[0-9']?|[xy]$", name):
        return "R"
    else:
        return name 
    
import requests

def predict_pka(smi):
    param={"Smiles" : ("tmg", smi)}
    headers={'token':'O05DriqqQLlry9kmpCwms2IJLC0MuLQ7'}
    response=requests.post(url='http://xundrug.cn:5001/modules/upload0/', files=param, headers=headers)
    return response.json()['gen_datas']

In [280]:
cdf = pd.read_csv("../compounds.to-pka.csv",  sep='\t',
                  header=None, names=["doi", "compound", "smiles"])

cdb = defaultdict(dict)
for _, row in cdf.iterrows():
    cdb[row["doi"]][row["compound"]] = row["smiles"]

adf = pd.read_csv("./jmc-article-stat.txt", sep='\t',
                  header=None, names=["doi", "y", "h", "w", "subtype", "venue"])

# JMC article + 标题为包含Discovery + 能下载正式发表论文 + 正文中有表格
root_dir = "../jmc-article-discovery-formal-table"
doc_list = [os.path.basename(fn)[:-4] for fn in glob(f"{root_dir}/*.pdf")]

In [297]:
def process_single(doi):
    doc_path = f"../jmc-article-discovery-formal-table-data/{doi}.document.json"
    tbl_path = f"../jmc-article-discovery-formal-table-data/{doi}.table.json"
    doc, tables = json.load(open(doc_path)), json.load(open(tbl_path))
    
    table_index = defaultdict(list)
    for ix, table in enumerate(doc["tables"]):
        m = re.findall("^Table\s+([0-9]+)(?:\.|$)", table["title"])
        if m:
            table_index[int(m[0])].append(ix)
    
    scaffolds = [
        '[R1]CC1=CC=C([C@H]2C[C@@H]2C2=CC=CC(C3=CC=CC=C3)=C2C)N=C1OC',
        '[R1]CC1=C(OC)[X]=C([(O)n][C@H]2C[C@@H]2C2=CC=CC([Ar])=C2C)C=C1',
        '[R1]CC1=CC(Cl)=C([C@H]2C[C@@H]2C2=CC=CC([Ar])=C2C)C=C1OC[R2]'
    ]
    
    compounds_a = cdb.get(doi, None)
    
    extra = {}
    for cid, smiles in compounds_a.items():
        values = calcMolAttr(smiles)
        extra[cid] = [smiles] + values
    
    
    data = []
    for ix, table in enumerate(tables):
        m = re.findall("^Table\s+([0-9]+)(?:\.|$)", table["title"])
        tix = int(m[0])
        title, columns = doc["tables"][table_index[tix][0]]["title"], []
        #print(title)
        columns = table["content"][0] + ['table_index', 'table_name', 'scaffold', 'smiles']
        for i,x in enumerate(attributes):
            columns.append(x)
        
        t = []
        for row in table["content"][1:]:
            row.append(tix)
            row.append(title)
            if tix < 3:
                row.append(scaffolds[tix])
            else:
                row.append('')
            adme = extra.get(row[0], ['' for i in range(1+len(attributes))])
            row += adme
            t.append(row)
        df = pd.DataFrame(t, columns=columns)
        df.to_csv(f"{doi}-table.{tix}.csv", index=None)
process_single('acs.jmedchem.3c00205')
#cdb['acs.jmedchem.3c00205']

In [250]:

data, has_compounds = [], 0
for doi in doc_list:
    doc_path = f"../jmc-article-discovery-formal-table-data/{doi}.document.json"
    tbl_path = f"../jmc-article-discovery-formal-table-data/{doi}.table.json"
    doc, tables = json.load(open(doc_path)), json.load(open(tbl_path))
    
    table_index = defaultdict(list)
    for ix, table in enumerate(doc["tables"]):
        m = re.findall("^Table\s+([0-9]+)(?:\.|$)", table["title"])
        if m:
            table_index[int(m[0])].append(ix)
    
    compounds = set()
    for ix, table in enumerate(tables):
        m = re.findall("^Table\s+([0-9]+)(?:\.|$)", table["title"])
        if not m:
            continue
        title, columns = table_index[int(m[0])], []
        for column in table["content"][0]:
            #for c in column.strip().split(";"):
            columns.append(check_column_type(column))
        
        for j in range(len(columns)):
            if columns[j] == "Compound":
                for row in table["content"][1:]:
                    for cid in row[j].split(";"):
                        compounds.add(cid.strip())
    
    compounds_a = cdb.get(doi, None)
    if compounds_a:
        ids = [cid for cid in compounds if cid in compounds_a]
        data.append([doi, len(ids)])
        has_compounds += 1
        print(doi, len(ids))

total_article = len(data)
processed_article = len([n for d,n in data if n>0])
total_smiles = sum([n for d,n in data if n>0])
print(f"\narticles: {processed_article}/{has_compounds}/{len(doc_list)}, molecules: {total_smiles}\n")

acs.jmedchem.0c02023 2
acs.jmedchem.2c01421 1
acs.jmedchem.1c01171 0
acs.jmedchem.0c01264 8
acs.jmedchem.9b00673 29
acs.jmedchem.1c00904 0
acs.jmedchem.2c01192 7
acs.jmedchem.0c00224 23
acs.jmedchem.1c01207 56
acs.jmedchem.1c00131 23
acs.jmedchem.1c00864 24
acs.jmedchem.2c01964 28
acs.jmedchem.3c00210 20
acs.jmedchem.2c00893 11
acs.jmedchem.2c01233 3
acs.jmedchem.2c00677 0
acs.jmedchem.2c01569 35
acs.jmedchem.1c00441 34
acs.jmedchem.2c01597 5
acs.jmedchem.1c01979 48
acs.jmedchem.2c01568 18
acs.jmedchem.2c00676 37
acs.jmedchem.1c01986 9
acs.jmedchem.0c01305 0
acs.jmedchem.2c01554 39
acs.jmedchem.3c00205 55
acs.jmedchem.1c02069 15
acs.jmedchem.8b00375 15
acs.jmedchem.1c00905 23
acs.jmedchem.1c00044 32
acs.jmedchem.1c00722 46
acs.jmedchem.3c00173 26
acs.jmedchem.5b01685 44
acs.jmedchem.5b00772 53
acs.jmedchem.1c00246 25
acs.jmedchem.2c01420 10
acs.jmedchem.2c01346 0
acs.jmedchem.9b01034 15
acs.jmedchem.2c01422 29
acs.jmedchem.2c01350 6
acs.jmedchem.1c01827 22
acs.jmedchem.2c01146 0
acs.jm

In [None]:
def render(doi, draw_mol=False):
    doc = database[doi]
    data = []
    for tbl in doc["tables"]:
        for cid, attrs in tbl["compounds"].items():
            smi = doc["compounds"][cid]
            values = calcMolAttr(smi)
            data.append([cid, smi] + values)
    d = pd.DataFrame(data, columns=["ID", "SMILES"] + attributes)
    if draw_mol:
        PandasTools.AddMoleculeColumnToFrame(d, 'SMILES','SMILES')
    return d
render('acs.jmedchem.0c00224', True)

In [None]:
output = open("compounds.to-pka.csv", "w")
for doi, doc in database.items():
    for cid, smiles in doc["compounds"].items():
        if len(cid) < 5:
            print(f"{doi}.{cid}\t{smiles}", file=output)
output.close()