In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
#! pip install global-chem

In [3]:
import global_chem
import inspect
import pandas as pd
import datamol as dm

def fullname(o):
    if inspect.isclass(o):
        klass = o
    else:
        klass = o.__class__
    module = klass.__module__
    if module == '__builtin__':
        return klass.__name__ # avoid outputs like '__builtin__.str'
    return module + '.' + klass.__name__

In [4]:
def build_data():
    gc = global_chem.GlobalChem()
    gc.build_global_chem_network()
    data = []
    for k, v in gc.__NODES__.items():
        if k != 'global_chem' and k != 'common_regex_patterns':
            full_hierarchy  = fullname(v).rsplit(".", 1)[0] + f".{k}"
            df_smiles = pd.DataFrame.from_dict(v.get_smiles(), orient="index", columns=["smiles"])
            df_smarts = pd.DataFrame.from_dict(v.get_smarts(), orient="index", columns=["smarts"])
            df = df_smiles.join(df_smarts, how="outer")
            df = df.rename_axis("iupac").reset_index()
            df["group"] = k
            df["hierarchy"] = full_hierarchy.replace("global_chem.", "")
            data.append(df)
    return pd.concat(data, ignore_index=True)
    

In [5]:
df = build_data()
df = df.rename(columns={"iupac": "name"})

In [6]:
#df.dropna(subset=["smiles", "smarts"], inplace=True)

In [7]:
## Add additional information

aggregator = pd.read_csv("https://raw.githubusercontent.com/kotori-y/Scopy/master/scopy/data/SMARTS/Aggregators.txt", sep="\t")
fg = pd.read_csv("https://raw.githubusercontent.com/kotori-y/Scopy/master/scopy/data/SMARTS/Function_Group.txt", sep="\t")
extended_fg = pd.read_csv("https://raw.githubusercontent.com/kotori-y/Scopy/master/scopy/data/SMARTS/Extended_Functional_Groups.txt", sep="\t")


def check_length(x):
    return len(eval(x))==1

def curate_additional_groups(df,group, hierarchy=None):
    df = df.copy()
    df = df[(df.Accept.notna())]
    df = df[(df.Accept.apply(check_length)) & (df.Reject.isna())]

    with dm.without_rdkit_log():
        if hierarchy is None:
            hierarchy = ".".join(["medicinal_chemistry", "functional_groups", group])
        df = df.rename(columns={"SMARTS": "smarts", "Name": "name"})
        df["mol"] = df["smarts"].apply(dm.from_smarts)
        df["smiles"] = df["smarts"].apply(dm.to_mol).apply(dm.to_smiles)
        df["group"] = group
        df["hierarchy"] = hierarchy    
    
    df = df[df.mol.notna()]
    df = df.drop(columns=["Reject", "Accept", "mol"])
    
    return df


In [8]:
aggregator_new = curate_additional_groups(aggregator, group="aggregator", hierarchy="medicinal_chemistry.aggregator")
fg_new = curate_additional_groups(fg, group="basic_groups")
extended_fg_new = curate_additional_groups(extended_fg, group="extended_groups")

In [9]:
df = pd.concat([df, aggregator_new, fg_new, extended_fg_new], ignore_index=True)

In [10]:
df.to_csv("../medchem/data/chemical_groups.csv", index=False)

In [11]:
df.head()

Unnamed: 0,name,smiles,smarts,group,hierarchy
0,perfluorohexanoic acid,C(=O)(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)(F)F)O,[#6](=[#8])(-[#6](-[#6](-[#6](-[#6](-[#6](-[#9...,emerging_perfluoroalkyls,environment.emerging_perfluoroalkyls.emerging_...
1,perfluoroheptanoic acid,C(=O)(C(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)(F)F)(...,[#6](=[#8])(-[#6](-[#6](-[#6](-[#6](-[#6](-[#6...,emerging_perfluoroalkyls,environment.emerging_perfluoroalkyls.emerging_...
2,perfluorononanoic acid,C(=O)(C(C(C(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)(F...,[#6](=[#8])(-[#6](-[#6](-[#6](-[#6](-[#6](-[#6...,emerging_perfluoroalkyls,environment.emerging_perfluoroalkyls.emerging_...
3,perfluorodecanoic acid,C(=O)(C(C(C(C(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)...,[#6](=[#8])(-[#6](-[#6](-[#6](-[#6](-[#6](-[#6...,emerging_perfluoroalkyls,environment.emerging_perfluoroalkyls.emerging_...
4,perfluorobutanesulfonic acid,C(C(C(F)(F)S(=O)(=O)O)(F)F)(C(F)(F)F)(F)F,[#6](-[#6](-[#6](-[#9])(-[#9])-[#16](=[#8])(=[...,emerging_perfluoroalkyls,environment.emerging_perfluoroalkyls.emerging_...
