# Classify chemicals using ClassyFire and NPClassifier

### Import modules

In [1]:
from collections import defaultdict
import time
import json
from pandas import json_normalize
from ast import literal_eval

from pybatchclassyfire import *
import pandas as pd
from tqdm import tqdm

from rdkit import RDLogger  
from rdkit.Chem.inchi import MolToInchi, InchiToInchiKey
from rdkit.Chem import MolFromSmiles

DEBUG:requests_cache.backends:Initializing backend: <SQLiteCache(name=http_cache)> demo_cache_pybatch
INFO:rdkit:Enabling RDKit 2022.09.1 jupyter extensions


In [2]:
%config Application.log_level='INFO'
import logging
logging.getLogger('s3fs').setLevel(logging.INFO)
logging.getLogger('botocore').setLevel(logging.INFO)
logging.getLogger('fsspec').setLevel(logging.INFO)

In [3]:
RDLogger.DisableLog('rdApp.*')

In [4]:
DATA_DIR = '../data/'

Load newmann data

In [12]:
natural_products_df = pd.read_csv(
    's3://enveda-data-kg/others/cdd/newman_natural_products.tsv',
    sep='\t',
    usecols=[
        'curated_name',
        'pubchem_name',
        'pubchem_id',
        'smiles'
    ]
)

synthetics_df = pd.read_csv(
    's3://enveda-data-kg/others/cdd/newman_synthetics.tsv',
    sep='\t',
    usecols=[
        'curated_name',
        'pubchem_name',
        'pubchem_id',
        'smiles'
    ]
)

In [13]:
natural_products_df.shape, synthetics_df.shape

((398, 4), (935, 4))

In [14]:
synthetics_df.head(2)

Unnamed: 0,curated_name,pubchem_name,pubchem_id,smiles
0,alizapride,6-methoxy-N-[(1-prop-2-enylpyrrolidin-2-yl)met...,135413504,COC1=CC2=C(C=C1C(=O)NCC3CCCN3CC=C)NN=N2
1,amezinium methylsulfate,6-methoxy-1-phenylpyridazin-1-ium-4-amine;meth...,71926,COC1=[N+](N=CC(=C1)N)C2=CC=CC=C2.COS(=O)(=O)[O-]


In [15]:
natural_products_df.head(2)

Unnamed: 0,curated_name,pubchem_name,pubchem_id,smiles
0,aclarubicin,"methyl (1R,2R,4S)-4-[(2R,4S,5S,6S)-4-(dimethyl...",451415,CC[C@]1(C[C@@H](C2=C(C3=C(C=C2[C@H]1C(=O)OC)C(...
1,Netilmicin sulfate,"(2R,3R,4R,5R)-2-[(1S,2S,3R,4S,6R)-4-amino-3-[[...",62115,CCN[C@@H]1C[C@@H]([C@H]([C@@H]([C@H]1O[C@@H]2[...


In [16]:
smiles_in_data = set(natural_products_df.smiles.unique()).union(
    set(synthetics_df.smiles.unique())
)
len(smiles_in_data)

1314

### NP-classifier

In [17]:
np_classifier_df = pd.read_parquet(
    's3://enveda-data-kg/kg3/raw_source_downloads/manual_uploads/np_classifier.parquet',
#     engine='pyarrow'
)

In [18]:
np_classifier_df.head(5)

Unnamed: 0.1,Unnamed: 0,pubchem_openeye_can_smiles,class_results,superclass_results,pathway_results,is_glycoside
2,2,C1=CC(C(C(=C1)C(=O)O)O)O,"['Shikimic acids and derivatives', 'Simple phe...",['Phenolic acids (C6-C1)'],['Shikimates and Phenylpropanoids'],False
3,3,CC(CN)O,[],[],[],False
4,4,C(C(=O)COP(=O)(O)O)N,['Aminoacids'],['Small peptides'],"['Amino acids and Peptides', 'Shikimates and P...",False
6,6,CCN1C=NC2=C(N=CN=C21)N,['Purine alkaloids'],['Pseudoalkaloids'],['Alkaloids'],False
7,7,CCC(C)(C(C(=O)O)O)O,['Hydroxy fatty acids'],['Fatty Acids and Conjugates'],['Fatty acids'],False


In [20]:
smiles_to_np_classifier = {}

all_classes = set()
all_superclasses = set()
all_pathways = set()

for _, smiles, classes, superclasses, pathways, is_glycoside in np_classifier_df.values:
    
    if smiles not in smiles_in_data:
        continue
        
    for clas in literal_eval(classes):
        all_classes.add(clas)
        
    for clas in literal_eval(superclasses):
        all_superclasses.add(clas)
        
    for clas in literal_eval(pathways):
        all_pathways.add(clas)
        
    smiles_to_np_classifier[smiles] = {
        "classes": literal_eval(classes),
        "superclasses": literal_eval(superclasses),
        "pathways": literal_eval(pathways),
        "is_glycoside": is_glycoside,
    }
    

In [21]:
len(all_classes), len(all_superclasses), len(all_pathways)

(42, 24, 7)

Ping API

In [17]:
resolved_ik_number_list = [0, 0]
total_inchi_number = len(all_inchikeys)

while True:
    
    start_time = time.time()
    
    print('%s inchikey to resolve' % total_inchi_number )
    get_classifications_cf_mod(all_inchikeys, par_level= 50)
    
    cleanse('all_json.json', 'all_json.json')
    
    with open("all_json.json") as tweetfile:
        jsondic = json.loads(tweetfile.read())

    df = json_normalize(jsondic)
    df = df.drop_duplicates('smiles')
    resolved_ik_number = len( df.drop_duplicates('smiles').inchikey )
    resolved_ik_number_list.append( resolved_ik_number )
    print('%s resolved smiles' % resolved_ik_number )
    print("done in --- %s seconds ---" % (time.time() - start_time))
    
    if resolved_ik_number_list[-1] < resolved_ik_number_list[-2] or resolved_ik_number_list[-1] == resolved_ik_number_list[-3]:
        break

69891 inchikey to resolve
64321 resolved smiles
done in --- 473.6717128753662 seconds ---
69891 inchikey to resolve
64321 resolved smiles
done in --- 147.92391800880432 seconds ---
69891 inchikey to resolve
64321 resolved smiles
done in --- 135.92525792121887 seconds ---


In [18]:
df.head()

Unnamed: 0,smiles,inchikey,intermediate_nodes,alternative_parents,molecular_framework,substituents,description,external_descriptors,ancestors,predicted_chebi_terms,...,subclass.name,subclass.description,subclass.chemont_id,subclass.url,direct_parent.name,direct_parent.description,direct_parent.chemont_id,direct_parent.url,identifier,report
0,[H]C1(C)CCC2(CCC3(C)C(=CCC4([H])C5(C)CCC([H])(...,InChIKey=RRIMLWHUVCZACL-UHFFFAOYSA-N,"[{'name': 'Triterpene glycosides', 'descriptio...","[{'name': 'Triterpenoids', 'description': 'Ter...",Aliphatic heteropolycyclic compounds,"[Triterpene saponin, Triterpenoid, Hexose mono...",This compound belongs to the class of organic ...,[],"[Acetals, Alcohols and polyols, Carbohydrates ...","[triterpenoid (CHEBI:36615), hexose (CHEBI:181...",...,Terpene glycosides,Prenol lipids containing a carbohydrate moiety...,CHEMONTID:0002049,http://classyfire.wishartlab.com/tax_nodes/C00...,Triterpene saponins,Glycosylated derivatives of triterpene sapogen...,CHEMONTID:0002358,http://classyfire.wishartlab.com/tax_nodes/C00...,,
1,C[C@@]1(CC[C@@]2(C)[C@@H](CC[C@@]3(C)[C@@H]2C=...,InChIKey=CVAILKMOFONEDU-KRJMWWHISA-N,[],"[{'name': 'Sesquiterpenoids', 'description': '...",Aliphatic homopolycyclic compounds,"[18-oxosteroid, Oxosteroid, Sesquiterpenoid, H...",This compound belongs to the class of organic ...,[],"[Benzenoids, Carbonyl compounds, Carboxylic ac...","[sesquiterpenoid (CHEBI:26658), phenanthrenes ...",...,Oxosteroids,Steroid derivatives carrying a C=O group attac...,CHEMONTID:0001194,http://classyfire.wishartlab.com/tax_nodes/C00...,Oxosteroids,Steroid derivatives carrying a C=O group attac...,CHEMONTID:0001194,http://classyfire.wishartlab.com/tax_nodes/C00...,,
2,COC1=C(OC)C2=C(C=C1)C(=O)C1=C(O)C(C3C(OC4=C3C3...,InChIKey=BDURUBOYNAVRCF-UHFFFAOYSA-N,"[{'name': 'Acridines', 'description': 'Organic...","[{'name': 'Hydroquinolones', 'description': 'C...",Aromatic heteropolycyclic compounds,"[Acridone, Dihydroquinolone, Dihydroquinoline,...",This compound belongs to the class of organic ...,[],"[1-hydroxy-2-unsubstituted benzenoids, 1-hydro...","[quinolines (CHEBI:26513), 1-benzofurans (CHEB...",...,Benzoquinolines,Organic compounds containing a benzene fused t...,CHEMONTID:0001908,http://classyfire.wishartlab.com/tax_nodes/C00...,Acridones,Acridines containing a ketone group attached t...,CHEMONTID:0001811,http://classyfire.wishartlab.com/tax_nodes/C00...,,
3,CC(=O)OCC12C(CC(C)(O)C3(CC(OC3=O)c3ccoc3)C1(O)...,InChIKey=PLNVNTKQZPPADC-UHFFFAOYSA-N,[],"[{'name': 'Gamma butyrolactones', 'description...",Aromatic heteropolycyclic compounds,"[Tricarboxylic acid or derivatives, Gamma buty...",This compound belongs to the class of organic ...,[],"[Alcohols and polyols, Carbonyl compounds, Car...","[gamma-lactone (CHEBI:37581), oxolanes (CHEBI:...",...,Tricarboxylic acids and derivatives,Carboxylic acids containing exactly three carb...,CHEMONTID:0001986,http://classyfire.wishartlab.com/tax_nodes/C00...,Tricarboxylic acids and derivatives,Carboxylic acids containing exactly three carb...,CHEMONTID:0001986,http://classyfire.wishartlab.com/tax_nodes/C00...,,
4,COC(=O)[C@H](C\C=C\C(C)(C)O)[C@@H]1[C@@H](O)C[...,InChIKey=KTEXQEFLIOIDKM-GEMWEJHDSA-N,[],"[{'name': 'Cholesterols and derivatives', 'des...",Aliphatic homopolycyclic compounds,"[Triterpenoid, Cholesterol-skeleton, Cholestan...",This compound belongs to the class of organic ...,[],"[16-beta-hydroxysteroids, 16-hydroxysteroids, ...","[cholestanoid (CHEBI:50401), cholanoid (CHEBI:...",...,Triterpenoids,Terpene molecules containing six isoprene units.,CHEMONTID:0001553,http://classyfire.wishartlab.com/tax_nodes/C00...,Triterpenoids,Terpene molecules containing six isoprene units.,CHEMONTID:0001553,http://classyfire.wishartlab.com/tax_nodes/C00...,,


In [20]:
df.to_csv('classyfire_smiles.tsv.gz', sep='\t', compression='gzip')

In [22]:
np_classifier_df.to_csv('npclassifier_smiles.tsv.gz', sep='\t', compression='gzip')