# Classify chemicals using ClassyFire

### Import modules

In [1]:
from collections import defaultdict
import time
import json
from pandas import json_normalize
from ast import literal_eval

from pybatchclassyfire import *
import pandas as pd
from tqdm import tqdm

from rdkit import RDLogger  
from rdkit.Chem.inchi import MolToInchi, InchiToInchiKey
from rdkit.Chem import MolFromSmiles

  from pandas.core.computation.check import NUMEXPR_INSTALLED
INFO:rdkit:Enabling RDKit 2022.09.1 jupyter extensions


In [2]:
RDLogger.DisableLog('rdApp.*')

In [3]:
DATA_DIR = '../data/'

Load plant-chemical associations

In [4]:
plant_chemical_df = pd.read_csv(
    f'{DATA_DIR}/plant_chemical_associations.tsv.zip',
    compression='zip',
    sep='\t',
    usecols=[
        'plant_curie',
        'chemical_curie',
    ]
)

In [5]:
plant_chemical_df.head(1)

Unnamed: 0,plant_curie,chemical_curie
0,ncbitaxon:1000425,pubchem.compound:3527


In [6]:
len(plant_chemical_df.chemical_curie.unique())

71179

Get SMILES

In [7]:
smiles_df = pd.read_csv(
    f'{DATA_DIR}/smiles.tsv.gz',
    sep='\t',
    compression='gzip',
)

In [8]:
smiles_df.head(1)

Unnamed: 0,pubchem id,smiles
0,pubchem:3527,CC(=CCCC1(C(CC2(C(=O)C(=C(C3=CC(=C(C=C3)O)O)O)...


In [9]:
pubchem_to_smiles = {}
all_smiles = set()

chemicals_in_plants = set(plant_chemical_df.chemical_curie.unique())

for pubchem_id, smiles in tqdm(smiles_df.values):
        
    if pubchem_id.startswith('pubchem:'):
        pubchem_id = pubchem_id.replace('pubchem:', 'pubchem.compound:')
        
    if pubchem_id not in chemicals_in_plants:
        continue
    
    all_smiles.add(smiles)
    pubchem_to_smiles[pubchem_id] = smiles

smiles_to_pubchem = {
    smiles: pubchem
    for pubchem, smiles in pubchem_to_smiles.items()
}

100%|██████████| 323369/323369 [00:00<00:00, 495993.42it/s]


In [10]:
len(all_smiles)

69954

In [11]:
all_inchikeys = set()
skipped = 0

for smiles in tqdm(all_smiles):
    try:
        mol = MolFromSmiles(smiles)
    except:
        skipped += 1
    
    all_inchikeys.add(InchiToInchiKey(MolToInchi(mol)))

100%|██████████| 69954/69954 [00:39<00:00, 1793.24it/s]


In [12]:
skipped

0

In [13]:
np_classifier_df = pd.read_parquet(
    's3://enveda-data-kg/kg3/raw_source_downloads/manual_uploads/np_classifier.parquet'
)

DEBUG:botocore.hooks:Changing event name from creating-client-class.iot-data to creating-client-class.iot-data-plane
DEBUG:botocore.hooks:Changing event name from before-call.apigateway to before-call.api-gateway
DEBUG:botocore.hooks:Changing event name from request-created.machinelearning.Predict to request-created.machine-learning.Predict
DEBUG:botocore.hooks:Changing event name from before-parameter-build.autoscaling.CreateLaunchConfiguration to before-parameter-build.auto-scaling.CreateLaunchConfiguration
DEBUG:botocore.hooks:Changing event name from before-parameter-build.route53 to before-parameter-build.route-53
DEBUG:botocore.hooks:Changing event name from request-created.cloudsearchdomain.Search to request-created.cloudsearch-domain.Search
DEBUG:botocore.hooks:Changing event name from docs.*.autoscaling.CreateLaunchConfiguration.complete-section to docs.*.auto-scaling.CreateLaunchConfiguration.complete-section
DEBUG:botocore.hooks:Changing event name from before-parameter-buil

DEBUG:botocore.parsers:Response body:
b''
DEBUG:botocore.hooks:Event needs-retry.s3.HeadObject: calling handler <botocore.retryhandler.RetryHandler object at 0x7fc8596c3e50>
DEBUG:botocore.retryhandler:No retry needed.
DEBUG:botocore.hooks:Event needs-retry.s3.HeadObject: calling handler <bound method S3RegionRedirector.redirect_from_error of <botocore.utils.S3RegionRedirector object at 0x7fc8596c3eb0>>
DEBUG:s3fs:CALL: head_object - ({},) - {'Bucket': 'enveda-data-kg', 'Key': 'kg3/raw_source_downloads/manual_uploads/np_classifier.parquet'}
DEBUG:botocore.hooks:Event before-parameter-build.s3.HeadObject: calling handler <function sse_md5 at 0x7fc878c4b040>
DEBUG:botocore.hooks:Event before-parameter-build.s3.HeadObject: calling handler <function validate_bucket_name at 0x7fc878c47f70>
DEBUG:botocore.hooks:Event before-parameter-build.s3.HeadObject: calling handler <bound method S3RegionRedirector.redirect_from_cache of <botocore.utils.S3RegionRedirector object at 0x7fc8596c3eb0>>
DEBUG

DEBUG:botocore.hooks:Event choose-signer.s3.HeadObject: calling handler <function set_operation_specific_signer at 0x7fc878c47ca0>
DEBUG:botocore.hooks:Event before-sign.s3.HeadObject: calling handler <bound method S3EndpointSetter.set_endpoint of <botocore.utils.S3EndpointSetter object at 0x7fc848d24040>>
DEBUG:botocore.utils:Checking for DNS compatible bucket for: https://s3.amazonaws.com/enveda-data-kg/kg3/raw_source_downloads/manual_uploads/np_classifier.parquet
DEBUG:botocore.utils:URI updated to: https://enveda-data-kg.s3.amazonaws.com/kg3/raw_source_downloads/manual_uploads/np_classifier.parquet
DEBUG:botocore.auth:Calculating signature using v4 auth.
DEBUG:botocore.auth:CanonicalRequest:
HEAD
/kg3/raw_source_downloads/manual_uploads/np_classifier.parquet

host:enveda-data-kg.s3.amazonaws.com
x-amz-content-sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
x-amz-date:20230208T120203Z

host;x-amz-content-sha256;x-amz-date
e3b0c44298fc1c149afbf4c8996fb92427ae4

DEBUG:botocore.parsers:Response body:
<botocore.response.StreamingBody object at 0x7fc869951dc0>
DEBUG:botocore.hooks:Event needs-retry.s3.GetObject: calling handler <botocore.retryhandler.RetryHandler object at 0x7fc8596c3e50>
DEBUG:botocore.retryhandler:No retry needed.
DEBUG:botocore.hooks:Event needs-retry.s3.GetObject: calling handler <bound method S3RegionRedirector.redirect_from_error of <botocore.utils.S3RegionRedirector object at 0x7fc8596c3eb0>>
DEBUG:s3fs:CALL: head_object - ({},) - {'Bucket': 'enveda-data-kg', 'Key': 'kg3/raw_source_downloads/manual_uploads/np_classifier.parquet'}
DEBUG:botocore.hooks:Event before-parameter-build.s3.HeadObject: calling handler <function sse_md5 at 0x7fc878c4b040>
DEBUG:botocore.hooks:Event before-parameter-build.s3.HeadObject: calling handler <function validate_bucket_name at 0x7fc878c47f70>
DEBUG:botocore.hooks:Event before-parameter-build.s3.HeadObject: calling handler <bound method S3RegionRedirector.redirect_from_cache of <botocore.util

DEBUG:botocore.hooks:Event choose-signer.s3.HeadObject: calling handler <function set_operation_specific_signer at 0x7fc878c47ca0>
DEBUG:botocore.hooks:Event before-sign.s3.HeadObject: calling handler <bound method S3EndpointSetter.set_endpoint of <botocore.utils.S3EndpointSetter object at 0x7fc848d24040>>
DEBUG:botocore.utils:Checking for DNS compatible bucket for: https://s3.amazonaws.com/enveda-data-kg/kg3/raw_source_downloads/manual_uploads/np_classifier.parquet
DEBUG:botocore.utils:URI updated to: https://enveda-data-kg.s3.amazonaws.com/kg3/raw_source_downloads/manual_uploads/np_classifier.parquet
DEBUG:botocore.auth:Calculating signature using v4 auth.
DEBUG:botocore.auth:CanonicalRequest:
HEAD
/kg3/raw_source_downloads/manual_uploads/np_classifier.parquet

host:enveda-data-kg.s3.amazonaws.com
x-amz-content-sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
x-amz-date:20230208T120204Z

host;x-amz-content-sha256;x-amz-date
e3b0c44298fc1c149afbf4c8996fb92427ae4

DEBUG:botocore.parsers:Response body:
<botocore.response.StreamingBody object at 0x7fc86996a100>
DEBUG:botocore.hooks:Event needs-retry.s3.GetObject: calling handler <botocore.retryhandler.RetryHandler object at 0x7fc8596c3e50>
DEBUG:botocore.retryhandler:No retry needed.
DEBUG:botocore.hooks:Event needs-retry.s3.GetObject: calling handler <bound method S3RegionRedirector.redirect_from_error of <botocore.utils.S3RegionRedirector object at 0x7fc8596c3eb0>>
DEBUG:fsspec:<File-like object S3FileSystem, enveda-data-kg/kg3/raw_source_downloads/manual_uploads/np_classifier.parquet> read: 4 - 18573274
DEBUG:s3fs:Fetch: enveda-data-kg/kg3/raw_source_downloads/manual_uploads/np_classifier.parquet, 4-18512416
DEBUG:botocore.hooks:Event before-parameter-build.s3.GetObject: calling handler <function sse_md5 at 0x7fc878c4b040>
DEBUG:botocore.hooks:Event before-parameter-build.s3.GetObject: calling handler <function validate_bucket_name at 0x7fc878c47f70>
DEBUG:botocore.hooks:Event before-parameter-

In [14]:
np_classifier_df.head(5)

Unnamed: 0.1,Unnamed: 0,pubchem_openeye_can_smiles,class_results,superclass_results,pathway_results,is_glycoside
2,2,C1=CC(C(C(=C1)C(=O)O)O)O,"['Shikimic acids and derivatives', 'Simple phe...",['Phenolic acids (C6-C1)'],['Shikimates and Phenylpropanoids'],False
3,3,CC(CN)O,[],[],[],False
4,4,C(C(=O)COP(=O)(O)O)N,['Aminoacids'],['Small peptides'],"['Amino acids and Peptides', 'Shikimates and P...",False
6,6,CCN1C=NC2=C(N=CN=C21)N,['Purine alkaloids'],['Pseudoalkaloids'],['Alkaloids'],False
7,7,CCC(C)(C(C(=O)O)O)O,['Hydroxy fatty acids'],['Fatty Acids and Conjugates'],['Fatty acids'],False


In [15]:
smiles_to_np_classifier = {}

all_classes = set()
all_superclasses = set()
all_pathways = set()

for _, smiles, classes, superclasses, pathways, is_glycoside in np_classifier_df.values:
    
    if smiles not in smiles_to_pubchem:
        continue
        
    for clas in literal_eval(classes):
        all_classes.add(clas)
        
    for clas in literal_eval(superclasses):
        all_superclasses.add(clas)
        
    for clas in literal_eval(pathways):
        all_pathways.add(clas)
        
    smiles_to_np_classifier[smiles_to_pubchem[smiles]] = {
        "classes": literal_eval(classes),
        "superclasses": literal_eval(superclasses),
        "pathways": literal_eval(pathways),
        "is_glycoside": is_glycoside,
    }
    

In [16]:
len(all_classes), len(all_superclasses), len(all_pathways)

(524, 71, 7)

Ping API

In [17]:
resolved_ik_number_list = [0, 0]
total_inchi_number = len(all_inchikeys)

while True:
    
    start_time = time.time()
    
    print('%s inchikey to resolve' % total_inchi_number )
    get_classifications_cf_mod(all_inchikeys, par_level= 50)
    
    cleanse('all_json.json', 'all_json.json')
    
    with open("all_json.json") as tweetfile:
        jsondic = json.loads(tweetfile.read())

    df = json_normalize(jsondic)
    df = df.drop_duplicates('smiles')
    resolved_ik_number = len( df.drop_duplicates('smiles').inchikey )
    resolved_ik_number_list.append( resolved_ik_number )
    print('%s resolved smiles' % resolved_ik_number )
    print("done in --- %s seconds ---" % (time.time() - start_time))
    
    if resolved_ik_number_list[-1] < resolved_ik_number_list[-2] or resolved_ik_number_list[-1] == resolved_ik_number_list[-3]:
        break

69891 inchikey to resolve
64321 resolved smiles
done in --- 473.6717128753662 seconds ---
69891 inchikey to resolve
64321 resolved smiles
done in --- 147.92391800880432 seconds ---
69891 inchikey to resolve
64321 resolved smiles
done in --- 135.92525792121887 seconds ---


In [18]:
df.head()

Unnamed: 0,smiles,inchikey,intermediate_nodes,alternative_parents,molecular_framework,substituents,description,external_descriptors,ancestors,predicted_chebi_terms,...,subclass.name,subclass.description,subclass.chemont_id,subclass.url,direct_parent.name,direct_parent.description,direct_parent.chemont_id,direct_parent.url,identifier,report
0,[H]C1(C)CCC2(CCC3(C)C(=CCC4([H])C5(C)CCC([H])(...,InChIKey=RRIMLWHUVCZACL-UHFFFAOYSA-N,"[{'name': 'Triterpene glycosides', 'descriptio...","[{'name': 'Triterpenoids', 'description': 'Ter...",Aliphatic heteropolycyclic compounds,"[Triterpene saponin, Triterpenoid, Hexose mono...",This compound belongs to the class of organic ...,[],"[Acetals, Alcohols and polyols, Carbohydrates ...","[triterpenoid (CHEBI:36615), hexose (CHEBI:181...",...,Terpene glycosides,Prenol lipids containing a carbohydrate moiety...,CHEMONTID:0002049,http://classyfire.wishartlab.com/tax_nodes/C00...,Triterpene saponins,Glycosylated derivatives of triterpene sapogen...,CHEMONTID:0002358,http://classyfire.wishartlab.com/tax_nodes/C00...,,
1,C[C@@]1(CC[C@@]2(C)[C@@H](CC[C@@]3(C)[C@@H]2C=...,InChIKey=CVAILKMOFONEDU-KRJMWWHISA-N,[],"[{'name': 'Sesquiterpenoids', 'description': '...",Aliphatic homopolycyclic compounds,"[18-oxosteroid, Oxosteroid, Sesquiterpenoid, H...",This compound belongs to the class of organic ...,[],"[Benzenoids, Carbonyl compounds, Carboxylic ac...","[sesquiterpenoid (CHEBI:26658), phenanthrenes ...",...,Oxosteroids,Steroid derivatives carrying a C=O group attac...,CHEMONTID:0001194,http://classyfire.wishartlab.com/tax_nodes/C00...,Oxosteroids,Steroid derivatives carrying a C=O group attac...,CHEMONTID:0001194,http://classyfire.wishartlab.com/tax_nodes/C00...,,
2,COC1=C(OC)C2=C(C=C1)C(=O)C1=C(O)C(C3C(OC4=C3C3...,InChIKey=BDURUBOYNAVRCF-UHFFFAOYSA-N,"[{'name': 'Acridines', 'description': 'Organic...","[{'name': 'Hydroquinolones', 'description': 'C...",Aromatic heteropolycyclic compounds,"[Acridone, Dihydroquinolone, Dihydroquinoline,...",This compound belongs to the class of organic ...,[],"[1-hydroxy-2-unsubstituted benzenoids, 1-hydro...","[quinolines (CHEBI:26513), 1-benzofurans (CHEB...",...,Benzoquinolines,Organic compounds containing a benzene fused t...,CHEMONTID:0001908,http://classyfire.wishartlab.com/tax_nodes/C00...,Acridones,Acridines containing a ketone group attached t...,CHEMONTID:0001811,http://classyfire.wishartlab.com/tax_nodes/C00...,,
3,CC(=O)OCC12C(CC(C)(O)C3(CC(OC3=O)c3ccoc3)C1(O)...,InChIKey=PLNVNTKQZPPADC-UHFFFAOYSA-N,[],"[{'name': 'Gamma butyrolactones', 'description...",Aromatic heteropolycyclic compounds,"[Tricarboxylic acid or derivatives, Gamma buty...",This compound belongs to the class of organic ...,[],"[Alcohols and polyols, Carbonyl compounds, Car...","[gamma-lactone (CHEBI:37581), oxolanes (CHEBI:...",...,Tricarboxylic acids and derivatives,Carboxylic acids containing exactly three carb...,CHEMONTID:0001986,http://classyfire.wishartlab.com/tax_nodes/C00...,Tricarboxylic acids and derivatives,Carboxylic acids containing exactly three carb...,CHEMONTID:0001986,http://classyfire.wishartlab.com/tax_nodes/C00...,,
4,COC(=O)[C@H](C\C=C\C(C)(C)O)[C@@H]1[C@@H](O)C[...,InChIKey=KTEXQEFLIOIDKM-GEMWEJHDSA-N,[],"[{'name': 'Cholesterols and derivatives', 'des...",Aliphatic homopolycyclic compounds,"[Triterpenoid, Cholesterol-skeleton, Cholestan...",This compound belongs to the class of organic ...,[],"[16-beta-hydroxysteroids, 16-hydroxysteroids, ...","[cholestanoid (CHEBI:50401), cholanoid (CHEBI:...",...,Triterpenoids,Terpene molecules containing six isoprene units.,CHEMONTID:0001553,http://classyfire.wishartlab.com/tax_nodes/C00...,Triterpenoids,Terpene molecules containing six isoprene units.,CHEMONTID:0001553,http://classyfire.wishartlab.com/tax_nodes/C00...,,


In [20]:
df.to_csv('classyfire_smiles.tsv.gz', sep='\t', compression='gzip')

In [22]:
np_classifier_df.to_csv('npclassifier_smiles.tsv.gz', sep='\t', compression='gzip')