# Classifying chemicals using NPClassifier

### Import modules

In [None]:
from ast import literal_eval

import pandas as pd
from tqdm import tqdm

from rdkit import RDLogger  
from rdkit.Chem.inchi import MolToInchi, InchiToInchiKey
from rdkit.Chem import MolFromSmiles

In [None]:
RDLogger.DisableLog('rdApp.*')

In [None]:
DATA_DIR = '../data/'

Load plant-chemical associations

In [None]:
plant_chemical_df = pd.read_csv(
    f's3://enveda-datascience/daniel_domingo/plant_chemical_associations.tsv.gz',
    compression='gzip',
    sep='\t',
    usecols=[
        'plant_curie',
        'chemical_curie',
    ],
)

In [None]:
plant_chemical_df.head(1)

In [None]:
len(plant_chemical_df.chemical_curie.unique())

Get SMILES

In [None]:
smiles_df = pd.read_csv(
    f's3://enveda-datascience/daniel_domingo/smiles.tsv.gz',
    sep='\t',
    compression='gzip',
)

In [None]:
smiles_df.head(1)

In [None]:
pubchem_to_smiles = {}
all_smiles = set()

chemicals_in_plants = set(plant_chemical_df.chemical_curie.unique())

for pubchem_id, smiles in tqdm(smiles_df.values):
        
    if pubchem_id.startswith('pubchem:'):
        pubchem_id = pubchem_id.replace('pubchem:', 'pubchem.compound:')
        
    if pubchem_id not in chemicals_in_plants:
        continue
    
    all_smiles.add(smiles)
    pubchem_to_smiles[pubchem_id] = smiles

smiles_to_pubchem = {
    smiles: pubchem
    for pubchem, smiles in pubchem_to_smiles.items()
}

In [None]:
len(all_smiles)

In [None]:
all_inchikeys = set()
skipped = 0

for smiles in tqdm(all_smiles):
    try:
        mol = MolFromSmiles(smiles)
    except:
        skipped += 1
    
    all_inchikeys.add(InchiToInchiKey(MolToInchi(mol)))

In [None]:
skipped

In [None]:
np_classifier_df = pd.read_parquet(
    's3://enveda-data-kg/kg3/raw_source_downloads/manual_uploads/np_classifier.parquet'
)

In [None]:
np_classifier_df.head(5)

In [None]:
smiles_to_np_classifier = {}

all_classes = set()
all_superclasses = set()
all_pathways = set()

for _, smiles, classes, superclasses, pathways, is_glycoside in np_classifier_df.values:
    
    if smiles not in smiles_to_pubchem:
        continue
        
    for clas in literal_eval(classes):
        all_classes.add(clas)
        
    for clas in literal_eval(superclasses):
        all_superclasses.add(clas)
        
    for clas in literal_eval(pathways):
        all_pathways.add(clas)
        
    smiles_to_np_classifier[smiles_to_pubchem[smiles]] = {
        "classes": literal_eval(classes),
        "superclasses": literal_eval(superclasses),
        "pathways": literal_eval(pathways),
        "is_glycoside": is_glycoside,
    }
    

In [None]:
len(all_classes), len(all_superclasses), len(all_pathways)

In [None]:
np_classifier_df.to_csv('npclassifier_smiles.tsv.gz', sep='\t', compression='gzip')