# Exploring the differences between medicinal and non medicinal plants using chemical classes

### Import modules

In [23]:
from collections import defaultdict

import pandas as pd
from tqdm import tqdm

# Import taxonomy
from utils import get_genus_and_family_info_for_plants

Load plant-chemical and plant-disease associations

In [13]:
plant_disease_df = pd.read_csv(
    's3://enveda-datascience/daniel_domingo/plant_disease_associations.tsv.gz',
    compression='gzip',
    sep='\t',
)

In [14]:
plant_disease_df.head(1)

Unnamed: 0,plant_curie,plant_name,disease_curie,database,evidence
0,ncbitaxon:3369,Cryptomeria japonica,mondo:0005324,bern2,10067319_8


Medicinal plants

In [15]:
medicinal_plants = plant_disease_df.plant_curie.unique()

Load chemicals

In [10]:
plant_chemical_df = pd.read_csv(
    's3://enveda-datascience/daniel_domingo/plant_chemical_associations.tsv.zip',
    compression='zip',
    sep='\t',
    usecols=[
        'plant_curie',
        'chemical_curie',
    ]
)

In [11]:
plant_chemical_df.head(1)

Unnamed: 0,plant_curie,chemical_curie
0,ncbitaxon:1000425,pubchem.compound:3527


In [6]:
len(plant_chemical_df.chemical_curie.unique())

71179

Load ClassyFire classes

In [16]:
classyfire_df = pd.read_csv(
    's3://enveda-datascience/daniel_domingo/classyfire_smiles.tsv.gz',
    sep='\t',
    compression='gzip',
)

  classyfire_df = pd.read_csv(


In [17]:
classyfire_df.head(1)

Unnamed: 0.1,Unnamed: 0,smiles,inchikey,intermediate_nodes,alternative_parents,molecular_framework,substituents,description,external_descriptors,ancestors,...,subclass.name,subclass.description,subclass.chemont_id,subclass.url,direct_parent.name,direct_parent.description,direct_parent.chemont_id,direct_parent.url,identifier,report
0,0,[H]C1(C)CCC2(CCC3(C)C(=CCC4([H])C5(C)CCC([H])(...,InChIKey=RRIMLWHUVCZACL-UHFFFAOYSA-N,"[{'name': 'Triterpene glycosides', 'descriptio...","[{'name': 'Triterpenoids', 'description': 'Ter...",Aliphatic heteropolycyclic compounds,"['Triterpene saponin', 'Triterpenoid', 'Hexose...",This compound belongs to the class of organic ...,[],"['Acetals', 'Alcohols and polyols', 'Carbohydr...",...,Terpene glycosides,Prenol lipids containing a carbohydrate moiety...,CHEMONTID:0002049,http://classyfire.wishartlab.com/tax_nodes/C00...,Triterpene saponins,Glycosylated derivatives of triterpene sapogen...,CHEMONTID:0002358,http://classyfire.wishartlab.com/tax_nodes/C00...,,


Load NPClassifier classes

In [19]:
np_classifier_df = pd.read_csv(
    's3://enveda-datascience/daniel_domingo/npclassifier_smiles.tsv.gz',
    sep='\t',
    compression='gzip',
)

In [20]:
np_classifier_df.head(1)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,pubchem_openeye_can_smiles,class_results,superclass_results,pathway_results,is_glycoside
0,2,2,C1=CC(C(C(=C1)C(=O)O)O)O,"['Shikimic acids and derivatives', 'Simple phe...",['Phenolic acids (C6-C1)'],['Shikimates and Phenylpropanoids'],False


In [21]:
smiles_df = pd.read_csv(
    's3://enveda-datascience/daniel_domingo/smiles.tsv.gz',
    sep='\t',
    compression='gzip',
)

In [22]:
pubchem_to_smiles = {}
all_smiles = set()

chemicals_in_plants = set(plant_chemical_df.chemical_curie.unique())

for pubchem_id, smiles in tqdm(smiles_df.values):
        
    if pubchem_id.startswith('pubchem:'):
        pubchem_id = pubchem_id.replace('pubchem:', 'pubchem.compound:')
        
    if pubchem_id not in chemicals_in_plants:
        continue
    
    all_smiles.add(smiles)
    pubchem_to_smiles[pubchem_id] = smiles

smiles_to_pubchem = {
    smiles: pubchem
    for pubchem, smiles in pubchem_to_smiles.items()
}

100%|██████████| 323369/323369 [00:00<00:00, 469135.53it/s]


#### Make Compound -> Class mapping

In [None]:
# Iterate through each df, map the pubchem using the smiles-> pubchem and make a pubchem->classes dict

#### Create plant vectors
e.g., plant A: [class1, class2, class3, ...]

In [None]:
def create_np_classifier_vectors():
    
    ...
    
    
def create_classyfire_vectors():
    
    ...

In [None]:
plant_to_classyfire_vectors = create_classyfire_vectors()

In [None]:
plant_to_npclassifier_vectors = create_np_classifier_vectors()

#### Load Plant -> Family mappings

In [None]:
_, family_to_species = get_genus_and_family_info_for_plants(
    set(plant_chemical_df.plant_curie.unique())
)

1. Iterate through families and separate the family into two groups (medicinal and non-medicinal plants)
2. For each group, aggregate the vector of all its plant

Plot PCA of the vectors for each family (2 categories per family)
Color by group (one color for medicinal plants and another one for non-medicinal ones)