In [2]:
import pandas as pd
import numpy as np
import pickle
import time

In [4]:
import os
os.chdir('..')

In [13]:
from config import mfp
from src.data_loader import load_ctd
import src.tools.chemidr.id_map as id_map

In [118]:
hdata = load_ctd()

In [138]:
ctd_mesh = hdata.drop_duplicates('ChemicalID').copy().reset_index(drop=True)

mesh_cid_map = []

start = time.time()
for mesh in ctd_mesh.ChemicalID.tolist():
    mesh_cid_map.append(id_map.mesh2pid(mesh))

    if not i % 400:
        print(i, 'loops done in', (time.time() - start) / 60, 'min')
    i+=1
    
    # uncomment to re-run, or load saved id's below
#     pickle.dump( mesh_cid_map, open( "misc_save/ctd_mesh_cid_map.pkl", "wb" ) )

11600 loops done in 0.0064910014470418295 min
12000 loops done in 3.0944095174471538 min
12400 loops done in 6.416819866498312 min
12800 loops done in 9.554338236649832 min
13200 loops done in 12.79873957236608 min
13600 loops done in 15.923254267374675 min
14000 loops done in 19.15265403588613 min
14400 loops done in 22.403776121139526 min
14800 loops done in 25.879218264420828 min
15200 loops done in 29.407595113913217 min
15600 loops done in 32.89271252155304 min


In [14]:
with open(mfp("misc_save/ctd_mesh_cid_map.pkl"), "rb") as f:
    ids = pickle.load(f)

ids_dict = {}
for d in ids:
    ids_dict.update(d)

cids = [float(v['cid']) for _, v in ids_dict.items()]

hchems = pd.DataFrame({'chem_id_p' : cids, 'ChemicalName' : [np.nan] * len(cids)})
hchems['chem_id'] = np.nan
hchems['chem_id'] = hchems['chem_id'].astype(object)
hchems.at[hchems[hchems.chem_id_p.notnull()].index, 'chem_id'] = id_map.cids2inchis(hchems[hchems.chem_id_p.notnull()].chem_id_p.tolist(), use_prefix=True)

In [20]:
def disambiguation_table(fm_q, fm_uq, fdb_d_q, fdb_d_uq, usda):
    fm_ids = list(set( fm_q[fm_q['chem_id'].notnull()].chem_id.tolist() + fm_uq[fm_uq['chem_id'].notnull()].chem_id.tolist() ))
    fdb_ids = list(set( fdb_d_q[fdb_d_q['chem_id'].notnull()].chem_id.tolist() + fdb_d_uq[fdb_d_uq['chem_id'].notnull()].chem_id.tolist() ))
    usda_ids = list(set( usda[(usda['chem_id'].notnull()) & (usda.Nutr_Val > 0)].chem_id.tolist() ))

    disp = pd.DataFrame({
            '' : ['foodmine', 'foodb', 'usda'],
            'quant' : [
                len( fm_q[fm_q.average_mean > 0].merge(hchems, how='inner', on='chem_id').dropna(subset=['chem_id'], axis=0).drop_duplicates('chem_id') ),
                len( fdb_d_q[fdb_d_q.standard_content > 0].merge(hchems, how='inner', on='chem_id').dropna(subset=['chem_id'], axis=0).drop_duplicates('chem_id') ),
                len( usda[usda.Nutr_Val > 0].merge(hchems, how='inner', on='chem_id').dropna(subset=['chem_id'], axis=0).drop_duplicates('chem_id') )
            ],
            'unquant' : [
                len( fm_uq.merge(hchems, how='inner', on='chem_id').dropna(subset=['chem_id'], axis=0).drop_duplicates('chem_id') ),
                len( fdb_d_uq.merge(hchems, how='inner', on='chem_id').dropna(subset=['chem_id'], axis=0).drop_duplicates('chem_id') ),
                0
            ],
            'absent' : [
                len( hchems[~hchems.chem_id.isin(fm_ids)].dropna(subset=['chem_id'], axis=0).drop_duplicates('chem_id') )
                + len( hchems[hchems.chem_id.isnull()].ChemicalName.drop_duplicates() ),
                len( hchems[~hchems.chem_id.isin(fdb_ids)].dropna(subset=['chem_id'], axis=0).drop_duplicates('chem_id') )
                + len( hchems[hchems.chem_id.isnull()].ChemicalName.drop_duplicates() ),
                len( hchems[~hchems.chem_id.isin(usda_ids)].dropna(subset=['chem_id'], axis=0).drop_duplicates('chem_id') )
                + len( hchems[hchems.chem_id.isnull()].ChemicalName.drop_duplicates() )
            ]
        })

    disp['total'] = disp.sum(axis=1)
    display(disp)

### Load data related to garlic

In [21]:
fdb_d = pd.read_pickle('misc_save/garlic_foodb_food_dump.pkl')
fdb_d.rename(columns={'pubchem_id' : 'chem_id_p', 'foodb_id' : 'chem_id_f'}, inplace=True)

# Creates a list of the unique chemicals in garlic from FooDB
foodb_food_lower = list(set( fdb_d.chem_id.tolist() ))

# Creates a separate dataframe that holds chemicals for garlic in foodb with a real quantification
fdb_d_q = fdb_d[fdb_d.standard_content.notnull()][['chem_id', 'chem_id_p', 'chem_id_f', 'orig_source_id','name', 'standard_content']].drop_duplicates()

# Creates a separate dataframe that holds chemicals for garlic in foodb without a real quantification
fdb_d_uq = fdb_d[fdb_d.standard_content.isnull()][['chem_id', 'chem_id_p', 'chem_id_f', 'orig_source_id', 'name', 'standard_content']].reset_index()

q_ids = list(set( fdb_d_q.chem_id.tolist() ))
q_names = list(set( fdb_d_q.name.tolist() ))
fdb_d_uq = fdb_d_uq[(~fdb_d_uq.chem_id.fillna('-').isin(q_ids)) & (~fdb_d_uq.name.fillna('-').isin(q_names))]


# Load FoodMine data
fm = pd.read_pickle('misc_save/garlic_fm.pkl')
fm_q = pd.read_pickle('misc_save/garlic_fm_quant.pkl')
fm_uq = pd.read_pickle('misc_save/garlic_fm_unquant.pkl')
fm.rename(columns={'pubchem_id' : 'chem_id_p', 'foodb_id' : 'chem_id_f'}, inplace=True)
fm_q.rename(columns={'pubchem_id' : 'chem_id_p', 'foodb_id' : 'chem_id_f'}, inplace=True)
fm_uq.rename(columns={'pubchem_id' : 'chem_id_p', 'foodb_id' : 'chem_id_f'}, inplace=True)


# Load USDA data
usda = pd.read_pickle('misc_save/garlic_usda.pkl')
usda.rename(columns={'pubchem_id' : 'chem_id_p', 'foodb_id' : 'chem_id_f'}, inplace=True)

In [22]:
disambiguation_table(fm_q, fm_uq, fdb_d_q, fdb_d_uq, usda)

Unnamed: 0,Unnamed: 1,quant,unquant,absent,total
0,foodmine,138,24,12086,12248
1,foodb,89,485,11674,12248
2,usda,37,0,12211,12248


### Load data related to cocoa

In [21]:
fdb_d = pd.read_pickle('misc_save/cocoa_foodb_food_dump.pkl')
fdb_d.rename(columns={'pubchem_id' : 'chem_id_p', 'foodb_id' : 'chem_id_f'}, inplace=True)

# Creates a list of the unique chemicals in cocoa from FooDB
foodb_food_lower = list(set( fdb_d.chem_id.tolist() ))

# Creates a separate dataframe that holds chemicals for cocoa in foodb with a real quantification
fdb_d_q = fdb_d[fdb_d.standard_content.notnull()][['chem_id', 'chem_id_p', 'chem_id_f', 'orig_source_id','name', 'standard_content']].drop_duplicates()

# Creates a separate dataframe that holds chemicals for cocoa in foodb without a real quantification
fdb_d_uq = fdb_d[fdb_d.standard_content.isnull()][['chem_id', 'chem_id_p', 'chem_id_f', 'orig_source_id', 'name', 'standard_content']].reset_index()

q_ids = list(set( fdb_d_q.chem_id.tolist() ))
q_names = list(set( fdb_d_q.name.tolist() ))
fdb_d_uq = fdb_d_uq[(~fdb_d_uq.chem_id.fillna('-').isin(q_ids)) & (~fdb_d_uq.name.fillna('-').isin(q_names))]


# Loads FoodMine data
fm = pd.read_pickle('misc_save/cocoa_fm.pkl')
fm_q = pd.read_pickle('misc_save/cocoa_fm_quant.pkl')
fm_uq = pd.read_pickle('misc_save/cocoa_fm_unquant.pkl')
fm.rename(columns={'pubchem_id' : 'chem_id_p', 'foodb_id' : 'chem_id_f'}, inplace=True)
fm_q.rename(columns={'pubchem_id' : 'chem_id_p', 'foodb_id' : 'chem_id_f'}, inplace=True)
fm_uq.rename(columns={'pubchem_id' : 'chem_id_p', 'foodb_id' : 'chem_id_f'}, inplace=True)


# Load USDA data
usda = pd.read_pickle('misc_save/cocoa_usda.pkl')
usda.rename(columns={'pubchem_id' : 'chem_id_p', 'foodb_id' : 'chem_id_f'}, inplace=True)

In [22]:
disambiguation_table(fm_q, fm_uq, fdb_d_q, fdb_d_uq, usda)

Unnamed: 0,Unnamed: 1,quant,unquant,absent,total
0,foodmine,138,24,12086,12248
1,foodb,72,467,11709,12248
2,usda,25,0,12223,12248
