In [7]:
import pandas as pd
import numpy as np
from time import time, sleep
import pickle
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline
mpl.rcParams['figure.dpi']= 150

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs

import urllib.request as request
from lxml import etree

from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [2]:
import importlib
import sys
sys.path.append('./misc')

import misc.pubmed_util
importlib.reload(misc.pubmed_util)
import misc.pubmed_util as PMD

In [3]:
food = 'garlic'
#food = 'cocoa'

food_data = pd.read_pickle('misc_save/' + food + '_food_data.pkl')
foodb_food = pd.read_pickle('misc_save/' + food + '_foodb_food.pkl')
usda = pd.read_pickle('misc_save/' + food + '_usda_conv.pkl')

foodb_food = foodb_food[foodb_food.standard_content.notnull()]

u = ['mg/100g', 'mg/100']
foodb_food = foodb_food[foodb_food.units.isin(u)]

foodb_food = foodb_food.rename(columns={'orig_source_name' : 'chemical', 'standard_content':'average_mean'})
usda = usda.rename(columns = {'nut_desc' : 'chemical', 'usda_amount' : 'average_mean'})

food_data['source'] = 'pilot'
foodb_food['source'] = 'FooDB'
usda['source'] = 'USDA'

In [4]:
def filt(df, simple=False):
    
    if simple:
        df = df[['chemical', 'chem_id_p']].drop_duplicates()
    else:
        df = df[['chemical', 'chem_id_p', 'average_mean', 'source']].drop_duplicates()
    
    df = df[df['chem_id_p'].notnull()]
    
    return df.reset_index(drop=True)

chems = pd.concat([filt(food_data, simple=True), 
                   filt(foodb_food, simple=True),
                   filt(usda, simple=True)])

In [26]:
for idx, row in chems.iterrows():
    chems.at[idx, 'SMILE'] = PMD.pubchem_SMILE(str(int(row['chem_id_p'])))
    sleep(.1)

In [5]:
#chems.to_pickle('misc_save/' + food + '_SMILES.pkl')
chems = pd.read_pickle('misc_save/' + food + '_SMILES.pkl')

# Fingerprint Analysis

In [8]:
def get_fingerprint_string(SMILE):
    # Gets dictionary of subcompountnets and their counts
    sub_dict = AllChem.GetMorganFingerprint(Chem.MolFromSmiles(SMILE),1).GetNonzeroElements()

    fingerprint = []
    for key, value in sub_dict.items():
        fingerprint = fingerprint + [str(key)] * value

    return ' '.join(fingerprint)

for idx, row in chems.iterrows():
    chems.at[idx, 'fingerprint'] = get_fingerprint_string(row['SMILE'])

In [9]:
def get_ordered_fingerprint_string(SMILE):
    bi = {}
    AllChem.GetMorganFingerprint(Chem.MolFromSmiles(SMILE), radius=1, bitInfo=bi)

    mol = pd.DataFrame()
    for key, value in bi.items():
        for i in range(len(value)):
            sub = pd.Series()
            sub['val'] = str(key)

            sub['order'] = value[i][0]
            sub['radius'] = value[i][1]

            mol = mol.append(sub, ignore_index=True)

    mol = mol.sort_values(by=['order', 'radius']).reset_index(drop=True)
    #display(mol)
    
    mol_string = " ".join(mol.val.tolist())
        
    return mol_string.strip()

for idx, row in chems.iterrows():
    chems.at[idx, 'fingerprint'] = get_ordered_fingerprint_string(row['SMILE'])

In [10]:
#chems.to_pickle('misc_save/' + food + '_fingerprints.pkl')
chems = pd.read_pickle('misc_save/' + food + '_fingerprints.pkl')