In [1]:
import sys
sys.path.append('..')

In [2]:
from pathlib import Path
import dietrx
from dietrx import app
from dietrx import db
from dietrx.models import *

import pandas as pd

DATA = Path('../data/version-3/')

In [3]:
def add_to_db(obj, dl):
    for i, item in enumerate(dl):
        db.session.add(obj(**item))

        if (i+1) % 10000 == 0:
            db.session.commit()
    db.session.commit()
    
    
def remove_from_db(obj):
    for i, o in enumerate(obj.query.all()):
        db.session.delete(o)
        
        if (i+1) % 10000 == 0:
            db.session.commit()
            
    db.session.commit()

### Delete previous data in DB

In [48]:
remove_from_db(Food_disease)
remove_from_db(Disease_gene)
remove_from_db(Food_gene)
remove_from_db(Food_chemical)
remove_from_db(Chemical_disease)
remove_from_db(Food)
remove_from_db(Disease)
remove_from_db(Gene)
remove_from_db(References)
remove_from_db(Chemical)

In [9]:
def clean_colnames(df):
    df.columns = [col.replace(' ', '-').lower() for col in df.columns]

## A. Lexicons

### Food Lexicon

In [49]:
df = pd.read_csv(DATA/'food-lexicon-automated.tsv', sep='\t', encoding='utf-8')
clean_colnames(df)

del df['foodb-group'], df['foodb-subgroup']
df.head()

Unnamed: 0,common-names,food-id,food-category,food-name,scientific-name,tax-id
0,alexanders; horse parsley,Plant ID:40962,Miscellaneous,Alexanders,Smyrnium olusatrum,40962.0
1,,Plant ID:942083,Miscellaneous,Scrophularia umbrosa,Scrophularia umbrosa,942083.0
2,dotted blazing star,Plant ID:344074,Miscellaneous,Dotted blazing star,Liatris punctata,344074.0
3,plymouth pear,Plant ID:761867,Miscellaneous,Plymouth pear,Pyrus cordata,761867.0
4,,Plant ID:49166,Miscellaneous,Rhododendron kaempferi,Rhododendron kaempferi,49166.0


In [50]:
# Replace NaN with None
df = df.where((pd.notnull(df)), None)

# Rename columns to match db schema.
df.rename(columns={
    'common-names': 'common_names',
    'food-name': 'display_name',
    'food-id': 'food_id',
    'food-category': 'food_category',
    'scientific-name': 'scientific_name',
    'tax-id':'tax_id',
}, inplace=True)

data_list= list(df.T.to_dict().values())

In [51]:
add_to_db(Food, data_list)

### Disease Lexicon

In [52]:
df = pd.read_csv(DATA/'disease-lexicon.tsv', sep='\t', encoding='utf-8', index_col=0)
clean_colnames(df)
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,disease-id,disease-name,other-ids,parent-ids,disease-treenum,disease-synonyms,disease-category
0,MESH:D008105,"Liver Cirrhosis, Biliary",DO:DOID:12236|OMIM:109720|OMIM:614220|OMIM:614221,MESH:D002780|MESH:D008103,C06.130.120.135.250.250|C06.552.150.250|C06.55...,"Biliary Cirrhosis|Biliary Cirrhosis, Primary|B...",Digestive system disease
1,MESH:D009369,Neoplasms,DO:DOID:162,MESH:C,C04,Benign Neoplasm|Benign Neoplasms|Cancer|Cancer...,Cancer
2,MESH:D005892,"Gingivitis, Necrotizing Ulcerative",DO:DOID:13924|DO:DOID:9673,MESH:D005674|MESH:D005891|MESH:D007239,C01.252.400.388.350.400|C01.539.424|C07.465.71...,Acute Membranous Gingivitides|Acute Membranous...,Bacterial infection or mycosis|Mouth disease
3,MESH:D010020,Osteonecrosis,DO:DOID:10159,MESH:D001847|MESH:D009336,C05.116.852|C23.550.717.732,Aseptic Necrosis of Bone|Avascular Necrosis of...,Musculoskeletal disease|Pathology (process)
4,MESH:D014522,Urethral Diseases,DO:DOID:732,MESH:D014570,C12.777.767|C13.351.968.767,"Diseases, Urethral|Disease, Urethral|Urethral ...",Urogenital disease (female)|Urogenital disease...


In [53]:
df = df.where((pd.notnull(df)), None)

# Rename to match schema
df.rename(columns={
    'disease-id': 'disease_id',
    'disease-name': 'disease_name',
    'disease-category': 'disease_category',
    'disease-synonyms': 'disease_synonyms'
}, inplace=True)

# Keep only selected columns.
df = df[['disease_id', 'disease_name', 'disease_category']]

data_list= list(df.T.to_dict().values())

In [54]:
add_to_db(Disease, data_list)

### Gene Lexicon

In [55]:
df = pd.read_csv('../data/version-3/gene-disease.tsv', sep='\t', encoding='utf-8')[['gene-id']]
df.drop_duplicates(subset=['gene-id'], inplace=True)
df.head()

Unnamed: 0,gene-id
0,1588
1,26504
2,9499
3,29940
4,113189


In [56]:
df.rename(columns={
    'gene-id': 'gene_id',
}, inplace=True)


data_list= list(df.T.to_dict().values())

In [57]:
add_to_db(Gene, data_list)

### References

In [58]:
df = pd.read_csv(DATA/'publication-records.tsv', sep='\t', encoding='utf-8', index_col='Unnamed: 0')
clean_colnames(df)
df.head()

Unnamed: 0,authors,date,journal-name,journal-name-abbrv.,pmid,publication-type,title
0,Elkiran T|Harputluoglu H|Yasar U|Babaoglu MO|D...,2007 Jan-Feb,Methods and findings in experimental and clini...,Methods Find Exp Clin Pharmacol,17344941,Clinical Trial|Journal Article,Differential alteration of drug-metabolizing e...
1,Severins N|Mensink RP|Plat J,2015 Feb,"Nutrition, metabolism, and cardiovascular dise...",Nutr Metab Cardiovasc Dis,25456153,Journal Article|Randomized Controlled Trial,Effects of lutein-enriched egg yolk in butterm...
2,Li C|Zuo C|Deng G|Kuang R|Yang Q|Hu C|Sheng O|...,2013,PloS one,PLoS One,23922960,Journal Article,Contamination of bananas with beauvericin and ...
3,Güneser S|Atici A|Cengizler I|Alparslan N,1996 May-Jun,Allergologia et immunopathologia,Allergol Immunopathol (Madr),8766742,Journal Article,Inhalant allergens: as a cause of respiratory ...
4,Brvar M|Bunc M,2009 Sep 9,Cases journal,Cases J,19918387,Journal Article,High-degree atrioventricular block in acute et...


In [59]:
df = df.where((pd.notnull(df)), None)

df.rename(columns={
    'authors': 'authors',
    'journal-name': 'journal_name',
    'journal-name-abbrv.': 'journal_name_abbr',
    'publication-type':'publication_type',
},inplace=True)

data_list= list(df.T.to_dict().values())

In [60]:
add_to_db(References, data_list)

### Chemicals

In [61]:
fchem = pd.read_csv(DATA/'food-chemical.tsv', sep='\t')
pubchems = set(fchem['pubchem-id'])

In [62]:
for i, item in enumerate(pubchems):
    db.session.add(Chemical(pubchem_id = item))

    if (i+1) % 10000 == 0:
        db.session.commit()
db.session.commit()

## B. Associations

### Food-disease

In [63]:
df = pd.read_csv(DATA/'food-disease.tsv', sep='\t', encoding='utf-8')
df = df.where((pd.notnull(df)), None)
df['association'] = df['association'].apply(str.lower)

df.head()

Unnamed: 0,association,disease-id,disease-term,food-id,food-term,food-type,pmid,sentence
0,negative,MESH:D004487,edema,Plant ID:4045,Celery,Plant,7398283,|Celery| and parsley were confirmed as etiolog...
1,negative,MESH:D014581,urticaria,Plant ID:4045,Celery,Plant,7398283,|Celery| and parsley were confirmed as etiolog...
2,negative,MESH:D004487,edema,Plant ID:4043,parsley,Plant,7398283,Celery and |parsley| were confirmed as etiolog...
3,negative,MESH:D014581,urticaria,Plant ID:4043,parsley,Plant,7398283,Celery and |parsley| were confirmed as etiolog...
4,positive,MESH:D000544,Alzheimer's disease,Plant ID:136217,turmeric,Plant,24273069,"Evidence suggests that curcumin, the phytochem..."


In [66]:
df = df.groupby(['food-id', 'disease-id', 'association']).agg({
    'pmid': lambda s: '|'.join(str(e) for e in s),
})
df['type'] = 'Curated'
df['inference-network'] = ''

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pmid,type,inference-network
food-id,disease-id,association,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AlcoholicBev ID:17,MESH:D001523,positive,7195588,Curated,
AlcoholicBev ID:17,MESH:D003866,positive,6798614,Curated,
AlcoholicBev ID:17,MESH:D004487,negative,27873566,Curated,
AlcoholicBev ID:17,MESH:D011041,negative,11639831,Curated,
AlcoholicBev ID:18,MESH:D001249,negative,6639863,Curated,


### Gene-disease

In [69]:
df = pd.read_csv(DATA/'gene-disease.tsv', sep='\t', encoding='utf-8', index_col='Unnamed: 0')

df.rename(columns={
    'gene-id': 'gene_id',
    'disease-id': 'disease_id',
    'source': 'reference',
}, inplace=True)

df['type'] = 'Curated'
df['inference-network'] = ''

df.head()

Unnamed: 0,disease_id,gene_id,reference,type,inference-network
0,MESH:C000591739,1588,ORPHANET,Curated,
1,MESH:C000596385,26504,UNIPROT,Curated,
2,MESH:C000598645,9499,CTD_human;ORPHANET;UNIPROT,Curated,
3,MESH:C000600608,29940,ORPHANET,Curated,
4,MESH:C000600608,113189,CTD_human;ORPHANET;UNIPROT,Curated,


###  Food-Gene

In [71]:
# Find food-gene associations
fd = pd.read_csv(DATA/'food-disease.tsv', sep='\t', encoding='utf-8')
gd = pd.read_csv(DATA/'gene-disease.tsv', sep='\t', encoding='utf-8')
food_gene = fd.merge(gd, on='disease-id').drop(['pmid', 'association', 'source'], axis=1)

In [79]:
food_gene = food_gene.drop_duplicates(subset=['food-id', 'gene-id', 'disease-id']).\
    groupby(['food-id', 'gene-id']).agg({
    'disease-id': lambda s: '|'.join(s)
})

food_gene['type'] = 'Inferred'
food_gene.rename(columns={'disease-id': 'inference-network'}, inplace=True)

food_gene.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,inference-network,type
food-id,gene-id,Unnamed: 2_level_1,Unnamed: 3_level_1
AlcoholicBev ID:17,2,MESH:D003866,Inferred
AlcoholicBev ID:17,15,MESH:D003866,Inferred
AlcoholicBev ID:17,52,MESH:D003866,Inferred
AlcoholicBev ID:17,81,MESH:D004487,Inferred
AlcoholicBev ID:17,100,MESH:D004487,Inferred


### Food-chemicals

In [93]:
df = pd.read_csv(DATA/'food-chemical.tsv', sep='\t', encoding='utf-8')
df['type'] = 'Curated'
df['inference-network'] = ''
df.head()

Unnamed: 0,food-id,pubchem-id,content,reference,type,inference-network
0,Plant ID:100170,5280378,Detected but not quantified,KNApSAcK,Curated,
1,Plant ID:100170,44260113,Detected but not quantified,KNApSAcK,Curated,
2,Plant ID:100506,1150,Detected but not quantified,KNApSAcK,Curated,
3,Plant ID:100506,5202,Detected but not quantified,KNApSAcK,Curated,
4,Plant ID:100506,5372945,Detected but not quantified,KNApSAcK,Curated,


### Chemical-disease

In [101]:
cd = pd.read_csv(DATA/'chemical-disease.tsv', sep='\t', encoding='utf-8', index_col='Unnamed: 0')

# Drop marker/mechanism associations wherever therapeutic is available
cd = cd.sort_values('type', ascending=False).drop_duplicates(subset=['pubchem-id', 'disease-id'])

cd.rename(columns={
    'type': 'association',
}, inplace=True)

cd['reference'] = 'CTD'
cd['type'] = 'Inferred'
cd['inference-network'] = ''

cd.head()

Unnamed: 0,pubchem-id,disease-id,association,reference,type,inference-network
0,5280378,MESH:D002583,therapeutic,CTD,Inferred,
56668,3776,MESH:D012772,therapeutic,CTD,Inferred,
56396,688674,MESH:D009336,therapeutic,CTD,Inferred,
56355,936,MESH:D003921,therapeutic,CTD,Inferred,
56353,936,MESH:D012393,therapeutic,CTD,Inferred,
