In [1]:
from pathlib import Path
import pandas as pd
from collections import defaultdict

DATA = Path('data/version-3/')

In [2]:
import sys
import os
os.chdir('..')

from pathlib import Path
from dietrx import *
from dietrx.models import *

In [3]:
def add_to_db(obj, dl):
    for i, item in enumerate(dl):
        db.session.add(obj(**item))

        if (i+1) % 50000 == 0:
            db.session.commit()
    db.session.commit()
    
    
def remove_from_db(obj):
    for i, o in enumerate(obj.query.all()):
        db.session.delete(o)
        
        if (i+1) % 10000 == 0:
            db.session.commit()
            
    db.session.commit()

In [4]:
def clean_colnames(df):
    df.columns = [col.replace(' ', '-').lower() for col in df.columns]

### Remove data

In [5]:
# remove_from_db(Food_disease)
# remove_from_db(Disease_gene)
# remove_from_db(Food_gene)
# remove_from_db(Food_chemical)
# remove_from_db(Chemical_disease)
# remove_from_db(Food)
# remove_from_db(Disease)
# remove_from_db(Gene)
# remove_from_db(References)
# remove_from_db(Chemical)
# remove_from_db(Chemical_gene)

# A. Lexicons

### Food Lexicon

In [6]:
df = pd.read_csv(DATA/'food-lexicon-automated.tsv', sep='\t', encoding='utf-8')
clean_colnames(df)

del df['foodb-group'], df['foodb-subgroup']
df.head()

Unnamed: 0,common-names,food-id,food-category,food-name,scientific-name,tax-id
0,alexanders; horse parsley,Plant ID:40962,Miscellaneous,Alexanders,Smyrnium olusatrum,40962.0
1,,Plant ID:942083,Miscellaneous,Scrophularia umbrosa,Scrophularia umbrosa,942083.0
2,dotted blazing star,Plant ID:344074,Miscellaneous,Dotted blazing star,Liatris punctata,344074.0
3,plymouth pear,Plant ID:761867,Miscellaneous,Plymouth pear,Pyrus cordata,761867.0
4,,Plant ID:49166,Miscellaneous,Rhododendron kaempferi,Rhododendron kaempferi,49166.0


In [7]:
# Replace NaN with None
df = df.where((pd.notnull(df)), None)

# Rename columns to match db schema.
df.rename(columns={
    'common-names': 'common_names',
    'food-name': 'display_name',
    'food-id': 'food_id',
    'food-category': 'food_category',
    'scientific-name': 'scientific_name',
    'tax-id':'tax_id',
}, inplace=True)

data_list= list(df.T.to_dict().values())
data_list[0]

{'common_names': 'alexanders; horse parsley',
 'display_name': 'Alexanders',
 'food_category': 'Miscellaneous',
 'food_id': 'Plant ID:40962',
 'scientific_name': 'Smyrnium olusatrum',
 'tax_id': 40962.0}

In [8]:
add_to_db(Food, data_list)

### Disease Lexicon

In [25]:
df = pd.read_csv(DATA/'disease-lexicon.tsv', sep='\t', encoding='utf-8', index_col=0)
clean_colnames(df)
df.reset_index(inplace=True, drop=True)
df.head()

b'Skipping line 5374: expected 9 fields, saw 10\n'


Unnamed: 0,disease_id,disease_name,disease_category,disease_synonyms
0,MESH:C538288,10p Deletion Syndrome (Partial),Congenital abnormality|Genetic disease (inborn...,"Chromosome 10, 10p- Partial|Chromosome 10, mon..."
1,MESH:C535484,13q deletion syndrome,Congenital abnormality|Genetic disease (inborn...,Chromosome 13q deletion|Chromosome 13q deletio...
2,MESH:C579849,15q24 Microdeletion,Congenital abnormality|Genetic disease (inborn...,15q24 Deletion|15q24 Microdeletion Syndrome|In...
3,MESH:C579850,16p11.2 Deletion Syndrome,Congenital abnormality|Genetic disease (inborn...,
4,MESH:C567076,"17,20-Lyase Deficiency, Isolated",Congenital abnormality|Endocrine system diseas...,"17-Alpha-Hydroxylase-17,20-Lyase Deficiency, C..."


In [10]:
df = df.where((pd.notnull(df)), None)

# Rename to match schema
df.rename(columns={
    'disease-id': 'disease_id',
    'disease-name': 'disease_name',
    'disease-category': 'disease_category',
    'disease-synonyms': 'disease_synonyms'
}, inplace=True)

# Keep only selected columns.
df = df[['disease_id', 'disease_name', 'disease_category']]

data_list= list(df.T.to_dict().values())

In [11]:
add_to_db(Disease, data_list)

### Gene Lexicon

In [12]:
df = pd.read_csv(DATA/'gene-disease.tsv', sep='\t', encoding='utf-8')[['gene-id']]
df.drop_duplicates(subset=['gene-id'], inplace=True)
df.head()

Unnamed: 0,gene-id
0,1588
1,26504
2,9499
3,29940
4,113189


In [13]:
df.rename(columns={
    'gene-id': 'gene_id',
}, inplace=True)


data_list= list(df.T.to_dict().values())

In [14]:
add_to_db(Gene, data_list)

### References

In [15]:
df = pd.read_csv(DATA/'publication-records.tsv', sep='\t', encoding='utf-8', index_col='Unnamed: 0')
clean_colnames(df)
df.head()

Unnamed: 0,authors,date,journal-name,journal-name-abbrv.,pmid,publication-type,title
0,Elkiran T|Harputluoglu H|Yasar U|Babaoglu MO|D...,2007 Jan-Feb,Methods and findings in experimental and clini...,Methods Find Exp Clin Pharmacol,17344941,Clinical Trial|Journal Article,Differential alteration of drug-metabolizing e...
1,Severins N|Mensink RP|Plat J,2015 Feb,"Nutrition, metabolism, and cardiovascular dise...",Nutr Metab Cardiovasc Dis,25456153,Journal Article|Randomized Controlled Trial,Effects of lutein-enriched egg yolk in butterm...
2,Li C|Zuo C|Deng G|Kuang R|Yang Q|Hu C|Sheng O|...,2013,PloS one,PLoS One,23922960,Journal Article,Contamination of bananas with beauvericin and ...
3,Güneser S|Atici A|Cengizler I|Alparslan N,1996 May-Jun,Allergologia et immunopathologia,Allergol Immunopathol (Madr),8766742,Journal Article,Inhalant allergens: as a cause of respiratory ...
4,Brvar M|Bunc M,2009 Sep 9,Cases journal,Cases J,19918387,Journal Article,High-degree atrioventricular block in acute et...


In [16]:
df = df.where((pd.notnull(df)), None)

df.rename(columns={
    'authors': 'authors',
    'journal-name': 'journal_name',
    'journal-name-abbrv.': 'journal_name_abbr',
    'publication-type':'publication_type',
},inplace=True)

data_list= list(df.T.to_dict().values())
data_list[0]

{'authors': 'Elkiran T|Harputluoglu H|Yasar U|Babaoglu MO|Dincel AK|Altundag K|Ozisik Y|Guler N|Bozkurt A',
 'date': '2007 Jan-Feb',
 'journal_name': 'Methods and findings in experimental and clinical pharmacology',
 'journal_name_abbr': 'Methods Find Exp Clin Pharmacol',
 'pmid': 17344941,
 'publication_type': 'Clinical Trial|Journal Article',
 'title': 'Differential alteration of drug-metabolizing enzyme activities after cyclophosphamide/adriamycin administration in breast cancer patients.'}

In [17]:
add_to_db(References, data_list)

### Chemicals

In [18]:
chem = pd.read_csv(DATA/'chemical-lexicon.tsv', sep='\t', encoding='utf-8')

In [25]:
df = chem.where((pd.notnull(chem)), None)

df.rename(columns={
    'canonical_smiles': 'smiles',
},inplace=True)

del df['inchi']
del df['inchikey']

df.drop_duplicates(['pubchem_id'], inplace=True)

print('done')

done


In [26]:
for i in range(0, len(df), 50000):
    data_list = list(df.loc[i:50000-1+i, :].T.to_dict().values())
    add_to_db(Chemical, data_list)
    print(i)
print('done')

0
done


## B. Associations

In [7]:
food_dis = pd.read_csv(DATA/'food-disease.tsv', sep='\t', encoding='utf-8')
food_dis.drop_duplicates(['food-id', 'disease-id', 'association', 'pmid'], inplace=True)
food_dis = food_dis.where((pd.notnull(food_dis)), None)
food_dis['association'] = food_dis['association'].apply(str.lower)
food_dis = food_dis.loc[:, ['food-id', 'disease-id', 'association', 'pmid']]
food_dis['pmid'] = food_dis['pmid'].apply(str)
food2dis = defaultdict(lambda: set(),
                       food_dis.groupby('food-id').agg(lambda s: set(s))['disease-id'].to_dict())
food_dis.head(1)

Unnamed: 0,food-id,disease-id,association,pmid
0,Plant ID:4045,MESH:D004487,negative,7398283


In [8]:
chem_dis = pd.read_csv(DATA/'chemical-disease.tsv', sep='\t', encoding='utf-8', index_col='Unnamed: 0')
chem_dis = chem_dis.groupby(['pubchem-id', 'disease-id']).agg(lambda s: sorted(s)[-1]).reset_index()
dis2chem = defaultdict(lambda: set(),
                       chem_dis.groupby('disease-id').agg(lambda s: set(s))['pubchem-id'].to_dict())
chem2dis = defaultdict(lambda: set(),
                       chem_dis.groupby('pubchem-id').agg(lambda s: set(s))['disease-id'].to_dict())

chem_dis.head(1)

Unnamed: 0,pubchem-id,disease-id,type
0,19,MESH:D007674,therapeutic


In [9]:
food_chem = pd.read_csv(DATA/'food-chemical.tsv', sep='\t', encoding='utf-8')
food2chem = defaultdict(lambda: set(),
                        food_chem.groupby('food-id').agg(lambda s: set(s))['pubchem-id'].to_dict())

food_chem.head(1)

Unnamed: 0,food-id,pubchem-id,content,reference
0,Plant ID:100170,5280378,Detected but not quantified,KNApSAcK


In [10]:
gene_dis = pd.read_csv(DATA/'gene-disease.tsv', sep='\t', encoding='utf-8', index_col='Unnamed: 0')
dis2gene = defaultdict(lambda: set(),
                        gene_dis.groupby('disease-id').agg(lambda s: set(s))['gene-id'].to_dict())
gene2dis = defaultdict(lambda: set(),
                        gene_dis.groupby('gene-id').agg(lambda s: set(s))['disease-id'].to_dict())

gene_dis.head()

Unnamed: 0,disease-id,gene-id,source
0,MESH:C000591739,1588,ORPHANET
1,MESH:C000596385,26504,UNIPROT
2,MESH:C000598645,9499,CTD_human;ORPHANET;UNIPROT
3,MESH:C000600608,29940,ORPHANET
4,MESH:C000600608,113189,CTD_human;ORPHANET;UNIPROT


In [11]:
chem_gene = pd.read_csv(DATA/'chemical-gene.tsv', sep='\t', encoding='utf-8')
chem2gene = defaultdict(lambda: set(),
                        chem_gene.groupby('pubchem-id').agg(lambda s: set(s))['gene-id'].to_dict())
gene2chem = defaultdict(lambda: set(),
                        chem_gene.groupby('gene-id').agg(lambda s: set(s))['pubchem-id'].to_dict())
chem_gene.head()

Unnamed: 0,pubchem-id,gene-id,interaction-actions,source
0,19,351,affects^binding|decreases^reaction,CTD
1,19,4313,decreases^activity|decreases^reaction|decrease...,CTD
2,51,3091,affects^binding|affects^cotreatment|increases^...,CTD
3,51,7428,affects^binding|affects^cotreatment|increases^...,CTD
4,51,8290,affects^cotreatment|decreases^methylation|incr...,CTD


### Redundancy check for lexicons

In [13]:
alldis = set(food_dis['disease-id']) | set(gene_dis['disease-id']) | set(chem_dis['disease-id'])

In [16]:
alldis - set(dislex['disease-id'])

{'OMIM:259710',
 'MESH:D006222',
 'MESH:C563657',
 'MESH:C535504',
 'MESH:D018500',
 'MESH:C537548',
 'MESH:D006550',
 'MESH:C536653',
 'MESH:C566909',
 'MESH:D005532',
 'MESH:C567734',
 'MESH:C535896',
 'MESH:D006530',
 'MESH:D020516',
 'MESH:C567373',
 'MESH:C535464',
 'MESH:D008591',
 'MESH:C536545',
 'OMIM:614499',
 'MESH:C564591',
 'MESH:C535535',
 'MESH:D017675',
 'MESH:C562876',
 'MESH:C563425',
 'MESH:C538070',
 'MESH:C565706',
 'MESH:D002288',
 'MESH:D013684',
 'MESH:C566865',
 'MESH:D001606',
 'MESH:D006013',
 'MESH:C566296',
 'MESH:C536171',
 'MESH:C537172',
 'MESH:C564334',
 'MESH:C565557',
 'MESH:C563384',
 'MESH:C567684',
 'MESH:D012133',
 'MESH:C536601',
 'MESH:C567057',
 'MESH:C565168',
 'MESH:D017091',
 'MESH:C563463',
 'OMIM:612124',
 'OMIM:615065',
 'MESH:C538167',
 'MESH:C536416',
 'MESH:C566932',
 'MESH:D010254',
 'OMIM:610251',
 'MESH:C536595',
 'MESH:D057174',
 'MESH:D003323',
 'OMIM:612126',
 'MESH:D005271',
 'MESH:C563136',
 'MESH:C567234',
 'MESH:C535742',
 'M

In [14]:
alldisa

{'MESH:D017445',
 'OMIM:259710',
 'MESH:C565677',
 'MESH:D006222',
 'MESH:D000033',
 'MESH:D015535',
 'MESH:D017240',
 'MESH:C563657',
 'MESH:D002037',
 'MESH:C535504',
 'MESH:D018500',
 'MESH:D015228',
 'MESH:D004410',
 'MESH:C537548',
 'MESH:D004428',
 'MESH:D006550',
 'MESH:C536653',
 'MESH:C567640',
 'MESH:C566909',
 'MESH:C567832',
 'MESH:D003922',
 'MESH:D005889',
 'MESH:D005532',
 'MESH:C567734',
 'MESH:C567678',
 'MESH:C565292',
 'MESH:C535896',
 'MESH:D013724',
 'OMIM:300888',
 'OMIM:614650',
 'MESH:D009783',
 'MESH:C567285',
 'MESH:C567543',
 'MESH:D006849',
 'MESH:D002532',
 'MESH:D004948',
 'MESH:D061085',
 'MESH:D006530',
 'MESH:D028361',
 'MESH:D020516',
 'MESH:D002771',
 'MESH:C567373',
 'MESH:C565312',
 'MESH:C535464',
 'MESH:D016388',
 'OMIM:614592',
 'MESH:D058502',
 'OMIM:300494',
 'MESH:D008591',
 'MESH:C538326',
 'MESH:C536545',
 'OMIM:614499',
 'MESH:D001941',
 'MESH:C564591',
 'MESH:D013121',
 'MESH:C535535',
 'MESH:D017675',
 'OMIM:300855',
 'MESH:D013281',
 'ME

### 1. Food-Page

##### Food-disease

In [32]:
# Curated
neg_fd = food_dis[food_dis.association == 'negative'].\
    groupby(['food-id', 'disease-id']).agg(lambda s: '|'.join(s))[['pmid']].reset_index(())
neg_fd.reset_index(inplace=True)
neg_fd.rename(columns={'pmid': 'negative-pmid'}, inplace=True)

pos_fd = food_dis[food_dis.association == 'positive'].\
    groupby(['food-id', 'disease-id']).agg(lambda s: '|'.join(s))[['pmid']]
pos_fd.reset_index(inplace=True)
pos_fd.rename(columns={'pmid': 'positive-pmid'}, inplace=True)

food_dis_curated = pos_fd.merge(neg_fd, on=['food-id', 'disease-id'], how='outer').fillna('')

# Inferred
fd_inferred = list()
for food, chemicals in food2chem.items():
    for chem in chemicals:
        for dis in chem2dis[chem]:
            fd_inferred.append([food, dis, str(chem)])
            
fd_inferred = pd.DataFrame(fd_inferred, 
                           columns=['food-id', 'disease-id', 'pubchem-id'])
fd_inferred = fd_inferred.groupby(['food-id', 'disease-id']).agg(lambda s: '|'.join(s)).reset_index()
del pos_fd, neg_fd

fd_inferred.head(2)

Unnamed: 0,food-id,disease-id,pubchem-id
0,Plant ID:100170,MESH:D002583,5280378
1,Plant ID:100170,MESH:D058186,5280378


In [33]:
food_dis_db = food_dis_curated.merge(fd_inferred, on=['food-id', 'disease-id'], how='outer').fillna('')
food_dis_db['weight'] = food_dis_db.apply(lambda row: len(row['positive-pmid'].split('|')) +
                                          len(row['positive-pmid'].split('|')), 1)
del fd_inferred, food_dis_curated

food_dis_db.head()

Unnamed: 0,food-id,disease-id,positive-pmid,negative-pmid,pubchem-id,weight
0,AlcoholicBev ID:17,MESH:D001523,7195588,,,2
1,AlcoholicBev ID:17,MESH:D003866,6798614,,,2
2,AlcoholicBev ID:18,MESH:D005756,11848298,,,2
3,AlcoholicBev ID:18,MESH:D013272,17295722,9615879|17295722,,2
4,AlcoholicBev ID:2,MESH:C537766,25604939,,,2


In [34]:
df = food_dis_db.where((pd.notnull(food_dis_db)), None)

df.rename(columns={
    'food-id': 'food_id',
    'disease-id': 'disease_id',
    'positive-pmid': 'positive_pmid',
    'negative-pmid': 'negative_pmid',
    'pubchem-id': 'pubchem_id',
    'weight':'weight',
},inplace=True)

data_list= list(df.T.to_dict().values())

add_to_db(Food_disease, data_list)

##### Food-chemical

In [35]:
food_chem_db = food_chem.where((pd.notnull(food_chem)), None)
food_chem_db.head()

Unnamed: 0,food-id,pubchem-id,content,reference
0,Plant ID:100170,5280378,Detected but not quantified,KNApSAcK
1,Plant ID:100170,44260113,Detected but not quantified,KNApSAcK
2,Plant ID:100506,1150,Detected but not quantified,KNApSAcK
3,Plant ID:100506,5202,Detected but not quantified,KNApSAcK
4,Plant ID:100506,5372945,Detected but not quantified,KNApSAcK


In [36]:
df = food_chem_db.where((pd.notnull(food_chem_db)), None)

df.rename(columns={
    'pubchem-id': 'pubchem_id',
    'food-id': 'food_id',
    'reference': 'references',
    'type':'type_relation',
    'inference-network':'inference_network'
},inplace=True)

data_list= list(df.T.to_dict().values())

add_to_db(Food_chemical, data_list)

#### Food-gene

In [37]:
# Inferred throuch chemicals
fg_inf_chem = list()
for food, chemlst in food2chem.items():
    for chem in chemlst:
        for gene in chem2gene[chem]:
            fg_inf_chem.append([food, str(chem), gene])
            
fg_inf_chem = pd.DataFrame(fg_inf_chem, columns=['food-id', 'pubchem-id', 'gene-id'])
fg_inf_chem = fg_inf_chem.groupby(['food-id', 'gene-id']).agg(lambda s: '|'.join(s)).reset_index()
fg_inf_chem.head(2)

Unnamed: 0,food-id,gene-id,pubchem-id
0,Plant ID:100170,196,5280378
1,Plant ID:100170,581,5280378


In [38]:
# Inferred throuch diseases
fg_inf_dis = list()
for food, dislst in food2dis.items():
    for dis in dislst:
        for gene in dis2gene[dis]:
            fg_inf_dis.append([food, dis, gene])
            
fg_inf_dis = pd.DataFrame(fg_inf_dis, columns=['food-id', 'disease-id', 'gene-id'])
fg_inf_dis = fg_inf_dis.groupby(['food-id', 'gene-id']).agg(lambda s: '|'.join(s)).reset_index()
fg_inf_dis.head(2)

Unnamed: 0,food-id,gene-id,disease-id
0,AlcoholicBev ID:17,2,MESH:D003866
1,AlcoholicBev ID:17,15,MESH:D003866


In [39]:
food_gene_db = fg_inf_dis.merge(fg_inf_chem, on=['food-id', 'gene-id'], how='outer').fillna('')
food_gene_db.rename(columns={'disease-id': 'via-diseases', 'pubchem-id': 'via-chemicals'}, inplace=True)

del fg_inf_chem, fg_inf_dis

In [40]:
food_gene_db.head()

Unnamed: 0,food-id,gene-id,via-diseases,via-chemicals
0,AlcoholicBev ID:17,2,MESH:D003866,
1,AlcoholicBev ID:17,15,MESH:D003866,
2,AlcoholicBev ID:17,52,MESH:D003866,
3,AlcoholicBev ID:17,81,MESH:D004487,
4,AlcoholicBev ID:17,100,MESH:D004487,


In [41]:
# TAKES LOT OF TIME!
# food_gene_db['weight'] = food_gene_db.apply(lambda row: len(row['via-diseases'].split('|')) +
#                                                         len(row['via-chemicals'].split('|')), 1)

In [42]:
df = food_gene_db.where((pd.notnull(food_gene_db)), None)

df.rename(columns={
    'gene-id': 'gene_id',
    'food-id': 'food_id',
    'via-diseases': 'via_diseases',
    'via-chemicals': 'via_chemicals'
},inplace=True)

print('done')

done


In [43]:
for i in range(0, len(df), 50000):
    data_list = list(df.loc[i:50000-1+i, :].T.to_dict().values())
    add_to_db(Food_gene, data_list)
    print(i)
print('done')

0
50000
100000
150000
200000
250000
300000
350000
400000
450000
500000
550000
600000
650000
700000
750000
800000
850000
900000
950000
1000000
1050000
1100000
1150000
1200000
1250000
1300000
1350000
1400000
1450000
1500000
1550000
1600000
1650000
1700000
1750000
1800000
1850000
1900000
1950000
2000000
2050000
2100000
2150000
2200000
2250000
2300000
2350000
2400000
2450000
2500000
2550000
2600000
2650000
2700000
2750000
2800000
2850000
2900000
2950000
3000000
3050000
3100000
3150000
3200000
3250000
3300000
3350000
3400000
3450000
3500000
3550000
3600000
3650000
3700000
3750000
done


In [44]:
len(df)

3792433

### 2. Disease page

#### Disease-gene

In [10]:
# Curated
dg_curated = gene_dis.copy(deep=True)

# Inferred through chemicals
dg_inferred = list()
for disease, chemlst in dis2chem.items():
    for chem in chemlst:
        for gene in chem2gene[chem]:
            dg_inferred.append([disease, str(chem), gene])
            
dg_inferred = pd.DataFrame(dg_inferred, columns=['disease-id', 'pubchem-id', 'gene-id'])
dg_inferred = dg_inferred.groupby(['disease-id', 'gene-id']).agg(lambda s: '|'.join(s)).reset_index()
dg_inferred.head(2)

Unnamed: 0,disease-id,gene-id,pubchem-id
0,MESH:C531617,14,14985
1,MESH:C531617,35,14985


In [11]:
# Merge
disease_gene_db = dg_curated.merge(dg_inferred, on=['disease-id', 'gene-id'], how='outer').fillna('')
disease_gene_db.rename(columns={'pubchem-id': 'via-chemicals'}, inplace=True)
del dg_inferred, dg_curated

disease_gene_db.head(1)

Unnamed: 0,disease-id,gene-id,source,via-chemicals
0,MESH:C000591739,1588,ORPHANET,


(array([      0,       1,       2, ..., 5150214, 5150215, 5150216]),
 array([3, 3, 3, ..., 2, 2, 2]))

In [47]:
df = disease_gene_db.where((pd.notnull(disease_gene_db)), None)

df.rename(columns={
    'gene-id': 'gene_id',
    'disease-id': 'disease_id',
    'via-chemicals':'via_chemicals',
    'source': 'reference'
},inplace=True)
print('done')

done


In [48]:
for i in range(0, len(df), 50000):
    data_list = list(df.loc[i:50000-1+i, :].T.to_dict().values())
    add_to_db(Disease_gene, data_list)
    print(i)

0
50000
100000
150000
200000
250000
300000
350000
400000
450000
500000
550000
600000
650000
700000
750000
800000
850000
900000
950000
1000000
1050000
1100000
1150000
1200000
1250000
1300000
1350000
1400000
1450000
1500000
1550000
1600000
1650000
1700000
1750000
1800000
1850000
1900000
1950000
2000000
2050000
2100000
2150000
2200000
2250000
2300000
2350000
2400000
2450000
2500000
2550000
2600000
2650000
2700000
2750000
2800000
2850000
2900000
2950000
3000000
3050000
3100000
3150000
3200000
3250000
3300000
3350000
3400000
3450000
3500000
3550000
3600000
3650000
3700000
3750000
3800000
3850000
3900000
3950000
4000000
4050000
4100000
4150000
4200000
4250000
4300000
4350000
4400000
4450000
4500000
4550000
4600000
4650000
4700000
4750000
4800000
4850000
4900000
4950000
5000000
5050000
5100000
5150000


In [49]:
len(df)

5150217

#### Disease-chemical

In [50]:
# Curated
chem_dis_curated = chem_dis.copy(deep=True)
chem_dis_curated['pubchem-id'] = chem_dis_curated['pubchem-id'].apply(str)

# Inferred through genes
dc_inferred = list()
for disease, genelst in dis2gene.items():
    for gene in genelst:
        for chem in gene2chem[gene]:
            dc_inferred.append([disease, str(chem), str(gene)])
            
dc_inferred = pd.DataFrame(dc_inferred, columns=['disease-id', 'pubchem-id', 'gene-id'])
dc_inferred = dc_inferred.groupby(['disease-id', 'pubchem-id']).agg(lambda s: '|'.join(s)).reset_index()
dc_inferred.head(2)

Unnamed: 0,disease-id,pubchem-id,gene-id
0,MESH:C000591739,10680,1588
1,MESH:C000591739,14403,1588


In [51]:
# Merge
disease_chem_db = dc_inferred.merge(chem_dis_curated, on=['disease-id', 'pubchem-id'], how='outer').fillna('')
disease_chem_db.rename(columns={'gene-id': 'via-genes'}, inplace=True)
# del disease_chem_db['type']
del dc_inferred, chem_dis_curated

disease_chem_db.head(1)

Unnamed: 0,disease-id,pubchem-id,via-genes,type
0,MESH:C000591739,10680,1588,


In [52]:
df = disease_chem_db.where((pd.notnull(disease_chem_db)), None)

df.rename(columns={
    'pubchem-id': 'pubchem_id',
    'disease-id': 'disease_id',
    'type':'type_relation',
    'via-genes':'via_genes'
},inplace=True)
print('done')

done


In [53]:
len(df)

284612

In [54]:
for i in range(0, len(df), 50000):
    data_list = list(df.loc[i:50000-1+i, :].T.to_dict().values())
    add_to_db(Chemical_disease, data_list)
    print(i)

0
50000
100000
150000
200000
250000


### 3. Gene page

#### Gene-chemical

In [55]:
# Curated
chem_gene_curated = chem_gene.copy(deep=True)
chem_gene_curated['gene-id'] = chem_gene_curated['gene-id'].apply(str)
chem_gene_curated['pubchem-id'] = chem_gene_curated['pubchem-id'].apply(str)

# Inferred through diseases
gc_inferred = list()
for gene, dislst in gene2dis.items():
    for dis in dislst:
        for chem in dis2chem[dis]:
            gc_inferred.append([str(gene), str(chem), dis])
            
gc_inferred = pd.DataFrame(gc_inferred, columns=['gene-id', 'pubchem-id', 'disease-id'])
gc_inferred = gc_inferred.groupby(['gene-id', 'pubchem-id']).agg(lambda s: '|'.join(s)).reset_index()
gc_inferred.head(2)

Unnamed: 0,gene-id,pubchem-id,disease-id
0,10,10208,MESH:D056486
1,10,10212,MESH:D056486


In [56]:
# Merge
chem_gene_db = gc_inferred.merge(chem_gene_curated, on=['gene-id', 'pubchem-id'], how='outer').fillna('')
chem_gene_db.rename(columns={'disease-id': 'via-diseases'}, inplace=True)
del chem_gene_db['source']
del gc_inferred, chem_gene_curated

chem_gene_db.head(1)

Unnamed: 0,gene-id,pubchem-id,via-diseases,interaction-actions
0,10,10208,MESH:D056486,


In [57]:
df = chem_gene_db.where((pd.notnull(chem_gene_db)), None)

df.rename(columns={
    'pubchem-id': 'pubchem_id',
    'gene-id': 'gene_id',
    'via-diseases': 'via_diseases',
    'interaction-actions': 'interaction_actions'
},inplace=True)
print('done')

done


In [58]:
len(df)

581465

In [59]:
for i in range(0, len(df), 50000):
    data_list = list(df.loc[i:50000-1+i, :].T.to_dict().values())
    add_to_db(Chemical_gene, data_list)
    print(i)

0
50000
100000
150000
200000
250000
300000
350000
400000
450000
500000
550000


In [60]:
app.elasticsearch.indices.delete(index='food', ignore=[400, 404])
app.elasticsearch.indices.delete(index='disease', ignore=[400, 404])
app.elasticsearch.indices.delete(index='gene', ignore=[400, 404])


Food.reindex("food_id")
Disease.reindex("disease_id")
Gene.reindex("gene_id")

app.elasticsearch.indices.delete(index='chemical', ignore=[400, 404])

Chemical.reindex("pubchem_id")