In [1]:
import sys
import os
os.chdir('..')

In [2]:
from pathlib import Path
import pandas as pd
from collections import defaultdict
import requests 
from bs4 import BeautifulSoup
import time
import pickle

from dietrx import *
from dietrx.models import *

DATA = Path('data/version-3/')

In [3]:
def clean_colnames(df):
    df.columns = [col.replace(' ', '-').lower() for col in df.columns]

In [4]:
def add_to_db(obj, dl):
    for i, item in enumerate(dl):
        db.session.add(obj(**item))

        if (i+1) % 5000 == 0:
            db.session.commit()
    db.session.commit()
    
def add_to_db_df(obj, df):
    for i, row in df.iterrows():
        db.session.add(obj(**row.to_dict()))

        if (i+1) % 5000 == 0:
            db.session.commit()
    db.session.commit()
    
    
def remove_from_db(obj):
    obj.query.delete()            
    db.session.commit()

In [5]:
# remove_from_db(Food_disease)
# remove_from_db(Food_chemical)
# remove_from_db(Food_gene)
# remove_from_db(Disease_gene)
# remove_from_db(Chemical_disease)
# remove_from_db(Chemical_gene)
# remove_from_db(Food)
# remove_from_db(Disease)
# remove_from_db(Gene)
# remove_from_db(References)
# remove_from_db(Chemical)

## Vanilla Associations

#### a. Food-Disease

In [6]:
food_dis = pd.read_csv(DATA/'food-disease.tsv', sep='\t', encoding='utf-8')
food_dis.drop_duplicates(['food-id', 'disease-id', 'association', 'pmid'], inplace=True)
food_dis = food_dis.where((pd.notnull(food_dis)), None)
food_dis['association'] = food_dis['association'].apply(str.lower)
food_dis = food_dis.loc[:, ['food-id', 'disease-id', 'association', 'pmid']]
food_dis['pmid'] = food_dis['pmid'].apply(str)
food2dis = defaultdict(lambda: set(),
                       food_dis.groupby('food-id').agg(lambda s: set(s))['disease-id'].to_dict())
food_dis.head(1)

Unnamed: 0,food-id,disease-id,association,pmid
0,Plant ID:4045,MESH:D004487,negative,7398283


#### b. Food-Chemical

In [7]:
food_chem = pd.read_csv(DATA/'food-chemical.tsv', sep='\t', encoding='utf-8')
food2chem = defaultdict(lambda: set(),
                        food_chem.groupby('food-id').agg(lambda s: set(s))['pubchem-id'].to_dict())

food_chem.head(1)

Unnamed: 0,food-id,pubchem-id,content,reference
0,Plant ID:100170,5280378,Detected but not quantified,KNApSAcK


#### c. Gene-Disease

In [8]:
gene_dis = pd.read_csv(DATA/'gene-disease.tsv', sep='\t', encoding='utf-8', index_col='Unnamed: 0')
dis2gene = defaultdict(lambda: set(),
                        gene_dis.groupby('disease-id').agg(lambda s: set(s))['gene-id'].to_dict())
gene2dis = defaultdict(lambda: set(),
                        gene_dis.groupby('gene-id').agg(lambda s: set(s))['disease-id'].to_dict())

gene_dis.head()

Unnamed: 0,disease-id,gene-id,source
0,MESH:C000591739,1588,ORPHANET
1,MESH:C000596385,26504,UNIPROT
2,MESH:C000598645,9499,CTD_human;ORPHANET;UNIPROT
3,MESH:C000600608,29940,ORPHANET
4,MESH:C000600608,113189,CTD_human;ORPHANET;UNIPROT


#### d. Chemical-Disease

In [9]:
chem_dis = pd.read_csv(DATA/'chemical-disease.tsv', sep='\t', encoding='utf-8', index_col='Unnamed: 0')
chem_dis = chem_dis.groupby(['pubchem-id', 'disease-id']).agg(lambda s: sorted(s)[-1]).reset_index()
dis2chem = defaultdict(lambda: set(),
                       chem_dis.groupby('disease-id').agg(lambda s: set(s))['pubchem-id'].to_dict())
chem2dis = defaultdict(lambda: set(),
                       chem_dis.groupby('pubchem-id').agg(lambda s: set(s))['disease-id'].to_dict())

chem_dis.head(1)

Unnamed: 0,pubchem-id,disease-id,type
0,19,MESH:D007674,therapeutic


#### e. Chemical-Gene

In [10]:
chem_gene = pd.read_csv(DATA/'chemical-gene.tsv', sep='\t', encoding='utf-8')
chem2gene = defaultdict(lambda: set(),
                        chem_gene.groupby('pubchem-id').agg(lambda s: set(s))['gene-id'].to_dict())
gene2chem = defaultdict(lambda: set(),
                        chem_gene.groupby('gene-id').agg(lambda s: set(s))['pubchem-id'].to_dict())
chem_gene.head()

Unnamed: 0,pubchem-id,gene-id,interaction-actions,source
0,19,351,affects^binding|decreases^reaction,CTD
1,19,4313,decreases^activity|decreases^reaction|decrease...,CTD
2,51,3091,affects^binding|affects^cotreatment|increases^...,CTD
3,51,7428,affects^binding|affects^cotreatment|increases^...,CTD
4,51,8290,affects^cotreatment|decreases^methylation|incr...,CTD


In [13]:
set([act.split('^')[0] for actionlst in chem_gene['interaction-actions'] for act in actionlst.split('|')])

{'affects', 'decreases', 'increases'}

In [14]:
set([act.split('^')[1] for actionlst in chem_gene['interaction-actions'] for act in actionlst.split('|')])

{'ADP-ribosylation',
 'N-linked glycosylation',
 'O-linked glycosylation',
 'abundance',
 'acetylation',
 'activity',
 'alkylation',
 'amination',
 'binding',
 'carboxylation',
 'chemical synthesis',
 'cleavage',
 'cotreatment',
 'degradation',
 'export',
 'expression',
 'farnesylation',
 'folding',
 'geranoylation',
 'glucuronidation',
 'glutathionylation',
 'glycation',
 'glycosylation',
 'hydrolysis',
 'hydroxylation',
 'import',
 'lipidation',
 'localization',
 'metabolic processing',
 'methylation',
 'mutagenesis',
 'nitrosation',
 'oxidation',
 'phosphorylation',
 'prenylation',
 'reaction',
 'reduction',
 'response to substance',
 'secretion',
 'splicing',
 'stability',
 'sulfation',
 'sumoylation',
 'transport',
 'ubiquitination',
 'uptake'}

## Lexicons

### A. Food 

In [13]:
# Read file
foodlex = pd.read_csv(DATA/'food-lexicon-automated.tsv', sep='\t', encoding='utf-8')
clean_colnames(foodlex)

del foodlex['foodb-group'], foodlex['foodb-subgroup']

# Replace NaN with None
foodlex = foodlex.where((pd.notnull(foodlex)), None)

# Rename columns to match db schema.
foodlex.rename(columns={
    'common-names': 'common_names',
    'food-name': 'display_name',
    'food-id': 'food_id',
    'food-category': 'food_category',
    'scientific-name': 'scientific_name',
    'tax-id':'tax_id',
}, inplace=True)

foodlex.head()

Unnamed: 0,common_names,food_id,food_category,display_name,scientific_name,tax_id
0,alexanders; horse parsley,Plant ID:40962,Miscellaneous,Alexanders,Smyrnium olusatrum,40962
1,,Plant ID:942083,Miscellaneous,Scrophularia umbrosa,Scrophularia umbrosa,942083
2,dotted blazing star,Plant ID:344074,Miscellaneous,Dotted blazing star,Liatris punctata,344074
3,plymouth pear,Plant ID:761867,Miscellaneous,Plymouth pear,Pyrus cordata,761867
4,,Plant ID:49166,Miscellaneous,Rhododendron kaempferi,Rhododendron kaempferi,49166


In [14]:
# Redundancy check
assert len((set(food_dis['food-id']) | set(food_chem['food-id'])) - set(foodlex['food_id'])) == 0,\
    'Food lexicon is not exhaustive!'

In [None]:
data_list= list(foodlex.T.to_dict().values())
add_to_db(Food, data_list)
del data_list

### B. Disease

In [16]:
dislex = pd.read_csv(DATA/'CTD_diseases_new.tsv', sep='\t', encoding='utf-8', error_bad_lines=False)

# Keep only selected columns.
dislex = dislex[['DiseaseID', 'DiseaseName', 'SlimMappings', 'Synonyms']]

# Replace nan with None
dislex = dislex.where((pd.notnull(dislex)), None)

# Rename to match schema
dislex.rename(columns={
    'DiseaseID': 'disease_id',
    'DiseaseName': 'disease_name',
    'SlimMappings': 'disease_category',
    'Synonyms': 'disease_synonyms'
}, inplace=True)

dislex.head()

b'Skipping line 5374: expected 9 fields, saw 10\n'


Unnamed: 0,disease_id,disease_name,disease_category,disease_synonyms
0,MESH:C538288,10p Deletion Syndrome (Partial),Congenital abnormality|Genetic disease (inborn...,"Chromosome 10, 10p- Partial|Chromosome 10, mon..."
1,MESH:C535484,13q deletion syndrome,Congenital abnormality|Genetic disease (inborn...,Chromosome 13q deletion|Chromosome 13q deletio...
2,MESH:C579849,15q24 Microdeletion,Congenital abnormality|Genetic disease (inborn...,15q24 Deletion|15q24 Microdeletion Syndrome|In...
3,MESH:C579850,16p11.2 Deletion Syndrome,Congenital abnormality|Genetic disease (inborn...,
4,MESH:C567076,"17,20-Lyase Deficiency, Isolated",Congenital abnormality|Endocrine system diseas...,"17-Alpha-Hydroxylase-17,20-Lyase Deficiency, C..."


In [17]:
# Redundancy check
assert len((set(food_dis['disease-id']) | set(chem_dis['disease-id']) | set(gene_dis['disease-id'])) 
           - set(dislex['disease_id'])) == 0,\
    'Disease lexicon is not exhaustive!'

# Subset disease lexicon
dislex = dislex.set_index('disease_id').\
    loc[list(set(food_dis['disease-id']) | set(chem_dis['disease-id']) | set(gene_dis['disease-id']))].\
    reset_index()

In [24]:
add_to_db_df(Disease, dislex)

### C. Chemical

In [26]:
chem = pd.read_csv(DATA/'chemical-lexicon.tsv', sep='\t', encoding='utf-8').drop_duplicates(['pubchem_id'])

# Replace nan with null values
chem = chem.where((pd.notnull(chem)), None)

# Rename columns
chem.rename(columns={'canonical_smiles': 'smiles',},inplace=True)

del chem['inchi'], chem['inchikey']

In [19]:
# Redundancy check
assert len((set(food_chem['pubchem-id']) | set(chem_dis['pubchem-id']) | set(chem_gene['pubchem-id'])) 
           - set(chem['pubchem_id'])) == 0,\
    'Chemical lexicon is not exhaustive!'

In [None]:
for i in range(0, len(chem), 50000):
    data_list = list(chem.loc[i:50000-1+i, :].T.to_dict().values())
    add_to_db(Chemical, data_list)
    print(i)
print('done')

### D. Gene

In [20]:
records = pickle.load(open('data/version-3/gene-lexicon-raw.p', 'rb'))
genelex = pd.DataFrame(records)

# Subset
genelex = genelex[['gene_id', 'Description', 'Name', 'Organism',
                   'OtherAliases', 'OtherDesignations']]

# Keep scientific name of organism only
genelex['Organism'] = genelex['Organism'].map(lambda s: s['ScientificName'])

# Rename columns
genelex.rename(columns={
    'Description': 'gene_name',
    'Name': 'gene_symbol',
    'Organism': 'organism',
    'OtherAliases': 'other_symbols',
    'OtherDesignations': 'synonyms'
}, inplace=True)

del records

genelex.head(2)

Unnamed: 0,gene_id,gene_name,gene_symbol,organism,other_symbols,synonyms
0,1,alpha-1-B glycoprotein,A1BG,Homo sapiens,"A1B, ABG, GAB, HYST2477",alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...
1,2,alpha-2-macroglobulin,A2M,Homo sapiens,"A2MD, CPAMD5, FWP007, S863-7",alpha-2-macroglobulin|C3 and PZP-like alpha-2-...


In [21]:
# Redundancy check
assert len((set(chem_gene['gene-id']) | set(gene_dis['gene-id'])) 
           - set(genelex['gene_id'])) == 0,\
    'Gene lexicon is not exhaustive!'

In [22]:
add_to_db_df(Gene, genelex)

### E. References

In [111]:
references = pd.read_csv(DATA/'publication-records.tsv', sep='\t', encoding='utf-8')
clean_colnames(references)

references = references.where((pd.notnull(references)), None)

references.rename(columns={
    'authors': 'authors',
    'journal-name': 'journal_name',
    'journal-name-abbrv.': 'journal_name_abbr',
    'publication-type':'publication_type',
},inplace=True)

pmids_not_mapped = list(str(s) for s in set(food_dis['pmid'].apply(int)) - set(references['pmid']))
references = references.set_index('pmid').\
    reindex(index=references.pmid.tolist() + pmids_not_mapped).fillna('').\
    reset_index()

references.head()

Unnamed: 0,pmid,authors,date,journal_name,journal_name_abbr,publication_type,title
0,17344941,Elkiran T|Harputluoglu H|Yasar U|Babaoglu MO|D...,2007 Jan-Feb,Methods and findings in experimental and clini...,Methods Find Exp Clin Pharmacol,Clinical Trial|Journal Article,Differential alteration of drug-metabolizing e...
1,25456153,Severins N|Mensink RP|Plat J,2015 Feb,"Nutrition, metabolism, and cardiovascular dise...",Nutr Metab Cardiovasc Dis,Journal Article|Randomized Controlled Trial,Effects of lutein-enriched egg yolk in butterm...
2,23922960,Li C|Zuo C|Deng G|Kuang R|Yang Q|Hu C|Sheng O|...,2013,PloS one,PLoS One,Journal Article,Contamination of bananas with beauvericin and ...
3,8766742,Güneser S|Atici A|Cengizler I|Alparslan N,1996 May-Jun,Allergologia et immunopathologia,Allergol Immunopathol (Madr),Journal Article,Inhalant allergens: as a cause of respiratory ...
4,19918387,Brvar M|Bunc M,2009 Sep 9,Cases journal,Cases J,Journal Article,High-degree atrioventricular block in acute et...


In [121]:
add_to_db_df(References, references)

## Final Associations

### Food-Disease

In [47]:
# Curated
neg_fd = food_dis[food_dis.association == 'negative'].\
    groupby(['food-id', 'disease-id']).agg(lambda s: '|'.join(s))[['pmid']].reset_index(())
neg_fd.reset_index(inplace=True)
neg_fd.rename(columns={'pmid': 'negative-pmid'}, inplace=True)

pos_fd = food_dis[food_dis.association == 'positive'].\
    groupby(['food-id', 'disease-id']).agg(lambda s: '|'.join(s))[['pmid']]
pos_fd.reset_index(inplace=True)
pos_fd.rename(columns={'pmid': 'positive-pmid'}, inplace=True)

food_dis_curated = pos_fd.merge(neg_fd, on=['food-id', 'disease-id'], how='outer').fillna('')

# Inferred
fd_inferred = list()
for food, chemicals in food2chem.items():
    for chem in chemicals:
        for dis in chem2dis[chem]:
            fd_inferred.append([food, dis, str(chem)])
            
fd_inferred = pd.DataFrame(fd_inferred, 
                           columns=['food-id', 'disease-id', 'pubchem-id'])
fd_inferred = fd_inferred.groupby(['food-id', 'disease-id']).agg(lambda s: '|'.join(s)).reset_index()
del pos_fd, neg_fd

fd_inferred.head(2)

Unnamed: 0,food-id,disease-id,pubchem-id
0,Plant ID:100170,MESH:D002583,5280378
1,Plant ID:100170,MESH:D058186,5280378


In [48]:
food_dis_db = food_dis_curated.merge(fd_inferred, on=['food-id', 'disease-id'], how='outer').fillna('')
food_dis_db['weight'] = food_dis_db.apply(lambda row: len(row['positive-pmid'].split('|')) +
                                          len(row['negative-pmid'].split('|')), 1)
del fd_inferred, food_dis_curated

food_dis_db = food_dis_db.where((pd.notnull(food_dis_db)), None)

food_dis_db.rename(columns={
    'food-id': 'food_id',
    'disease-id': 'disease_id',
    'positive-pmid': 'positive_pmid',
    'negative-pmid': 'negative_pmid',
    'pubchem-id': 'pubchem_id',
    'weight':'weight',
}, inplace=True)

food_dis_db.head()

Unnamed: 0,food_id,disease_id,positive_pmid,negative_pmid,pubchem_id,weight
0,AlcoholicBev ID:17,MESH:D001523,7195588,,,2
1,AlcoholicBev ID:17,MESH:D003866,6798614,,,2
2,AlcoholicBev ID:18,MESH:D005756,11848298,,,2
3,AlcoholicBev ID:18,MESH:D013272,17295722,9615879|17295722,,3
4,AlcoholicBev ID:2,MESH:C537766,25604939,,,2


In [47]:
add_to_db_df(Food_disease, food_dis_db)

### Food-chemical

In [43]:
food_chem_db = food_chem.where((pd.notnull(food_chem)), None)

food_chem_db = food_chem_db.where((pd.notnull(food_chem_db)), None)

food_chem_db.rename(columns={
    'pubchem-id': 'pubchem_id',
    'food-id': 'food_id',
    'reference': 'references',
    'type':'type_relation',
    'inference-network':'inference_network'
}, inplace=True)

food_chem_db.head()

Unnamed: 0,food_id,pubchem_id,content,references
0,Plant ID:100170,5280378,Detected but not quantified,KNApSAcK
1,Plant ID:100170,44260113,Detected but not quantified,KNApSAcK
2,Plant ID:100506,1150,Detected but not quantified,KNApSAcK
3,Plant ID:100506,5202,Detected but not quantified,KNApSAcK
4,Plant ID:100506,5372945,Detected but not quantified,KNApSAcK


In [49]:
add_to_db_df(Food_chemical, food_chem_db)

### Food-gene

In [24]:
# Inferred through chemicals
fg_inf_chem = list()
for food, chemlst in food2chem.items():
    for chem in chemlst:
        for gene in chem2gene[chem]:
            fg_inf_chem.append([food, str(chem), gene])
            
fg_inf_chem = pd.DataFrame(fg_inf_chem, columns=['food-id', 'pubchem-id', 'gene-id'])
fg_inf_chem = fg_inf_chem.groupby(['food-id', 'gene-id']).agg(lambda s: '|'.join(s)).reset_index()

In [25]:
# Inferred throuch diseases
fg_inf_dis = list()
for food, dislst in food2dis.items():
    for dis in dislst:
        for gene in dis2gene[dis]:
            fg_inf_dis.append([food, dis, gene])
            
fg_inf_dis = pd.DataFrame(fg_inf_dis, columns=['food-id', 'disease-id', 'gene-id'])
fg_inf_dis = fg_inf_dis.groupby(['food-id', 'gene-id']).agg(lambda s: '|'.join(s)).reset_index()

In [26]:
food_gene_db = fg_inf_dis.merge(fg_inf_chem, on=['food-id', 'gene-id'], how='outer').fillna('')
food_gene_db.rename(columns={
    'gene-id': 'gene_id',
    'food-id': 'food_id',
    'disease-id': 'via_diseases', 
    'pubchem-id': 'via_chemicals'}, inplace=True)

del fg_inf_chem, fg_inf_dis

# food_gene_db['weight'] = food_gene_db.apply(lambda row: len(row['via_diseases'].split('|')) +
#                                             len(row['via_chemicals'].split('|')), 1)


food_gene_db = food_gene_db.where((pd.notnull(food_gene_db)), None)

food_gene_db.head()

Unnamed: 0,food_id,gene_id,via_diseases,via_chemicals
0,AlcoholicBev ID:17,2,MESH:D003866,
1,AlcoholicBev ID:17,15,MESH:D003866,
2,AlcoholicBev ID:17,52,MESH:D003866,
3,AlcoholicBev ID:17,81,MESH:D004487,
4,AlcoholicBev ID:17,100,MESH:D004487,


In [27]:
# del food_gene_db['weight']
add_to_db_df(Food_gene, food_gene_db)

In [28]:
del food_gene_db

### Disease Gene

In [29]:
# Curated
dg_curated = gene_dis.copy(deep=True)

# Inferred through chemicals
dg_inferred = list()
for disease, chemlst in dis2chem.items():
    for chem in chemlst:
        for gene in chem2gene[chem]:
            dg_inferred.append([disease, str(chem), gene])
            
dg_inferred = pd.DataFrame(dg_inferred, columns=['disease-id', 'pubchem-id', 'gene-id'])
dg_inferred = dg_inferred.groupby(['disease-id', 'gene-id']).agg(lambda s: '|'.join(s)).reset_index()
dg_inferred.head(2)

Unnamed: 0,disease-id,gene-id,pubchem-id
0,MESH:C531617,14,14985
1,MESH:C531617,35,14985


In [30]:
# Merge
disease_gene_db = dg_curated.merge(dg_inferred, on=['disease-id', 'gene-id'], how='outer').fillna('')
disease_gene_db.rename(columns={'pubchem-id': 'via-chemicals'}, inplace=True)
del dg_inferred, dg_curated

disease_gene_db = disease_gene_db.where((pd.notnull(disease_gene_db)), None)

disease_gene_db.rename(columns={
    'gene-id': 'gene_id',
    'disease-id': 'disease_id',
    'via-chemicals':'via_chemicals',
    'source': 'reference'
}, inplace=True)

disease_gene_db.head(1)

Unnamed: 0,disease_id,gene_id,reference,via_chemicals
0,MESH:C000591739,1588,ORPHANET,


In [31]:
add_to_db_df(Disease_gene, disease_gene_db)

### Disease chemical

In [31]:
# Curated
chem_dis_curated = chem_dis.copy(deep=True)
chem_dis_curated['pubchem-id'] = chem_dis_curated['pubchem-id'].apply(str)

# Inferred through genes
dc_inferred = list()
for disease, genelst in dis2gene.items():
    for gene in genelst:
        for chem in gene2chem[gene]:
            dc_inferred.append([disease, str(chem), str(gene)])
            
dc_inferred = pd.DataFrame(dc_inferred, columns=['disease-id', 'pubchem-id', 'gene-id'])
dc_inferred = dc_inferred.groupby(['disease-id', 'pubchem-id']).agg(lambda s: '|'.join(s)).reset_index()
dc_inferred.head(2)

Unnamed: 0,disease-id,pubchem-id,gene-id
0,MESH:C000591739,10680,1588
1,MESH:C000591739,14403,1588


In [32]:
# Merge
disease_chem_db = dc_inferred.merge(chem_dis_curated, on=['disease-id', 'pubchem-id'], how='outer').fillna('')
disease_chem_db.rename(columns={'gene-id': 'via-genes'}, inplace=True)
# del disease_chem_db['type']
del dc_inferred, chem_dis_curated

disease_chem_db.head(1)

Unnamed: 0,disease-id,pubchem-id,via-genes,type
0,MESH:C000591739,10680,1588,


In [93]:
# disease_chem_db['weight'] = disease_chem_db['via-genes'].apply(lambda s: len(s.split('|')))

In [78]:
disease_chem_db = disease_chem_db.where((pd.notnull(disease_chem_db)), None)

disease_chem_db.rename(columns={
    'pubchem-id': 'pubchem_id',
    'disease-id': 'disease_id',
    'type':'type_relation',
    'via-genes':'via_genes'
},inplace=True)

In [79]:
Chemical_disease.query.delete()

284612

In [80]:
add_to_db_df(Chemical_disease, disease_chem_db)

### Gene Chemical

In [32]:
# Curated
chem_gene_curated = chem_gene.copy(deep=True)
chem_gene_curated['gene-id'] = chem_gene_curated['gene-id'].apply(str)
chem_gene_curated['pubchem-id'] = chem_gene_curated['pubchem-id'].apply(str)

# Inferred through diseases
gc_inferred = list()
for gene, dislst in gene2dis.items():
    for dis in dislst:
        for chem in dis2chem[dis]:
            gc_inferred.append([str(gene), str(chem), dis])
            
gc_inferred = pd.DataFrame(gc_inferred, columns=['gene-id', 'pubchem-id', 'disease-id'])
gc_inferred = gc_inferred.groupby(['gene-id', 'pubchem-id']).agg(lambda s: '|'.join(s)).reset_index()

In [33]:
# Merge
chem_gene_db = gc_inferred.merge(chem_gene_curated, on=['gene-id', 'pubchem-id'], how='outer').fillna('')
chem_gene_db.rename(columns={'disease-id': 'via-diseases'}, inplace=True)
del chem_gene_db['source']
del gc_inferred, chem_gene_curated

In [34]:
chem_gene_db = chem_gene_db.where((pd.notnull(chem_gene_db)), None)

chem_gene_db.rename(columns={
    'pubchem-id': 'pubchem_id',
    'gene-id': 'gene_id',
    'via-diseases': 'via_diseases',
    'interaction-actions': 'interaction_actions'
}, inplace=True)

In [35]:
add_to_db_df(Chemical_gene, chem_gene_db)

### Chemical Images

In [16]:
from rdkit.Chem import MolFromSmiles, Draw
from rdkit.Chem.AllChem import Compute2DCoords
from shutil import copy

In [19]:
!rm /dietrx/static/images/molecules/*

rm: cannot remove '/dietrx/static/images/molecules/*': No such file or directory


In [20]:
for mol in Chemical.query.all():
    try:
        m = MolFromSmiles(mol.smiles)
        tmp = Compute2DCoords(m)
        Draw.MolToFile(m, 'dietrx/static/images/molecules/' + str(mol.pubchem_id) + '.png')
    except:
        print(mol.mol_id, mol.smiles)
        copy('dietrx/static/images/no-image.png', 
             'dietrx/static/images/molecules/' + str(mol.pubchem_id) + '.png')

### SDF Files

In [21]:
import requests
covered = set()

In [42]:
for i, mol in enumerate(Chemical.query.all()):
    if i not in covered:
        r = requests.get('https://cactus.nci.nih.gov/chemical/structure/%s/file?format=sdf&get3d=True' % mol.smiles)
        with open('dietrx/static/sdf_files/%i.sdf' % mol.pubchem_id, 'w') as f:
            f.write(r.text)

            covered.add(i)
        
    if i % 10 == 0:
        print("Completed: %i" %i)

Completed: 0
Completed: 10
Completed: 20
Completed: 30
Completed: 40
Completed: 50
Completed: 60
Completed: 70
Completed: 80
Completed: 90
Completed: 100
Completed: 110
Completed: 120
Completed: 130
Completed: 140
Completed: 150
Completed: 160
Completed: 170
Completed: 180
Completed: 190
Completed: 200
Completed: 210
Completed: 220
Completed: 230
Completed: 240
Completed: 250
Completed: 260
Completed: 270
Completed: 280
Completed: 290
Completed: 300
Completed: 310
Completed: 320
Completed: 330
Completed: 340
Completed: 350
Completed: 360
Completed: 370
Completed: 380
Completed: 390
Completed: 400
Completed: 410
Completed: 420
Completed: 430
Completed: 440
Completed: 450
Completed: 460
Completed: 470
Completed: 480
Completed: 490
Completed: 500
Completed: 510
Completed: 520
Completed: 530
Completed: 540
Completed: 550
Completed: 560
Completed: 570
Completed: 580
Completed: 590
Completed: 600
Completed: 610
Completed: 620
Completed: 630
Completed: 640
Completed: 650
Completed: 660
Comple

SSLError: HTTPSConnectionPool(host='cactus.nci.nih.gov', port=443): Max retries exceeded with url: /chemical/structure/C(=CC(=O)O)C(=O)O/file?format=sdf&get3d=True (Caused by SSLError(SSLError("bad handshake: SysCallError(-1, 'Unexpected EOF')",),))

In [36]:
sum(disease_chem_db['via-genes'] == '')

4183

In [41]:
gene_dis[gene_dis['disease-id'] == 'MESH:D009369']

Unnamed: 0,disease-id,gene-id,source


In [39]:
disease_chem_db[disease_chem_db['disease-id'] == 'MESH:D009369']

Unnamed: 0,disease-id,pubchem-id,via-genes,type
280454,MESH:D009369,137,,marker/mechanism
280541,MESH:D009369,241,,marker/mechanism
280937,MESH:D009369,896,,therapeutic
280963,MESH:D009369,935,,marker/mechanism
280999,MESH:D009369,938,,therapeutic
281139,MESH:D009369,1054,,marker/mechanism
281230,MESH:D009369,1130,,marker/mechanism
281249,MESH:D009369,1140,,marker/mechanism
281393,MESH:D009369,2236,,marker/mechanism
281415,MESH:D009369,2353,,therapeutic


### Refrest Elastic Search

In [5]:
app.elasticsearch.indices.delete(index='food', ignore=[400, 404])
app.elasticsearch.indices.delete(index='disease', ignore=[400, 404])
app.elasticsearch.indices.delete(index='gene', ignore=[400, 404])


Food.reindex("food_id")
Disease.reindex("disease_id")
Gene.reindex("gene_id")

app.elasticsearch.indices.delete(index='chemical', ignore=[400, 404])

Chemical.reindex("pubchem_id")