# Validate NN on VMH unseen database
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: Let's take the NN developed in Opa/ and test it out on virtual metabolic human data

In [107]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from random import randint
import random
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from pandas_ml import ConfusionMatrix
import json
import subprocess
import pickle
import math
import scipy.io
import urllib.request


#Set random seed
np.random.seed(1606)

### Import VMH Disease List

In [109]:
# Import VMH diseases list
v_dis = pd.read_csv('data/recon-store-diseases-1.tsv', sep='\t')

In [110]:
v_dis.shape

(257, 27)

In [111]:
v_dis.sample(2)

Unnamed: 0,abbreviation,name,dtype,subtype,inheritance,organ,omim,prevalence,references,ghr,...,genereviews,clingendosage,igsr1000genoms,gwascataloge,gwascentral,geno2mp,clinvar,lovd,malacard,omim_symptons
135,SIAL,Sialidosis,Inherited metabolic disorder,Carbohydrate disorder,Autosomal recessive,"Nervous system, eye, liver, spleen",256550,The overall prevalence of sialidosis is unknow...,PMID: 14517945,sialidosis,...,,NEU1,ENSG00000228691,NEU1,NEU1,NEU1,608272.0,NEU1,neuraminidase_deficiency,256550.0
195,TSD,Tay-Sachs Disease,Inherited metabolic disorder,Carbohydrate disorder,Autosomal recessive,"Nervous system, eye, muscle",272800,Tay-Sachs disease is very rare in the general ...,http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?...,tay-sachs-disease,...,NBK1218,HEXA,ENSG00000213614,,HEXA,HEXA,606869.0,HEXA,tay_sachs_disease,272800.0


In [112]:
# v_dis = v_dis[['abbreviation', 'name', 'dtype', 'subtype', 'organ', 'omim', 'ghr', 'orphanet',
#  'cellLines',
#  'clinicaltrials',
#  'eurogenetest',
#  'geneticalliance',
#  'gard',
#  'igsr1000genoms',
#  'gwascentral',
#  'clinvar',
#  'malacard',
#  'omim_symptons']]

In [113]:
v_dis = v_dis[['abbreviation', 'name', 'omim', 'gwascentral']]

In [114]:
v_dis = v_dis.drop(v_dis.index[[2]])

In [115]:
v_dis['abbreviation'] = v_dis.abbreviation.astype(str)

### Get paired chems for these diseases
Using the above disease abbreviation, API call for associated chems from VMH

In [117]:
import pdb
def get_met(abbrev):
    """
    2nd attempt - this time create a line for every metabolite
    Given a disease abbreviation from VMH make api call to get assoc metabolite
    """
    url = 'https://www.vmh.life/_api/biomarkers/?disease=' + abbrev
    response = urllib.request.urlopen(url).read().strip() # make api call 
    response = json.loads(response) # turn from byte to dict
    metName, value, cid, chebi, chembl = [np.nan] * 5
    for metabolite in response['results']:
        try:
            value = metabolite['value']
            cid = metabolite['metabolite']['pubChemId']
            chebi = metabolite['metabolite']['cheBlId']
            chembl = metabolite['metabolite']['chembl']
            metName = metabolite['metabolite']['fullName']
        except Exception as e:
            print('Exception: ', e)
            return [abbrev, metName, value, cid, chebi, chembl]
            pass
        if value == 'Increased':
            return [abbrev, metName, value, cid, chebi, chembl]
        else:
            return ([np.nan] * 6)

In [118]:
vmh_df = pd.DataFrame()
vmh_df['column'] = v_dis.abbreviation.apply(get_met)

In [119]:
vmh_df = vmh_df[vmh_df.column.map(lambda x: x is not None)]

In [121]:
vmh_df[['DiseaseAbbrev', 'metName', 'value', 'cid', 'chebi', 'chembl']] = pd.DataFrame(vmh_df.column.values.tolist(), index= vmh_df.index)
del vmh_df['column']
del vmh_df['value']

In [122]:
vmh_df.reset_index(inplace=True, drop=True)
vmh_df.head()

Unnamed: 0,DiseaseAbbrev,metName,value,cid,chebi,chembl
0,3MGA,3-Hydroxy-Isovaleryl Carnitine,Increased,53915061,73027.0,
1,ARG,L-Arginine,Increased,6322,16467.0,CHEMBL1485
2,ASA,Citrulline,Increased,9750,16349.0,CHEMBL444814
3,BTD,Propionylcarnitine,Increased,107738,28867.0,
4,BKT,Tiglyl Carnitine,Increased,22833596,,


### Get dis-gofuncs 
With the single gene given by VMH


In [None]:
# Export gene name list 
genes = v_dis.gwascentral
np.savetxt(r'v_genes.txt', genes.values, fmt='%s')

#### NOTE the next step is MANUAL
You need to go to https://www.uniprot.org/uploadlists/ and give it the created v_genes.txt file, ask it to convert
Gene Names to uniprot ID. Then download this as UniprotIDs.tab (as uncompressed, mapping table) to this folder

In [None]:
# Import manually generated file of geneID --> uniprotID
df_uni_ids = pd.read_csv('data/UniprotIDs.tab', sep='\t',usecols=[0,1])
df_uni_ids.columns = ['GeneID', 'UniprotID']
df_uni_ids['GeneID'] = df_uni_ids.GeneID.astype(str)

In [None]:
df_uni_ids.sample(3)

In [None]:
# Let's use this gene-uniprot df as our base, add disease into it via map
gen2dis = dict(zip(v_dis.gwascentral, v_dis.omim))
df_uni_ids['OMIM'] = df_uni_ids.GeneID.map(lambda x: gen2dis.get(x))

In [None]:
# Now turn unprot to go func
# import goa file (uniprot ID to go_functions)
go_funcs = pd.read_csv('../goa_human.gaf', header=None, skiprows=30, sep='\t')

In [None]:
# Cut out all cols except uniprot ids and go_funcs, rename these
go_funcs = go_funcs.rename(columns={ go_funcs.columns[1]: "UniprotID" })
go_funcs = go_funcs.rename(columns={ go_funcs.columns[4]: "gofunc" })
col_list = ['UniprotID', 'gofunc']
df_go = go_funcs[col_list]

In [None]:
df_uni_ids.head()

In [None]:
# Merge the go functions into our existing chem-uniprotID and dis-uniprotID dfs
df_uni_ids = df_uni_ids.merge(df_go, on='UniprotID', how='outer').dropna()

In [None]:
df_uni_ids.head()

In [None]:
# Create a col with the full go url
df_uni_ids['go_url'] = '<' + 'http://purl.obolibrary.org/obo/' + df_uni_ids.gofunc.str.replace(':', '_')  + '>'

In [None]:
# Grab just the columns we want to output (diseaseID and go_url/ chemicalID and go_url)
col_list_d = ['OMIM', 'go_url']
df_d = df_uni_ids[col_list_d]

In [None]:
# Output an association file for each of chem and dis
np.savetxt(r'associations_c.txt', df_c.values, fmt='%s')
np.savetxt(r'associations_d.txt', df_d.values, fmt='%s')

In [None]:
# Merge these two into one single file
subprocess.call('cat associations_c.txt > myassociations', shell=True)
subprocess.call('cat associations_d.txt >> myassociations', shell=True)

In [None]:
# Create entities.lst to inform opa2vec which entities we want vectors for
entities = df_d.DiseaseID.unique().tolist() + df_c.ChemicalID.unique().tolist()
np.savetxt(r'entities.lst', entities, fmt='%s')

In [None]:
# Actually we have to make these into vectors at the same time as the opa-nn vecs so
# 1. get chem-gofuncs
# 2. integrate both into vec creation along with opa-nn vecs

### Get chem-gofuncs

In [None]:
# Does VMH give us chem-gene?
# - with current data
# - with other api call
# ELSE are these chems in CTD
# ELSE are these chems in DisGeNet

### Get assoc genes for diseases (ii) pair them to CTD via semantic matching/omim

In [None]:
# # Map the vmh disease abbreviations to our disease list
# # import diseases
# c_dis = pd.read_csv('../ctd-to-nt/all-diseases-w-genes-ctd-idsnamesgenes.csv')
# print('CTD chems: ', c_dis.shape[0])

In [None]:
# c_dis.head()

In [None]:
# # Match up diseases in two ways, OMIM and semantic similarity (may be ~ no matches...)
# # Use a measure of distance to match up disease names from ctd and from VMH 
# from difflib import SequenceMatcher
# import pdb

# def similar(a, b):
#     return SequenceMatcher(None, a, b).ratio()

# def create_map(std_list, flawed_list):
#     flawed_list = (n for n in flawed_list)
#     team_map = {}
#     best_score = {}
#     for team in flawed_list:
#         scores = [similar(team, std_team) for std_team in std_list]
#         highest = max(scores)
#         if highest > 0.8:
#             index = scores.index(max(scores))
#             team_map[team] = std_list[index]
#     return team_map

In [None]:
## KEEP commented out unless you haven't made this map yet
# mapboy = create_map(v_dis.name, c_dis.DiseaseName.unique())

# # Here we export the dictionary in a way that's easily imported as dict
# import pickle 

# with open('Uniprot_HINO_map'+ '.pkl', 'wb') as f:
#         pickle.dump(mapboy, f, pickle.HIGHEST_PROTOCOL)

In [None]:
# ## The commented section above makes a map of vmh dis to ctd dis, importing here (computationally expensive) 
# def load_obj(name):
#     with open(name + '.pkl', 'rb') as f:
#         return pickle.load(f)

# vdis2cdis = load_obj('vmhdis_to_ctddis')

In [None]:
# vdis2cdis

In [None]:
# # These are the incorrect mappings I've identified for a 0.8 similarity cutoff
# remove = ('Turner Syndrome', 'Werner Syndrome', 'Enterokinase Deficiency', 'Prolidase Deficiency')
# vdis2cdis = {key: vdis2cdis[key] for key in vdis2cdis if key not in remove}

In [None]:
# sorted(c_dis.DiseaseName.unique())[700:]

In [None]:
# sorted(v_dis.name)

In [None]:
# API script to get disease-marker pairs from VMH