# Validate NN on SIDER unseen database
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: Let's take the NN developed in Opa/ and test it out on an unseen database

Ways to improve <br>
- get more chems through disgenet
- get more diseases by running opa2vec on ctd data freshly

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from random import randint
import random
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from pandas_ml import ConfusionMatrix
import json
import subprocess
import pickle
import math

#Set random seed
np.random.seed(1606)

In [2]:
# Right, what databases? Where can I get unseen chem-disease associations
# Virtual Metabolic Human - ?nutrients
# Sider - drugs
# I want environmental chemicals, could use EPA toxic list but probably all in training database
# How does this validation thing work, (i) import trained model (ii) create features for the chemical/diseases
# (iii) predict

In [3]:
# Import database of unqiue diseases with their vectors from opa-nn

In [4]:
# import new chemicals with their actual disease associations, extract unique chems

## 1. Sider

In [5]:
# Import sider (all side effects)
# SE = side effect
# CID1 - "flat compound", i.e. stereo-isomers have been merged into one compound
# CID2 - stereo-specific compound id
colnames = ['CID1', 'CID2', 'UMLS', 'UMLS2Type', 'UMLS2', 'SEname']
sider = pd.read_csv('../validation/data/meddra_all_se.tsv', sep='\t', names=colnames)

In [6]:
sider.sample(3)

Unnamed: 0,CID1,CID2,UMLS,UMLS2Type,UMLS2,SEname
161261,CID100005314,CID000005314,C0042510,PT,C0042510,Ventricular fibrillation
59539,CID100003143,CID000148123,C0151738,LLT,C0151738,Large intestine perforation
87586,CID100003690,CID000003690,C0020542,PT,C0020542,Pulmonary hypertension


### Get Disease MESH IDs for sider side effects
Problem here is that sider uses UMLS, convert this to MESH
<br> Commenting out the next few cells as the mapping process is intensive and I've saved map

In [7]:
# # Import CTD Chemical-Disease Original CSV to get disease names, try semantic matching to get UMLS-MESH conversion
# Read in CTD sample, skipping the intro rows
cols = ['DiseaseID', 'DiseaseName', 'DirectEvidence']
col_types = {   
    'DiseaseID': 'category',
    'DiseaseName': 'category',
    'DirectEvidence': 'category'
}
df_cd = pd.read_csv('../ctd-to-nt/csvs/CTD_chemicals_diseases.csv', skiprows=27, usecols=cols, dtype=col_types)
df_cd = df_cd.drop(0)
df_cd = df_cd.dropna(subset=['DirectEvidence']) # drop if it doesn't have direct evidence

In [8]:
# df_cd.head()

In [9]:
# Make a mesh disease name to mesh id map for later use
mesh_get_id = dict(zip(df_cd.DiseaseName, df_cd.DiseaseID))

In [10]:
# # Process DiseaseID so as to be usable in url
# df_cd['DiseaseID'] = df_cd['DiseaseID'].str.replace('MESH:', '')

# #Specify type to optimise
# df_cd['ChemicalID'] = df_cd.ChemicalID.astype(str)
# # df_cd['InferenceGeneSymbol'] = df_cd.InferenceGeneSymbol.astype(str)

In [11]:
# Use a measure of distance to match up disease names from ctd (MESH) and from sider (UMLS) 
from difflib import SequenceMatcher
import pdb

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def create_map(std_list, flawed_list):
    flawed_list = (n for n in flawed_list)
    team_map = {}
    best_score = {}
    for team in flawed_list:
        scores = [similar(team, std_team) for std_team in std_list]
        highest = max(scores)
        if highest > 0.8:
            index = scores.index(max(scores))
            team_map[team] = std_list[index]
    return team_map

In [12]:
# umls = sorted(sider.SEname.unique())
# mesh = sorted(df_cd.DiseaseName.unique())

In [13]:
# Commenting out as takes ages, and am saving the map as pickle object
# umls_mesh_map = create_map(umls, mesh)
# umls_mesh_map_mod = {value:key for (key, value) in umls_mesh_map_mod.items()}

In [14]:
# print(ummap)

In [15]:
# # These are the incorrect mappings I've identified for a 0.8 similarity cutoff
# remove = ('Agraphia', 'Angina, Stable', 'Cerebrospinal Fluid Otorrhea', 'Confusion',
#          'Endarteritis', 'Fetal Growth Retardation', 'Glucose Intolerance', 'Hearing Disorders',
#          'Hemoperitoneum', 'Hepatitis, Animal', 'Hip Contracture', 'Hyperoxaluria',
#          'Hyperoxia', 'Hyperpigmentation','Hypolipoproteinemias',  'Intestinal Diseases',
#          'Milk Hypersensitivity', 'Mucositis', 'Murine Acquired Immunodeficiency Syndrome', 
#          'Muscle Neoplasms', 'Mycotoxicosis', 'Olfaction Disorders','Osteopetrosis',
#          'Peanut Hypersensitivity', 'Pharyngeal Neoplasms', 'Polycystic liver disease',
#          'Pseudohypoparathyroidism', 'Psychomotor Agitation', 'Pulmonary Emphysema',
#          'Purpura, Thrombocytopenic', 'Renal Insufficiency', 'Sciatic Neuropathy',
#          'Simian Acquired Immunodeficiency Syndrome', 'Spinal Curvatures', 'Sporotrichosis',
#          'Vipoma', 'Vitamin A Deficiency', 'Vitamin D Deficiency', 'Vitamin E Deficiency',
#          'Wheat Hypersensitivity')
# umls_mesh_map_mod = {key: umls_mesh_map[key] for key in umls_mesh_map if key not in remove}
# # Muscle neoplasms is not the same as muscle spams
# # 'Olfaction Disorders' != 'Ovulation disorder'

In [16]:
# # Export map of UMLS:MESH
# with open('umls_mesh_map'+ '.pkl', 'wb') as f:
#         pickle.dump(umls_mesh_map_mod, f, pickle.HIGHEST_PROTOCOL)

In [17]:
# Loading the map from pickle object - if you haven't created it you may need to uncomment above lines
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

umls_mesh_map_mod = load_obj('umls_mesh_map')

In [18]:
#Use the umls-mesh map to add mesh col to sider

In [19]:
sider['MESH'] = sider.SEname.map(lambda x: umls_mesh_map_mod.get(str(x)))

In [20]:
sider.head()

Unnamed: 0,CID1,CID2,UMLS,UMLS2Type,UMLS2,SEname,MESH
0,CID100000085,CID000010917,C0000729,LLT,C0000729,Abdominal cramps,
1,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain,Abdominal Pain
2,CID100000085,CID000010917,C0000737,LLT,C0000737,Abdominal pain,Abdominal Pain
3,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain,
4,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain,Abdominal Pain


In [21]:
print('total sider rows: ', sider.shape[0])
print('sider rows with mesh value: ', sider[sider.MESH.map(lambda x: x is not None)].shape[0])

total sider rows:  309849
sider rows with mesh value:  146773


In [22]:
sider_mod = sider[sider.MESH.map(lambda x: x is not None)]

In [23]:
sider_mod.sample(2)

Unnamed: 0,CID1,CID2,UMLS,UMLS2Type,UMLS2,SEname,MESH
127407,CID100004595,CID000004595,C0013384,LLT,C0013384,Dyskinesia,Dyskinesias
300125,CID116132446,CID016132446,C0030305,PT,C0030305,Pancreatitis,Pancreatitis


In [24]:
# Split out the two CID columns NOTE that each row can now potentially be two - one for each CID1 and CID2
sider1 = sider_mod[['CID1', 'MESH']]
sider2 = sider_mod[['CID2', 'MESH']]
sider1.columns = ['CID', 'MESH']
sider2.columns = ['CID', 'MESH']
sider_mod = pd.concat([sider1, sider2], ignore_index=True)

In [25]:
sider_mod.sample(2)

Unnamed: 0,CID,MESH
215305,CID000004927,Pruritus
161941,CID000002540,Hypersensitivity


In [26]:
print('Sider shape: ', sider_mod.shape[0])
sider_mod = sider_mod.drop_duplicates()
print('Total unique correlated chem:dis observations: ', sider_mod.shape[0])
print('Unique chems: ', sider_mod.CID.unique().shape[0])
print('Unique diseases: ', sider_mod.MESH.unique().shape[0])

Sider shape:  293546
Total unique correlated chem:dis observations:  145635
Unique chems:  2968
Unique diseases:  1034


In [27]:
# Chop out all chems that are in our training database
# Read in training db chems (opa-nn notebook)
chems_in_nn = pd.read_csv('../opa/chemsInNN.txt', names=['Chem'])
chems_in_nn = chems_in_nn.dropna().drop_duplicates()
chems_in_nn.shape[0]

# Now chop from SIDER db
nnChems = list(chems_in_nn.Chem)
sider_mod['inNN'] = sider_mod.CID.map(lambda x: x in nnChems)
sider_mod = sider_mod[sider_mod.inNN.map(lambda x: x == False)]
sider_mod = sider_mod[['CID', 'MESH']]
sider_mod = sider_mod.reset_index(drop=True)

In [28]:
print('Total unique correlated chem:dis observations: ', sider_mod.shape[0])
print('Unique chems: ', sider_mod.CID.unique().shape[0])
print('Unique diseases: ', sider_mod.MESH.unique().shape[0])

Total unique correlated chem:dis observations:  145635
Unique chems:  2968
Unique diseases:  1034


In [29]:
# Now we have a set of chem:dis that are not in the NN training set

In [30]:
# Next: Make each vector for these
# Then: Run NN on them

# Chemical entity - Gene Ontology embeddings (via associated genes)
# Disease entity - Gene Ontology embeddings (via associated genes)
# Disease entity - Human Phenotype Ontology embeddings (via associated phenotypes)
# Disease entity - Mammalian Phenotype Ontology embeddings (via associated phenotypes)
# Chemical entity - Chemical Entities of Biological Interest (CHEBI ) Ontology embeddings
# Disease entity - Disease Ontology embeddings
# Chemical entity - Human Interaction Network Ontology embeddings (via associated genes)
# Disease entity - Human Interaction Network Ontology embeddings (via associated genes)


In [31]:
# SIDER-GO vecs
# For this I need chem-gene associations and disease-gene associations
# Sources: CTD, Disgenet

### Sider Go vecs
First get IDs

In [32]:
sider_mod.sample(2)

Unnamed: 0,CID,MESH
119514,CID005311051,Hyperglycemia
5944,CID100002375,Dysuria


In [33]:
# Turn CID to CTD chemical ID with this map I made earlier 
# Load the map from pickle object
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

ctd_cid_map = load_obj('../opa/ctd_cid_map')

In [34]:
# Will need to standardise the CID and decode from bytes
def cid_standardiser (cid):
    # Must be format CID + 9 int chars, starting with 1 seemingly
    cid = int(cid)
    output = 'CID1' + '0' * (8 - len(str(cid))) + str(cid)
    return output

ctd_cid_map_df = pd.DataFrame.from_dict(ctd_cid_map, orient='index')

In [35]:
# Process and reverse map
ctd_cid_map_df[0] = ctd_cid_map_df[0].str.decode('utf-8')
ctd_cid_map_df[0] = ctd_cid_map_df[0].map(lambda x: cid_standardiser(x))
ctd_cid_map = dict(zip(ctd_cid_map_df[0], ctd_cid_map_df.index.values))

In [36]:
# Now we have the map, apply it to our sider df
sider_mod['ChemicalID'] = sider_mod.CID.map(lambda x: ctd_cid_map.get(x))

In [37]:
print('chem:dis combos: ', sider_mod[sider_mod.ChemicalID.map(lambda x: x is not None)].shape[0])
print('unique chems: ',sider_mod[sider_mod.ChemicalID.map(lambda x: x is not None)].ChemicalID.nunique())

chem:dis combos:  31280
unique chems:  595


In [38]:
sider_mod = sider_mod[sider_mod.ChemicalID.map(lambda x: x is not None)]
sider_mod['MESHid'] = sider_mod.MESH.map(lambda x: mesh_get_id.get(x))
sider_mod.head()

Unnamed: 0,CID,MESH,ChemicalID,MESHid
40,CID100000119,Angioedema,D005680,MESH:D000799
41,CID100000119,Pain,D005680,MESH:D010146
42,CID100000119,Urticaria,D005680,MESH:D014581
43,CID100000137,Anemia,D000622,MESH:D000740
44,CID100000137,Aphasia,D000622,MESH:D001037


In [39]:
print('Total unique correlated chem:dis observations: ', sider_mod.shape[0])
print('Unique chems: ', sider_mod.CID.unique().shape[0])
print('Unique diseases: ', sider_mod.MESHid.unique().shape[0])
## Note that we're losing a lot when we take only chems in CTD - see if we can get gene assocs from elsewhere

Total unique correlated chem:dis observations:  31280
Unique chems:  595
Unique diseases:  886


### Get chem-gene-vecs and dis-gene-vecs premade from CTD data

In [40]:
# Import GOFUNC vecs directly, for diseases
with open('../opa/go-gofuncs.lst', 'r') as file:
    text = file.read()
    
# Strip and split vector data into list of lists [chem, vec]
text = text.replace('\n', '')
text = text.split(']')
text = [item.strip().split(' [') for item in text]

# Turn it into a data frame
df = pd.DataFrame(text)
df.columns = ['ID', 'Vector']

# Clean
df = df.dropna()
df['Vector'] = df.Vector.map(lambda x: x.rstrip().lstrip().replace('    ', ' ').replace('   ', ' ').replace('  ', ' ').replace(' ', ','))

# Turn vector column into a list
df['Vector'] = df.Vector.map(lambda x: x.split(','))

In [41]:
df[df.ID.map(lambda x: ('MESH' not in x) & ('OMIM' not in x))].shape# 586

(845, 2)

In [42]:
# Get the chemical vecs, delete any row without a chemical vec
chem_go_vecs = df[df.ID.map(lambda x: ('MESH' not in x) & ('OMIM' not in x))]
chem_to_vec = dict(zip(chem_go_vecs.ID, chem_go_vecs.Vector))
sider_mod['ChemGoVec'] = sider_mod.ChemicalID.map(lambda x: chem_to_vec.get(x))
sider_mod = sider_mod[sider_mod.ChemGoVec.map(lambda x: x is not None)]

In [43]:
# Get the disease vecs, delete any row without a disease vec
dis_go_vecs = df[df.ID.map(lambda x: 'MESH' in x)]
dis_to_vec = dict(zip(dis_go_vecs.ID, dis_go_vecs.Vector))
sider_mod['DisGoVec'] = sider_mod.MESHid.map(lambda x: dis_to_vec.get(x))
sider_mod = sider_mod[sider_mod.DisGoVec.map(lambda x: x is not None)]

In [44]:
sider_mod.head()

Unnamed: 0,CID,MESH,ChemicalID,MESHid,ChemGoVec,DisGoVec
76,CID100000143,Anorexia,D002955,MESH:D000855,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[-1.35700975e-04, 1.38808534e-01, 3.22125033e-..."
80,CID100000143,Diarrhea,D002955,MESH:D003967,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[0.03242041, 0.13026166, 0.02192746, -0.127020..."
83,CID100000143,Hypersensitivity,D002955,MESH:D006967,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[0.03163584, 0.12489144, 0.02595111, -0.142592..."
87,CID100000143,Pruritus,D002955,MESH:D011537,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[-0.00294329, 0.09807964, -0.07707691, -0.0823..."
88,CID100000143,Stomatitis,D002955,MESH:D013280,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[-0.011678, 0.09770971, 0.02659312, -0.0894378..."


In [45]:
has_dis_vec = sider_mod.DisGoVec.map(lambda x: x is not np.nan)
has_chem_vec = sider_mod.ChemGoVec.map(lambda x: x is not None)
sider_mod = sider_mod[has_dis_vec & has_chem_vec]
print('Number of chem-dis pairs with gofuncs: ', sider_mod.shape[0])
print('Number of chems: ', sider_mod.ChemicalID.nunique())
print('Number of diseases: ', sider_mod.MESHid.nunique())

Number of chem-dis pairs with gofuncs:  1366
Number of chems:  62
Number of diseases:  166


### Del any pairs in the original NN dataset

In [46]:
# Now to make it a real blind test we must del any chem-dis pairs in the NN db
nn_chem_dis = pd.read_csv('../ctd-to-nt/chem-dis-pos-assocs.csv')
nn_chem_dis.columns = ['ChemicalID', 'MESHid']

# Remove from sider_mod any chem-dis pairs that exist in nn_chem_dis
combined_cd = pd.merge(sider_mod[['ChemicalID', 'MESHid']], nn_chem_dis, on=['ChemicalID', 'MESHid'], how='left', indicator='Exist')
combined_cd['Exist'] = np.where(combined_cd.Exist == 'both', True, False)
not_in_nn = [not i for i in list(combined_cd.Exist)]
sider_mod = sider_mod[not_in_nn]

In [47]:
has_dis_vec = sider_mod.DisGoVec.map(lambda x: x is not np.nan)
has_chem_vec = sider_mod.ChemGoVec.map(lambda x: x is not None)
sider_mod = sider_mod[has_dis_vec & has_chem_vec]
print('Number of chem-dis pairs with gofuncs: ', sider_mod.shape[0])
print('Number of chems: ', sider_mod.ChemicalID.nunique())
print('Number of diseases: ', sider_mod.MESHid.nunique())

Number of chem-dis pairs with gofuncs:  991
Number of chems:  62
Number of diseases:  151


### Add control rows (all above are correlated)

In [48]:
# Add control rows (all above are correlated)
sider_mod['Correlation'] = 1

In [49]:
sider.head()

Unnamed: 0,CID1,CID2,UMLS,UMLS2Type,UMLS2,SEname,MESH
0,CID100000085,CID000010917,C0000729,LLT,C0000729,Abdominal cramps,
1,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain,Abdominal Pain
2,CID100000085,CID000010917,C0000737,LLT,C0000737,Abdominal pain,Abdominal Pain
3,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain,
4,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain,Abdominal Pain


In [50]:
# Add unrelated pairs - control obs
no_rows = (sider_mod.shape[0]-1)    # This is a parameter to be tuned --> how many uncorrelated pairs do we want
print('Original shape: ', sider_mod.shape)
sider_mod = sider_mod.drop_duplicates(subset=['ChemicalID', 'MESHid'], keep=False)
print('Shape after dropping duplicates: ', sider_mod.shape)

# Randomly select chems and diseases (as many as there are related pairs)
df_chems = sider_mod[['ChemicalID', 'ChemGoVec']].drop_duplicates(subset=['ChemicalID']).reset_index(drop=True)
df_dis = sider_mod[['MESHid', 'DisGoVec', 'MESH']].drop_duplicates(subset=['MESHid']).reset_index(drop=True)
df_chems.columns = ['ID', 'Vector']
df_dis.columns = ['ID', 'Vector', 'MESH']

# print('chem size: ', df_chems.shape[0])
# print('dis size: ', df_dis.shape[0])

no_chems = len(df_chems) - 1
no_dis = len(df_dis) - 1
rand_chems = np.random.choice(no_chems, no_rows, replace=True)
rand_dis = np.random.choice(no_dis, no_rows, replace=True)

# Add the new pairs as rows
for x in range(0, no_rows):
    int1 = rand_chems[x]
    int2 = rand_dis[x]
    chem, chemvec = df_chems.loc[int1, 'ID'], df_chems.loc[int1, 'Vector']
    dis, disvec, mesh = df_dis.loc[int2, 'ID'], df_dis.loc[int2, 'Vector'], df_dis.loc[int2, 'MESH']
    sider_mod = sider_mod.append({'ChemicalID':chem, 'MESHid':dis, 'ChemGoVec':chemvec, 'DisGoVec': disvec, 'Correlation':0, 'MESH': mesh}, ignore_index=True)

print('Shape after adding controls: ', sider_mod.shape)
# Drop any duplicates (removes known correlated pairs accidentally generated as uncorrelated)
sider_mod = sider_mod.drop_duplicates(subset=['ChemicalID', 'MESHid'], keep=False)
print('Shape after dropping duplicates: ', sider_mod.shape)

Original shape:  (991, 7)
Shape after dropping duplicates:  (991, 7)
Shape after adding controls:  (1981, 7)
Shape after dropping duplicates:  (1657, 7)


In [51]:
sider_mod.head()

Unnamed: 0,CID,MESH,ChemicalID,MESHid,ChemGoVec,DisGoVec,Correlation
0,CID100000143,Hypersensitivity,D002955,MESH:D006967,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[0.03163584, 0.12489144, 0.02595111, -0.142592...",1
2,CID100000143,Urticaria,D002955,MESH:D014581,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[-0.05275472, 0.06301367, -0.17321466, -0.0895...",1
3,CID100000143,Acute Kidney Injury,D002955,MESH:D058186,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[0.0238973, 0.07814123, 0.0345676, -0.08827752...",1
4,CID100000143,Disease Progression,D002955,MESH:D018450,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[0.03092173, 0.13223884, -0.04477117, -0.14742...",1
6,CID100000681,Atrial Fibrillation,D004298,MESH:D001281,"[-0.01242278, 0.09383001, -0.07544384, -0.1446...","[0.02409604, 0.13129537, 0.0031278, -0.1579988...",1


In [52]:
# sider_mod[['MESH', 'ChemicalID', 'Correlation']].sort_values(['ChemicalID'])

In [53]:
# Manually looking at chem-dis associations. Some don't seem to exist like
# Hypertension	D019793
# Neoplasms	D019793
# Don't seem to exist - google search but do exist in sider

In [54]:
sider_mod.sample(5)

Unnamed: 0,CID,MESH,ChemicalID,MESHid,ChemGoVec,DisGoVec,Correlation
423,CID100003394,Skin Ulcer,D005480,MESH:D012883,"[-0.035742, 0.03456548, -0.217711, -0.06264855...","[-0.00760662, 0.10853698, -0.04041342, -0.1150...",1
1641,,Adenocarcinoma of lung,D005013,MESH:C538231,"[-6.23983219e-02, 3.05526033e-02, -6.98827058e...","[-1.99492890e-02, 1.23391971e-01, 5.40936664e-...",0
156,CID100002818,Hypertriglyceridemia,D003024,MESH:D015228,"[0.00269767, 0.14648503, 0.00703032, -0.138347...","[0.01583762, 0.14497107, 0.0456062, -0.1102462...",1
1646,,Trigeminal Neuralgia,D012968,MESH:D014277,"[0.01459336, 0.10860223, 0.03106978, -0.113938...","[0.00975664, -0.00295015, -0.18856503, -0.0529...",0
1559,,Hypersensitivity,D004221,MESH:D006967,"[-4.41320464e-02, 4.18050997e-02, -1.52748421e...","[0.03163584, 0.12489144, 0.02595111, -0.142592...",0


##  SIDER Phenotype Ontology 

In [55]:
# First get DOIDs --> importing map and applying it:
mapper = pd.read_csv('../opa/chem_dis_to_CID_DOID.csv')
print(mapper.DOID.nunique()) # 1671
mesh_to_doid = dict(zip(mapper.ID, mapper.DOID))
sider_mod['DOID'] = sider_mod.MESHid.map(lambda x: mesh_to_doid.get(x))

1671


In [56]:
# Standardise the DOIDs
def doid_standardiser (doid):
    doid = doid.replace(':', '_')
    return doid

sider_mod['DOID'] = sider_mod.DOID.map(lambda x: np.nan if isinstance(x, float) else doid_standardiser(x))

In [57]:
# Simply load in the premade dis-phenVec maps
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

dis_mpVec = load_obj('../opa/dis_mpVec_map')

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

dis_hpVec = load_obj('../opa/dis_hpVec_map')

In [58]:
# Apply the maps to add phenVecs to our dataframe
empty_vec = [0] * 200

sider_mod['disPhenVecMP'] = sider_mod.DOID.map(lambda x: dis_mpVec.get(x, empty_vec))
sider_mod['disPhenVecHP'] = sider_mod.DOID.map(lambda x: dis_hpVec.get(x, empty_vec))

In [59]:
sider_mod.head()

Unnamed: 0,CID,MESH,ChemicalID,MESHid,ChemGoVec,DisGoVec,Correlation,DOID,disPhenVecMP,disPhenVecHP
0,CID100000143,Hypersensitivity,D002955,MESH:D006967,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[0.03163584, 0.12489144, 0.02595111, -0.142592...",1,DOID_1205,"[3.10053602e-02, 1.36995822e-01, 5.86080104e-0...","[4.44377996e-02, 1.47881836e-01, 7.03755245e-0..."
2,CID100000143,Urticaria,D002955,MESH:D014581,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[-0.05275472, 0.06301367, -0.17321466, -0.0895...",1,DOID_1555,"[2.47163400e-02, 1.18226737e-01, 4.61919121e-0...","[4.06458825e-02, 1.50512770e-01, 5.22731468e-0..."
3,CID100000143,Acute Kidney Injury,D002955,MESH:D058186,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[0.0238973, 0.07814123, 0.0345676, -0.08827752...",1,DOID_3021,"[2.60288436e-02, 1.05936542e-01, 4.98992279e-0...","[4.38529812e-02, 1.56046316e-01, 5.53010181e-0..."
4,CID100000143,Disease Progression,D002955,MESH:D018450,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[0.03092173, 0.13223884, -0.04477117, -0.14742...",1,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,CID100000681,Atrial Fibrillation,D004298,MESH:D001281,"[-0.01242278, 0.09383001, -0.07544384, -0.1446...","[0.02409604, 0.13129537, 0.0031278, -0.1579988...",1,DOID_0060224,"[0.05471873, 0.15233922, 0.07463891, -0.128690...","[6.88141435e-02, 1.53615937e-01, 6.51953071e-0..."


In [60]:
# Right let's add the rest of the features to our dataset

### Add CHEBI vecs

In [61]:
# Import chem2chebivec map made in opa-nn notebook
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

chem2chebi = load_obj('../opa/chem2chebi')

In [62]:
sider_mod.sample(4)

Unnamed: 0,CID,MESH,ChemicalID,MESHid,ChemGoVec,DisGoVec,Correlation,DOID,disPhenVecMP,disPhenVecHP
1121,,Proteinuria,D003024,MESH:D011507,"[0.00269767, 0.14648503, 0.00703032, -0.138347...","[-1.85592696e-02, 1.07028866e-02, -1.92884237e...",0,DOID_576,"[0.03868523, 0.12156112, 0.06080182, -0.112411...","[0.05968903, 0.15474011, 0.05591857, -0.121531..."
1032,,Otitis Media,D002065,MESH:D010033,"[0.03464986, 0.08828709, -0.02281343, -0.14911...","[0.0259653, 0.10290584, 0.05516887, -0.1001262...",0,DOID_10754,"[0.02782048, 0.11976513, 0.06038493, -0.120898...","[0.04213005, 0.1586637, 0.04728878, -0.1188094..."
203,CID100003121,Deafness,D014635,MESH:D003638,"[-7.86130223e-03, 1.40450239e-01, -6.48044944e...","[0.03536899, 0.11699601, 0.04886917, -0.118581...",1,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
366,CID100003386,Hyperplasia,D005473,MESH:D006965,"[0.05209794, 0.00067142, -0.4113431, -0.202348...","[0.01746579, 0.14595346, 0.05720224, -0.153178...",1,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [63]:
sider_mod['CHEBIvec'] = sider_mod.ChemicalID.map(lambda x: chem2chebi.get(x, empty_vec))

### Add DO Vecs

In [64]:
# Import Gofunc vec file
with open('../opa/do-vecs.lst', 'r') as file:
    text = file.read()
    
# Strip and split vector data into list of lists [disease, vec]
text = text.replace('\n', '')
text = text.split(']')
text = [item.strip().split(' [') for item in text]

# Turn it into a data frame
df = pd.DataFrame(text)
df.columns = ['ID', 'Vector']

# Clean
df = df.dropna()
df['Vector'] = df.Vector.map(lambda x: x.rstrip().lstrip().replace('    ', ' ').replace('   ', ' ').replace('  ', ' ').replace(' ', ','))

# Turn vector column into a list
df['Vector'] = df.Vector.map(lambda x: x.split(','))

# Make a map of it (DisID to DOvec)
dis_to_DOvec = dict(zip(df.ID, df.Vector))

In [65]:
sider_mod['DOvec'] = sider_mod.MESHid.map(lambda x: dis_to_DOvec.get(x))

In [66]:
# Change the DO vec elements from string to floats
sider_mod['DOvec'] = sider_mod.DOvec.map(lambda x: [float(i) for i in x])

In [67]:
sider_mod.head()

Unnamed: 0,CID,MESH,ChemicalID,MESHid,ChemGoVec,DisGoVec,Correlation,DOID,disPhenVecMP,disPhenVecHP,CHEBIvec,DOvec
0,CID100000143,Hypersensitivity,D002955,MESH:D006967,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[0.03163584, 0.12489144, 0.02595111, -0.142592...",1,DOID_1205,"[3.10053602e-02, 1.36995822e-01, 5.86080104e-0...","[4.44377996e-02, 1.47881836e-01, 7.03755245e-0...","[0.01772805, 0.05946666, 0.02581723, -0.057734...","[0.0323501714, 0.0718148202, 0.0226857904, -0...."
2,CID100000143,Urticaria,D002955,MESH:D014581,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[-0.05275472, 0.06301367, -0.17321466, -0.0895...",1,DOID_1555,"[2.47163400e-02, 1.18226737e-01, 4.61919121e-0...","[4.06458825e-02, 1.50512770e-01, 5.22731468e-0...","[0.01772805, 0.05946666, 0.02581723, -0.057734...","[0.02033558, 0.07687499, 0.03339575, -0.061590..."
3,CID100000143,Acute Kidney Injury,D002955,MESH:D058186,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[0.0238973, 0.07814123, 0.0345676, -0.08827752...",1,DOID_3021,"[2.60288436e-02, 1.05936542e-01, 4.98992279e-0...","[4.38529812e-02, 1.56046316e-01, 5.53010181e-0...","[0.01772805, 0.05946666, 0.02581723, -0.057734...","[0.0153563516, 0.0718724281, 0.0305716358, -0...."
4,CID100000143,Disease Progression,D002955,MESH:D018450,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[0.03092173, 0.13223884, -0.04477117, -0.14742...",1,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.01772805, 0.05946666, 0.02581723, -0.057734...","[0.00961015, 0.0239394, 0.01371201, -0.0367966..."
6,CID100000681,Atrial Fibrillation,D004298,MESH:D001281,"[-0.01242278, 0.09383001, -0.07544384, -0.1446...","[0.02409604, 0.13129537, 0.0031278, -0.1579988...",1,DOID_0060224,"[0.05471873, 0.15233922, 0.07463891, -0.128690...","[6.88141435e-02, 1.53615937e-01, 6.51953071e-0...","[0.01396446, 0.05335214, 0.02592678, -0.054371...","[0.02116747, 0.06971144, 0.04062635, -0.064116..."


### Get HINO Vecs

In [68]:
# Import HINO vec file
with open('../opa/hinoVecs.lst', 'r') as file:
    text = file.read()
    
# Strip and split vector data into list of lists [chem, vec]
text = text.replace('\n', '')
text = text.split(']')
text = [item.strip().split(' [') for item in text]

# Turn it into a data frame
df = pd.DataFrame(text)
df.columns = ['ID', 'Vector']

# Clean
df = df.dropna()
df['Vector'] = df.Vector.map(lambda x: x.rstrip().lstrip().replace('    ', ' ').replace('   ', ' ').replace('  ', ' ').replace(' ', ','))

# Turn vector column into a list
df['Vector'] = df.Vector.map(lambda x: x.split(','))

# Make a map of it (DisID to DOvec)
entity_to_HINOvec = dict(zip(df.ID, df.Vector))

In [69]:
sider_mod['dis_HINOvec'] = sider_mod.MESHid.map(lambda x: entity_to_HINOvec.get(x))
sider_mod['chem_HINOvec'] = sider_mod.ChemicalID.map(lambda x: entity_to_HINOvec.get(x))

In [70]:
print('HINO dis vecs: ', sider_mod[sider_mod.dis_HINOvec.map(lambda x: x is not None)].shape[0])
print('HINO chem vecs: ', sider_mod[sider_mod.chem_HINOvec.map(lambda x: x is not None)].shape[0])
at_least_one = sider_mod.chem_HINOvec.map(lambda x: x is not None) | sider_mod.dis_HINOvec.map(lambda x: x is not None)
print('At least one hino vec: ', sider_mod[at_least_one].shape[0])

HINO dis vecs:  1294
HINO chem vecs:  1021
At least one hino vec:  1516


In [71]:
# Add empty vecs in place of None
empty_vec = [0] * 200

for col in ['dis_HINOvec', 'chem_HINOvec']:
    sider_mod[col] = sider_mod[col].map(lambda x: empty_vec if x is None else x)

In [72]:
# Change the HINO vec elements from string to floats
sider_mod['dis_HINOvec'] = sider_mod.dis_HINOvec.map(lambda x: [float(i) for i in x])
sider_mod['chem_HINOvec'] = sider_mod.chem_HINOvec.map(lambda x: [float(i) for i in x])

In [73]:
sider_mod.head()

Unnamed: 0,CID,MESH,ChemicalID,MESHid,ChemGoVec,DisGoVec,Correlation,DOID,disPhenVecMP,disPhenVecHP,CHEBIvec,DOvec,dis_HINOvec,chem_HINOvec
0,CID100000143,Hypersensitivity,D002955,MESH:D006967,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[0.03163584, 0.12489144, 0.02595111, -0.142592...",1,DOID_1205,"[3.10053602e-02, 1.36995822e-01, 5.86080104e-0...","[4.44377996e-02, 1.47881836e-01, 7.03755245e-0...","[0.01772805, 0.05946666, 0.02581723, -0.057734...","[0.0323501714, 0.0718148202, 0.0226857904, -0....","[0.01789101, 0.04353557, 0.01829723, -0.042048...","[0.01958054, 0.07949963, 0.04380728, -0.093200..."
2,CID100000143,Urticaria,D002955,MESH:D014581,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[-0.05275472, 0.06301367, -0.17321466, -0.0895...",1,DOID_1555,"[2.47163400e-02, 1.18226737e-01, 4.61919121e-0...","[4.06458825e-02, 1.50512770e-01, 5.22731468e-0...","[0.01772805, 0.05946666, 0.02581723, -0.057734...","[0.02033558, 0.07687499, 0.03339575, -0.061590...","[0.02427921, 0.08827087, 0.04033868, -0.078898...","[0.01958054, 0.07949963, 0.04380728, -0.093200..."
3,CID100000143,Acute Kidney Injury,D002955,MESH:D058186,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[0.0238973, 0.07814123, 0.0345676, -0.08827752...",1,DOID_3021,"[2.60288436e-02, 1.05936542e-01, 4.98992279e-0...","[4.38529812e-02, 1.56046316e-01, 5.53010181e-0...","[0.01772805, 0.05946666, 0.02581723, -0.057734...","[0.0153563516, 0.0718724281, 0.0305716358, -0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.01958054, 0.07949963, 0.04380728, -0.093200..."
4,CID100000143,Disease Progression,D002955,MESH:D018450,"[-0.04375384, 0.05983105, -0.11726998, -0.0517...","[0.03092173, 0.13223884, -0.04477117, -0.14742...",1,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.01772805, 0.05946666, 0.02581723, -0.057734...","[0.00961015, 0.0239394, 0.01371201, -0.0367966...","[0.01203498, 0.06432136, 0.02607773, -0.058572...","[0.01958054, 0.07949963, 0.04380728, -0.093200..."
6,CID100000681,Atrial Fibrillation,D004298,MESH:D001281,"[-0.01242278, 0.09383001, -0.07544384, -0.1446...","[0.02409604, 0.13129537, 0.0031278, -0.1579988...",1,DOID_0060224,"[0.05471873, 0.15233922, 0.07463891, -0.128690...","[6.88141435e-02, 1.53615937e-01, 6.51953071e-0...","[0.01396446, 0.05335214, 0.02592678, -0.054371...","[0.02116747, 0.06971144, 0.04062635, -0.064116...","[0.01673175, 0.06547476, 0.02241637, -0.051255...","[0.01246304, 0.07983294, 0.04100963, -0.077603..."


### Get PRO vecs

In [74]:
# Import PRO vec file
with open('../opa/PROVecs.lst', 'r') as file:
    text = file.read()
    
# Strip and split vector data into list of lists [chem, vec]
text = text.replace('\n', '')
text = text.split(']')
text = [item.strip().split(' [') for item in text]

# Turn it into a data frame
df = pd.DataFrame(text)
df.columns = ['ID', 'Vector']

# Clean
df = df.dropna()
df['Vector'] = df.Vector.map(lambda x: x.rstrip().lstrip().replace('    ', ' ').replace('   ', ' ').replace('  ', ' ').replace(' ', ','))

# Turn vector column into a list
df['Vector'] = df.Vector.map(lambda x: x.split(','))

# Make a map of it (DisID to DOvec)
entity_to_PROvec = dict(zip(df.ID, df.Vector)) 

In [75]:
sider_mod['dis_PROvec'] = sider_mod.MESHid.map(lambda x: entity_to_PROvec.get(x))
sider_mod['chem_PROvec'] = sider_mod.ChemicalID.map(lambda x: entity_to_PROvec.get(x))

In [76]:
# Add empty vecs in place of None
empty_vec = [0] * 200

for col in ['dis_PROvec', 'chem_PROvec']:
    sider_mod[col] = sider_mod[col].map(lambda x: empty_vec if x is None else x)

In [77]:
# Change the vec elements from string to floats
sider_mod['dis_PROvec'] = sider_mod.dis_PROvec.map(lambda x: [float(i) for i in x])
sider_mod['chem_PROvec'] = sider_mod.chem_PROvec.map(lambda x: [float(i) for i in x])

In [78]:
sider_mod.sample(10)

Unnamed: 0,CID,MESH,ChemicalID,MESHid,ChemGoVec,DisGoVec,Correlation,DOID,disPhenVecMP,disPhenVecHP,CHEBIvec,DOvec,dis_HINOvec,chem_HINOvec,dis_PROvec,chem_PROvec
132,CID100002662,Pulmonary Embolism,D000068579,MESH:D011655,"[0.01938764, 0.07840507, 0.02419372, -0.066259...","[-0.05989128, 0.02124369, -0.07228915, -0.0245...",1,DOID_9477,"[0.02989602, 0.10579331, 0.0458674, -0.1000161...","[6.47266880e-02, 1.49182171e-01, 4.98331375e-0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0156938918, 0.0867972076, 0.0221232176, -0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.01547952, 0.05871394, 0.02636163, -0.041966...","[0.04649611, 0.14973469, 0.11586712, -0.120807..."
644,CID100027991,Hallucinations,D003894,MESH:D006212,"[2.54816320e-02, 1.22387677e-01, 5.23497686e-0...","[4.16016653e-02, 9.38132107e-02, -1.80803277e-...",1,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1.65594146e-02, 4.69885767e-02, 2.39662267e-0...","[0.00156835, 0.03837546, 0.02150043, -0.032972...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0087855449, 0.035341766, 0.01706201, -0.048...","[0.01499267, 0.04976962, 0.03062077, -0.041347...","[0.0039960877, 0.0426524729, 0.0267111398, -0...."
765,CID100068740,Pain,C088658,MESH:D010146,"[0.04418474, 0.14310175, 0.04817571, -0.159905...","[2.24024318e-02, 9.60943699e-02, -6.57454357e-...",1,DOID_0060164,"[2.07049586e-02, 1.04310833e-01, 5.95965236e-0...","[0.0864669, 0.18244573, 0.04888484, -0.1034179...","[0.02015562, 0.06090377, 0.02371735, -0.060520...","[0.0221906733, 0.0621656179, 0.0342238694, -0....","[0.0214250777, 0.0636401176, 0.0447132327, -0....","[0.00700311828, 0.0423441194, 0.0239823423, -0...","[0.0285669453, 0.0819910243, 0.0499954, -0.095...","[-0.00724289, 0.09012652, 0.08922295, -0.08863..."
1832,,Acidosis,C520809,MESH:D000138,"[0.01699807, 0.13581963, 0.01922014, -0.101776...","[0.01823403, 0.12453435, 0.03510754, -0.120218...",0,DOID_0050758,"[2.46017128e-02, 1.35439306e-01, 6.50083050e-0...","[4.31502573e-02, 1.76954478e-01, 6.35614619e-0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.01223107, 0.05447845, 0.03592256, -0.060255...","[0.00281852437, 0.040843837, 0.0216121934, -0....","[0.00833679456, 0.0382306054, 0.024758298, -0....","[0.0162582137, 0.0336972848, 0.0148332668, -0....","[0.01273698, 0.03527592, 0.01804266, -0.033962..."
1003,,Fibrosis,C056516,MESH:D005355,"[-2.25543845e-02, -3.97761073e-03, -1.65494725...","[0.01802431, 0.14452943, -0.00467963, -0.15955...",0,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.01953488, 0.05832418, 0.0215925, -0.0734598...","[0.011856921, 0.0321784094, 0.0178192798, -0.0...","[0.00930883, 0.04089586, 0.01740959, -0.036715...","[0.0188581459, 0.0900882706, 0.0319500118, -0....","[0.00741399, 0.0757233, 0.045437, -0.065031, -...","[0.03806945, 0.14591888, 0.09889416, -0.108370..."
805,CID100082146,Hypersensitivity,C095105,MESH:D006967,"[0.003139, 0.13455677, 0.05174527, -0.15150066...","[0.03163584, 0.12489144, 0.02595111, -0.142592...",1,DOID_1205,"[3.10053602e-02, 1.36995822e-01, 5.86080104e-0...","[4.44377996e-02, 1.47881836e-01, 7.03755245e-0...","[0.01104784, 0.0811751, 0.02639464, -0.0813666...","[0.0323501714, 0.0718148202, 0.0226857904, -0....","[0.01789101, 0.04353557, 0.01829723, -0.042048...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.00470749, 0.04630517, 0.02642858, -0.050063...","[0.0234674681, 0.0828227624, 0.0572618805, -0...."
1119,,Dystonia,C108128,MESH:D004421,"[0.03336684, 0.13025133, -0.00576218, -0.14790...","[0.03494442, 0.12329976, 0.03048507, -0.136564...",0,DOID_543,"[0.03337827, 0.13778381, 0.06419638, -0.123143...","[0.04519903, 0.15312935, 0.05736172, -0.116072...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0312586166, 0.0807410479, 0.0347139612, -0....","[0.0108330026, 0.0483490452, 0.0155456848, -0....","[0.00272642, 0.03967267, 0.02175706, -0.040885...","[0.00218241196, 0.0473134592, 0.0247607734, -0...","[0.00342612, 0.05763958, 0.04024423, -0.069848..."
603,CID100005978,Intestinal Obstruction,D014750,MESH:D007415,"[0.03043379, 0.11411092, 0.03300637, -0.118642...","[3.89536209e-02, 1.31734654e-01, 1.25595992e-0...",1,DOID_8437,"[0.0298487, 0.11577346, 0.05007022, -0.1088261...","[0.04601289, 0.16350427, 0.06664635, -0.132718...","[4.79505537e-03, 9.33838561e-02, 5.36614992e-0...","[0.0280748326, 0.0803671926, 0.0389995761, -0....","[0.020531056, 0.059273284, 0.029718777, -0.056...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.02114966, 0.07192397, 0.03859979, -0.070619...","[0.0042272, 0.03712083, 0.0267113, -0.03179786..."
393,CID100003386,Hyperuricemia,D005473,MESH:D033461,"[0.05209794, 0.00067142, -0.4113431, -0.202348...","[0.02256741, 0.11404881, 0.0079848, -0.1397816...",1,DOID_1920,"[0.02471046, 0.14064701, 0.05976346, -0.115151...","[0.03516927, 0.16577831, 0.05603116, -0.125716...","[0.00727077, 0.05299819, 0.02481131, -0.049112...","[0.0183377, 0.07108285, 0.03803683, -0.0703049...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.042719543, 0.127159715, 0.054618571, -0.146...","[0.01784863, 0.06257563, 0.03386438, -0.049829...","[0.01558697, 0.19574451, 0.18315063, -0.181546..."
1790,,Subarachnoid Hemorrhage,C010792,MESH:D013345,"[0.01050896, 0.11482349, 0.05574952, -0.128762...","[-0.02982739, 0.10760874, -0.05823579, -0.0920...",0,DOID_0060228,"[0.01072248, 0.09699773, 0.04329898, -0.102470...","[0.05319184, 0.16859166, 0.05600307, -0.116780...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.02268115, 0.06369793, 0.02509757, -0.040526...","[0.015868254, 0.0460937805, 0.0181534849, -0.0...","[0.00610753242, 0.0467655025, 0.0222043451, -0...","[0.00855476316, 0.0393077545, 0.0167582519, -0...","[0.00938532, 0.053098, 0.0328143, -0.04804737,..."


## SIDER Run NN

In [113]:
# Load model (saved in opa-nn notebook)
from tensorflow.keras.models import load_model
# nn14022019auc921GoPhenCHEdoHI
# nn15022019auc937GoPhenCHEdoHIpro
# nn14022019auc921GoPhenCHEdoHI --> .586
# nn15022019auc92GoPhenCHEdoHIpro --> .50
# nn20022019auc878GoHINO.h5
# nn20022019auc866GoHINO
# nn20022019auc859GOHINOnosparsity
model = load_model('../opa/nn20022019auc866GoHINO.h5')



In [114]:
# Now let's see if saving and loading the model worked
# Create a vector from go vec + empty vecs for the other desired vecs
# Use NN to make predictions
# Evaluate these

In [115]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_3 (Flatten)          (None, 800)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 200)               160200    
_________________________________________________________________
dense_13 (Dense)             (None, 60)                12060     
_________________________________________________________________
dense_14 (Dense)             (None, 10)                610       
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 11        
Total params: 172,881
Trainable params: 172,881
Non-trainable params: 0
_________________________________________________________________


### Preprocess vecs

In [116]:
# # I thiiink that this model expects input shape 1600, so add empty vecs for the cols I don't have yet
# empty_vec = [0] * 200

# cols_to_do = ['disPhenVecMP', 'disPhenVecHP', 'CHEBIvec', 'DOvec', 'dis_HINOvec', 'chem_HINOvec', ]

# for col in cols_to_do:
#     sider_mod[col] = np.nan
#     sider_mod[col] = sider_mod[col].map(lambda x: empty_vec)

In [117]:
# Need to turn all to float
all_vecs = ['ChemGoVec', 'DisGoVec', 'disPhenVecMP', 'disPhenVecHP', 'CHEBIvec', 'DOvec', 'dis_HINOvec', 'chem_HINOvec', ]

for col in all_vecs:
    sider_mod[col] = sider_mod[col].map(lambda x: [float(i) for i in x])

In [118]:
print(sider_mod[sider_mod.Correlation == 1].shape[0])
print(sider_mod[sider_mod.Correlation == 0].shape[0])
print(sider_mod.shape)

881
776
(1657, 16)


In [119]:
sider_mod.sample(3)

Unnamed: 0,CID,MESH,ChemicalID,MESHid,ChemGoVec,DisGoVec,Correlation,DOID,disPhenVecMP,disPhenVecHP,CHEBIvec,DOvec,dis_HINOvec,chem_HINOvec,dis_PROvec,chem_PROvec
1344,,Thrombocytosis,C111237,MESH:D013922,"[0.0407695882, 0.122441672, 0.0625707433, -0.1...","[0.0222676, 0.13557586, 0.01347457, -0.1086524...",0,DOID_2228,"[0.02181925, 0.17985186, 0.07445057, -0.141800...","[0.0573725812, 0.141926616, 0.0507249758, -0.1...","[0.01256534, 0.05737388, 0.02561303, -0.061355...","[0.0136506511, 0.0822437853, 0.0422460325, -0....","[0.0113859, 0.04435341, 0.00967744, -0.0428643...","[0.00699205697, 0.0367214382, 0.0206637774, -0...","[0.00984975137, 0.0654935837, 0.0565586984, -0...","[0.00701486599, 0.0914235339, 0.0594028272, -0..."
1168,,Hypercholesterolemia,C101866,MESH:D006937,"[0.05118491, 0.07795462, -0.02230867, -0.11712...","[0.00705798483, 0.113484874, 0.0160176642, -0....",0,DOID_13810,"[0.0243070964, 0.125378847, 0.0576208457, -0.1...","[0.03911279, 0.14988306, 0.05798033, -0.125094...","[-0.00241162, 0.06804803, 0.0312441, -0.088556...","[0.0182029065, 0.0606179424, 0.0304756258, -0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.00871678907, 0.0451454818, 0.0274254084, -0...","[0.02465567, 0.06237349, 0.02876557, -0.051365...","[0.01428858, 0.0444684476, 0.0245946459, -0.04..."
1304,,Hypertension,C108128,MESH:D006973,"[0.03336684, 0.13025133, -0.00576218, -0.14790...","[0.006351477, 0.0485302098, -0.200872347, -0.1...",0,DOID_10763,"[0.03504562, 0.14703396, 0.06828971, -0.125510...","[0.0560684949, 0.168276355, 0.0629214868, -0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.015685454, 0.0748481154, 0.0308311991, -0.0...","[0.02415318, 0.10397863, 0.04948746, -0.097153...","[0.00272642, 0.03967267, 0.02175706, -0.040885...","[0.05884882, 0.1363091, 0.08159841, -0.1346005...","[0.00342612, 0.05763958, 0.04024423, -0.069848..."


In [120]:
# # Optionally remove all empty vecs
# empty_vec = [0.0] * 200

# for col in ['DisGoVec', 'ChemGoVec', 'disPhenVecMP', 'disPhenVecHP', 'CHEBIvec', 'DOvec', 'dis_HINOvec', 'chem_HINOvec', 'dis_PROvec']:
#     sider_mod[col] = sider_mod[col].map(lambda x: np.nan if x == empty_vec else x)
    
# sider_mod = sider_mod.dropna(subset=['DisGoVec', 'ChemGoVec', 'disPhenVecMP', 'disPhenVecHP', 'CHEBIvec', 'DOvec', 'dis_HINOvec', 'chem_HINOvec'])

In [121]:
print(sider_mod[sider_mod.Correlation == 1].shape[0])
print(sider_mod[sider_mod.Correlation == 0].shape[0])
print(sider_mod.shape)

881
776
(1657, 16)


In [122]:
# # Download sider_mod to run NN on it in Opa-nn notebook (compare and see if model load component failing)
# sider_mod.to_csv('Sider_val.csv')

In [123]:
# # For Keras, need to turn inputs into numpy arrays instead of pandas df
# # First create single np array of all vecs... not pretty:
# Dvecs = pd.DataFrame(sider_mod.DisGoVec.values.tolist(), index= sider_mod.index)
# Cvecs = pd.DataFrame(sider_mod.ChemGoVec.values.tolist(), index= sider_mod.index)
# gofuncs = Dvecs.merge(Cvecs, how='outer', left_index=True, right_index=True)

# DMPvecs = pd.DataFrame(sider_mod.disPhenVecHP.values.tolist(), index= sider_mod.index)
# DHPvecs = pd.DataFrame(sider_mod.disPhenVecMP.values.tolist(), index= sider_mod.index)
# disPvecs = DMPvecs.merge(DHPvecs, how='outer', left_index=True, right_index=True)

# all_X = disPvecs.merge(gofuncs, how='outer', left_index=True, right_index=True)

# CHEBvecs = pd.DataFrame(sider_mod.CHEBIvec.values.tolist(), index = sider_mod.index)
# all_X = CHEBvecs.merge(all_X, how='outer', left_index=True, right_index=True)

# DOvecs = pd.DataFrame(sider_mod.DOvec.values.tolist(), index = sider_mod.index)
# all_X = DOvecs.merge(all_X, how='outer', left_index=True, right_index=True)

# dHINOvecs = pd.DataFrame(sider_mod.dis_HINOvec.values.tolist(), index=sider_mod.index)
# cHINOvecs = pd.DataFrame(sider_mod.chem_HINOvec.values.tolist(), index=sider_mod.index)
# hinovecs = cHINOvecs.merge(dHINOvecs, how='outer', left_index=True, right_index=True)
# all_X = all_X.merge(hinovecs, how='outer', left_index=True, right_index=True)

# all_X = np.array(all_X)

In [124]:
# # try out dropping na rows --> does of course boost AUC significantly
# sider_mod = sider_mod.dropna()
# # need to re-add uncorrelated rows tho if dropping na

# # Add unrelated pairs - control obs
# no_rows = (sider_mod.shape[0]-1)   # This is a parameter to be tuned --> how many uncorrelated pairs do we want
# print('Original shape: ', sider_mod.shape)
# sider_mod = sider_mod.drop_duplicates(subset=['ChemicalID', 'MESHid'], keep=False)
# print('Shape after dropping duplicates: ', sider_mod.shape)

# # Randomly select chems and diseases (as many as there are related pairs)
# df_chems = sider_mod[['ChemicalID', 'ChemGoVec']].drop_duplicates(subset=['ChemicalID']).reset_index(drop=True)
# df_dis = sider_mod[['MESHid', 'DisGoVec', 'MESH']].drop_duplicates(subset=['MESHid']).reset_index(drop=True)
# df_chems.columns = ['ID', 'Vector']
# df_dis.columns = ['ID', 'Vector', 'MESH']

# # print('chem size: ', df_chems.shape[0])
# # print('dis size: ', df_dis.shape[0])

# no_chems = len(df_chems) - 1
# no_dis = len(df_dis) - 1
# rand_chems = np.random.choice(no_chems, no_rows, replace=True)
# rand_dis = np.random.choice(no_dis, no_rows, replace=True)

# # Add the new pairs as rows
# for x in range(0, no_rows):
#     int1 = rand_chems[x]
#     int2 = rand_dis[x]
#     chem, chemvec = df_chems.loc[int1, 'ID'], df_chems.loc[int1, 'Vector']
#     dis, disvec, mesh = df_dis.loc[int2, 'ID'], df_dis.loc[int2, 'Vector'], df_dis.loc[int2, 'MESH']
#     sider_mod = sider_mod.append({'ChemicalID':chem, 'MESHid':dis, 'ChemGoVec':chemvec, 'DisGoVec': disvec, 'Correlation':0, 'MESH': mesh}, ignore_index=True)

# print('Shape after adding controls: ', sider_mod.shape)
# # Drop any duplicates (removes known correlated pairs accidentally generated as uncorrelated)
# sider_mod = sider_mod.drop_duplicates(subset=['ChemicalID', 'MESHid'], keep=False)
# print('Shape after dropping duplicates: ', sider_mod.shape)

# # and re add empty vecs
# empty_vec = [0] * 200

# sider_mod['disPhenVecMP'] = sider_mod.DOID.map(lambda x: dis_mpVec.get(x, empty_vec))
# sider_mod['disPhenVecHP'] = sider_mod.DOID.map(lambda x: dis_hpVec.get(x, empty_vec))

In [125]:
# # Version for phen and gofunc vecs
# # For Keras, need to turn inputs into numpy arrays instead of pandas df
# # First create single np array of all vecs... not pretty:
# Dvecs = pd.DataFrame(sider_mod.DisGoVec.values.tolist(), index= sider_mod.index)
# Cvecs = pd.DataFrame(sider_mod.ChemGoVec.values.tolist(), index= sider_mod.index)
# gofuncs = Dvecs.merge(Cvecs, how='outer', left_index=True, right_index=True)

# DMPvecs = pd.DataFrame(sider_mod.disPhenVecHP.values.tolist(), index= sider_mod.index)
# DHPvecs = pd.DataFrame(sider_mod.disPhenVecMP.values.tolist(), index= sider_mod.index)
# disPvecs = DMPvecs.merge(DHPvecs, how='outer', left_index=True, right_index=True)

# all_X = disPvecs.merge(gofuncs, how='outer', left_index=True, right_index=True)

# all_X = np.array(all_X)

In [126]:
# # Version for HINO, DO, CHEBI, disphen and gofunc vecs
# # For Keras, need to turn inputs into numpy arrays instead of pandas df
# # First create single np array of all vecs... not pretty:
# Dvecs = pd.DataFrame(sider_mod.DisGoVec.values.tolist(), index= sider_mod.index)
# Cvecs = pd.DataFrame(sider_mod.ChemGoVec.values.tolist(), index= sider_mod.index)
# gofuncs = Dvecs.merge(Cvecs, how='outer', left_index=True, right_index=True)

# DMPvecs = pd.DataFrame(sider_mod.disPhenVecHP.values.tolist(), index= sider_mod.index)
# DHPvecs = pd.DataFrame(sider_mod.disPhenVecMP.values.tolist(), index= sider_mod.index)
# disPvecs = DMPvecs.merge(DHPvecs, how='outer', left_index=True, right_index=True)

# all_X = disPvecs.merge(gofuncs, how='outer', left_index=True, right_index=True)

# CHEBvecs = pd.DataFrame(sider_mod.CHEBIvec.values.tolist(), index = sider_mod.index)
# all_X = CHEBvecs.merge(all_X, how='outer', left_index=True, right_index=True)

# DOvecs = pd.DataFrame(sider_mod.DOvec.values.tolist(), index = sider_mod.index)
# all_X = DOvecs.merge(all_X, how='outer', left_index=True, right_index=True)

# dHINOvecs = pd.DataFrame(sider_mod.dis_HINOvec.values.tolist(), index=sider_mod.index)
# cHINOvecs = pd.DataFrame(sider_mod.chem_HINOvec.values.tolist(), index=sider_mod.index)
# hinovecs = cHINOvecs.merge(dHINOvecs, how='outer', left_index=True, right_index=True)
# all_X = all_X.merge(hinovecs, how='outer', left_index=True, right_index=True)

# all_X = np.array(all_X)

In [127]:
# # Version for PRO, HINO, DO, CHEBI, disphen and gofunc vecs
# # For Keras, need to turn inputs into numpy arrays instead of pandas df
# # First create single np array of all vecs... not pretty:
# Dvecs = pd.DataFrame(sider_mod.DisGoVec.values.tolist(), index= sider_mod.index)
# Cvecs = pd.DataFrame(sider_mod.ChemGoVec.values.tolist(), index= sider_mod.index)
# gofuncs = Dvecs.merge(Cvecs, how='outer', left_index=True, right_index=True)

# DMPvecs = pd.DataFrame(sider_mod.disPhenVecHP.values.tolist(), index= sider_mod.index)
# DHPvecs = pd.DataFrame(sider_mod.disPhenVecMP.values.tolist(), index= sider_mod.index)
# disPvecs = DMPvecs.merge(DHPvecs, how='outer', left_index=True, right_index=True)

# all_X = disPvecs.merge(gofuncs, how='outer', left_index=True, right_index=True)

# CHEBvecs = pd.DataFrame(sider_mod.CHEBIvec.values.tolist(), index = sider_mod.index)
# all_X = CHEBvecs.merge(all_X, how='outer', left_index=True, right_index=True)

# DOvecs = pd.DataFrame(sider_mod.DOvec.values.tolist(), index = sider_mod.index)
# all_X = DOvecs.merge(all_X, how='outer', left_index=True, right_index=True)

# dHINOvecs = pd.DataFrame(sider_mod.dis_HINOvec.values.tolist(), index=sider_mod.index)
# cHINOvecs = pd.DataFrame(sider_mod.chem_HINOvec.values.tolist(), index=sider_mod.index)
# hinovecs = cHINOvecs.merge(dHINOvecs, how='outer', left_index=True, right_index=True)
# all_X = all_X.merge(hinovecs, how='outer', left_index=True, right_index=True)

# dPROvecs = pd.DataFrame(sider_mod.dis_PROvec.values.tolist(), index=sider_mod.index)
# cPROvecs = pd.DataFrame(sider_mod.chem_PROvec.values.tolist(), index=sider_mod.index)
# PROvecs = cPROvecs.merge(dPROvecs, how='outer', left_index=True, right_index=True)
# all_X = all_X.merge(PROvecs, how='outer', left_index=True, right_index=True)

# all_X = np.array(all_X)

In [128]:
# # Version for gofunc vecs and CHEBI
# # For Keras, need to turn inputs into numpy arrays instead of pandas df
# # First create single np array of all vecs... not pretty:
# Dvecs = pd.DataFrame(sider_mod.DisGoVec.values.tolist(), index= sider_mod.index)
# Cvecs = pd.DataFrame(sider_mod.ChemGoVec.values.tolist(), index= sider_mod.index)
# all_X = Dvecs.merge(Cvecs, how='outer', left_index=True, right_index=True)

# CHEBvecs = pd.DataFrame(sider_mod.CHEBIvec.values.tolist(), index = sider_mod.index)
# all_X = CHEBvecs.merge(all_X, how='outer', left_index=True, right_index=True)

# all_X = np.array(all_X)

In [129]:
# # Version for gofunc vecs
# # For Keras, need to turn inputs into numpy arrays instead of pandas df
# # First create single np array of all vecs... not pretty:
# Dvecs = pd.DataFrame(sider_mod.DisGoVec.values.tolist(), index= sider_mod.index)
# Cvecs = pd.DataFrame(sider_mod.ChemGoVec.values.tolist(), index= sider_mod.index)
# all_X = Dvecs.merge(Cvecs, how='outer', left_index=True, right_index=True)

# all_X = np.array(all_X)

In [130]:
# # Version for gofunc vecs and HINO
# # For Keras, need to turn inputs into numpy arrays instead of pandas df
# # First create single np array of all vecs... not pretty:
# Dvecs = pd.DataFrame(sider_mod.DisGoVec.values.tolist(), index= sider_mod.index)
# Cvecs = pd.DataFrame(sider_mod.ChemGoVec.values.tolist(), index= sider_mod.index)
# all_X = Dvecs.merge(Cvecs, how='outer', left_index=True, right_index=True)

# dHINOvecs = pd.DataFrame(sider_mod.dis_HINOvec.values.tolist(), index=sider_mod.index)
# cHINOvecs = pd.DataFrame(sider_mod.chem_HINOvec.values.tolist(), index=sider_mod.index)
# hinovecs = cHINOvecs.merge(dHINOvecs, how='outer', left_index=True, right_index=True)
# all_X = all_X.merge(hinovecs, how='outer', left_index=True, right_index=True)

# all_X = np.array(all_X)

In [131]:
# Version for gofunc vecs and HINO
# For Keras, need to turn inputs into numpy arrays instead of pandas df
# First create single np array of all vecs... not pretty:
Dvecs = pd.DataFrame(sider_mod.DisGoVec.values.tolist(), index= sider_mod.index)
Cvecs = pd.DataFrame(sider_mod.ChemGoVec.values.tolist(), index= sider_mod.index)
all_X = Dvecs.merge(Cvecs, how='outer', left_index=True, right_index=True)

dHINOvecs = pd.DataFrame(sider_mod.dis_HINOvec.values.tolist(), index=sider_mod.index)
cHINOvecs = pd.DataFrame(sider_mod.chem_HINOvec.values.tolist(), index=sider_mod.index)
hinovecs = cHINOvecs.merge(dHINOvecs, how='outer', left_index=True, right_index=True)
all_X = all_X.merge(hinovecs, how='outer', left_index=True, right_index=True)

all_X = np.array(all_X)

In [132]:
# Now create np array of the y output
all_y = np.array(sider_mod.Correlation)

In [133]:
print('y shape: ', all_y.shape)
print('X shape: ', all_X.shape)

y shape:  (1657,)
X shape:  (1657, 800)


In [134]:
# sider_mod[['ChemicalID', 'MESHid', 'Correlation', 'ChemGoVec', 'DisGoVec']]

In [135]:
# Now I have my validation db (tho small...) so Run NN, get predictions and accuracy

In [136]:
# 2. Compile the model (give it loss func, optimise func and eval metric)
model.compile(optimizer=tf.train.AdamOptimizer(), # determines how the model is adapted based on loss func
              loss='binary_crossentropy', # measure of accuracy during training
              metrics=['accuracy']) # measure for train and testing steps 

In [137]:
# Accuracy
test_loss, test_acc = model.evaluate(all_X, all_y)
print('Test accuracy:', test_acc)

Test accuracy: 0.5588412796364682


In [138]:
# Get actual predictions for test set
predictions = model.predict(all_X)
rounded_predictions = [int(float(round(x[0]))) for x in predictions]

In [139]:
# ROC AUC
print('ROC AUC: ', roc_auc_score(all_y, predictions))

ROC AUC:  0.5623486080718958


In [140]:
print('Chems :', sider_mod.ChemicalID.nunique())
print('Dis :', sider_mod.MESH.nunique())
print('chem:dis obs: ', sider_mod.shape[0])
print('of which are uncorrelated: ', sider_mod[sider_mod.Correlation == 0].shape[0])

Chems : 62
Dis : 151
chem:dis obs:  1657
of which are uncorrelated:  776
