# Validate NN on SIDER unseen database
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: Let's take the NN developed in Opa/ and test it out on an unseen database

Ways to improve <br>
- get more chems through disgenet
- get more diseases by running opa2vec on ctd data freshly

In [361]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from random import randint
import random
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from pandas_ml import ConfusionMatrix
import json
import subprocess
import pickle
import math

#Set random seed
np.random.seed(1606)

In [362]:
# Right, what databases? Where can I get unseen chem-disease associations
# Virtual Metabolic Human - ?nutrients
# Sider - drugs
# I want environmental chemicals, could use EPA toxic list but probably all in training database
# How does this validation thing work, (i) import trained model (ii) create features for the chemical/diseases
# (iii) predict

In [363]:
# Import database of unqiue diseases with their vectors from opa-nn

In [364]:
# import new chemicals with their actual disease associations, extract unique chems

## 1. Sider

In [365]:
# Import sider (all side effects)
# SE = side effect
# CID1 - "flat compound", i.e. stereo-isomers have been merged into one compound
# CID2 - stereo-specific compound id
colnames = ['CID1', 'CID2', 'UMLS', 'UMLS2Type', 'UMLS2', 'SEname']
sider = pd.read_csv('../validation/data/meddra_all_se.tsv', sep='\t', names=colnames)

In [366]:
sider.sample(3)

Unnamed: 0,CID1,CID2,UMLS,UMLS2Type,UMLS2,SEname
161261,CID100005314,CID000005314,C0042510,PT,C0042510,Ventricular fibrillation
59539,CID100003143,CID000148123,C0151738,LLT,C0151738,Large intestine perforation
87586,CID100003690,CID000003690,C0020542,PT,C0020542,Pulmonary hypertension


### Get Disease MESH IDs for sider side effects
Problem here is that sider uses UMLS, convert this to MESH
<br> Commenting out the next few cells as the mapping process is intensive and I've saved map

In [367]:
# # Import CTD Chemical-Disease Original CSV to get disease names, try semantic matching to get UMLS-MESH conversion
# Read in CTD sample, skipping the intro rows
cols = ['DiseaseID', 'DiseaseName', 'DirectEvidence']
col_types = {   
    'DiseaseID': 'category',
    'DiseaseName': 'category',
    'DirectEvidence': 'category'
}
df_cd = pd.read_csv('../ctd-to-nt/csvs/CTD_chemicals_diseases.csv', skiprows=27, usecols=cols, dtype=col_types)
df_cd = df_cd.drop(0)
df_cd = df_cd.dropna(subset=['DirectEvidence']) # drop if it doesn't have direct evidence

In [368]:
# df_cd.head()

In [369]:
# Make a mesh disease name to mesh id map for later use
mesh_get_id = dict(zip(df_cd.DiseaseName, df_cd.DiseaseID))

In [370]:
# # Process DiseaseID so as to be usable in url
# df_cd['DiseaseID'] = df_cd['DiseaseID'].str.replace('MESH:', '')

# #Specify type to optimise
# df_cd['ChemicalID'] = df_cd.ChemicalID.astype(str)
# # df_cd['InferenceGeneSymbol'] = df_cd.InferenceGeneSymbol.astype(str)

In [371]:
# Use a measure of distance to match up disease names from ctd (MESH) and from sider (UMLS) 
from difflib import SequenceMatcher
import pdb

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def create_map(std_list, flawed_list):
    flawed_list = (n for n in flawed_list)
    team_map = {}
    best_score = {}
    for team in flawed_list:
        scores = [similar(team, std_team) for std_team in std_list]
        highest = max(scores)
        if highest > 0.8:
            index = scores.index(max(scores))
            team_map[team] = std_list[index]
    return team_map

In [372]:
# umls = sorted(sider.SEname.unique())
# mesh = sorted(df_cd.DiseaseName.unique())

In [373]:
# Commenting out as takes ages, and am saving the map as pickle object
# umls_mesh_map = create_map(umls, mesh)
# umls_mesh_map_mod = {value:key for (key, value) in umls_mesh_map_mod.items()}

In [374]:
# print(ummap)

In [375]:
# # These are the incorrect mappings I've identified for a 0.8 similarity cutoff
# remove = ('Agraphia', 'Angina, Stable', 'Cerebrospinal Fluid Otorrhea', 'Confusion',
#          'Endarteritis', 'Fetal Growth Retardation', 'Glucose Intolerance', 'Hearing Disorders',
#          'Hemoperitoneum', 'Hepatitis, Animal', 'Hip Contracture', 'Hyperoxaluria',
#          'Hyperoxia', 'Hyperpigmentation','Hypolipoproteinemias',  'Intestinal Diseases',
#          'Milk Hypersensitivity', 'Mucositis', 'Murine Acquired Immunodeficiency Syndrome', 
#          'Muscle Neoplasms', 'Mycotoxicosis', 'Olfaction Disorders','Osteopetrosis',
#          'Peanut Hypersensitivity', 'Pharyngeal Neoplasms', 'Polycystic liver disease',
#          'Pseudohypoparathyroidism', 'Psychomotor Agitation', 'Pulmonary Emphysema',
#          'Purpura, Thrombocytopenic', 'Renal Insufficiency', 'Sciatic Neuropathy',
#          'Simian Acquired Immunodeficiency Syndrome', 'Spinal Curvatures', 'Sporotrichosis',
#          'Vipoma', 'Vitamin A Deficiency', 'Vitamin D Deficiency', 'Vitamin E Deficiency',
#          'Wheat Hypersensitivity')
# umls_mesh_map_mod = {key: umls_mesh_map[key] for key in umls_mesh_map if key not in remove}
# # Muscle neoplasms is not the same as muscle spams
# # 'Olfaction Disorders' != 'Ovulation disorder'

In [376]:
# # Export map of UMLS:MESH
# with open('umls_mesh_map'+ '.pkl', 'wb') as f:
#         pickle.dump(umls_mesh_map_mod, f, pickle.HIGHEST_PROTOCOL)

In [377]:
# Loading the map from pickle object - if you haven't created it you may need to uncomment above lines
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

umls_mesh_map_mod = load_obj('umls_mesh_map')

In [378]:
#Use the umls-mesh map to add mesh col to sider

In [379]:
sider['MESH'] = sider.SEname.map(lambda x: umls_mesh_map_mod.get(str(x)))

In [380]:
sider.head()

Unnamed: 0,CID1,CID2,UMLS,UMLS2Type,UMLS2,SEname,MESH
0,CID100000085,CID000010917,C0000729,LLT,C0000729,Abdominal cramps,
1,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain,Abdominal Pain
2,CID100000085,CID000010917,C0000737,LLT,C0000737,Abdominal pain,Abdominal Pain
3,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain,
4,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain,Abdominal Pain


In [381]:
print('total sider rows: ', sider.shape[0])
print('sider rows with mesh value: ', sider[sider.MESH.map(lambda x: x is not None)].shape[0])

total sider rows:  309849
sider rows with mesh value:  146773


In [382]:
sider_mod = sider[sider.MESH.map(lambda x: x is not None)]

In [383]:
sider_mod.sample(2)

Unnamed: 0,CID1,CID2,UMLS,UMLS2Type,UMLS2,SEname,MESH
127407,CID100004595,CID000004595,C0013384,LLT,C0013384,Dyskinesia,Dyskinesias
300125,CID116132446,CID016132446,C0030305,PT,C0030305,Pancreatitis,Pancreatitis


In [384]:
# Split out the two CID columns NOTE that each row can now potentially be two - one for each CID1 and CID2
sider1 = sider_mod[['CID1', 'MESH']]
sider2 = sider_mod[['CID2', 'MESH']]
sider1.columns = ['CID', 'MESH']
sider2.columns = ['CID', 'MESH']
sider_mod = pd.concat([sider1, sider2], ignore_index=True)

In [385]:
sider_mod.sample(2)

Unnamed: 0,CID,MESH
215305,CID000004927,Pruritus
161941,CID000002540,Hypersensitivity


In [386]:
print('Sider shape: ', sider_mod.shape[0])
sider_mod = sider_mod.drop_duplicates()
print('Total unique correlated chem:dis observations: ', sider_mod.shape[0])
print('Unique chems: ', sider_mod.CID.unique().shape[0])
print('Unique diseases: ', sider_mod.MESH.unique().shape[0])

Sider shape:  293546
Total unique correlated chem:dis observations:  145635
Unique chems:  2968
Unique diseases:  1034


In [387]:
# Chop out all chems that are in our training database
# Read in training db chems (opa-nn notebook)
chems_in_nn = pd.read_csv('../opa/chemsInNN.txt', names=['Chem'])
chems_in_nn = chems_in_nn.dropna().drop_duplicates()
chems_in_nn.shape[0]

# Now chop from SIDER db
nnChems = list(chems_in_nn.Chem)
sider_mod['inNN'] = sider_mod.CID.map(lambda x: x in nnChems)
sider_mod = sider_mod[sider_mod.inNN.map(lambda x: x == False)]
sider_mod = sider_mod[['CID', 'MESH']]
sider_mod = sider_mod.reset_index(drop=True)

In [388]:
print('Total unique correlated chem:dis observations: ', sider_mod.shape[0])
print('Unique chems: ', sider_mod.CID.unique().shape[0])
print('Unique diseases: ', sider_mod.MESH.unique().shape[0])

Total unique correlated chem:dis observations:  145635
Unique chems:  2968
Unique diseases:  1034


In [389]:
# Now we have a set of chem:dis that are not in the NN training set

In [390]:
# Next: Make each vector for these
# Then: Run NN on them

# Chemical entity - Gene Ontology embeddings (via associated genes)
# Disease entity - Gene Ontology embeddings (via associated genes)
# Disease entity - Human Phenotype Ontology embeddings (via associated phenotypes)
# Disease entity - Mammalian Phenotype Ontology embeddings (via associated phenotypes)
# Chemical entity - Chemical Entities of Biological Interest (CHEBI ) Ontology embeddings
# Disease entity - Disease Ontology embeddings
# Chemical entity - Human Interaction Network Ontology embeddings (via associated genes)
# Disease entity - Human Interaction Network Ontology embeddings (via associated genes)


In [391]:
# SIDER-GO vecs
# For this I need chem-gene associations and disease-gene associations
# Sources: CTD, Disgenet

### Sider Go vecs
First get IDs

In [392]:
sider_mod.sample(2)

Unnamed: 0,CID,MESH
119514,CID005311051,Hyperglycemia
5944,CID100002375,Dysuria


In [393]:
# Turn CID to CTD chemical ID with this map I made earlier 
# Load the map from pickle object
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

ctd_cid_map = load_obj('../opa/ctd_cid_map')

In [394]:
# Will need to standardise the CID and decode from bytes
def cid_standardiser (cid):
    # Must be format CID + 9 int chars, starting with 1 seemingly
    cid = int(cid)
    output = 'CID1' + '0' * (8 - len(str(cid))) + str(cid)
    return output

ctd_cid_map_df = pd.DataFrame.from_dict(ctd_cid_map, orient='index')

In [395]:
# Process and reverse map
ctd_cid_map_df[0] = ctd_cid_map_df[0].str.decode('utf-8')
ctd_cid_map_df[0] = ctd_cid_map_df[0].map(lambda x: cid_standardiser(x))
ctd_cid_map = dict(zip(ctd_cid_map_df[0], ctd_cid_map_df.index.values))

In [396]:
# Now we have the map, apply it to our sider df
sider_mod['ChemicalID'] = sider_mod.CID.map(lambda x: ctd_cid_map.get(x))

In [397]:
print('chem:dis combos: ', sider_mod[sider_mod.ChemicalID.map(lambda x: x is not None)].shape[0])
print('unique chems: ',sider_mod[sider_mod.ChemicalID.map(lambda x: x is not None)].ChemicalID.nunique())

chem:dis combos:  31280
unique chems:  595


In [398]:
sider_mod = sider_mod[sider_mod.ChemicalID.map(lambda x: x is not None)]
sider_mod['MESHid'] = sider_mod.MESH.map(lambda x: mesh_get_id.get(x))
sider_mod.head()

Unnamed: 0,CID,MESH,ChemicalID,MESHid
40,CID100000119,Angioedema,D005680,MESH:D000799
41,CID100000119,Pain,D005680,MESH:D010146
42,CID100000119,Urticaria,D005680,MESH:D014581
43,CID100000137,Anemia,D000622,MESH:D000740
44,CID100000137,Aphasia,D000622,MESH:D001037


In [399]:
print('Total unique correlated chem:dis observations: ', sider_mod.shape[0])
print('Unique chems: ', sider_mod.CID.unique().shape[0])
print('Unique diseases: ', sider_mod.MESHid.unique().shape[0])
## Note that we're losing a lot when we take only chems in CTD - see if we can get gene assocs from elsewhere

Total unique correlated chem:dis observations:  31280
Unique chems:  595
Unique diseases:  886


### Get chem-gene-vecs and dis-gene-vecs premade from CTD data

In [400]:
# Import GOFUNC vecs directly, for diseases
with open('../opa/go-gofuncs.lst', 'r') as file:
    text = file.read()
    
# Strip and split vector data into list of lists [chem, vec]
text = text.replace('\n', '')
text = text.split(']')
text = [item.strip().split(' [') for item in text]

# Turn it into a data frame
df = pd.DataFrame(text)
df.columns = ['ID', 'Vector']

# Clean
df = df.dropna()
df['Vector'] = df.Vector.map(lambda x: x.rstrip().lstrip().replace('    ', ' ').replace('   ', ' ').replace('  ', ' ').replace(' ', ','))

# Turn vector column into a list
df['Vector'] = df.Vector.map(lambda x: x.split(','))

In [401]:
df[df.ID.map(lambda x: ('MESH' not in x) & ('OMIM' not in x))].shape# 586

(586, 2)

In [402]:
# Get the chemical vecs, delete any row without a chemical vec
chem_go_vecs = df[df.ID.map(lambda x: ('MESH' not in x) & ('OMIM' not in x))]
chem_to_vec = dict(zip(chem_go_vecs.ID, chem_go_vecs.Vector))
sider_mod['ChemGoVec'] = sider_mod.ChemicalID.map(lambda x: chem_to_vec.get(x))
sider_mod = sider_mod[sider_mod.ChemGoVec.map(lambda x: x is not None)]

In [403]:
# Get the disease vecs, delete any row without a disease vec
dis_go_vecs = df[df.ID.map(lambda x: 'MESH' in x)]
dis_to_vec = dict(zip(dis_go_vecs.ID, dis_go_vecs.Vector))
sider_mod['DisGoVec'] = sider_mod.MESHid.map(lambda x: dis_to_vec.get(x))
sider_mod = sider_mod[sider_mod.DisGoVec.map(lambda x: x is not None)]

In [404]:
sider_mod.head()

Unnamed: 0,CID,MESH,ChemicalID,MESHid,ChemGoVec,DisGoVec
76,CID100000143,Anorexia,D002955,MESH:D000855,"[-0.1928972, 0.04133245, -0.13697416, -0.05781...","[-0.03718876, 0.12608664, -0.0080918, -0.14357..."
80,CID100000143,Diarrhea,D002955,MESH:D003967,"[-0.1928972, 0.04133245, -0.13697416, -0.05781...","[7.43628596e-04, 1.31800339e-01, 2.09368542e-0..."
83,CID100000143,Hypersensitivity,D002955,MESH:D006967,"[-0.1928972, 0.04133245, -0.13697416, -0.05781...","[0.01048629, 0.12093927, 0.02683925, -0.134863..."
87,CID100000143,Pruritus,D002955,MESH:D011537,"[-0.1928972, 0.04133245, -0.13697416, -0.05781...","[-4.40565981e-02, 8.48817080e-02, -7.02312589e..."
88,CID100000143,Stomatitis,D002955,MESH:D013280,"[-0.1928972, 0.04133245, -0.13697416, -0.05781...","[-4.93375286e-02, 8.82932022e-02, 1.08785890e-..."


In [405]:
has_dis_vec = sider_mod.DisGoVec.map(lambda x: x is not np.nan)
has_chem_vec = sider_mod.ChemGoVec.map(lambda x: x is not None)
sider_mod = sider_mod[has_dis_vec & has_chem_vec]
print('Number of chem-dis pairs with gofuncs: ', sider_mod.shape[0])
print('Number of chems: ', sider_mod.ChemicalID.nunique())
print('Number of diseases: ', sider_mod.MESHid.nunique())

Number of chem-dis pairs with gofuncs:  1366
Number of chems:  62
Number of diseases:  166


### Del any pairs in the original NN dataset

In [406]:
# Now to make it a real blind test we must del any chem-dis pairs in the NN db
nn_chem_dis = pd.read_csv('../ctd-to-nt/chem-dis-pos-assocs.csv')
nn_chem_dis.columns = ['ChemicalID', 'MESHid']

# Remove from sider_mod any chem-dis pairs that exist in nn_chem_dis
combined_cd = pd.merge(sider_mod[['ChemicalID', 'MESHid']], nn_chem_dis, on=['ChemicalID', 'MESHid'], how='left', indicator='Exist')
combined_cd['Exist'] = np.where(combined_cd.Exist == 'both', True, False)
not_in_nn = [not i for i in list(combined_cd.Exist)]
sider_mod = sider_mod[not_in_nn]

In [407]:
has_dis_vec = sider_mod.DisGoVec.map(lambda x: x is not np.nan)
has_chem_vec = sider_mod.ChemGoVec.map(lambda x: x is not None)
sider_mod = sider_mod[has_dis_vec & has_chem_vec]
print('Number of chem-dis pairs with gofuncs: ', sider_mod.shape[0])
print('Number of chems: ', sider_mod.ChemicalID.nunique())
print('Number of diseases: ', sider_mod.MESHid.nunique())

Number of chem-dis pairs with gofuncs:  991
Number of chems:  62
Number of diseases:  151


### Add control rows (all above are correlated)

In [408]:
# Add control rows (all above are correlated)
sider_mod['Correlation'] = 1

In [409]:
sider.head()

Unnamed: 0,CID1,CID2,UMLS,UMLS2Type,UMLS2,SEname,MESH
0,CID100000085,CID000010917,C0000729,LLT,C0000729,Abdominal cramps,
1,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain,Abdominal Pain
2,CID100000085,CID000010917,C0000737,LLT,C0000737,Abdominal pain,Abdominal Pain
3,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain,
4,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain,Abdominal Pain


In [410]:
# Add unrelated pairs - control obs
no_rows = (sider_mod.shape[0]-1)   # This is a parameter to be tuned --> how many uncorrelated pairs do we want
print('Original shape: ', sider_mod.shape)
sider_mod = sider_mod.drop_duplicates(subset=['ChemicalID', 'MESHid'], keep=False)
print('Shape after dropping duplicates: ', sider_mod.shape)

# Randomly select chems and diseases (as many as there are related pairs)
df_chems = sider_mod[['ChemicalID', 'ChemGoVec']].drop_duplicates(subset=['ChemicalID']).reset_index(drop=True)
df_dis = sider_mod[['MESHid', 'DisGoVec', 'MESH']].drop_duplicates(subset=['MESHid']).reset_index(drop=True)
df_chems.columns = ['ID', 'Vector']
df_dis.columns = ['ID', 'Vector', 'MESH']

# print('chem size: ', df_chems.shape[0])
# print('dis size: ', df_dis.shape[0])

no_chems = len(df_chems) - 1
no_dis = len(df_dis) - 1
rand_chems = np.random.choice(no_chems, no_rows, replace=True)
rand_dis = np.random.choice(no_dis, no_rows, replace=True)

# Add the new pairs as rows
for x in range(0, no_rows):
    int1 = rand_chems[x]
    int2 = rand_dis[x]
    chem, chemvec = df_chems.loc[int1, 'ID'], df_chems.loc[int1, 'Vector']
    dis, disvec, mesh = df_dis.loc[int2, 'ID'], df_dis.loc[int2, 'Vector'], df_dis.loc[int2, 'MESH']
    sider_mod = sider_mod.append({'ChemicalID':chem, 'MESHid':dis, 'ChemGoVec':chemvec, 'DisGoVec': disvec, 'Correlation':0, 'MESH': mesh}, ignore_index=True)

print('Shape after adding controls: ', sider_mod.shape)
# Drop any duplicates (removes known correlated pairs accidentally generated as uncorrelated)
sider_mod = sider_mod.drop_duplicates(subset=['ChemicalID', 'MESHid'], keep=False)
print('Shape after dropping duplicates: ', sider_mod.shape)

Original shape:  (991, 7)
Shape after dropping duplicates:  (991, 7)
Shape after adding controls:  (1981, 7)
Shape after dropping duplicates:  (1657, 7)


In [411]:
sider_mod.head()

Unnamed: 0,CID,MESH,ChemicalID,MESHid,ChemGoVec,DisGoVec,Correlation
0,CID100000143,Hypersensitivity,D002955,MESH:D006967,"[-0.1928972, 0.04133245, -0.13697416, -0.05781...","[0.01048629, 0.12093927, 0.02683925, -0.134863...",1
2,CID100000143,Urticaria,D002955,MESH:D014581,"[-0.1928972, 0.04133245, -0.13697416, -0.05781...","[-0.15222934, 0.05032845, -0.21946777, -0.0608...",1
3,CID100000143,Acute Kidney Injury,D002955,MESH:D058186,"[-0.1928972, 0.04133245, -0.13697416, -0.05781...","[0.01984936, 0.0847866, 0.04233291, -0.0797925...",1
4,CID100000143,Disease Progression,D002955,MESH:D018450,"[-0.1928972, 0.04133245, -0.13697416, -0.05781...","[1.11526530e-03, 1.28473654e-01, 3.04674823e-0...",1
6,CID100000681,Atrial Fibrillation,D004298,MESH:D001281,"[-6.34962916e-02, 1.08258978e-01, -2.02546176e...","[-1.60670001e-03, 1.17270418e-01, 1.88340824e-...",1


In [412]:
# sider_mod[['MESH', 'ChemicalID', 'Correlation']].sort_values(['ChemicalID'])

In [413]:
# Manually looking at chem-dis associations. Some don't seem to exist like
# Hypertension	D019793
# Neoplasms	D019793
# Don't seem to exist - google search but do exist in sider

In [414]:
sider_mod.sample(5)

Unnamed: 0,CID,MESH,ChemicalID,MESHid,ChemGoVec,DisGoVec,Correlation
423,CID100003394,Skin Ulcer,D005480,MESH:D012883,"[-0.19691008, 0.01246815, -0.21318772, -0.0904...","[-0.03957883, 0.13396174, -0.0498878, -0.09962...",1
1641,,Adenocarcinoma of lung,D005013,MESH:C538231,"[-0.09967206, 0.05885228, -0.1085031, -0.05162...","[-0.05705886, 0.09871884, 0.00668764, -0.11565...",0
156,CID100002818,Hypertriglyceridemia,D003024,MESH:D015228,"[-0.0243297, 0.09980957, -0.01168219, -0.12672...","[-3.41259576e-02, 1.32385924e-01, 4.26359326e-...",1
1646,,Trigeminal Neuralgia,D012968,MESH:D014277,"[1.50684863e-02, 1.09219313e-01, 3.79371457e-0...","[-0.14551874, 0.05946752, -0.18298474, -0.0806...",0
1559,,Hypersensitivity,D004221,MESH:D006967,"[-0.15010051, 0.04692498, -0.16275029, -0.0622...","[0.01048629, 0.12093927, 0.02683925, -0.134863...",0


##  SIDER Phenotype Ontology 

In [415]:
# First get DOIDs --> importing map and applying it:
mapper = pd.read_csv('../opa/chem_dis_to_CID_DOID.csv')
print(mapper.DOID.nunique()) # 1671
mesh_to_doid = dict(zip(mapper.ID, mapper.DOID))
sider_mod['DOID'] = sider_mod.MESHid.map(lambda x: mesh_to_doid.get(x))

1671


In [416]:
# Standardise the DOIDs
def doid_standardiser (doid):
    doid = doid.replace(':', '_')
    return doid

sider_mod['DOID'] = sider_mod.DOID.map(lambda x: np.nan if isinstance(x, float) else doid_standardiser(x))

In [417]:
# Simply load in the premade dis-phenVec maps
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

dis_mpVec = load_obj('../opa/dis_mpVec_map')

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

dis_hpVec = load_obj('../opa/dis_hpVec_map')

In [418]:
# Apply the maps to add phenVecs to our dataframe
empty_vec = [0] * 200

sider_mod['disPhenVecMP'] = sider_mod.DOID.map(lambda x: dis_mpVec.get(x, empty_vec))
sider_mod['disPhenVecHP'] = sider_mod.DOID.map(lambda x: dis_hpVec.get(x, empty_vec))

In [419]:
sider_mod.head()

Unnamed: 0,CID,MESH,ChemicalID,MESHid,ChemGoVec,DisGoVec,Correlation,DOID,disPhenVecMP,disPhenVecHP
0,CID100000143,Hypersensitivity,D002955,MESH:D006967,"[-0.1928972, 0.04133245, -0.13697416, -0.05781...","[0.01048629, 0.12093927, 0.02683925, -0.134863...",1,DOID_1205,"[3.05852331e-02, 1.36921287e-01, 5.83447553e-0...","[4.44701687e-02, 1.48065820e-01, 7.01489896e-0..."
2,CID100000143,Urticaria,D002955,MESH:D014581,"[-0.1928972, 0.04133245, -0.13697416, -0.05781...","[-0.15222934, 0.05032845, -0.21946777, -0.0608...",1,DOID_1555,"[0.02479337, 0.11779714, 0.04578669, -0.118837...","[0.04158028, 0.15021026, 0.05177102, -0.128954..."
3,CID100000143,Acute Kidney Injury,D002955,MESH:D058186,"[-0.1928972, 0.04133245, -0.13697416, -0.05781...","[0.01984936, 0.0847866, 0.04233291, -0.0797925...",1,DOID_3021,"[2.61312630e-02, 1.06001623e-01, 5.01475073e-0...","[4.42636572e-02, 1.56471714e-01, 5.58946170e-0..."
4,CID100000143,Disease Progression,D002955,MESH:D018450,"[-0.1928972, 0.04133245, -0.13697416, -0.05781...","[1.11526530e-03, 1.28473654e-01, 3.04674823e-0...",1,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,CID100000681,Atrial Fibrillation,D004298,MESH:D001281,"[-6.34962916e-02, 1.08258978e-01, -2.02546176e...","[-1.60670001e-03, 1.17270418e-01, 1.88340824e-...",1,DOID_0060224,"[5.46471141e-02, 1.52429357e-01, 7.47762397e-0...","[0.06875926, 0.15412922, 0.06408796, -0.108529..."


In [420]:
# Right let's add the rest of the features to our dataset

## SIDER Run NN

In [425]:
# Load model (saved in opa-nn notebook)
from tensorflow.keras.models import load_model

model = load_model('../opa/nn020219auc907.h5')



In [426]:
# Now let's see if saving and loading the model worked
# Create a vector from go vec + empty vecs for the other desired vecs
# Use NN to make predictions
# Evaluate these

In [427]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 1600)              0         
_________________________________________________________________
dense (Dense)                (None, 200)               320200    
_________________________________________________________________
dense_1 (Dense)              (None, 60)                12060     
_________________________________________________________________
dense_2 (Dense)              (None, 10)                610       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 332,881
Trainable params: 332,881
Non-trainable params: 0
_________________________________________________________________


### Preprocess vecs

In [428]:
# I thiiink that this model expects input shape 1600, so add empty vecs for the cols I don't have yet
empty_vec = [0] * 200

cols_to_do = ['disPhenVecMP', 'disPhenVecHP', 'CHEBIvec', 'DOvec', 'dis_HINOvec', 'chem_HINOvec', ]

for col in cols_to_do:
    sider_mod[col] = np.nan
    sider_mod[col] = sider_mod[col].map(lambda x: empty_vec)

In [429]:
# Need to turn all to float
all_vecs = ['ChemGoVec', 'DisGoVec', 'disPhenVecMP', 'disPhenVecHP', 'CHEBIvec', 'DOvec', 
            'dis_HINOvec', 'chem_HINOvec', ]

for col in all_vecs:
    sider_mod[col] = sider_mod[col].map(lambda x: [float(i) for i in x])

In [430]:
# Download sider_mod to run NN on it in Opa-nn notebook (compare and see if model load component failing)
sider_mod.to_csv('Sider_val.csv')

In [431]:
# For Keras, need to turn inputs into numpy arrays instead of pandas df
# First create single np array of all vecs... not pretty:
Dvecs = pd.DataFrame(sider_mod.DisGoVec.values.tolist(), index= sider_mod.index)
Cvecs = pd.DataFrame(sider_mod.ChemGoVec.values.tolist(), index= sider_mod.index)
gofuncs = Dvecs.merge(Cvecs, how='outer', left_index=True, right_index=True)

DMPvecs = pd.DataFrame(sider_mod.disPhenVecHP.values.tolist(), index= sider_mod.index)
DHPvecs = pd.DataFrame(sider_mod.disPhenVecMP.values.tolist(), index= sider_mod.index)
disPvecs = DMPvecs.merge(DHPvecs, how='outer', left_index=True, right_index=True)

all_X = disPvecs.merge(gofuncs, how='outer', left_index=True, right_index=True)

CHEBvecs = pd.DataFrame(sider_mod.CHEBIvec.values.tolist(), index = sider_mod.index)
all_X = CHEBvecs.merge(all_X, how='outer', left_index=True, right_index=True)

DOvecs = pd.DataFrame(sider_mod.DOvec.values.tolist(), index = sider_mod.index)
all_X = DOvecs.merge(all_X, how='outer', left_index=True, right_index=True)

dHINOvecs = pd.DataFrame(sider_mod.dis_HINOvec.values.tolist(), index=sider_mod.index)
cHINOvecs = pd.DataFrame(sider_mod.chem_HINOvec.values.tolist(), index=sider_mod.index)
hinovecs = cHINOvecs.merge(dHINOvecs, how='outer', left_index=True, right_index=True)
all_X = all_X.merge(hinovecs, how='outer', left_index=True, right_index=True)

all_X = np.array(all_X)

In [432]:
# Now create np array of the y output
all_y = np.array(sider_mod.Correlation)

In [433]:
print('y shape: ', all_y.shape)
print('X shape: ', all_X.shape)

y shape:  (1657,)
X shape:  (1657, 1600)


In [434]:
# sider_mod[['ChemicalID', 'MESHid', 'Correlation', 'ChemGoVec', 'DisGoVec']]

In [435]:
# Now I have my validation db (tho small...) so Run NN, get predictions and accuracy

In [436]:
# 2. Compile the model (give it loss func, optimise func and eval metric)
model.compile(optimizer=tf.train.AdamOptimizer(), # determines how the model is adapted based on loss func
              loss='binary_crossentropy', # measure of accuracy during training
              metrics=['accuracy']) # measure for train and testing steps 

In [437]:
# Accuracy
test_loss, test_acc = model.evaluate(all_X, all_y)
print('Test accuracy:', test_acc)

Test accuracy: 0.4966807485921741


In [438]:
# Get actual predictions for test set
predictions = model.predict(all_X)
rounded_predictions = [int(float(round(x[0]))) for x in predictions]

In [439]:
# ROC AUC
print('ROC AUC: ', roc_auc_score(all_y, predictions))
# .52...
# Right, options:
# (i) Model is shit and doesn't work and thesis isn't looking great
# (ii) WRONG -> Loading the model in isn't working properly - download data from here and run it in opa-nn
# (iii) Sample size is too small to detect pattern, get bigger validation db...seems v unlikely
# (iv) WRONG (prob)-> Good chance that a significant amount of the controls are actually correlated, import
# controls from opa-nn training database? Is that cheating tho - actually, given that the controls are taken
# from chems and diseases that have approval, it's prob safe to assume uncorr as otherwise would be included as corr
# (v) Some of these side effects occur in less than 1% of patients... seems like quite a bulls eye! Try another 
# dataset
# (vi) CHASE is a genius and highlighted that opa2vec will project the same data in different ways upon different
# trainings so I need to train the validation vecs along with the original vecs

ROC AUC:  0.5753010285874768


In [440]:
print('Chems :', sider_mod.ChemicalID.nunique())
print('Dis :', sider_mod.MESH.nunique())
print('chem:dis obs: ', sider_mod.shape[0])
print('of which are uncorrelated: ', sider_mod[sider_mod.Correlation == 0].shape[0])

Chems : 62
Dis : 151
chem:dis obs:  1657
of which are uncorrelated:  776


In [441]:
sider_mod

Unnamed: 0,CID,MESH,ChemicalID,MESHid,ChemGoVec,DisGoVec,Correlation,DOID,disPhenVecMP,disPhenVecHP,CHEBIvec,DOvec,dis_HINOvec,chem_HINOvec
0,CID100000143,Hypersensitivity,D002955,MESH:D006967,"[-0.1928972, 0.04133245, -0.13697416, -0.05781...","[0.01048629, 0.12093927, 0.02683925, -0.134863...",1,DOID_1205,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,CID100000143,Urticaria,D002955,MESH:D014581,"[-0.1928972, 0.04133245, -0.13697416, -0.05781...","[-0.15222934, 0.05032845, -0.21946777, -0.0608...",1,DOID_1555,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,CID100000143,Acute Kidney Injury,D002955,MESH:D058186,"[-0.1928972, 0.04133245, -0.13697416, -0.05781...","[0.01984936, 0.0847866, 0.04233291, -0.0797925...",1,DOID_3021,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,CID100000143,Disease Progression,D002955,MESH:D018450,"[-0.1928972, 0.04133245, -0.13697416, -0.05781...","[0.0011152653, 0.128473654, 0.00304674823, -0....",1,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,CID100000681,Atrial Fibrillation,D004298,MESH:D001281,"[-0.0634962916, 0.108258978, -0.00202546176, -...","[-0.00160670001, 0.117270418, 0.0188340824, -0...",1,DOID_0060224,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,CID100000681,Vomiting,D004298,MESH:D014839,"[-0.0634962916, 0.108258978, -0.00202546176, -...","[-0.10008354, 0.11469808, -0.0721268, -0.11735...",1,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,CID100001727,Mental Disorders,D015761,MESH:D001523,"[0.0176533423, 0.108793855, 0.049405124, -0.11...","[0.00456702, 0.10299499, 0.01434879, -0.119055...",1,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,CID100001727,Hypersensitivity,D015761,MESH:D006967,"[0.0176533423, 0.108793855, 0.049405124, -0.11...","[0.01048629, 0.12093927, 0.02683925, -0.134863...",1,DOID_1205,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
11,CID100001727,Urinary Tract Infections,D015761,MESH:D014552,"[0.0176533423, 0.108793855, 0.049405124, -0.11...","[-0.0405895673, 0.150183201, 0.0180282425, -0....",1,DOID_13148,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
12,CID100001727,Urticaria,D015761,MESH:D014581,"[0.0176533423, 0.108793855, 0.049405124, -0.11...","[-0.15222934, 0.05032845, -0.21946777, -0.0608...",1,DOID_1555,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
