# Notebook: Use pretrained NN to make predictions specifically for one disease or chemical
<b> Author: </b> Ian Coleman <br>

In [48]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from random import randint
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import subprocess
import math
from tensorflow.keras.models import load_model
import ast
#Set random seed
np.random.seed(1606)

In [49]:
# Load model (saved in opa-nn notebook)
model = load_model('nn26022019auc82FULL.h5')



In [50]:
df = pd.read_csv('nndf.csv')
df.shape

(17739, 20)

In [51]:
df = df.drop_duplicates(subset=['DiseaseID'])
df.shape

(2492, 20)

In [52]:
# Select a chemical to predict for --> stick with those that have most features
# Ozone = D010126
# df.ChemicalID = 'D010126'
choice = 'D010126'
chosen_one = df[df.ChemicalID == choice].iloc[0]
df.ChemicalID = choice
df.CVec = chosen_one.CVec
df.CHEBIvec = chosen_one.CHEBIvec
df.chem_HINOvec = chosen_one.chem_HINOvec
df.chem_PROvec = chosen_one.chem_PROvec

In [53]:
df.sample(5)

Unnamed: 0,index,ChemicalID,DiseaseID,DVec,CVec,Correlation,DOID,disPhenVecMP,disPhenVecHP,chemPhenVecHP,chemPhenVecMP,CHEBIvec,DOvec,dis_HINOvec,chem_HINOvec,dis_PROvec,chem_PROvec,train,test,val
11853,11871,D010126,OMIM:615887,"[0.01818166, 0.12378526, 0.04889142, -0.121179...","[0.03669577, 0.12194263, 0.06072473, -0.123892...",0,DOID_0110063,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.01126783, 0.06393044, 0.02156465, -0.055335...","[0.0136271901, 0.0530568585, 0.019042287, -0.0...","[0.0141532, 0.03931979, 0.02005169, -0.0302570...","[0.0162163, 0.04033889, 0.02631078, -0.0336999...","[0.00288361, 0.07592782, 0.03343415, -0.038101...",True,False,False
13305,13330,D010126,MESH:C536385,"[0.00938240439, 0.10453891, 0.0762157366, -0.1...","[0.03669577, 0.12194263, 0.06072473, -0.123892...",0,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.01199584, 0.03398103, 0.01708337, -0.036862...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0141532, 0.03931979, 0.02005169, -0.0302570...","[0.0220800377, 0.0591374822, 0.0259745289, -0....","[0.00288361, 0.07592782, 0.03343415, -0.038101...",True,False,False
15529,15572,D010126,MESH:C567564,"[0.00841829, 0.140784, 0.02630615, -0.15054174...","[0.03669577, 0.12194263, 0.06072473, -0.123892...",0,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.00321890693, 0.0142608797, 0.00761737814, -...","[0.01036113, 0.04542729, 0.01591639, -0.044820...","[0.0141532, 0.03931979, 0.02005169, -0.0302570...","[-0.00064169, 0.05391175, 0.02826553, -0.05401...","[0.00288361, 0.07592782, 0.03343415, -0.038101...",True,False,False
9712,9717,D010126,MESH:D008265,"[0.0302413, 0.12132155, 0.06380817, -0.1173168...","[0.03669577, 0.12194263, 0.06072473, -0.123892...",0,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0076338821, 0.033348184, 0.016061915, -0.02...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0141532, 0.03931979, 0.02005169, -0.0302570...","[0.00994788, 0.03832113, 0.02065783, -0.035911...","[0.00288361, 0.07592782, 0.03343415, -0.038101...",False,True,False
10791,10804,D010126,MESH:C537466,"[0.0364489332, 0.124290623, 0.0150852408, -0.1...","[0.03669577, 0.12194263, 0.06072473, -0.123892...",0,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.00016291378, -0.0018969665, -0.0002104037, ...","[0.00513098, 0.05196941, 0.03235548, -0.054002...","[0.0141532, 0.03931979, 0.02005169, -0.0302570...","[-0.01113216, 0.07152898, 0.06170911, -0.05302...","[0.00288361, 0.07592782, 0.03343415, -0.038101...",True,False,False


In [54]:
type(df.iloc[0].CVec)

str

In [55]:
# Need to turn all to float
all_vecs = ['CVec', 'DVec', 'disPhenVecMP', 'disPhenVecHP', 'CHEBIvec', 'DOvec', 'dis_HINOvec', 'chem_HINOvec', 'dis_PROvec', 'chem_PROvec' ]

for col in all_vecs:
    df[col] = df[col].map(lambda x: ast.literal_eval(x))
    df[col] = df[col].map(lambda x: [float(i) for i in x])

In [56]:
# Version for GoFuncs, DIS-Phens, CHEBI, DO, HINO, PRO
# For Keras, need to turn inputs into numpy arrays instead of pandas df
# First create single np array of all vecs... not pretty:
Dvecs = pd.DataFrame(df.DVec.values.tolist(), index= df.index)
Cvecs = pd.DataFrame(df.CVec.values.tolist(), index= df.index)
gofuncs = Dvecs.merge(Cvecs, how='outer', left_index=True, right_index=True)

# all_X = Dvecs.merge(Cvecs, how='outer', left_index=True, right_index=True)
DMPvecs = pd.DataFrame(df.disPhenVecHP.values.tolist(), index= df.index)
DHPvecs = pd.DataFrame(df.disPhenVecMP.values.tolist(), index= df.index)
disPvecs = DMPvecs.merge(DHPvecs, how='outer', left_index=True, right_index=True)
all_X = disPvecs.merge(gofuncs, how='outer', left_index=True, right_index=True)

CHEBvecs = pd.DataFrame(df.CHEBIvec.values.tolist(), index = df.index)
all_X = CHEBvecs.merge(all_X, how='outer', left_index=True, right_index=True)

DOvecs = pd.DataFrame(df.DOvec.values.tolist(), index = df.index)
all_X = DOvecs.merge(all_X, how='outer', left_index=True, right_index=True)

dHINOvecs = pd.DataFrame(df.dis_HINOvec.values.tolist(), index=df.index)
cHINOvecs = pd.DataFrame(df.chem_HINOvec.values.tolist(), index=df.index)
hinovecs = cHINOvecs.merge(dHINOvecs, how='outer', left_index=True, right_index=True)
all_X = all_X.merge(hinovecs, how='outer', left_index=True, right_index=True)

dPROvecs = pd.DataFrame(df.dis_PROvec.values.tolist(), index=df.index)
cPROvecs = pd.DataFrame(df.chem_PROvec.values.tolist(), index=df.index)
PROvecs = cPROvecs.merge(dPROvecs, how='outer', left_index=True, right_index=True)
all_X = PROvecs.merge(all_X, how='outer', left_index=True, right_index=True)

all_X = np.array(all_X)

In [57]:
all_X.shape #(2492, 2000)

(2492, 2000)

In [58]:
# Get actual predictions for test set
predictions = model.predict(all_X)
rounded_predictions = [int(float(round(x[0]))) for x in predictions]

df['preds'] = predictions
df['rpreds'] = rounded_predictions

In [70]:
# get map of ids to names for dis and chem
id2name = pd.read_csv('../ctd-to-nt/chemid_chemname_disid_disname.csv')
chem_id2name = dict(zip(id2name.ChemicalID, id2name['# ChemicalName']))
dis_id2name = dict(zip(id2name.DiseaseID, id2name.DiseaseName))

# Apply map
df['ChemName'] = df.ChemicalID.map(lambda x: chem_id2name.get(x))
df['DisName'] = df.DiseaseID.map(lambda x: dis_id2name.get(x))

In [None]:
# Turns out the above doesn't have all the dis ones
dis_boyos = pd.read_csv('../ctd-to-nt/dis_id2name.csv')

In [72]:
df[['ChemName', 'DiseaseID', 'DisName', 'preds', 'rpreds']].sample(14)

Unnamed: 0,ChemName,DiseaseID,DisName,preds,rpreds
10584,Ozone,MESH:C562688,,0.028114,0
12317,Ozone,OMIM:613376,,0.007485,0
11845,Ozone,MESH:C567766,,0.003209,0
13231,Ozone,OMIM:616033,,0.011611,0
10640,Ozone,OMIM:300850,,0.000103,0
9072,Ozone,MESH:C580388,,0.078727,0
9343,Ozone,MESH:D011218,Prader-Willi Syndrome,0.006781,0
9735,Ozone,OMIM:613958,,0.496149,0
17179,Ozone,MESH:C537758,,0.02644,0
204,Ozone,MESH:D019957,Motor Skills Disorders,0.734108,1
