# Notebook: Use NN to predict disease from chemicals using Opa2Vec vectors
<b> Author: </b> Ian Coleman <br>
<b> Purpose: </b> Take the vectors created in the opa2vec notebook. This took chemical go functions
    and disease go function, creating vectors for each. Train a NN to predict positive chem-dis relationships from these vectors

In [358]:
# TODO 
# Currently I'm only creating the uncorrelated group from mixing corr dis and chems, should not be done this way

In [359]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from random import randint
import random
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

### 1. Import Vectors and Pre-Process them

In [360]:
# Import vec file
with open('AllVectorResults.lst', 'r') as file:
    text = file.read()

In [361]:
# Strip and split it into list of lists [chem, vec]
text = text.replace('\n', '')
text = text.split(']')
text = [item.strip().split(' [') for item in text]

In [362]:
# Turn it into a data frame
df = pd.DataFrame(text)
df.columns = ['ID', 'Vector']
# df.head()

In [363]:
# Clean
df = df.dropna()
df['Vector'] = df.Vector.map(lambda x: x.rstrip().lstrip().replace('    ', ' ').replace('   ', ' ').replace('  ', ' ').replace(' ', ','))

In [364]:
# Turn vector column into a list
df['Vector'] = df.Vector.map(lambda x: x.split(','))

In [365]:
# df.loc[:,0].head()
# BCE binary classification --> The loss function recommended by Jun
# sigmoid output

In [366]:
# Now we have chems/diseases and their respective vector
df.head()

Unnamed: 0,ID,Vector
0,MESH:D012559,"[0.01491615, -0.00155747, -0.30986652, 0.04035..."
1,MESH:D009404,"[3.82804424e-02, 1.29408345e-01, 3.75053808e-0..."
2,MESH:D001749,"[-0.01025235, 0.00664143, -0.30367315, 0.15593..."
3,MESH:D011471,"[-0.0130785, -0.02445601, -0.46697775, 0.13181..."
4,MESH:D008106,"[-0.06240484, 0.00166245, -0.5013923, 0.116841..."


### 2. Create DF for NN
From the ID-Vector dataframe we will now create a df matching each chem with each disease of the following columns:
ChemID DisID ChemVec DisVec PositiveAssociationExists(binary)

In [367]:
# Step 1: Import file of proven chem-dis positive associations (created in ctd-to-nt notebook from ctd data)
chem_dis = pd.read_csv('../ctd-to-nt/chem-dis-pos-assocs.csv')
chem_dis.head()

Unnamed: 0,ChemicalID,DiseaseID
0,C112297,MESH:D006948
1,C112297,MESH:D012640
2,C425777,MESH:D006948
3,C013567,MESH:D006333
4,C418863,MESH:D013262


In [368]:
# Step 2: Iterate through each chem and create a line for it with each dis

In [369]:
# First create is_chem col in df to differentiate between chem and disease
# df['is_chem'] = df.ID.map(lambda x: ':' not in x) # as len of disease ID is always 8

In [370]:
# We only want the chems and diseases that we have vectors for
df.shape

(2970, 2)

In [371]:
# Reshape chem_dis to to only keep lines where both chem and dis have a vec
chem_dis['DiseaseID'] = chem_dis['DiseaseID'].astype(str)
df['ID'] = df['ID'].astype(str)
id_list = df.ID.tolist() # list of chems+diseases with vecs

chem_dis['hasDVec'] = chem_dis.DiseaseID.map(lambda x: x in id_list)
chem_dis['hasCVec'] = chem_dis.ChemicalID.map(lambda x: x in id_list)
chem_dis = chem_dis.loc[(chem_dis['hasDVec'] == True) & (chem_dis['hasCVec'] == True)]
chem_dis = chem_dis.drop(['hasDVec','hasCVec'], axis=1)

In [372]:
# Now create a df:
# cID, dID, cVec, dVec, correlate?
# We'll build it on chem_dis...but all current relationships are positive. Need to create non-related pairs c-d
# out_df = chem_dis.copy()
# out_df['correlate'] = 1
# out_df.shape # 62015, should aim to make this many non-related associations too

In [373]:
# So iterate through vecs and create a line for it if there is a rel with a dis that has a vec
# chem_dis['dVec'] = np.nan
# chem_dis['cVec'] = np.nan
# chem_dis['dVec'] = np.where(df.ID == chem_dis.DiseaseID, df.Vector, None)

In [374]:
chem_dis.shape

(6660, 2)

In [375]:
df.head()

Unnamed: 0,ID,Vector
0,MESH:D012559,"[0.01491615, -0.00155747, -0.30986652, 0.04035..."
1,MESH:D009404,"[3.82804424e-02, 1.29408345e-01, 3.75053808e-0..."
2,MESH:D001749,"[-0.01025235, 0.00664143, -0.30367315, 0.15593..."
3,MESH:D011471,"[-0.0130785, -0.02445601, -0.46697775, 0.13181..."
4,MESH:D008106,"[-0.06240484, 0.00166245, -0.5013923, 0.116841..."


In [376]:
# chem_dis = chem_dis.drop(['hasDVec','hasCVec'], axis=1)

In [377]:
# merge all info into one df
# NB this does end up only containing correlated pairs... so this should be restructured
df_d = df.copy()
df_d.columns= ['DiseaseID', 'DVec']
df_c = df.copy()
df_c.columns= ['ChemicalID', 'CVec']
df1 = pd.merge(chem_dis, df_d, on='DiseaseID')
df1 = pd.merge(df1, df_c, on='ChemicalID')

In [378]:
df1['Correlation'] = 1 # currently only have correlated in there

In [379]:
df[-5:]

Unnamed: 0,ID,Vector
2965,C009277,"[1.23863621e-02, 8.82702619e-02, 4.14445959e-0..."
2966,D010476,"[9.61125921e-03, 9.76962894e-02, 5.69957085e-0..."
2967,D007545,"[0.0261362, 0.10923246, 0.0478084, -0.09515817..."
2968,C503700,"[1.94576792e-02, 1.17052965e-01, 2.78214607e-0..."
2969,D007840,"[0.02687103, 0.08320867, 0.05077124, -0.084266..."


In [380]:
df1[-5:]

Unnamed: 0,ChemicalID,DiseaseID,DVec,CVec,Correlation
6655,C523184,MESH:D010493,"[2.42044590e-02, 8.11136067e-02, 3.06675881e-0...","[0.01392082, 0.10170084, 0.03771368, -0.081525...",1
6656,D010662,MESH:D003876,"[1.61286499e-02, 1.31367072e-01, 2.22104788e-0...","[2.04818249e-02, 1.09417193e-01, 7.35951886e-0...",1
6657,D010938,MESH:D003877,"[6.56600296e-03, 1.54068500e-01, -5.23654372e-...","[0.02128523, 0.0977059, 0.04150996, -0.0879026...",1
6658,D017973,MESH:D012208,"[0.04091055, 0.11387356, 0.05788051, -0.136686...","[2.57809879e-03, 1.36724383e-01, 5.32903746e-0...",1
6659,C023336,MESH:C536561,"[2.78481357e-02, 9.72110108e-02, 4.88455780e-0...","[3.04495990e-02, 9.69514623e-02, 3.08338068e-0...",1


In [381]:
chem_dis[chem_dis.ChemicalID == 'C023336']

Unnamed: 0,ChemicalID,DiseaseID
59045,C023336,MESH:C536561


In [382]:
df1.shape

(6660, 5)

In [383]:
# Randomly generate control set - pairs of unrelated diseases and chemicals and add as rows 
# Change this to randomly select from df instead of chem_dis
no_rows = df1.shape[0]-1
print('shape: ', no_rows)
for x in range(0, no_rows):
    int1 = randint(0, df1.shape[0]-1)
    int2 = randint(0, df1.shape[0]-1)
    chem, chemvec = df1.loc[int1, 'ChemicalID'], df1.loc[int1, 'CVec']
    dis, disvec = df1.loc[int2, 'DiseaseID'], df1.loc[int2, 'DVec']
    df1 = df1.append({'ChemicalID':chem, 'DiseaseID':dis, 'CVec':chemvec, 'DVec':disvec, 'Correlation':0}, ignore_index=True)
    
# Delete any duplicates induced in generating rows (note that this will remove any known correlations 
# from the new rows)
df1 = df1.drop_duplicates(subset=['ChemicalID', 'DiseaseID'], keep=False)

shape:  6659


In [413]:
# no_rows = df1.shape[0]-1
# print('shape: ', no_rows)

chems = df.ID.map(lambda x: 'MESH' in x)
dis = df.ID.map(lambda x: 'MESH' not in x)

len(df.ID)
# dis =

# for x in range(0, no_rows):
    

2970

In [416]:
'MESH' in df.ID[0]

True

In [384]:
df1.shape

(9609, 5)

In [386]:
# Convert the elements of the vectors to actual numbers
df1['DVec'] = df1.DVec.map(lambda x: [float(i) for i in x])
df1['CVec'] = df1.CVec.map(lambda x: [float(i) for i in x])

### 2. Preprocess
Now that we have the df ready, let's split it into train/test/validation sets and convert it into numpy arrays so it can be consumed by a Keras NN

In [388]:
df1.head()

Unnamed: 0,ChemicalID,DiseaseID,DVec,CVec,Correlation
0,C049584,MESH:D001943,"[-0.00754089653, 0.0284954235, -0.145941272, -...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
1,C049584,MESH:D018270,"[0.01976116, 0.098279193, 0.0369541571, -0.089...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
2,C049584,MESH:D019457,"[0.03360923, 0.10056757, 0.05314376, -0.113913...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
3,C049584,MESH:D003110,"[0.00136586, 0.13832065, 0.02338981, -0.113038...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
4,C049584,MESH:D015179,"[-0.02237691, 0.07948194, -0.09784327, -0.0321...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1


In [389]:
# # Split df into train, test, val
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [390]:
# For Keras, need to turn inputs into numpy arrays instead of pandas df
# First create single np array of the two vectors CONCERN: should these be two separate inputs?
Dvecs = pd.DataFrame(df1.DVec.values.tolist(), index= df1.index)
Cvecs = pd.DataFrame(df1.CVec.values.tolist(), index= df1.index)
all_X = Dvecs.merge(Cvecs, how='outer', left_index=True, right_index=True)
all_X = np.array(all_X)

In [391]:
# Now create np array of the y output
all_y = np.array(df1.Correlation)

In [392]:
print('y shape: ', all_y.shape)
print('X shape: ', all_X.shape)

y shape:  (9609,)
X shape:  (9609, 400)


In [393]:
# Split into train, test, val
X_train, X_test, y_train, y_test = train_test_split(all_X, all_y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [394]:
type(X_train)

numpy.ndarray

### 3. Establish NN Model

In [395]:
# 1. Establish the model architecture
model = keras.Sequential([
    keras.layers.Dense(200, activation=tf.nn.relu), #Dense layers are fully connected, this one 128 nodes
    keras.layers.Dense(200, activation=tf.nn.relu), #it's safe to say that I don't know what I'm doing here
    keras.layers.Dense(1, activation=tf.nn.sigmoid)
])
# ??? How is the number of nodes decided? Final layer has no. of outcome vars

In [396]:
# 2. Compile the model (give it loss func, optimise func and eval metric)
model.compile(optimizer=tf.train.AdamOptimizer(), # determines how the model is adapted based on loss func
              loss='binary_crossentropy', # measure of accuracy during training
              metrics=['accuracy']) # measure for train and testing steps 

In [403]:
# 3. Train
model.fit(X_train, y_train, epochs=16) # ??? How is the number of epochs decided, rule of thumb? elbow?

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7fd33db09208>

In [404]:
# 4. Evaluate
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

Test accuracy: 0.5624349635796045


In [402]:
# # This is pointless - manually verifying accuracy test
# # Round predictions to int based on threshold, run accuracy-test manually
# predictions = model.predict(X_test)
# threshold = predictions[:].sum()/len(predictions) # Threshold is the mean value of predictions
# predictions = [float(round(x[0]-threshold+0.5)) for x in predictions]
# manual_accuracy = sklearn.metrics.accuracy_score(y_test, predictions, normalize=True, sample_weight=None)
# print(manual_accuracy)

0.5109261186264308


### 4. Calculate Cosine Similary
So NN was a bust... will return to it but for now taking a step back and a broad view of which methods to use
to predict chem-dis association <br>
Starting with cosine sim of the dis and chem vecs

In [251]:
df1.head()

Unnamed: 0,ChemicalID,DiseaseID,DVec,CVec,Correlation
1,C049584,MESH:D018270,"[1.97611600e-02, 9.82791930e-02, 3.69541571e-0...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
2,C049584,MESH:D019457,"[0.03360923, 0.10056757, 0.05314376, -0.113913...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
3,C049584,MESH:D003110,"[0.00136586, 0.13832065, 0.02338981, -0.113038...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
4,C049584,MESH:D015179,"[-0.02237691, 0.07948194, -0.09784327, -0.0321...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
5,C049584,MESH:D018450,"[3.39546017e-02, 1.23659089e-01, 3.62569839e-0...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1


In [401]:
# Calculate out the cosine similarity and see if there's a difference between groups (there isn't)
def cosine_sim (row):
    return cosine_similarity(np.array(row.DVec).reshape(1, -1), np.array(row.CVec).reshape(1, -1))[0][0]

df1['cosine_sim'] = df1.apply(lambda row: cosine_sim(row), axis=1)

# Compare cosine sim of correlated and uncorrelated groups
print('Cosine mean with no correlation: ', df1[df1.Correlation == 1 ].cosine_sim.mean())
print('Cosine mean with correlation: ', df1[df1.Correlation == 0 ].cosine_sim.mean())

Cosine mean with no correlation:  0.8946985295116548
Cosine mean with correlation:  0.8967258397244313


In [405]:
df1.sample(5)

Unnamed: 0,ChemicalID,DiseaseID,DVec,CVec,Correlation,cosine_sim
5777,D012293,MESH:D009402,"[0.0257995, 0.11954114, 0.05746227, -0.1107729...","[0.01611397, 0.13506274, 0.06212807, -0.140545...",1,0.972719
12723,D011794,MESH:D007249,"[0.01034204, 0.13031472, 0.01779367, -0.105695...","[0.02154884, 0.13381697, -0.02840679, -0.10185...",0,0.964045
1557,D014028,MESH:D001943,"[-0.00754089653, 0.0284954235, -0.145941272, -...","[0.05815332, 0.083392, -0.2269482, -0.03284321...",1,0.963383
16,C049584,MESH:D010190,"[-0.0205049776, 0.0931701958, -0.068471916, -0...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1,0.804406
9555,D010100,MESH:D009421,"[0.03341816, 0.12678742, 0.03506398, -0.108211...","[-0.01983569, 0.13055551, 0.03985594, -0.09913...",0,0.944355
