# Notebook: Use NN to predict disease from chemicals using Opa2Vec vectors
<b> Author: </b> Ian Coleman <br>
<b> Purpose: </b> Take the vectors created in the opa2vec notebook. This took chemical go functions
    and disease go function, creating vectors for each. Train a NN to predict positive chem-dis relationships from these vectors

In [158]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from random import randint
import random
from sklearn.model_selection import train_test_split

### 1. Import Vectors and Pre-Process them

In [81]:
# Import vec file
with open('AllVectorResults.lst', 'r') as file:
    text = file.read()

In [82]:
# Strip and split it into list of lists [chem, vec]
text = text.replace('\n', '')
text = text.split(']')
text = [item.strip().split(' [') for item in text]

In [83]:
# Turn it into a data frame
df = pd.DataFrame(text)
df.columns = ['ID', 'Vector']
# df.head()

In [84]:
# Clean
df = df.dropna()
df['Vector'] = df.Vector.map(lambda x: x.rstrip().lstrip().replace('    ', ' ').replace('   ', ' ').replace('  ', ' ').replace(' ', ','))

In [85]:
# Turn vector column into a list
df['Vector'] = df.Vector.map(lambda x: x.split(','))

In [86]:
# df.loc[:,0].head()
# BCE binary classification --> The loss function recommended by Jun
# sigmoid output

In [87]:
# Now we have chems/diseases and their respective vector
df.head()

Unnamed: 0,ID,Vector
0,MESH:D012559,"[0.01491615, -0.00155747, -0.30986652, 0.04035..."
1,MESH:D009404,"[3.82804424e-02, 1.29408345e-01, 3.75053808e-0..."
2,MESH:D001749,"[-0.01025235, 0.00664143, -0.30367315, 0.15593..."
3,MESH:D011471,"[-0.0130785, -0.02445601, -0.46697775, 0.13181..."
4,MESH:D008106,"[-0.06240484, 0.00166245, -0.5013923, 0.116841..."


### 2. Create DF for NN
From the ID-Vector dataframe we will now create a df matching each chem with each disease of the following columns:
ChemID DisID ChemVec DisVec PositiveAssociationExists(binary)

In [88]:
# Step 1: Import file of proven chem-dis positive associations (created in ctd-to-nt notebook from ctd data)
chem_dis = pd.read_csv('../ctd-to-nt/chem-dis-pos-assocs.csv')
chem_dis.head()

Unnamed: 0,ChemicalID,DiseaseID
0,C112297,MESH:D006948
1,C112297,MESH:D012640
2,C425777,MESH:D006948
3,C013567,MESH:D006333
4,C418863,MESH:D013262


In [89]:
# Step 2: Iterate through each chem and create a line for it with each dis

In [90]:
# First create is_chem col in df to differentiate between chem and disease
# df['is_chem'] = df.ID.map(lambda x: ':' not in x) # as len of disease ID is always 8

In [91]:
# We only want the chems and diseases that we have vectors for
df.shape

(2970, 2)

In [92]:
# Reshape chem_dis to to only keep lines where both chem and dis have a vec
chem_dis['DiseaseID'] = chem_dis['DiseaseID'].astype(str)
df['ID'] = df['ID'].astype(str)
id_list = df.ID.tolist()
chem_dis['hasDVec'] = chem_dis.DiseaseID.map(lambda x: x in id_list)
chem_dis['hasCVec'] = chem_dis.ChemicalID.map(lambda x: x in id_list)
chem_dis = chem_dis.loc[(chem_dis['hasDVec'] == True) & (chem_dis['hasCVec'] == True)]
chem_dis = chem_dis.drop(['hasDVec','hasCVec'], axis=1)

In [93]:
# Now create a df:
# cID, dID, cVec, dVec, correlate?
# We'll build it on chem_dis...but all current relationships are positive. Need to create non-related pairs c-d
# out_df = chem_dis.copy()
# out_df['correlate'] = 1
# out_df.shape # 62015, should aim to make this many non-related associations too

(6660, 3)

In [94]:
# So iterate through vecs and create a line for it if there is a rel with a dis that has a vec
# chem_dis['dVec'] = np.nan
# chem_dis['cVec'] = np.nan
# chem_dis['dVec'] = np.where(df.ID == chem_dis.DiseaseID, df.Vector, None)

In [95]:
chem_dis.shape

(6660, 2)

In [96]:
df.head()

Unnamed: 0,ID,Vector
0,MESH:D012559,"[0.01491615, -0.00155747, -0.30986652, 0.04035..."
1,MESH:D009404,"[3.82804424e-02, 1.29408345e-01, 3.75053808e-0..."
2,MESH:D001749,"[-0.01025235, 0.00664143, -0.30367315, 0.15593..."
3,MESH:D011471,"[-0.0130785, -0.02445601, -0.46697775, 0.13181..."
4,MESH:D008106,"[-0.06240484, 0.00166245, -0.5013923, 0.116841..."


In [97]:
# chem_dis = chem_dis.drop(['hasDVec','hasCVec'], axis=1)

In [98]:
# merge all info into one df
df_d = df.copy()
df_d.columns= ['DiseaseID', 'DVec']
df_c = df.copy()
df_c.columns= ['ChemicalID', 'CVec']
df1 = pd.merge(chem_dis, df_d, on='DiseaseID')
df1 = pd.merge(df1, df_c, on='ChemicalID')

In [99]:
df1['Correlation'] = 1

In [100]:
df1.head()

Unnamed: 0,ChemicalID,DiseaseID,DVec,CVec,Correlation
0,C049584,MESH:D001943,"[-7.54089653e-03, 2.84954235e-02, -1.45941272e...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
1,C049584,MESH:D018270,"[1.97611600e-02, 9.82791930e-02, 3.69541571e-0...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
2,C049584,MESH:D019457,"[0.03360923, 0.10056757, 0.05314376, -0.113913...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
3,C049584,MESH:D003110,"[0.00136586, 0.13832065, 0.02338981, -0.113038...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
4,C049584,MESH:D015179,"[-0.02237691, 0.07948194, -0.09784327, -0.0321...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1


In [101]:
df1.shape

(6660, 5)

In [102]:
# Randomly generate control set - pairs of unrelated diseases and chemicals and add as rows 
no_rows = df1.shape[0]-1
print('shape: ', no_rows)
for x in range(0, no_rows):
    int1 = randint(0, df1.shape[0]-1)
    int2 = randint(0, df1.shape[0]-1)
    chem, chemvec = df1.loc[int1, 'ChemicalID'], df1.loc[int1, 'CVec']
    dis, disvec = df1.loc[int2, 'DiseaseID'], df1.loc[int2, 'DVec']
    df1 = df1.append({'ChemicalID':chem, 'DiseaseID':dis, 'CVec':chemvec, 'DVec':disvec, 'Correlation':0}, ignore_index=True)
    
# Delete any duplicates induced in generating rows (note that this will remove any known correlations 
# from the new rows)
df1 = df1.drop_duplicates(subset=['ChemicalID', 'DiseaseID'], keep=False)

shape:  6659


In [103]:
df1.shape

(10692, 5)

In [106]:
df1.sample(13)

Unnamed: 0,ChemicalID,DiseaseID,DVec,CVec,Correlation
3292,D016572,MESH:D003920,"[0.01597756, 0.1102011, 0.04125728, -0.1060583...","[0.02098043, 0.16350506, 0.02600217, -0.136558...",1
1337,D012906,MESH:D002869,"[0.02050007, 0.1214596, 0.02021665, -0.1083158...","[0.0088706, 0.12644653, 0.05610176, -0.0973913...",1
9826,D013749,MESH:D020521,"[6.00448437e-02, 4.87491116e-02, -1.77989587e-...","[-0.00484554, 0.04194254, -0.2854546, 0.007800...",0
9545,D004317,MESH:D014777,"[0.01738031, 0.12546273, 0.05568044, -0.116583...","[0.0052996, 0.09436312, 0.06031594, -0.1170692...",0
9891,D002220,MESH:D065886,"[0.03868974, 0.04606202, -0.11462147, -0.04161...","[2.61190552e-02, 1.01711757e-01, 4.12642285e-0...",0
15750,D010100,MESH:D009135,"[0.03400159, 0.11989318, 0.02914303, -0.126909...","[-0.01983569, 0.13055551, 0.03985594, -0.09913...",0
13683,D011078,MESH:D000013,"[2.95486022e-02, 1.07405908e-01, 3.65215577e-0...","[0.02907555, 0.13805275, 0.03605509, -0.150835...",0
2885,D001151,MESH:D014397,"[3.88203412e-02, 1.17690191e-01, -5.02661616e-...","[-3.86975444e-04, 1.26777455e-01, 1.97975617e-...",1
7548,C089750,MESH:D006984,"[2.17946768e-02, 8.44956636e-02, 3.87270674e-0...","[2.25237999e-02, 1.30792275e-01, 3.84024456e-0...",0
12921,C088658,MESH:D009203,"[0.01541551, 0.11135558, -0.05661469, -0.04524...","[1.76655445e-02, -8.61010998e-02, -6.38402700e...",0


### 2. Preprocess
Now that we have the df ready, let's split it into train/test/validation sets and convert it into numpy arrays so it can be consumed by a Keras NN

In [109]:
df1.head()

Unnamed: 0,ChemicalID,DiseaseID,DVec,CVec,Correlation
0,C049584,MESH:D001943,"[-7.54089653e-03, 2.84954235e-02, -1.45941272e...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
1,C049584,MESH:D018270,"[1.97611600e-02, 9.82791930e-02, 3.69541571e-0...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
2,C049584,MESH:D019457,"[0.03360923, 0.10056757, 0.05314376, -0.113913...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
3,C049584,MESH:D003110,"[0.00136586, 0.13832065, 0.02338981, -0.113038...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
4,C049584,MESH:D015179,"[-0.02237691, 0.07948194, -0.09784327, -0.0321...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1


In [None]:
# # Split df into train, test, val
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [168]:
# For Keras, need to turn inputs into numpy arrays instead of pandas df
# First create single np array of the two vectors CONCERN: should these be two separate inputs?
Dvecs = pd.DataFrame(df1.DVec.values.tolist(), index= df1.index)
Cvecs = pd.DataFrame(df1.CVec.values.tolist(), index= df1.index)
all_X = Dvecs.merge(Cvecs, how='outer', left_index=True, right_index=True)
all_X = np.array(all_X)

In [169]:
# Now create np array of the y output
all_y = np.array(df1.Correlation)

In [170]:
print('y shape: ', all_y.shape)
print('X shape: ', all_X.shape)

y shape:  (10692,)
X shape:  (10692, 400)


In [171]:
# Split into train, test, val
X_train, X_test, y_train, y_test = train_test_split(all_X, all_y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [172]:
type(X_train)

numpy.ndarray

### 3. Establish NN Model

In [173]:
# 1. Establish the model architecture
model = keras.Sequential([
    keras.layers.Dense(200, activation=tf.nn.relu), #Dense layers are fully connected, this one 128 nodes
    keras.layers.Dense(200, activation=tf.nn.relu), #it's safe to say that I don't know what I'm doing here
    keras.layers.Dense(1, activation=tf.nn.sigmoid)
])
# ??? How is the number of nodes decided? Final layer has no. of outcome vars

In [174]:
# 2. Compile the model (give it loss func, optimise func and eval metric)
model.compile(optimizer=tf.train.AdamOptimizer(), # determines how the model is adapted based on loss func
              loss='binary_crossentropy', # measure of accuracy during training
              metrics=['accuracy']) # measure for train and testing steps 

In [175]:
# 3. Train
model.fit(X_train, y_train, epochs=5) # ??? How is the number of epochs decided, rule of thumb? elbow?

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f3a5ee51550>

In [176]:
# 4. Evaluate
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

Test accuracy: 0.5357643760159061
