# Notebook: Use NN to predict disease from chemicals using Opa2Vec vectors
<b> Author: </b> Ian Coleman <br>
<b> Purpose: </b> Take the vectors created in the opa2vec notebook. This took chemical go functions
    and disease go function, creating vectors for each. Train a NN to predict positive chem-dis relationships from these vectors

In [190]:
# TODO 
# Currently I'm only creating the control group from mixing non-control group, ideally use original unrelated c-ds

In [271]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from random import randint
import random
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

### 1. Import Vectors and Pre-Process them

In [192]:
# Import vec file
with open('AllVectorResults.lst', 'r') as file:
    text = file.read()

In [193]:
# Strip and split it into list of lists [chem, vec]
text = text.replace('\n', '')
text = text.split(']')
text = [item.strip().split(' [') for item in text]

In [194]:
# Turn it into a data frame
df = pd.DataFrame(text)
df.columns = ['ID', 'Vector']
# df.head()

In [195]:
# Clean
df = df.dropna()
df['Vector'] = df.Vector.map(lambda x: x.rstrip().lstrip().replace('    ', ' ').replace('   ', ' ').replace('  ', ' ').replace(' ', ','))

In [196]:
# Turn vector column into a list
df['Vector'] = df.Vector.map(lambda x: x.split(','))

In [197]:
# df.loc[:,0].head()
# BCE binary classification --> The loss function recommended by Jun
# sigmoid output

In [198]:
# Now we have chems/diseases and their respective vector
df.head()

Unnamed: 0,ID,Vector
0,MESH:D012559,"[0.01491615, -0.00155747, -0.30986652, 0.04035..."
1,MESH:D009404,"[3.82804424e-02, 1.29408345e-01, 3.75053808e-0..."
2,MESH:D001749,"[-0.01025235, 0.00664143, -0.30367315, 0.15593..."
3,MESH:D011471,"[-0.0130785, -0.02445601, -0.46697775, 0.13181..."
4,MESH:D008106,"[-0.06240484, 0.00166245, -0.5013923, 0.116841..."


### 2. Create DF for NN
From the ID-Vector dataframe we will now create a df matching each chem with each disease of the following columns:
ChemID DisID ChemVec DisVec PositiveAssociationExists(binary)

In [199]:
# Step 1: Import file of proven chem-dis positive associations (created in ctd-to-nt notebook from ctd data)
chem_dis = pd.read_csv('../ctd-to-nt/chem-dis-pos-assocs.csv')
chem_dis.head()

Unnamed: 0,ChemicalID,DiseaseID
0,C112297,MESH:D006948
1,C112297,MESH:D012640
2,C425777,MESH:D006948
3,C013567,MESH:D006333
4,C418863,MESH:D013262


In [200]:
# Step 2: Iterate through each chem and create a line for it with each dis

In [201]:
# First create is_chem col in df to differentiate between chem and disease
# df['is_chem'] = df.ID.map(lambda x: ':' not in x) # as len of disease ID is always 8

In [202]:
# We only want the chems and diseases that we have vectors for
df.shape

(2970, 2)

In [203]:
# Reshape chem_dis to to only keep lines where both chem and dis have a vec
chem_dis['DiseaseID'] = chem_dis['DiseaseID'].astype(str)
df['ID'] = df['ID'].astype(str)
id_list = df.ID.tolist()
chem_dis['hasDVec'] = chem_dis.DiseaseID.map(lambda x: x in id_list)
chem_dis['hasCVec'] = chem_dis.ChemicalID.map(lambda x: x in id_list)
chem_dis = chem_dis.loc[(chem_dis['hasDVec'] == True) & (chem_dis['hasCVec'] == True)]
chem_dis = chem_dis.drop(['hasDVec','hasCVec'], axis=1)

In [204]:
# Now create a df:
# cID, dID, cVec, dVec, correlate?
# We'll build it on chem_dis...but all current relationships are positive. Need to create non-related pairs c-d
# out_df = chem_dis.copy()
# out_df['correlate'] = 1
# out_df.shape # 62015, should aim to make this many non-related associations too

In [205]:
# So iterate through vecs and create a line for it if there is a rel with a dis that has a vec
# chem_dis['dVec'] = np.nan
# chem_dis['cVec'] = np.nan
# chem_dis['dVec'] = np.where(df.ID == chem_dis.DiseaseID, df.Vector, None)

In [206]:
chem_dis.shape

(6660, 2)

In [207]:
df.head()

Unnamed: 0,ID,Vector
0,MESH:D012559,"[0.01491615, -0.00155747, -0.30986652, 0.04035..."
1,MESH:D009404,"[3.82804424e-02, 1.29408345e-01, 3.75053808e-0..."
2,MESH:D001749,"[-0.01025235, 0.00664143, -0.30367315, 0.15593..."
3,MESH:D011471,"[-0.0130785, -0.02445601, -0.46697775, 0.13181..."
4,MESH:D008106,"[-0.06240484, 0.00166245, -0.5013923, 0.116841..."


In [208]:
# chem_dis = chem_dis.drop(['hasDVec','hasCVec'], axis=1)

In [232]:
# merge all info into one df
df_d = df.copy()
df_d.columns= ['DiseaseID', 'DVec']
df_c = df.copy()
df_c.columns= ['ChemicalID', 'CVec']
df1 = pd.merge(chem_dis, df_d, on='DiseaseID')
df1 = pd.merge(df1, df_c, on='ChemicalID')

In [233]:
df1['Correlation'] = 1

In [234]:
df1.head()

Unnamed: 0,ChemicalID,DiseaseID,DVec,CVec,Correlation
0,C049584,MESH:D001943,"[-7.54089653e-03, 2.84954235e-02, -1.45941272e...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
1,C049584,MESH:D018270,"[1.97611600e-02, 9.82791930e-02, 3.69541571e-0...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
2,C049584,MESH:D019457,"[0.03360923, 0.10056757, 0.05314376, -0.113913...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
3,C049584,MESH:D003110,"[0.00136586, 0.13832065, 0.02338981, -0.113038...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
4,C049584,MESH:D015179,"[-0.02237691, 0.07948194, -0.09784327, -0.0321...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1


In [235]:
df1.shape

(6660, 5)

In [236]:
# Randomly generate control set - pairs of unrelated diseases and chemicals and add as rows 
no_rows = df1.shape[0]-1
print('shape: ', no_rows)
for x in range(0, no_rows):
    int1 = randint(0, df1.shape[0]-1)
    int2 = randint(0, df1.shape[0]-1)
    chem, chemvec = df1.loc[int1, 'ChemicalID'], df1.loc[int1, 'CVec']
    dis, disvec = df1.loc[int2, 'DiseaseID'], df1.loc[int2, 'DVec']
    df1 = df1.append({'ChemicalID':chem, 'DiseaseID':dis, 'CVec':chemvec, 'DVec':disvec, 'Correlation':0}, ignore_index=True)
    
# Delete any duplicates induced in generating rows (note that this will remove any known correlations 
# from the new rows)
df1 = df1.drop_duplicates(subset=['ChemicalID', 'DiseaseID'], keep=False)

shape:  6659


In [237]:
df1.shape

(9514, 5)

In [238]:
df1.sample(13)

Unnamed: 0,ChemicalID,DiseaseID,DVec,CVec,Correlation
105,D001564,MESH:D009361,"[-0.01793019, 0.02332558, -0.13887669, -0.0121...","[-8.07521492e-03, 1.39567137e-01, -2.41595390e...",1
11694,C089730,MESH:D003866,"[0.01702759, 0.12427577, 0.03813349, -0.110617...","[0.01647617, 0.16031805, 0.03482624, -0.119994...",0
3281,D016572,MESH:D010146,"[3.53458934e-02, 1.41089112e-01, 2.37864740e-0...","[0.02098043, 0.16350506, 0.02600217, -0.136558...",1
12657,D037742,MESH:D065886,"[0.03868974, 0.04606202, -0.11462147, -0.04161...","[0.01644101, 0.0993525, 0.04270832, -0.0963581...",0
1329,D012906,MESH:D006333,"[0.01560365, 0.13829835, -0.0064437, -0.112833...","[0.0088706, 0.12644653, 0.05610176, -0.0973913...",1
1050,C023036,MESH:D011041,"[0.02073777, 0.14103304, 0.07279226, -0.122865...","[0.03414991, 0.12547259, -0.00204764, -0.14949...",1
3248,D019256,MESH:D006948,"[0.02679902, 0.13058257, 0.04513128, -0.107747...","[0.03078445, 0.11899038, 0.05539357, -0.125800...",1
4477,D017239,MESH:D006967,"[3.33185270e-02, 1.05473764e-01, 1.61803439e-0...","[3.05738989e-02, 1.24812022e-01, 4.42865677e-0...",1
5976,D019821,MESH:D018908,"[1.6617071e-02, 9.9785693e-02, 5.6648266e-02, ...","[5.62723773e-03, 1.45457312e-01, 3.73894423e-0...",1
9505,D015232,MESH:D020300,"[0.02078298, 0.11145127, 0.0520744, -0.1102614...","[1.96472630e-02, 1.13115221e-01, 6.78690001e-0...",0


### 2. Preprocess
Now that we have the df ready, let's split it into train/test/validation sets and convert it into numpy arrays so it can be consumed by a Keras NN

In [239]:
df1.head()

Unnamed: 0,ChemicalID,DiseaseID,DVec,CVec,Correlation
1,C049584,MESH:D018270,"[1.97611600e-02, 9.82791930e-02, 3.69541571e-0...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
2,C049584,MESH:D019457,"[0.03360923, 0.10056757, 0.05314376, -0.113913...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
3,C049584,MESH:D003110,"[0.00136586, 0.13832065, 0.02338981, -0.113038...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
4,C049584,MESH:D015179,"[-0.02237691, 0.07948194, -0.09784327, -0.0321...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
5,C049584,MESH:D018450,"[3.39546017e-02, 1.23659089e-01, 3.62569839e-0...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1


In [240]:
# # Split df into train, test, val
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [241]:
# For Keras, need to turn inputs into numpy arrays instead of pandas df
# First create single np array of the two vectors CONCERN: should these be two separate inputs?
Dvecs = pd.DataFrame(df1.DVec.values.tolist(), index= df1.index)
Cvecs = pd.DataFrame(df1.CVec.values.tolist(), index= df1.index)
all_X = Dvecs.merge(Cvecs, how='outer', left_index=True, right_index=True)
all_X = np.array(all_X)

In [242]:
# Now create np array of the y output
all_y = np.array(df1.Correlation)

In [243]:
print('y shape: ', all_y.shape)
print('X shape: ', all_X.shape)

y shape:  (9514,)
X shape:  (9514, 400)


In [244]:
# Split into train, test, val
X_train, X_test, y_train, y_test = train_test_split(all_X, all_y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [245]:
type(X_train)

numpy.ndarray

### 3. Establish NN Model

In [246]:
# 1. Establish the model architecture
model = keras.Sequential([
    keras.layers.Dense(200, activation=tf.nn.relu), #Dense layers are fully connected, this one 128 nodes
    keras.layers.Dense(200, activation=tf.nn.relu), #it's safe to say that I don't know what I'm doing here
    keras.layers.Dense(1, activation=tf.nn.sigmoid)
])
# ??? How is the number of nodes decided? Final layer has no. of outcome vars

In [247]:
# 2. Compile the model (give it loss func, optimise func and eval metric)
model.compile(optimizer=tf.train.AdamOptimizer(), # determines how the model is adapted based on loss func
              loss='binary_crossentropy', # measure of accuracy during training
              metrics=['accuracy']) # measure for train and testing steps 

In [248]:
# 3. Train
model.fit(X_train, y_train, epochs=6) # ??? How is the number of epochs decided, rule of thumb? elbow?

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x7fd34afc30b8>

In [249]:
# 4. Evaluate
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

Test accuracy: 0.5622700998580149


In [228]:
# # This is pointless - manually verifying accuracy test
# # Round predictions to int based on threshold, run accuracy-test manually
# predictions = model.predict(X_test)
# threshold = predictions[:].sum()/len(predictions) # Threshold is the mean value of predictions
# predictions = [int(round(x[0]-threshold+0.5)) for x in predictions]
# manual_accuracy = sklearn.metrics.accuracy_score(y_test, predictions, normalize=True, sample_weight=None)
# print(manual_accuracy)

### 4. Calculate Cosine Similary
So NN was a bust... will return to it but for now taking a step back and a broad view of which methods to use
to predict chem-dis association <br>
Starting with cosine sim of the dis and chem vecs

In [251]:
df1.head()

Unnamed: 0,ChemicalID,DiseaseID,DVec,CVec,Correlation
1,C049584,MESH:D018270,"[1.97611600e-02, 9.82791930e-02, 3.69541571e-0...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
2,C049584,MESH:D019457,"[0.03360923, 0.10056757, 0.05314376, -0.113913...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
3,C049584,MESH:D003110,"[0.00136586, 0.13832065, 0.02338981, -0.113038...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
4,C049584,MESH:D015179,"[-0.02237691, 0.07948194, -0.09784327, -0.0321...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
5,C049584,MESH:D018450,"[3.39546017e-02, 1.23659089e-01, 3.62569839e-0...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1


In [260]:
df1['dvector'] = df1.DVec.map(lambda x: [float(i) for i in x])# ((int(i) for i in x)) for x in df1.DVec)
df1['cvector'] = df1.CVec.map(lambda x: [float(i) for i in x])

In [261]:
df1.head()

Unnamed: 0,ChemicalID,DiseaseID,DVec,CVec,Correlation,dvector,cvector
1,C049584,MESH:D018270,"[1.97611600e-02, 9.82791930e-02, 3.69541571e-0...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1,"[0.01976116, 0.098279193, 0.0369541571, -0.089...","[0.02189679, 0.10079688, 0.04159389, -0.099326..."
2,C049584,MESH:D019457,"[0.03360923, 0.10056757, 0.05314376, -0.113913...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1,"[0.03360923, 0.10056757, 0.05314376, -0.113913...","[0.02189679, 0.10079688, 0.04159389, -0.099326..."
3,C049584,MESH:D003110,"[0.00136586, 0.13832065, 0.02338981, -0.113038...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1,"[0.00136586, 0.13832065, 0.02338981, -0.113038...","[0.02189679, 0.10079688, 0.04159389, -0.099326..."
4,C049584,MESH:D015179,"[-0.02237691, 0.07948194, -0.09784327, -0.0321...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1,"[-0.02237691, 0.07948194, -0.09784327, -0.0321...","[0.02189679, 0.10079688, 0.04159389, -0.099326..."
5,C049584,MESH:D018450,"[3.39546017e-02, 1.23659089e-01, 3.62569839e-0...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1,"[0.0339546017, 0.123659089, 0.0362569839, -0.1...","[0.02189679, 0.10079688, 0.04159389, -0.099326..."


In [266]:
df1.dvector[1]

[0.01976116,
 0.098279193,
 0.0369541571,
 -0.0897530913,
 -0.154745102,
 0.0290236436,
 0.0530408919,
 -0.000902482425,
 0.0720053613,
 -0.041336827,
 0.00992028508,
 -0.103465043,
 -0.014664581,
 0.0093777962,
 0.148570582,
 0.0985531509,
 -0.0434654877,
 -0.0865896568,
 0.00383703806,
 -0.00602399092,
 0.0253834855,
 0.0271703824,
 0.0382526852,
 0.0886162743,
 -0.0628179833,
 -0.0523640253,
 0.0281912461,
 -0.106265023,
 0.0880977809,
 0.0111200521,
 0.0212230459,
 -0.00216637924,
 -0.0410975516,
 0.110580273,
 0.0171194188,
 0.0934675559,
 0.101760432,
 -0.046714969,
 0.113564804,
 0.0228218939,
 -0.0267247092,
 0.0281053428,
 -0.0340921618,
 0.180602431,
 -0.15899086,
 -0.0727024078,
 0.0102252197,
 -3.06672082e-05,
 0.0731203109,
 0.0928461328,
 0.0489640646,
 -0.087873444,
 -0.0385674052,
 -0.101904385,
 -0.0293791089,
 0.0235408749,
 0.101567715,
 0.0562058017,
 0.0511286631,
 -0.0166602544,
 -0.020712819,
 0.0195202716,
 -0.00397341931,
 0.0343304649,
 0.121722341,
 -0.012347

In [269]:
s = tf.losses.cosine_distance(tf.nn.l2_normalize(df1.dvector[1], 0), tf.nn.l2_normalize(df1.cvector[1], 0), axis=0)

In [270]:
tf.losses.cosine_distance(
    labels,
    predictions,
    axis=None,
    weights=1.0,
    scope=None,
    loss_collection=tf.GraphKeys.LOSSES,
    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS,
    dim=None
)

<tf.Tensor 'cosine_distance_loss_1/value:0' shape=() dtype=float32>

In [279]:

df1['cosine_sim'] = cosine_similarity(np.array(df1.dvector[1]).reshape(1, -1), np.array(df1.cvector[1]).reshape(1, -1))

array([[0.98706982]])

In [291]:
def cosine_sim (row):
    return cosine_similarity(np.array(row.dvector).reshape(1, -1), np.array(row.cvector).reshape(1, -1))[0][0]

In [292]:
df1['cosine_sim'] = df1.apply(lambda row: cosine_sim(row), axis=1)

In [293]:
# type(cosine_similarity(np.array(df1.dvector[1]).reshape(1, -1), np.array(df1.cvector[1]).reshape(1, -1))[0][0])

In [296]:
df1.cosine_sim.mean()

0.8967551173162478

In [295]:
df1.sample(13)

Unnamed: 0,ChemicalID,DiseaseID,DVec,CVec,Correlation,dvector,cvector,cosine_sim
4487,D017239,MESH:D010523,"[-1.09452317e-02, 1.57868296e-01, 1.80701411e-...","[3.05738989e-02, 1.24812022e-01, 4.42865677e-0...",1,"[-0.0109452317, 0.157868296, 0.00180701411, -0...","[0.0305738989, 0.124812022, 0.0442865677, -0.1...",0.877209
12465,D014747,MESH:D007248,"[1.71829686e-02, 1.39605537e-01, 2.36382452e-0...","[0.01618104, 0.07219438, -0.15823099, -0.02158...",0,"[0.0171829686, 0.139605537, 0.00236382452, -0....","[0.01618104, 0.07219438, -0.15823099, -0.02158...",0.869425
12161,D017258,MESH:D001281,"[0.03029574, 0.12470473, 0.04246138, -0.127076...","[3.73844840e-02, 9.46905091e-02, 2.48691831e-0...",0,"[0.03029574, 0.12470473, 0.04246138, -0.127076...","[0.037384484, 0.0946905091, 0.0248691831, -0.1...",0.973461
2781,D026023,MESH:D005911,"[3.08072437e-02, 1.20687835e-01, 4.16728817e-0...","[0.03310541, 0.08051842, 0.04012659, -0.070637...",1,"[0.0308072437, 0.120687835, 0.0416728817, -0.1...","[0.03310541, 0.08051842, 0.04012659, -0.070637...",0.969631
5827,C030852,MESH:D013280,"[0.02112314, 0.11517609, 0.04242558, -0.101617...","[0.0010688, 0.11910306, 0.03999582, -0.1281371...",1,"[0.02112314, 0.11517609, 0.04242558, -0.101617...","[0.0010688, 0.11910306, 0.03999582, -0.1281371...",0.970135
13097,D003976,MESH:D003110,"[0.00136586, 0.13832065, 0.02338981, -0.113038...","[2.63617318e-02, 1.08001307e-01, 4.83879410e-0...",0,"[0.00136586, 0.13832065, 0.02338981, -0.113038...","[0.0263617318, 0.108001307, 0.048387941, -0.09...",0.892396
5929,D010894,MESH:D002779,"[0.00174257, 0.08889782, -0.01832172, -0.07990...","[1.70917287e-02, 9.46692526e-02, 4.82771657e-0...",1,"[0.00174257, 0.08889782, -0.01832172, -0.07990...","[0.0170917287, 0.0946692526, 0.0482771657, -0....",0.861249
10247,C049109,MESH:D006331,"[1.17974998e-02, 1.22592472e-01, 2.86617447e-0...","[0.06243994, 0.11764007, 0.00298701, -0.105037...",0,"[0.0117974998, 0.122592472, 0.0286617447, -0.1...","[0.06243994, 0.11764007, 0.00298701, -0.105037...",0.963265
1973,C006632,MESH:D017180,"[0.04513699, 0.1102139, 0.02985132, -0.1074163...","[0.01304277, 0.08549409, 0.03833252, -0.092752...",1,"[0.04513699, 0.1102139, 0.02985132, -0.1074163...","[0.01304277, 0.08549409, 0.03833252, -0.092752...",0.983344
2384,D010634,MESH:D019954,"[0.02417529, 0.09007905, 0.0597823, -0.1178615...","[0.02074566, 0.09746573, 0.02772122, -0.109691...",1,"[0.02417529, 0.09007905, 0.0597823, -0.1178615...","[0.02074566, 0.09746573, 0.02772122, -0.109691...",0.941925
