# Notebook: Use NN to predict disease from chemicals using Opa2Vec vectors
<b> Author: </b> Ian Coleman <br>
<b> Purpose: </b> Take the vectors created in the opa2vec notebook. This took chemical go functions
    and disease go function, creating vectors for the chemicals. Train a NN to predict diseases from these chemical
    vectors

In [80]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from random import randint


### 1. Import Vectors and Pre-Process them

In [81]:
# Import vec file
with open('AllVectorResults.lst', 'r') as file:
    text = file.read()

In [82]:
# Strip and split it into list of lists [chem, vec]
text = text.replace('\n', '')
text = text.split(']')
text = [item.strip().split(' [') for item in text]

In [83]:
# Turn it into a data frame
df = pd.DataFrame(text)
df.columns = ['ID', 'Vector']
# df.head()

In [84]:
# Clean
df = df.dropna()
df['Vector'] = df.Vector.map(lambda x: x.rstrip().lstrip().replace('    ', ' ').replace('   ', ' ').replace('  ', ' ').replace(' ', ','))

In [85]:
# Turn vector column into a list
df['Vector'] = df.Vector.map(lambda x: x.split(','))

In [86]:
# df.loc[:,0].head()
# BCE binary classification --> The loss function recommended by Jun
# sigmoid output

In [87]:
# Now we have 
df.head()

Unnamed: 0,ID,Vector
0,MESH:D012559,"[0.01491615, -0.00155747, -0.30986652, 0.04035..."
1,MESH:D009404,"[3.82804424e-02, 1.29408345e-01, 3.75053808e-0..."
2,MESH:D001749,"[-0.01025235, 0.00664143, -0.30367315, 0.15593..."
3,MESH:D011471,"[-0.0130785, -0.02445601, -0.46697775, 0.13181..."
4,MESH:D008106,"[-0.06240484, 0.00166245, -0.5013923, 0.116841..."


### 2. Create DF for NN
From the ID-Vector DF we will now create a DF matching each chem with each disease of the following columns:
ChemID DisID ChemVec DisVec PositiveAssociationExists(binary)

I'm running into a problem here...
Disgenet uses UMLS ID for diseases
CTD uses MESH for diseases

I need to either: 
convert between MESH and UMLS --> Waiting for my UMLS membership, can't see how to do it without
OR recreate vectors using only CTD diseases
OR create a new chem_dis list from Disgenet --> Non-existent

In [88]:
# Step 1: Import file of proven chem-dis positive associations (created in ctd-to-nt notebook)
chem_dis = pd.read_csv('../ctd-to-nt/chem-dis-pos-assocs.csv')
chem_dis.head()

Unnamed: 0,ChemicalID,DiseaseID
0,C112297,MESH:D006948
1,C112297,MESH:D012640
2,C425777,MESH:D006948
3,C013567,MESH:D006333
4,C418863,MESH:D013262


In [89]:
# Step 2: Iterate through each chem and create a line for it with each dis

In [90]:
# First create is_chem col in df to differentiate between chem and disease
# df['is_chem'] = df.ID.map(lambda x: ':' not in x) # as len of disease ID is always 8

In [91]:
# We only want the chems and diseases that we have vectors for
df.shape

(2970, 2)

In [92]:
# Reshape chem_dis to to only keep lines where both chem and dis have a vec
chem_dis['DiseaseID'] = chem_dis['DiseaseID'].astype(str)
df['ID'] = df['ID'].astype(str)
id_list = df.ID.tolist()
chem_dis['hasDVec'] = chem_dis.DiseaseID.map(lambda x: x in id_list)
chem_dis['hasCVec'] = chem_dis.ChemicalID.map(lambda x: x in id_list)
chem_dis = chem_dis.loc[(chem_dis['hasDVec'] == True) & (chem_dis['hasCVec'] == True)]
chem_dis = chem_dis.drop(['hasDVec','hasCVec'], axis=1)

In [93]:
# Now create a df:
# cID, dID, cVec, dVec, correlate?
# We'll build it on chem_dis...but all current relationships are positive. Need to create non-related pairs c-d
out_df = chem_dis.copy()
out_df['correlate'] = 1
out_df.shape # 62015, should aim to make this many non-related associations too

(6660, 3)

In [94]:
# So iterate through vecs and create a line for it if there is a rel with a dis that has a vec
# chem_dis['dVec'] = np.nan
# chem_dis['cVec'] = np.nan
# chem_dis['dVec'] = np.where(df.ID == chem_dis.DiseaseID, df.Vector, None)

In [95]:
chem_dis.shape

(6660, 2)

In [96]:
df.head()

Unnamed: 0,ID,Vector
0,MESH:D012559,"[0.01491615, -0.00155747, -0.30986652, 0.04035..."
1,MESH:D009404,"[3.82804424e-02, 1.29408345e-01, 3.75053808e-0..."
2,MESH:D001749,"[-0.01025235, 0.00664143, -0.30367315, 0.15593..."
3,MESH:D011471,"[-0.0130785, -0.02445601, -0.46697775, 0.13181..."
4,MESH:D008106,"[-0.06240484, 0.00166245, -0.5013923, 0.116841..."


In [97]:
# chem_dis = chem_dis.drop(['hasDVec','hasCVec'], axis=1)

In [98]:
# merge all info into one df
df_d = df.copy()
df_d.columns= ['DiseaseID', 'DVec']
df_c = df.copy()
df_c.columns= ['ChemicalID', 'CVec']
df1 = pd.merge(chem_dis, df_d, on='DiseaseID')
df1 = pd.merge(df1, df_c, on='ChemicalID')

In [99]:
df1['Correlation'] = 1

In [100]:
df1.head()

Unnamed: 0,ChemicalID,DiseaseID,DVec,CVec,Correlation
0,C049584,MESH:D001943,"[-7.54089653e-03, 2.84954235e-02, -1.45941272e...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
1,C049584,MESH:D018270,"[1.97611600e-02, 9.82791930e-02, 3.69541571e-0...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
2,C049584,MESH:D019457,"[0.03360923, 0.10056757, 0.05314376, -0.113913...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
3,C049584,MESH:D003110,"[0.00136586, 0.13832065, 0.02338981, -0.113038...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1
4,C049584,MESH:D015179,"[-0.02237691, 0.07948194, -0.09784327, -0.0321...","[0.02189679, 0.10079688, 0.04159389, -0.099326...",1


In [101]:
df1.shape

(6660, 5)

In [102]:
# Randomly generate pairs of diseases and chemicals and add as rows 
no_rows = df1.shape[0]-1
print('shape: ', no_rows)
for x in range(0,no_rows):
    int1 = randint(0, df1.shape[0]-1)
    int2 = randint(0, df1.shape[0]-1)
    chem, chemvec = df1.loc[int1, 'ChemicalID'], df1.loc[int1, 'CVec']
    dis, disvec = df1.loc[int2, 'DiseaseID'], df1.loc[int2, 'DVec']
    df1 = df1.append({'ChemicalID':chem, 'DiseaseID':dis, 'CVec':chemvec, 'DVec':disvec, 'Correlation':0}, ignore_index=True)
    
# Delete any duplicates induced in generating rows (note that this will remove any known correlations 
# from the new rows)
df1 = df1.drop_duplicates(subset=['ChemicalID', 'DiseaseID'], keep=False)

shape:  6659


In [103]:
df1.shape

(10692, 5)

In [105]:
df1.sample(13)

Unnamed: 0,ChemicalID,DiseaseID,DVec,CVec,Correlation
11510,D002104,MESH:D002294,"[0.01329628, 0.1473947, 0.05748451, -0.1255661...","[-0.00324665, 0.14041205, 0.03548243, -0.13014...",0
11053,C047246,MESH:D011014,"[-0.00139776, 0.12390826, -0.04465656, -0.0741...","[-0.00669882, 0.11344735, 0.06861784, -0.12697...",0
12087,C000944,MESH:D064420,"[0.01095377, 0.15810992, 0.06086655, -0.125184...","[0.01467722, 0.11732066, 0.04879517, -0.127387...",0
2333,D037742,MESH:D009336,"[3.90566746e-03, 1.15480758e-01, 3.81208509e-0...","[0.01644101, 0.0993525, 0.04270832, -0.0963581...",1
12144,C030852,MESH:D011658,"[2.57783663e-02, 1.35575280e-01, -5.06892130e-...","[0.0010688, 0.11910306, 0.03999582, -0.1281371...",0
7613,D000069470,MESH:D001523,"[0.03826718, 0.11923131, 0.05030534, -0.103037...","[0.00129271, 0.14051303, 0.04350722, -0.126025...",0
11546,D015232,MESH:D001249,"[0.01036902, 0.14740449, 0.02375182, -0.115398...","[1.96472630e-02, 1.13115221e-01, 6.78690001e-0...",0
6628,D014451,MESH:D009135,"[0.03400159, 0.11989318, 0.02914303, -0.126909...","[0.02183472, 0.10678482, 0.03412488, -0.086460...",1
9583,C017947,MESH:D009135,"[0.03400159, 0.11989318, 0.02914303, -0.126909...","[-0.06307664, 0.026673, -0.14727059, -0.054864...",0
13936,D013759,MESH:D003967,"[0.03470691, 0.1128696, 0.03834436, -0.1075954...","[2.25929990e-02, 7.96100125e-02, 4.26383689e-0...",0
