# Notebook: Use NN to predict disease from chemicals using Opa2Vec vectors
<b> Author: </b> Ian Coleman <br>
<b> Purpose: </b> Take the vectors created in the opa2vec notebook. This took chemical go functions
    and disease go function, creating vectors for the chemicals. Train a NN to predict diseases from these chemical
    vectors

In [106]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### 1. Import Vectors and Pre-Process them

In [107]:
# TODO needs to be adapted to account for the fact that AllVectorResults.lst will now contain not only chemical
# vectors but also disease ones. The IDs are very similar but looks like maybe disease IDs are always len 8
# and chemical ones never are... verify. Have verified (dis are 8, chem are 7 or 10)

In [108]:
# Import vec file
with open('AllVectorResults.lst', 'r') as file:
    text = file.read()

In [109]:
# Strip and split it into list of lists [chem, vec]
text = text.replace('\n', '')
text = text.split(']')
text = [item.strip().split(' [') for item in text]

In [110]:
# Turn it into a data frame
df = pd.DataFrame(text)
df.columns = ['ID', 'Vector']
# df.head()

In [111]:
# Clean
df = df.dropna()
df['Vector'] = df.Vector.map(lambda x: x.rstrip().lstrip().replace('    ', ' ').replace('   ', ' ').replace('  ', ' ').replace(' ', ','))

In [112]:
# Turn vector column into a list
df['Vector'] = df.Vector.map(lambda x: x.split(','))

# df = df['Vector'].str.split(',', expand=True)
# df = df.join(vec_split, lsuffix='_df', rsuffix='_vec_split')
# df['chemVec'] = np.nan
# for index in range(df.shape[0]):
#     df['chemVec'][index] = df.iloc[index, 2:].tolist()

In [113]:
# df.loc[:,0].head()
# BCE binary classification --> The loss function recommended by Jun
# sigmoid output

In [114]:
# Now we have 
df.head()

Unnamed: 0,ID,Vector
0,MESH:D012559,"[0.01491615, -0.00155747, -0.30986652, 0.04035..."
1,MESH:D009404,"[3.82804424e-02, 1.29408345e-01, 3.75053808e-0..."
2,MESH:D001749,"[-0.01025235, 0.00664143, -0.30367315, 0.15593..."
3,MESH:D011471,"[-0.0130785, -0.02445601, -0.46697775, 0.13181..."
4,MESH:D008106,"[-0.06240484, 0.00166245, -0.5013923, 0.116841..."


### 2. Create DF for NN
From the ID-Vector DF we will now create a DF matching each chem with each disease of the following columns:
ChemID DisID ChemVec DisVec PositiveAssociationExists(binary)

I'm running into a problem here...
Disgenet uses UMLS ID for diseases
CTD uses MESH for diseases

I need to either: 
convert between MESH and UMLS --> Waiting for my UMLS membership, can't see how to do it without
OR recreate vectors using only CTD diseases
OR create a new chem_dis list from Disgenet --> Non-existent

In [115]:
# Step 1: Import file of proven chem-dis positive associations (created in ctd-to-nt notebook)
chem_dis = pd.read_csv('../ctd-to-nt/chem-dis-pos-assocs.csv')
chem_dis.head()

Unnamed: 0,ChemicalID,DiseaseID
0,C112297,MESH:D006948
1,C112297,MESH:D012640
2,C425777,MESH:D006948
3,C013567,MESH:D006333
4,C418863,MESH:D013262


In [116]:
# Step 2: Iterate through each chem and create a line for it with each dis

In [117]:
# First create is_chem col in df to differentiate between chem and disease
# df['is_chem'] = df.ID.map(lambda x: ':' not in x) # as len of disease ID is always 8

In [118]:
# We only want the chems and diseases that we have vectors for
df.shape

(2970, 2)

In [119]:
# Reshape chem_dis to to only keep lines where both chem and dis have a vec TRAINING SET
chem_dis['DiseaseID'] = chem_dis['DiseaseID'].astype(str)
df['ID'] = df['ID'].astype(str)
id_list = df.ID.tolist()
chem_dis['hasDVec'] = chem_dis.DiseaseID.map(lambda x: x in id_list)
chem_dis['hasCVec'] = chem_dis.ChemicalID.map(lambda x: x in id_list)
chem_dis = chem_dis.loc[(chem_dis['hasDVec'] == True) & (chem_dis['hasCVec'] == True)]

In [120]:
# So iterate through vecs and create a line for it if there is a rel with a dis that has a vec
# chem_dis['dVec'] = np.nan
# chem_dis['cVec'] = np.nan
# chem_dis['dVec'] = np.where(df.ID == chem_dis.DiseaseID, df.Vector, None)

In [121]:
chem_dis.head()

Unnamed: 0,ChemicalID,DiseaseID,hasDVec,hasCVec
1042,C049584,MESH:D001943,True,True
1044,C049584,MESH:D018270,True,True
1047,C049584,MESH:D019457,True,True
1048,C049584,MESH:D003110,True,True
1049,C049584,MESH:D015179,True,True


In [122]:
df.head()

Unnamed: 0,ID,Vector
0,MESH:D012559,"[0.01491615, -0.00155747, -0.30986652, 0.04035..."
1,MESH:D009404,"[3.82804424e-02, 1.29408345e-01, 3.75053808e-0..."
2,MESH:D001749,"[-0.01025235, 0.00664143, -0.30367315, 0.15593..."
3,MESH:D011471,"[-0.0130785, -0.02445601, -0.46697775, 0.13181..."
4,MESH:D008106,"[-0.06240484, 0.00166245, -0.5013923, 0.116841..."


In [123]:
chem_dis = chem_dis.drop(['hasDVec','hasCVec'], axis=1)

In [124]:
df_d = df.copy()
df_d.columns= ['DiseaseID', 'DVec']
df_c = df.copy()
df_c.columns= ['ChemicalID', 'CVec']
df1 = pd.merge(chem_dis, df_d, on='DiseaseID')
df1 = pd.merge(df1, df_c, on='ChemicalID')

In [125]:
df.loc[df.ID == 'D001564']

Unnamed: 0,ID,Vector
2651,D001564,"[-8.07521492e-03, 1.39567137e-01, -2.41595390e..."


In [126]:
df1.sample(13)

Unnamed: 0,ChemicalID,DiseaseID,DVec,CVec
5730,D015474,MESH:D015228,"[0.02243945, 0.13646156, 0.04679821, -0.125776...","[0.03241549, 0.12670228, 0.02712511, -0.124088..."
4651,D014520,MESH:D002972,"[0.01854516, 0.12501726, 0.05249919, -0.130227...","[-1.22959409e-02, -2.69480702e-02, -4.49473411..."
1718,D001335,MESH:D047928,"[0.01676517, 0.13132495, 0.04465051, -0.102658...","[1.78187005e-02, -4.97377068e-02, -4.06166583e..."
31,D000393,MESH:D000740,"[0.02901717, 0.0821704, 0.0358058, -0.09371509...","[0.01339481, 0.1094034, 0.04425092, -0.1222666..."
4786,D008694,MESH:D017202,"[-5.09201623e-02, -4.54995222e-03, -2.45105401...","[0.02017646, 0.09857961, 0.04948057, -0.097299..."
4176,D002220,MESH:D013375,"[0.04933756, 0.10612826, -0.10598502, -0.04282...","[2.61190552e-02, 1.01711757e-01, 4.12642285e-0..."
3951,D014212,MESH:D012008,"[0.00973559, 0.11164339, -0.05571327, -0.06229...","[4.39150184e-02, 6.36065304e-02, -2.76675701e-..."
1614,D014028,MESH:D050171,"[0.02702204, 0.10942674, 0.04576094, -0.099105...","[0.05815332, 0.083392, -0.2269482, -0.03284321..."
2320,D016685,MESH:D011697,"[0.03362812, 0.1131022, 0.02494448, -0.0973135...","[1.03593338e-02, 9.08397809e-02, 4.65230122e-0..."
5136,C024746,MESH:D014029,"[0.04185405, 0.13143839, 0.01233958, -0.099147...","[-5.57276001e-03, 1.14237055e-01, 5.88921085e-..."


In [None]:
# Step 3: For each line check the chem-dis reference df to see if positive rel exists, if so encode 1 else 0

In [None]:
# # Import disease list (created in opa2vec notebook that created vectors)
# diseases = pd.read_csv('diseases.lst', header=None, skiprows=1) # Skipping first row as will be nan
# diseases.shape # 1264 diseases...

In [None]:
# df.head()

In [None]:
# diseases.head()

In [192]:
# Import directly evidenced chemical-disease positive relationships from CTD
chem_dis = pd.read_csv('../ctd-to-nt/chem-dis-pos-assocs.csv')
chem_dis.head()

Unnamed: 0,ChemicalID,DiseaseID
0,C112297,MESH:D006948
1,C112297,MESH:D012640
2,C425777,MESH:D006948
3,C013567,MESH:D006333
4,C418863,MESH:D013262


In [None]:
df.head()

In [None]:
## Get rid of rows from chem_dis that have chems that aren't in df
print(chem_dis.shape)
chemsers = df.ChemicalID.unique()
bools = chem_dis.ChemicalID.map(lambda x: x in chemsers)
chem_dis = chem_dis[bools]
chem_dis.shape

In [None]:
print('Number chems: ', len(chem_dis.ChemicalID.unique()))
print('Number diseases: ', len(chem_dis.DiseaseID.unique()))

In [None]:
# Create column for each disease, nan columns
for name in chem_dis.DiseaseID.unique():
    df[name] = np.nan

In [None]:
df.head()

In [None]:
# For each chem-disease relationship set cell to one, if no relationship then set to 0
def check_assoc(row):
    for index, r in chem_dis[chem_dis.ChemicalID == row.ChemicalID].head().iterrows():
#         row[r.DiseaseID] = 1
        print(r.DiseaseID)
        df.loc[index, r.DiseaseID] = 1
    
    
# convert np.nan to 0 for col in df


In [None]:
chem_dis.head()

In [None]:
df.apply(check_assoc, axis=1)

In [None]:
df.head() 
df["MESH:D048629"].unique()

In [None]:
df.shape