# Notebook: Use NN to predict disease from chemicals using Opa2Vec vectors
<b> Author: </b> Ian Coleman <br>
<b> Purpose: </b> Take the vectors created in the opa2vec notebook. This took chemical go functions
    and disease go function, creating vectors for the chemicals. Train a NN to predict diseases from these chemical
    vectors

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

  from ._conv import register_converters as _register_converters


### 1. Import Vectors and Pre-Process them

In [2]:
# TODO needs to be adapted to account for the fact that AllVectorResults.lst will now contain not only chemical
# vectors but also disease ones. The IDs are very similar but looks like maybe disease IDs are always len 8
# and chemical ones never are... verify. Have verified (dis are 8, chem are 7 or 10)

In [3]:
# Import vec file
with open('data/AllVectorResults.lst', 'r') as file:
    text = file.read()

In [4]:
# Strip and split it into list of lists [chem, vec]
text = text.replace('\n', '')
text = text.split(']')
text = [item.strip().split(' [') for item in text]

In [5]:
# Turn it into a data frame
df = pd.DataFrame(text)
df.columns = ['ID', 'Vector']
# df.head()

In [6]:
# Clean
df = df.dropna()
df['Vector'] = df.Vector.map(lambda x: x.rstrip().lstrip().replace('    ', ' ').replace('   ', ' ').replace('  ', ' ').replace(' ', ','))

In [10]:
# Turn vector column into a list
df['Vector'] = df.Vector.map(lambda x: x.split(','))

# df = df['Vector'].str.split(',', expand=True)
# df = df.join(vec_split, lsuffix='_df', rsuffix='_vec_split')
# df['chemVec'] = np.nan
# for index in range(df.shape[0]):
#     df['chemVec'][index] = df.iloc[index, 2:].tolist()

Unnamed: 0,ID,Vector
407,C009277,"[0.02475145, 0.08796439, 0.04819478, -0.088675..."
408,D010476,"[0.01687066, 0.10008188, 0.04525531, -0.097604..."
409,D007545,"[0.0204481, 0.10942723, 0.04233734, -0.1014265..."
410,C503700,"[0.02763922, 0.11034967, 0.03813923, -0.120829..."
411,D007840,"[0.01837787, 0.08553177, 0.0417898, -0.0809339..."


In [190]:
# df.loc[:,0].head()
# BCE binary classification --> The loss function recommended by Jun
# sigmoid output

In [16]:
# Now we have 
df[-5:]

Unnamed: 0,ID,Vector,is_chem
407,C009277,"[0.02475145, 0.08796439, 0.04819478, -0.088675...",True
408,D010476,"[0.01687066, 0.10008188, 0.04525531, -0.097604...",True
409,D007545,"[0.0204481, 0.10942723, 0.04233734, -0.1014265...",True
410,C503700,"[0.02763922, 0.11034967, 0.03813923, -0.120829...",True
411,D007840,"[0.01837787, 0.08553177, 0.0417898, -0.0809339...",True


### 2. Create DF for NN
From the ID-Vector DF we will now create a DF matching each chem with each disease of the following columns:
ChemID DisID ChemVec DisVec PositiveAssociationExists(binary)

In [13]:
# Step 1: Import file of proven chem-dis positive associations (created in ctd-to-nt notebook)
chem_dis = pd.read_csv('../ctd-to-nt/chem-dis-pos-assocs.csv')
chem_dis.head()

Unnamed: 0,ChemicalID,DiseaseID
0,C112297,MESH:D006948
1,C112297,MESH:D012640
2,C425777,MESH:D006948
3,C013567,MESH:D006333
4,C418863,MESH:D013262


In [None]:
# Step 2: Iterate through each chem and create a line for it with each dis

In [14]:
# First create is_chem col in df to differentiate between chem and disease
df['is_chem'] = df.ID.map(lambda x: len(x) != 8) # as len of disease ID is always 8

In [None]:
# Step 3: For each line check the chem-dis reference df to see if positive rel exists, if so encode 1 else 0

In [None]:
# # Import disease list (created in opa2vec notebook that created vectors)
# diseases = pd.read_csv('diseases.lst', header=None, skiprows=1) # Skipping first row as will be nan
# diseases.shape # 1264 diseases...

In [None]:
# df.head()

In [None]:
# diseases.head()

In [192]:
# Import directly evidenced chemical-disease positive relationships from CTD
chem_dis = pd.read_csv('../ctd-to-nt/chem-dis-pos-assocs.csv')
chem_dis.head()

Unnamed: 0,ChemicalID,DiseaseID
0,C112297,MESH:D006948
1,C112297,MESH:D012640
2,C425777,MESH:D006948
3,C013567,MESH:D006333
4,C418863,MESH:D013262


In [None]:
df.head()

In [None]:
## Get rid of rows from chem_dis that have chems that aren't in df
print(chem_dis.shape)
chemsers = df.ChemicalID.unique()
bools = chem_dis.ChemicalID.map(lambda x: x in chemsers)
chem_dis = chem_dis[bools]
chem_dis.shape

In [None]:
print('Number chems: ', len(chem_dis.ChemicalID.unique()))
print('Number diseases: ', len(chem_dis.DiseaseID.unique()))

In [None]:
# Create column for each disease, nan columns
for name in chem_dis.DiseaseID.unique():
    df[name] = np.nan

In [None]:
df.head()

In [None]:
# For each chem-disease relationship set cell to one, if no relationship then set to 0
def check_assoc(row):
    for index, r in chem_dis[chem_dis.ChemicalID == row.ChemicalID].head().iterrows():
#         row[r.DiseaseID] = 1
        print(r.DiseaseID)
        df.loc[index, r.DiseaseID] = 1
    
    
# convert np.nan to 0 for col in df


In [None]:
chem_dis.head()

In [None]:
df.apply(check_assoc, axis=1)

In [None]:
df.head() 
df["MESH:D048629"].unique()

In [None]:
df.shape