# Notebook: Use NN to predict disease from chemicals using Opa2Vec vectors
<b> Author: </b> Ian Coleman <br>
<b> Purpose: </b> Take the vectors created in the opa2vec notebook. This took chemical go functions
    and disease go function, creating vectors for the chemicals. Train a NN to predict diseases from these chemical
    vectors

In [195]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### 1. Import Chem Vectors and Pre-Process them

In [196]:
# TODO needs to be adapted to account for the fact that AllVectorResults.lst will now contain not only chemical
# vectors but also disease ones. The IDs are very similar but looks like maybe disease IDs are always len 8
# and chemical ones never are... verify

In [197]:
# Import vec file
with open('data/AllVectorResults.lst', 'r') as file:
    text = file.read()

In [186]:
# Strip and split it into list of lists [chem, vec]
text = text.replace('\n', '')
text = text.split(']')
text = [item.strip().split(' [') for item in text]

In [187]:
# Turn it into a data frame
df = pd.DataFrame(text)
df.columns = ['ChemicalID', 'ChemicalVector']
# df.head()

In [188]:
# Clean
df = df.dropna()
df['ChemicalVector'] = df.ChemicalVector.map(lambda x: x.rstrip().lstrip().replace('    ', ' ').replace('   ', ' ').replace('  ', ' ').replace(' ', ','))

In [189]:
# Turn vector column into a list
df['ChemicalVector'] = df.ChemicalVector.map(lambda x: x.split(','))
df.head()

# df = df['Vector'].str.split(',', expand=True)
# df = df.join(vec_split, lsuffix='_df', rsuffix='_vec_split')
# df['chemVec'] = np.nan
# for index in range(df.shape[0]):
#     df['chemVec'][index] = df.iloc[index, 2:].tolist()

Unnamed: 0,ChemicalID,ChemicalVector
0,D015032,"[-0.01185622, -0.31878912, -0.89908963, 0.0717..."
1,C085514,"[0.0223429, 0.1116555, 0.02859181, -0.1335976,..."
2,C104536,"[4.91102971e-02, 1.35097533e-01, -2.54380330e-..."
3,C088658,"[-1.5123323e-02, -3.2596567e-01, -1.0544300e+0..."
4,D014635,"[-6.37703110e-03, -4.31791008e-01, -1.22665536..."


In [190]:
# df.loc[:,0].head()
# BCE binary classification --> The loss function recommended by Jun
# sigmoid output

In [191]:
# Now we have 
df.head()

Unnamed: 0,ChemicalID,ChemicalVector
0,D015032,"[-0.01185622, -0.31878912, -0.89908963, 0.0717..."
1,C085514,"[0.0223429, 0.1116555, 0.02859181, -0.1335976,..."
2,C104536,"[4.91102971e-02, 1.35097533e-01, -2.54380330e-..."
3,C088658,"[-1.5123323e-02, -3.2596567e-01, -1.0544300e+0..."
4,D014635,"[-6.37703110e-03, -4.31791008e-01, -1.22665536..."


0    [-0.01185622, -0.31878912, -0.89908963, 0.0717...
1    [0.0223429, 0.1116555, 0.02859181, -0.1335976,...
2    [4.91102971e-02, 1.35097533e-01, -2.54380330e-...
3    [-1.5123323e-02, -3.2596567e-01, -1.0544300e+0...
4    [-6.37703110e-03, -4.31791008e-01, -1.22665536...
Name: chemVec, dtype: object

### 2. Add Diseases to DF
Binary encode presence of positive association between each disease and each chem

In [None]:
# # Import disease list (created in opa2vec notebook that created vectors)
# diseases = pd.read_csv('diseases.lst', header=None, skiprows=1) # Skipping first row as will be nan
# diseases.shape # 1264 diseases...

In [None]:
# df.head()

In [None]:
# diseases.head()

In [192]:
# Import directly evidenced chemical-disease positive relationships from CTD
chem_dis = pd.read_csv('../ctd-to-nt/chem-dis-pos-assocs.csv')
chem_dis.head()

Unnamed: 0,ChemicalID,DiseaseID
0,C112297,MESH:D006948
1,C112297,MESH:D012640
2,C425777,MESH:D006948
3,C013567,MESH:D006333
4,C418863,MESH:D013262


In [None]:
df.head()

In [None]:
## Get rid of rows from chem_dis that have chems that aren't in df
print(chem_dis.shape)
chemsers = df.ChemicalID.unique()
bools = chem_dis.ChemicalID.map(lambda x: x in chemsers)
chem_dis = chem_dis[bools]
chem_dis.shape

In [None]:
print('Number chems: ', len(chem_dis.ChemicalID.unique()))
print('Number diseases: ', len(chem_dis.DiseaseID.unique()))

In [None]:
# Create column for each disease, nan columns
for name in chem_dis.DiseaseID.unique():
    df[name] = np.nan

In [None]:
df.head()

In [None]:
# For each chem-disease relationship set cell to one, if no relationship then set to 0
def check_assoc(row):
    for index, r in chem_dis[chem_dis.ChemicalID == row.ChemicalID].head().iterrows():
#         row[r.DiseaseID] = 1
        print(r.DiseaseID)
        df.loc[index, r.DiseaseID] = 1
    
    
# convert np.nan to 0 for col in df


In [None]:
chem_dis.head()

In [None]:
df.apply(check_assoc, axis=1)

In [None]:
df.head() 
df["MESH:D048629"].unique()

In [None]:
df.shape