# Notebook: Use NN to predict disease from chemicals using Opa2Vec vectors
<b> Author: </b> Ian Coleman <br>
<b> Purpose: </b> Take the vectors created in the opa2vec notebook. This took chemical go functions
    and disease go function, creating vectors for the chemicals. Train a NN to predict diseases from these chemical
    vectors

In [2]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### 1. Import Chem Vectors and Pre-Process them

In [3]:
# Import vec file
with open('AllVectorResults.lst', 'r') as file:
    text = file.read()

In [4]:
# Strip and split it into list of lists [chem, vec]
text = text.replace('\n', '')
text = text.split(']')
text = [item.strip().split(' [') for item in text]

In [5]:
# Turn it into a data frame
df = pd.DataFrame(text)
df.columns = ['ChemicalID', 'Vector']
# df.head()

In [6]:
# Clean
df = df.dropna()
df['Vector'] = df.Vector.map(lambda x: x.rstrip().lstrip().replace('    ', ' ').replace('   ', ' ').replace('  ', ' ').replace(' ', ','))

In [7]:
# Split the vector into a column per number
vec_split = df['Vector'].str.split(',', expand=True)
df = df.join(vec_split, lsuffix='_df', rsuffix='_vec_split')

In [8]:
# df.loc[:,0].head()
# BCE binary classification --> The loss function recommended by Jun
# sigmoid output


### 2. Add Diseases to DF
Binary encode presence of positive association between each disease and each chem

In [17]:
# Import disease list (created in opa2vec notebook that created vectors)
diseases = pd.read_csv('diseases.lst', header=None, skiprows=1) # Skipping first row as will be nan
diseases.shape # 1264 diseases...

In [20]:
df.head()

Unnamed: 0,ChemicalID,Vector,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
0,D015032,"-0.01185622,-0.31878912,-0.89908963,0.07175528...",-0.01185622,-0.31878912,-0.89908963,0.07175528,0.20856386,-0.6810764,0.7155862,0.3877636,...,0.4799509,0.49532512,-0.40308106,0.440478,-0.14763047,-0.33496988,0.21969438,0.49740723,-0.08506261,0.07757662
1,C085514,"0.0223429,0.1116555,0.02859181,-0.1335976,-0.2...",0.0223429,0.1116555,0.02859181,-0.1335976,-0.23470162,0.034631,0.12097855,0.00488628,...,0.02691469,0.01887334,-0.05138162,0.11950745,-0.02516304,-0.10212526,0.08233052,0.1476164,-0.0013641,0.07410253
2,C104536,"4.91102971e-02,1.35097533e-01,-2.54380330e-03,...",0.0491102971,0.135097533,-0.0025438033,-0.139362305,-0.244591922,0.0478408448,0.143908396,-0.0237005409,...,0.0486359373,-0.0125287324,-0.031518843,0.146263734,-0.0474830456,-0.113526262,0.097283937,0.153202027,0.0104252342,0.0923839062
3,C088658,"-1.5123323e-02,-3.2596567e-01,-1.0544300e+00,2...",-0.015123323,-0.32596567,-1.05443,0.20313551,0.3505004,-0.82078874,0.86272287,0.34606051,...,0.43986505,0.46400654,-0.45179743,0.34801915,-0.21653381,-0.2952297,0.2747623,0.52579957,0.024951532,0.086974278
4,D014635,"-6.37703110e-03,-4.31791008e-01,-1.22665536e+0...",-0.0063770311,-0.431791008,-1.22665536,0.168801114,0.49764058,-0.93002069,0.801523268,0.506105006,...,0.611794293,0.517952442,-0.392865539,0.526886761,-0.212535933,-0.357861817,0.322705418,0.43379885,-0.167800769,0.0785325244


In [21]:
diseases.head()

Unnamed: 0,0
0,C2750090
1,C0342708
2,C0039292
3,C1970456
4,C1845028
