# Notebook: Use NN to predict disease from chemicals using Opa2Vec vectors
<b> Author: </b> Ian Coleman <br>
<b> Purpose: </b> Take the vectors created in the opa2vec notebook. This took chemical go functions
    and disease go function, creating vectors for the chemicals. Train a NN to predict diseases from these chemical
    vectors

In [2]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### 1. Import Chem Vectors and Pre-Process them

In [3]:
# Import vec file
with open('AllVectorResults.lst', 'r') as file:
    text = file.read()

In [4]:
# Strip and split it into list of lists [chem, vec]
text = text.replace('\n', '')
text = text.split(']')
text = [item.strip().split(' [') for item in text]

In [5]:
# Turn it into a data frame
df = pd.DataFrame(text)
df.columns = ['ChemicalID', 'Vector']
# df.head()

In [6]:
# Clean
df = df.dropna()
df['Vector'] = df.Vector.map(lambda x: x.rstrip().lstrip().replace('    ', ' ').replace('   ', ' ').replace('  ', ' ').replace(' ', ','))

In [7]:
# Split the vector into a column per number
vec_split = df['Vector'].str.split(',', expand=True)
df = df.join(vec_split, lsuffix='_df', rsuffix='_vec_split')

In [8]:
# df.loc[:,0].head()
# BCE binary classification --> The loss function recommended by Jun
# sigmoid output


### 2. Add Diseases to DF
Binary encode presence of positive association between each disease and each chem

In [17]:
# # Import disease list (created in opa2vec notebook that created vectors)
# diseases = pd.read_csv('diseases.lst', header=None, skiprows=1) # Skipping first row as will be nan
# diseases.shape # 1264 diseases...

In [24]:
# df.head()

In [25]:
# diseases.head()

In [23]:
# Import directly evidenced chemical-disease positive relationships from CTD
chem_dis = pd.read_csv('../ctd-to-nt/chem-dis-pos-assocs.csv')
chem_dis.head()

Unnamed: 0,ChemicalID,DiseaseID
0,C112297,MESH:D006948
1,C112297,MESH:D012640
2,C425777,MESH:D006948
3,C013567,MESH:D006333
4,C418863,MESH:D013262


In [26]:
df.head()

Unnamed: 0,ChemicalID,Vector,0,1,2,3,4,5,6,7,...,190,191,192,193,194,195,196,197,198,199
0,D015032,"-0.01185622,-0.31878912,-0.89908963,0.07175528...",-0.01185622,-0.31878912,-0.89908963,0.07175528,0.20856386,-0.6810764,0.7155862,0.3877636,...,0.4799509,0.49532512,-0.40308106,0.440478,-0.14763047,-0.33496988,0.21969438,0.49740723,-0.08506261,0.07757662
1,C085514,"0.0223429,0.1116555,0.02859181,-0.1335976,-0.2...",0.0223429,0.1116555,0.02859181,-0.1335976,-0.23470162,0.034631,0.12097855,0.00488628,...,0.02691469,0.01887334,-0.05138162,0.11950745,-0.02516304,-0.10212526,0.08233052,0.1476164,-0.0013641,0.07410253
2,C104536,"4.91102971e-02,1.35097533e-01,-2.54380330e-03,...",0.0491102971,0.135097533,-0.0025438033,-0.139362305,-0.244591922,0.0478408448,0.143908396,-0.0237005409,...,0.0486359373,-0.0125287324,-0.031518843,0.146263734,-0.0474830456,-0.113526262,0.097283937,0.153202027,0.0104252342,0.0923839062
3,C088658,"-1.5123323e-02,-3.2596567e-01,-1.0544300e+00,2...",-0.015123323,-0.32596567,-1.05443,0.20313551,0.3505004,-0.82078874,0.86272287,0.34606051,...,0.43986505,0.46400654,-0.45179743,0.34801915,-0.21653381,-0.2952297,0.2747623,0.52579957,0.024951532,0.086974278
4,D014635,"-6.37703110e-03,-4.31791008e-01,-1.22665536e+0...",-0.0063770311,-0.431791008,-1.22665536,0.168801114,0.49764058,-0.93002069,0.801523268,0.506105006,...,0.611794293,0.517952442,-0.392865539,0.526886761,-0.212535933,-0.357861817,0.322705418,0.43379885,-0.167800769,0.0785325244


In [43]:
## Get rid of rows from chem_dis that have chems that aren't in df
print(chem_dis.shape)
chemsers = df.ChemicalID.unique()
bools = chem_dis.ChemicalID.map(lambda x: x in chemsers)
chem_dis = chem_dis[bools]
chem_dis.shape

(11818, 2)


(11818, 2)

In [45]:
print('Number chems: ', len(chem_dis.ChemicalID.unique()))
print('Number diseases: ', len(chem_dis.DiseaseID.unique()))

Number chems:  294
Number diseases:  1748


In [46]:
# Create column for each disease, nan columns
for name in chem_dis.DiseaseID.unique():
    df[name] = np.nan

In [47]:
df.head()

Unnamed: 0,ChemicalID,Vector,0,1,2,3,4,5,6,7,...,MESH:C538178,MESH:D059226,MESH:D012608,MESH:C564286,MESH:D000210,MESH:D010020,MESH:D020447,MESH:D013009,MESH:D043183,MESH:D014605
0,D015032,"-0.01185622,-0.31878912,-0.89908963,0.07175528...",-0.01185622,-0.31878912,-0.89908963,0.07175528,0.20856386,-0.6810764,0.7155862,0.3877636,...,,,,,,,,,,
1,C085514,"0.0223429,0.1116555,0.02859181,-0.1335976,-0.2...",0.0223429,0.1116555,0.02859181,-0.1335976,-0.23470162,0.034631,0.12097855,0.00488628,...,,,,,,,,,,
2,C104536,"4.91102971e-02,1.35097533e-01,-2.54380330e-03,...",0.0491102971,0.135097533,-0.0025438033,-0.139362305,-0.244591922,0.0478408448,0.143908396,-0.0237005409,...,,,,,,,,,,
3,C088658,"-1.5123323e-02,-3.2596567e-01,-1.0544300e+00,2...",-0.015123323,-0.32596567,-1.05443,0.20313551,0.3505004,-0.82078874,0.86272287,0.34606051,...,,,,,,,,,,
4,D014635,"-6.37703110e-03,-4.31791008e-01,-1.22665536e+0...",-0.0063770311,-0.431791008,-1.22665536,0.168801114,0.49764058,-0.93002069,0.801523268,0.506105006,...,,,,,,,,,,


In [76]:
# For each chem-disease relationship set cell to one, if no relationship then set to 0
def check_assoc(row):
    for index, r in chem_dis[chem_dis.ChemicalID == row.ChemicalID].head().iterrows():
#         row[r.DiseaseID] = 1
        print(r.DiseaseID)
        df.loc[index, r.DiseaseID] = 1
    
    
# convert np.nan to 0 for col in df


In [75]:
chem_dis.head()

Unnamed: 0,ChemicalID,DiseaseID
1038,C049584,MESH:D058739
1039,C049584,MESH:D000230
1040,C049584,MESH:D000236
1041,C049584,MESH:D001284
1042,C049584,MESH:D001943


In [77]:
df.apply(check_assoc, axis=1)

MESH:D000169
MESH:C538178
MESH:D054058
MESH:D000370
MESH:D000505
MESH:D064793
MESH:D058186
MESH:D000210
MESH:D000505
MESH:D000740
MESH:D001281
MESH:D015746
MESH:D000014
MESH:D000015
MESH:D000138
MESH:D000140
MESH:D058186
MESH:D000743
MESH:D000748
MESH:D000787
MESH:D000855
MESH:D001259
MESH:D019970
MESH:D003681
MESH:D003967
MESH:D005221
MESH:D015746
MESH:D000380
MESH:D000505
MESH:D000782
MESH:D000787
MESH:D015746
MESH:D000014
MESH:D000140
MESH:D058186
MESH:D000370
MESH:D056486
MESH:D009336
MESH:D001919
MESH:D007022
MESH:D012208
MESH:D019446
MESH:D008569
MESH:D009336
MESH:D009845
MESH:D015431
MESH:D000138
MESH:D007022
MESH:D000860
MESH:D009336
MESH:D011014
MESH:D015746
MESH:D020434
MESH:D000014
MESH:D000015
MESH:D000051
MESH:D001919
MESH:D003643
MESH:D064420
MESH:D005207
MESH:D015877
MESH:D000022
MESH:D058186
MESH:D055371
MESH:C538231
MESH:D056151
MESH:D050197
MESH:D006331
MESH:D007859
MESH:D011014
MESH:D011230
MESH:D009336
MESH:D049188
MESH:D058186
MESH:D000419
MESH:D000707
MESH:D000741

MESH:D019465
MESH:D012734
MESH:D020964
MESH:D001249
MESH:D059366
MESH:D001943
MESH:D016535
MESH:D017449
MESH:D000170
MESH:D001289
MESH:D001321
MESH:D001327
MESH:D002375
MESH:D058186
MESH:D000380
MESH:D000707
MESH:D000740
MESH:D000741
MESH:D009336
MESH:D000380
MESH:D000743
MESH:D000855
MESH:D001037
MESH:D001259
MESH:D000169
MESH:D000740
MESH:D000741
MESH:D001855
MESH:D001922
MESH:D006528
MESH:D002318
MESH:D002561
MESH:D056486
MESH:D003324
MESH:D000014
MESH:D018248
MESH:D001284
MESH:D056486
MESH:D003920
MESH:D000303
MESH:D000309
MESH:D000855
MESH:D001714
MESH:D002386
MESH:D064420
MESH:D017109
MESH:D001480
MESH:D002375
MESH:D020246
MESH:D015746
MESH:D058186
MESH:D000380
MESH:D000740
MESH:D000741
MESH:D002493
MESH:D009207
MESH:D000014
MESH:D000015
MESH:D001327
MESH:D001927
MESH:D001929
MESH:D058186
MESH:D000782
MESH:D001201
MESH:D001847
MESH:D002289
MESH:D058186
MESH:D000236
MESH:D001169
MESH:D001172
MESH:D001943
MESH:D019970
MESH:D007174
MESH:D012640
MESH:D019966
MESH:D015746
MESH:D000014

0        None
1        None
2        None
3        None
4        None
5        None
6        None
7        None
8        None
9        None
10       None
11       None
12       None
13       None
14       None
15       None
16       None
17       None
18       None
19       None
20       None
21       None
22       None
23       None
24       None
25       None
26       None
27       None
28       None
29       None
         ... 
18135    None
18136    None
39986    None
39987    None
39988    None
39989    None
18731    None
18732    None
18733    None
18734    None
18735    None
51742    None
53490    None
53491    None
53492    None
53493    None
53494    None
46123    None
46124    None
46125    None
32568    None
32569    None
32570    None
32571    None
32572    None
57379    None
57380    None
33350    None
33351    None
33352    None
Length: 1651, dtype: object

In [80]:
df.head() 
df["MESH:D048629"].unique()

array([nan,  1.])