# Notebook: Use NN to predict disease from chemicals using Opa2Vec vectors
<b> Author: </b> Ian Coleman <br>
<b> Purpose: </b> Take the vectors created in the opa2vec notebook. This took chemical go functions
    and disease go function, creating vectors for the chemicals. Train a NN to predict diseases from these chemical
    vectors

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### 1. Import and split into training, validation, test

In [38]:
# Import vec file
with open('AllVectorResults.lst', 'r') as file:
    text = file.read()

In [39]:
text = text.replace('\n', '')
text = text.split(']')

In [40]:
text = [item.strip().split(' [') for item in text]
text[0:2]

[['D015032',
  '-0.01185622 -0.31878912 -0.89908963  0.07175528  0.20856386 -0.6810764  0.7155862   0.3877636  -0.45872363 -0.14578499 -0.3104885  -0.3812815  0.10305603  0.06099862  0.3993434   0.56654865 -0.569487   -0.56566286  0.08605032  0.6462419   0.01758292  0.07233136  0.34907985  0.19823638 -0.05432747 -0.03480773  0.03869971 -0.04352272  0.55715096  0.57840073  0.05536235  0.0179637  -0.22523978 -0.57956624 -0.17768218  0.16944307  0.16660923  0.06673377 -0.07830229 -0.04351297  0.17825207 -0.05874934  0.159374    0.5286587  -0.17912982  0.48476198 -0.0611092  -0.00952586  0.40587026  0.09002941  0.0289922   0.03674132 -0.5670638   0.7046182 -0.21693063  0.5323985  -0.31173742  0.05067689  0.05639684 -0.2973395  0.21779567  0.60137844 -0.06182181  0.6540956   0.90607023  0.14218462 -0.63414097 -0.30269045  0.48181444 -0.20776002 -0.60399675  0.12005788  0.47047248  0.59953636 -0.19428143 -0.19011453 -0.5184687  -0.0859951  0.6290439  -0.07862689 -0.1550973  -0.01180639 -0.28

In [118]:
# pd.read_csv('AllVectorResults.lst', delimiter=' ')
df = pd.DataFrame(text)

In [119]:
df.columns = ['ChemicalID', 'Vector']
df.head()

Unnamed: 0,ChemicalID,Vector
0,D015032,-0.01185622 -0.31878912 -0.89908963 0.0717552...
1,C085514,0.0223429 0.1116555 0.02859181 -0.1335976...
2,C104536,4.91102971e-02 1.35097533e-01 -2.54380330e-0...
3,C088658,-1.5123323e-02 -3.2596567e-01 -1.0544300e+00 ...
4,D014635,-6.37703110e-03 -4.31791008e-01 -1.22665536e+0...


In [104]:
# df.Vector.replace(' ', ', ').head()

In [123]:
df = df.dropna()
df['Vector'] = df.Vector.map(lambda x: x.rstrip().lstrip().replace('    ', ' ').replace('   ', ' ').replace('  ', ' ').replace(' ', ','))
# df['Vector'] = df.Vector.replace('   ', ' ').replace('  ', ' ').replace(' ', ',')

In [107]:
df.describe()

Unnamed: 0,ChemicalID,Vector
count,412,412
unique,412,412
top,C009006,"1.61842685e-02,1.09440275e-01,3.32038254e-02,-..."
freq,1,1


In [124]:
df['Vector'].str.split(',', expand=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,192,193,194,195,196,197,198,199,200,201
0,-0.01185622,-0.31878912,-0.89908963,0.07175528,0.20856386,-0.6810764,0.7155862,0.3877636,-0.45872363,-0.14578499,...,-0.40308106,0.440478,-0.14763047,-0.33496988,0.21969438,0.49740723,-0.08506261,0.07757662,,
1,0.0223429,0.1116555,0.02859181,-0.1335976,-0.23470162,0.034631,0.12097855,0.00488628,0.03462547,-0.07215885,...,-0.05138162,0.11950745,-0.02516304,-0.10212526,0.08233052,0.1476164,-0.0013641,0.07410253,,
2,4.91102971e-02,1.35097533e-01,-2.54380330e-03,-1.39362305e-01,-2.44591922e-01,4.78408448e-02,1.43908396e-01,-2.37005409e-02,6.46571368e-02,-6.19980693e-02,...,-3.15188430e-02,1.46263734e-01,-4.74830456e-02,-1.13526262e-01,9.72839370e-02,1.53202027e-01,1.04252342e-02,9.23839062e-02,,
3,-1.5123323e-02,-3.2596567e-01,-1.0544300e+00,2.0313551e-01,3.5050040e-01,-8.2078874e-01,8.6272287e-01,3.4606051e-01,-5.7727689e-01,-2.8629407e-01,...,-4.5179743e-01,3.4801915e-01,-2.1653381e-01,-2.9522970e-01,2.7476230e-01,5.2579957e-01,2.4951532e-02,8.6974278e-02,,
4,-6.37703110e-03,-4.31791008e-01,-1.22665536e+00,1.68801114e-01,4.97640580e-01,-9.30020690e-01,8.01523268e-01,5.06105006e-01,-3.26853752e-01,-1.91961050e-01,...,-3.92865539e-01,5.26886761e-01,-2.12535933e-01,-3.57861817e-01,3.22705418e-01,4.33798850e-01,-1.67800769e-01,7.85325244e-02,,
5,-0.05451952,-0.39623424,-0.77553004,0.01988446,0.07864953,-0.5806129,0.7094759,0.35100234,-0.33569843,-0.08201764,...,-0.41541043,0.40357396,-0.14640635,-0.3340158,0.11955252,0.42888513,-0.11255865,0.07083637,,
6,0.10511223,-0.38449043,-1.1118854,0.11648989,0.21659392,-0.7763645,0.8429269,0.46082917,-0.42588428,-0.2066469,...,-0.5071475,0.4659734,-0.16432147,-0.24225369,0.2217176,0.57947755,0.02706927,0.11450217,,
7,0.04836849,0.05923012,-0.09087747,-0.1132914,-0.20736475,-0.00518869,0.22266796,0.04801345,0.00056334,-0.12582675,...,-0.04886024,0.19957873,-0.04112303,-0.15335204,0.10230027,0.19650498,0.00689218,0.10336098,,
8,-0.00628643,0.06187131,-0.1688884,-0.05538355,-0.2882366,-0.09844729,0.30789605,0.03863276,-0.05183575,-0.0903343,...,-0.10394672,0.28886896,-0.08028729,-0.16200067,0.03578192,0.29697332,-0.01626065,0.10562278,,
9,-0.03668509,-0.1457927,-0.555698,0.01622644,-0.01939054,-0.3135555,0.5917505,0.19063814,-0.30101514,-0.19161965,...,-0.15314558,0.3084693,-0.13758318,-0.25040042,0.09308326,0.37959248,-0.02348905,0.00665588,,


In [116]:
# pd.concat([pd.Series(row['ChemicalID'], row['Vector'].split(' '))              
#                     for _, row in df.iterrows()]).reset_index()

In [122]:
df.head()

Unnamed: 0,ChemicalID,Vector
0,D015032,"-0.01185622,-0.31878912,-0.89908963,0.07175528..."
1,C085514,"0.0223429,0.1116555,0.02859181,-0.1335976,-0.2..."
2,C104536,"4.91102971e-02,1.35097533e-01,-2.54380330e-03,..."
3,C088658,"-1.5123323e-02,-3.2596567e-01,-1.0544300e+00,2..."
4,D014635,"-6.37703110e-03,-4.31791008e-01,-1.22665536e+0..."


In [15]:
df = pd.DataFrame(text)

In [None]:
pd.concat([Series(row['var2'], row['var1'].split(','))

In [16]:
b = pd.DataFrame(df['0'].str.split(' ').tolist()).stack()

KeyError: '0'