## Load Molecule Embeddings

This notebook loads the embeddings generated by MolecularTransformerEmbeddings as found in [link](https://github.com/mpcrlab/MolecularTransformerEmbeddings)

- Loads full BioKG unique drug id and smiles list.
- Loads generated embeddings as .npz
- Matches on SMILES
- Stores embeddings in the form of {id, embedding}

In [138]:
import torch
import dill
import pandas as pd
import numpy as np

from pathlib import Path


In [12]:
path = Path('../../../MolecularTransformerEmbeddings')

In [20]:
with open(path.joinpath('data/drug_ids_full.txt')) as f, open(path.joinpath('data/drug_smiles_full.txt')) as e:
    drug_ids_smiles = [(linef.strip(), linee.strip()) for linef,linee in zip(f.readlines(), e.readlines())]

In [68]:

drug_df = pd.DataFrame(drug_ids_smiles, columns=['identifiers', 'smiles'])


In [146]:
drug_df

Unnamed: 0,identifiers,smiles
0,DB07460,CNC(=O)C1=C(NC2=NC(NC3=CC=C(C=C3OC)N3CCOCC3)=NC=C2Cl)C=CC=C1
1,DB08869,CC\C=C\CC(=O)N[C@@H](CC1=CC=C(O)C=C1)C(=O)N[C@@H](C)C(=O)N[C@@H](CC(O)=O)C(=O)N[C@@H](C)C(=O)N[C@@H](C(C)CC)C(=O)N[C@@H](CC1=CC=CC=C1)C(=O)N[C@@H](C(C)O)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CC1=CC=C(O)C=C1)C(=O)N[C@@H](CCCNC(N)=N)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](C(C)C)C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CO)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCNC(N)=N)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(O)=O)C(=O)N[C@@H](C(C)CC)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCNC(N)=N)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCC(N)=O)C(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCCNC(N)=N)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](CCCNC(N)=N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCNC(N)=N)C(=O)N[C@@H](CC(C)C)C(N)=O
2,DB11154,[Zn++].[Zn++].[Zn++].OC(CC([O-])=O)(CC([O-])=O)C([O-])=O.OC(CC([O-])=O)(CC([O-])=O)C([O-])=O
3,DB03231,[H]N([H])C1=NC(=O)C2=NN(N([H])C2=N1)C1=CC=CC(=C1)C(=O)N([H])CC1=CC=CC=C1SC1=CC=CC=C1CO
4,DB08873,[H][C@]12CN([C@H](C(=O)NC(CC3CCC3)C(=O)C(N)=O)[C@@]1([H])C2(C)C)C(=O)[C@@H](NC(=O)NC(C)(C)C)C(C)(C)C
...,...,...
10810,DB09295,FC(F)(F)C1=CC(NC2=C(C=CC=N2)C(=O)OC2OC(=O)C3=CC=CC=C23)=CC=C1
10811,DB11880,CN(C)CCNC(=O)C1=C2N=C3C=CC=CC3=CC2=CC=C1
10812,DB03505,NC1=CC2=C(O)N=C(N)N=C2C=C1
10813,DB14736,CCCC1=CC(O)=C2C(OC(C)(C)C3=C2C=C(C)C=C3)=C1


In [147]:


molecular_embeddings = np.load(path.joinpath('embeddings/drug_smiles_full.npz'))

# .npz files appear to be a collection of np files characterized by a specific key. Much like a dictionary,
# https://stackoverflow.com/questions/18231135/load-compressed-data-npz-from-file-using-numpy-load

In [78]:
molecular_embeddings['CCCC1=CC(O)=C2C(OC(C)(C)C3=C2C=C(C)C=C3)=C1']

array([[-0.9351504 ,  0.5126137 ,  1.279951  , ...,  0.18365322,
        -0.22268394,  0.7488331 ],
       [-1.5087725 , -0.11476845, -0.13022745, ...,  1.3475311 ,
        -0.9201733 ,  1.8141911 ],
       [-1.5598581 , -0.20119447, -0.13484088, ...,  1.3420202 ,
        -0.92527956,  1.7934967 ],
       ...,
       [-1.6620691 ,  0.04188597,  1.3711847 , ...,  0.35630718,
         0.0508751 ,  2.7646027 ],
       [ 0.06319952, -0.20660579, -0.16125609, ...,  0.02191191,
        -0.98814416,  1.8253094 ],
       [-1.699482  ,  0.7204988 ,  1.1108482 , ...,  0.08221638,
        -0.9439809 , -0.78185666]], dtype=float32)

In [44]:

molecular_embeddings.files[-2]

'CCCC1=CC(O)=C2C(OC(C)(C)C3=C2C=C(C)C=C3)=C1'

In [47]:
embedding_set = set(molecular_embeddings.files)

In [55]:
from collections import Counter

In [56]:
c = Counter(drug_df[1])

In [59]:
drug_df[drug_df[1] == 'CCNC1=NC(Cl)=NC(NC(C)(CC)C#N)=N1']

Unnamed: 0,0,1
5379,DB07551,CCNC1=NC(Cl)=NC(NC(C)(CC)C#N)=N1
9982,DB07552,CCNC1=NC(Cl)=NC(NC(C)(CC)C#N)=N1


In [57]:
c.most_common(n=10)

[('CCNC1=NC(Cl)=NC(NC(C)(CC)C#N)=N1', 2),
 ('CNC(=O)C1=C(NC2=NC(NC3=CC=C(C=C3OC)N3CCOCC3)=NC=C2Cl)C=CC=C1', 1),
 ('CC\\C=C\\CC(=O)N[C@@H](CC1=CC=C(O)C=C1)C(=O)N[C@@H](C)C(=O)N[C@@H](CC(O)=O)C(=O)N[C@@H](C)C(=O)N[C@@H](C(C)CC)C(=O)N[C@@H](CC1=CC=CC=C1)C(=O)N[C@@H](C(C)O)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CC1=CC=C(O)C=C1)C(=O)N[C@@H](CCCNC(N)=N)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](C(C)C)C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CO)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCNC(N)=N)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(O)=O)C(=O)N[C@@H](C(C)CC)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCNC(N)=N)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCC(N)=O)C(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCCNC(N)=N)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](CCCNC(N)=N)C(=O)N[C@@H](C)C(=O)N[C@@H](CCCNC(N)=N)C(=O)N[C@@H](CC(C)C)C(N)=O',
  1),
 ('[Zn++].[Zn++].[Zn++

In [52]:
id_set = set(drug_df[1])

In [54]:
len(id_set), len(embedding_set)

(10814, 10814)

In [53]:
id_set.difference(embedding_set)

set()

In [66]:
test_df = pd.DataFrame([file for file in set(molecular_embeddings.files)], columns=['smiles'])

In [121]:
test_df['mol_len'] = test_df['smiles'].apply(len)

In [123]:
test_df.sort_values('mol_len', ascending=False)

Unnamed: 0,smiles,mol_len
7257,[H][C@]1(O)C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)O[C@@]1([H])C[C@@]([H])(O[C@]1([H])COP(O)(=O)OC1([H])CC([H])(OC1([H])CO)N1C=NC2=C1NC(=N)N=C2O)N1C=NC2=C1NC(=N)N=C2O)N1C=C(C)C(O)=NC1=O)N1C=NC2=C1NC(=N)N=C2O)N1C=NC2=C1NC(=N)N=C2O)N1C=C(C)C(O)=NC1=O)N1C=NC2=C1NC(=N)N=C2O)N1C=NC2=C1NC(=N)N=C2O)N1C=C(C)C(O)=NC1=O)N1C=NC2=C1NC(=N)N=C2O)N1C=NC2=C1NC(=N)N=C2O)N1C=C(C)C(O)=NC1=O)N1C=C(C)C(O)=NC1=O)N1C=NC2=C1NC(=N)N=C2O)N1C=C(C)C(O)=NC1=O)N1C=NC2=C1NC(=N)N=C2O)N1C=NC2=C1NC(=N)N=C2O)N1C=C(C)C(O)=NC1=O)N1C=NC2=C1NC(=N)N=C2O)N1C=NC2=C1NC(=N)N=C2O)N1C=C(C)C(O)=NC1=O)N1C=NC2=C1NC(=N)N=C2O)N1C=NC2=C1NC(=N)N=C2O)N1C=C(C)C(O)=NC1=O)N1C=NC2=C1NC(=N)N=C2O)N1C=NC2=C1NC(=N)N=C2O,1695
4271,OC[C@H]1O[C@@](CO)(OC[C@@]2(OC[C@@]3(OC[C@@]4(OC[C@@]5(OC[C@@]6(OC[C@@]7(OC[C@@]8(OC[C@@]9(OC[C@@]%10(OC[C@@]%11(OC[C@@]%12(OC[C@@]%13(OC[C@@]%14(OC[C@@]%15(OC[C@@]%16(OC[C@@]%17(OC[C@@]%18(OC[C@@]%19(OC[C@@]%20(OC[C@@]%21(OC[C@@]%22(OC[C@@]%23(OC[C@@]%24(OC[C@@]%25(OC[C@@]%26(OC[C@@]%27(OC[C@@]%28(OC[C@@]%29(OC[C@@]%30(OC[C@@]%31(OC[C@@]%32(OC[C@@]%33(OC[C@@]%34(OC[C@@]%35(OC[C@@]%36(OC[C@@]%37(O[C@H]%38O[C@H](CO)[C@@H](O)[C@H](O)[C@H]%38O)O[C@H](CO)[C@@H](O)[C@@H]%37O)O[C@H](CO)[C@@H](O)[C@@H]%36O)O[C@H](CO)[C@@H](O)[C@@H]%35O)O[C@H](CO)[C@@H](O)[C@@H]%34O)O[C@H](CO)[C@@H](O)[C@@H]%33O)O[C@H](CO)[C@@H](O)[C@@H]%32O)O[C@H](CO)[C@@H](O)[C@@H]%31O)O[C@H](CO)[C@@H](O)[C@@H]%30O)O[C@H](CO)[C@@H](O)[C@@H]%29O)O[C@H](CO)[C@@H](O)[C@@H]%28O)O[C@H](CO)[C@@H](O)[C@@H]%27O)O[C@H](CO)[C@@H](O)[C@@H]%26O)O[C@H](CO)[C@@H](O)[C@@H]%25O)O[C@H](CO)[C@@H](O)[C@@H]%24O)O[C@H](CO)[C@@H](O)[C@@H]%23O)O[C@H](CO)[C@@H](O)[C@@H]%22O)O[C@H](CO)[C@@H](O)[C@@H]%21O)O[C@H](CO)[C@@H](O)[C@@H]%20O)O[C@H](CO)[C@@H](O)[C@@H]%19O)O[C@H](CO)[C@@H](O)[C@@H]%18O)O[C@H](CO)[C@@H](O)[C@@H]%17O)O[C@H](CO)[C@@H](O)[C@@H]%16O)O[C@H](CO)[C@@H](O)[C@@H]%15O)O[C@H](CO)[C@@H](O)[C@@H]%14O)O[C@H](CO)[C@@H](O)[C@@H]%13O)O[C@H](CO)[C@@H](O)[C@@H]%12O)O[C@H](CO)[C@@H](O)[C@@H]%11O)O[C@H](CO)[C@@H](O)[C@@H]%10O)O[C@H](CO)[C@@H](O)[C@@H]9O)O[C@H](CO)[C@@H](O)[C@@H]8O)O[C@H](CO)[C@@H](O)[C@@H]7O)O[C@H](CO)[C@@H](O)[C@@H]6O)O[C@H](CO)[C@@H](O)[C@@H]5O)O[C@H](CO)[C@@H](O)[C@@H]4O)O[C@H](CO)[C@@H](O)[C@@H]3O)O[C@H](CO)[C@@H](O)[C@@H]2O)[C@@H](O)[C@@H]1O,1526
4824,COCCO[C@@H]1[C@H](O)[C@@H](COP(S)(=O)O[C@@H]2[C@@H](COP(S)(=O)O[C@@H]3[C@@H](COP(S)(=O)O[C@@H]4[C@@H](COP(S)(=O)O[C@H]5C[C@@H](O[C@@H]5COP(S)(=O)O[C@H]5C[C@@H](O[C@@H]5COP(S)(=O)O[C@H]5C[C@@H](O[C@@H]5COP(S)(=O)O[C@H]5C[C@@H](O[C@@H]5COP(S)(=O)O[C@H]5C[C@@H](O[C@@H]5COP(S)(=O)O[C@H]5C[C@@H](O[C@@H]5COP(S)(=O)O[C@H]5C[C@@H](O[C@@H]5COP(S)(=O)O[C@H]5C[C@@H](O[C@@H]5COP(S)(=O)O[C@H]5C[C@@H](O[C@@H]5COP(S)(=O)O[C@H]5C[C@@H](O[C@@H]5COP(S)(=O)O[C@H]5C[C@@H](O[C@@H]5COP(S)(=O)O[C@H]5C[C@@H](O[C@@H]5COP(S)(=O)O[C@H]5C[C@@H](O[C@@H]5COP(S)(=O)O[C@@H]5[C@@H](COP(S)(=O)O[C@@H]6[C@@H](COP(S)(=O)O[C@@H]7[C@@H](COP(S)(=O)O[C@@H]8[C@@H](CO)O[C@H]([C@@H]8OCCOC)N8C=C(C)C(N)=NC8=O)O[C@H]([C@@H]7OCCOC)N7C=NC8=C7N=CN=C8N)O[C@H]([C@@H]6OCCOC)N6C=NC7=C6N=C(N)NC7=O)O[C@H]([C@@H]5OCCOC)N5C=C(C)C(N)=NC5=O)N5C=NC6=C5N=CN=C6N)N5C=NC6=C5N=C(N)NC6=O)N5C=CC(N)=NC5=O)N5C=NC6=C5N=CN=C6N)N5C=NC6=C5N=C(N)NC6=O)N5C=NC6=C5N=CN=C6N)N5C=NC6=C5N=C(N)NC6=O)N5C=C(C)C(=O)NC5=O)N5C=CC(N)=NC5=O)N5C=C(C)C(=O)NC5=O)N5C=C(C)C(=O)NC5=O)N5C=CC(N)=NC5=O)N5C=NC6=C5N=CN=C6N)O[C@H]([C@@H]4OCCOC)N4C=C(C)C(=O)NC4=O)O[C@H]([C@@H]3OCCOC)N3C=C(C)C(N)=NC3=O)O[C@H]([C@@H]2OCCOC)N2C=NC3=C2N=CN=C3N)O[C@H]1N1C=C(C)C(=O)NC1=O,1182
5095,COCCO[C@H]1[C@@H](O)[C@H](COP(O)(=O)S[C@H]2[C@H](COP(O)(=O)S[C@H]3[C@H](COP(O)(=O)S[C@H]4[C@H](COP(O)(=O)S[C@@H]5[C@@H](COP(O)(=O)S[C@H]6C[C@@H](O[C@@H]6COP(O)(=O)S[C@H]6C[C@@H](O[C@@H]6COP(O)(=O)S[C@H]6C[C@@H](O[C@@H]6COP(O)(=O)S[C@H]6C[C@@H](O[C@@H]6COP(O)(=O)S[C@H]6C[C@@H](O[C@@H]6COP(O)(=O)S[C@H]6C[C@@H](O[C@@H]6COP(O)(=O)S[C@H]6C[C@@H](O[C@@H]6COP(O)(=O)S[C@H]6C[C@@H](O[C@@H]6COP(O)(=O)S[C@H]6C[C@@H](O[C@@H]6COP(O)(=O)S[C@H]6C[C@@H](O[C@@H]6COP(O)(=O)S[C@@H]6[C@@H](COP(O)(=O)S[C@@H]7[C@@H](COP(O)(=O)S[C@@H]8[C@@H](COP(O)(=O)S[C@@H]9[C@@H](COP(O)(=O)S[C@@H]%10[C@@H](CO)O[C@H]([C@@H]%10OCCOC)N%10C=NC%11=C%10N=C(N)NC%11=O)O[C@H]([C@@H]9OCCOC)N9C=C(C)C(N)=NC9=O)O[C@H]([C@@H]8OCCOC)N8C=C(C)C(N)=NC8=O)O[C@H]([C@@H]7OCCOC)N7C=C(C)C(=O)NC7=O)O[C@H]([C@@H]6OCCOC)N6C=C(C)C(N)=NC6=O)N6C=NC7=C6N=CN=C7N)N6C=NC7=C6N=C(N)NC7=O)N6C=C(C)C(=O)NC6=O)N6C=C(C)C(N)=NC6=O)N6C=C(C)C(=O)NC6=O)N6C=NC7=C6N=C(N)NC7=O)N6C=C(C)C(N)=NC6=O)N6C=C(C)C(=O)NC6=O)N6C=C(C)C(=O)NC6=O)N6C=C(C)C(N)=NC6=O)O[C@H]([C@@H]5OCCOC)N5C=NC6=C5N=C(N)NC6=O)O[C@@H]([C@H]4OCCOC)N4C=C(C)C(N)=NC4=O)O[C@@H]([C@H]3OCCOC)N3C=NC4=C3N=CN=C4N)O[C@@H]([C@H]2OCCOC)N2C=C(C)C(N)=NC2=O)O[C@@H]1N1C=C(C)C(N)=NC1=O,1169
7533,[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[H][C@@]1(O[C@@H]2O[C@H](COC)[C@@]([H])(O[C@@H]3O[C@H](COC)[C@@]([H])(O[C@@H]4O[C@H](COC)[C@@]([H])(O[C@H]5O[C@H](COS([O-])(=O)=O)[C@@]([H])(O[C@@H]6O[C@H](COS([O-])(=O)=O)[C@@]([H])(O[C@H]7O[C@H](COS([O-])(=O)=O)[C@@]([H])(O[C@H]8O[C@H](COS([O-])(=O)=O)[C@@H](OS([O-])(=O)=O)[C@H](OS([O-])(=O)=O)[C@H]8OS([O-])(=O)=O)[C@H](OS([O-])(=O)=O)[C@H]7OS([O-])(=O)=O)[C@H](OS([O-])(=O)=O)[C@H]6OS([O-])(=O)=O)[C@H](OC)[C@H]5OC)[C@H](OC)[C@H]4OC)[C@H](OC)[C@H]3OC)[C@H](OC)[C@H]2OC)[C@@H](COC)O[C@H](O[C@]2([H])[C@@H](COC)O[C@@H](O[C@]3([H])[C@@H](COC)O[C@H](O[C@]4([H])[C@@H](COC)O[C@@H](O[C@@]5([H])[C@@H](COS([O-])(=O)=O)O[C@H](O[C@@]6([H])[C@H](OC)[C@@H](OC)[C@H](O[C@]7([H])[C@@H](COS([O-])(=O)=O)O[C@H](O[C@@]8([H])[C@H](OC)[C@H](OC)[C@H](O[C@]9([H])[C@@H](COS([O-])(=O)=O)O[C@H](OC)[C@@H](OS([O-])(=O)=O)[C@H]9OC)O[C@H]8C([O-])=O)[C@H](OS([O-])(=O)=O)[C@H]7OS([O-])(=O)=O)O[C@H]6C([O-])=O)[C@H](OC)[C@H]5OC)[C@H](OC)[C@H]4OC)[C@H](OC)[C@H]3OC)[C@H](OC)[C@H]2OC)[C@H](OC)[C@H]1OC,1091
...,...,...
3767,CN,2
3148,OO,2
2087,II,2
1441,F,1


In [70]:
final_df = test_df.merge(drug_df, on='smiles')

In [116]:
final_df

Unnamed: 0,smiles,identifiers
0,CN(C)C(=O)C1=CC(=CN1)C1=NNC=C1C1=CC=CC=C1,DB06877
1,CC1=NC(C#CC2=CC=NC(Cl)=C2)=C(C)N1C1=CC=C(F)C=C1,DB11833
2,CCCN(CCC)CCC1=CNC2=C1C(O)=CC=C2,DB13990
3,COC[C@@]1(C)CCCN2CCC3=C(OC4=CC=CC=C34)[C@H]12,DB12057
4,C[N+](C)(C)CCCC[C@H](N)C(O)=O,DB03977
...,...,...
10810,FC(F)(F)C1=CC=C(COC(CN2C=CN=C2)C2=CC=C(Cl)C=C2Cl)C=C1,DB11985
10811,[H][C@@]12CC[C@H](O)[C@@]1(C)CC[C@@]1([H])[C@@]2([H])CCC2=CC(=O)CC[C@]12C,DB00624
10812,NC(=N)C1=CC=C(CC(=O)C(O)=O)C=C1,DB02018
10813,COC1=CC(=CC=C1OC[C@H](OP(O)(O)=O)C1CC1)N1C=NC2=C(SC(=C2)C2=CC=C(Cl)C=C2)C1=O,DB14787


In [117]:
final_df['embeddings'] = final_df['smiles'].apply(lambda x: molecular_embeddings[x])

In [124]:
final_df['max_emb_dim'] = final_df['embeddings'].apply(lambda x: x.shape[0])

In [129]:
final_df.sort_values('max_emb_dim', ascending=False)['max_emb_dim']


# Here's how we realised that there's a concatenation of SMILES happeing on the 256 AA.
# Question for later: Is this significant?

7551    256
7258    256
2349    256
2351    256
3223    256
       ... 
2088      3
1026      3
3768      3
1441      2
3483      2
Name: max_emb_dim, Length: 10815, dtype: int64

In [82]:
molecular_embeddings['CCCC1=CC(O)=C2C(OC(C)(C)C3=C2C=C(C)C=C3)=C1'].shape

(44, 512)

In [132]:
molecule_embeddings = final_df[['identifiers', 'embeddings']]

In [135]:
molecule_embeddings = molecule_embeddings.set_index('identifiers')

In [140]:

print(f"Saving molecule embeddings......")
# Save it
filename = f'../../data/processed/biokg_molecule_embeddings.pt'

with open(filename, 'wb') as f:  # Overwrites any existing file.
    torch.save(molecule_embeddings, f, pickle_module=dill)

Saving molecule embeddings......


In [None]:
# Load them back for testing purposes...

In [141]:
tensor_data = torch.load(f'../../data/processed/biokg_molecule_embeddings.pt')

df_data = pd.DataFrame.from_dict(tensor_data)

In [None]:
df_data.head(3)

### The end