In [16]:
import pandas as pd
import numpy as np
import pickle
import time

# Vector Representation

In [9]:
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import FastText

# For Glove
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [31]:
def train_w2v(chems, fingerprint):
    fingerprints = [f.split() for f in chems[fingerprint].tolist()]

    dim_embedding = 100

    start = time.time()
    model = Word2Vec(fingerprints, size=dim_embedding, sg=1, window=5, min_count=1, workers=4)
    model.train(fingerprints, total_examples=model.corpus_count, epochs=10)

    print('Done in', (time.time() - start) / 60, 'min')
    
    return model


def train_fasttext(genes, window=4):
    return FastText(size=30, window=window, min_count=1, sentences=genes, iter=10)
   

# Have to calculate glove model seperately with docker as of now
def load_glove():
    glove_file = 'docker_glove/vectors.txt'
    tmp_file = "docker_glove/glove_word2vec.txt"

    _ = glove2word2vec(glove_file, tmp_file)

    model = KeyedVectors.load_word2vec_format(tmp_file)
    
    return model
    
    
def calc_fingerprint_vector(fingerprint, model=None):
    fingerprints = fingerprint.split()
    
    fingerprint_substructures = [model.wv[f] for f in fingerprints]
    
    fingerprint_vector = np.mean(fingerprint_substructures, axis=0)
    
    fingerprint_vector = ' '.join([str(i) for i in fingerprint_vector])
    
    return fingerprint_vector


def make_fingerprint_vectors(chems, suffix='', use_fasttext=False, use_glove=False):
    if use_fasttext:
        model = train_fasttext(chems)
        col_access = 'upac'
        col_assign = 'upac'
    elif use_glove:
        model = load_glove()
        col_access = 'fingerprint'
        col_assign = 'glove'
    else:
        col_access = f'fingerprint{suffix}'
        col_assign = f'fingerprint{suffix}'
        model = train_w2v(chems, col_access)
    
    start = time.time()
    for idx, row in chems.iterrows():
        chems.at[idx, f'{col_assign}_vector'] = calc_fingerprint_vector(row[f'{col_access}'], model=model)

#         if not idx % 1000:
#             print(idx, 'rows completed in', (time.time() - start)/60, 'min')
            
    return chems

In [22]:
data = pd.read_pickle('drugbank_fingerprinted (1).pkl')
data = data[data.target_gene.notnull()]
data.head()

Unnamed: 0,drug_cas,drug_id,drug_name,target_actions,target_amino,target_gene,target_id,target_name,target_organism,pubchem_id,pubchem_name,target_gene_fingerprint,drug_fingerprint
31,128270-60-0,DB00006,Bivalirudin,[inhibitor],>lcl|BSEQ0016004|Prothrombin\nMAHVRGLQLPGCLALA...,>lcl|BSEQ0016005|Prothrombin (F2)\nATGGCGCACGT...,BE0000048,Prothrombin,Human,16129704,Bivalirudin,[1.2739266e-05 -2.7748918e-05 0.00095751503 0....,[0.10044218 0.18559723 -0.12865402 -0.0650997 ...
53,65807-02-5,DB00014,Goserelin,[agonist],>lcl|BSEQ0036957|Lutropin-choriogonadotropic h...,>lcl|BSEQ0010174|Lutropin-choriogonadotropic h...,BE0000134,Lutropin-choriogonadotropic hormone receptor,Human,5311128,goserelin,[1.2739266e-05 -2.7748918e-05 0.00095751503 0....,[0.1331214 0.2391842 -0.14361311 -0.14107014 0...
54,65807-02-5,DB00014,Goserelin,[agonist],>lcl|BSEQ0000405|Gonadotropin-releasing hormon...,>lcl|BSEQ0018926|Gonadotropin-releasing hormon...,BE0000203,Gonadotropin-releasing hormone receptor,Human,5311128,goserelin,[1.2739266e-05 -2.7748918e-05 0.00095751503 0....,[0.1331214 0.2391842 -0.14361311 -0.14107014 0...
131,16679-58-6,DB00035,Desmopressin,[agonist],>lcl|BSEQ0000583|Vasopressin V2 receptor\nMLMA...,>lcl|BSEQ0010065|Vasopressin V2 receptor (AVPR...,BE0000293,Vasopressin V2 receptor,Human,16051933,DESMOPRESSIN,[1.2739266e-05 -2.7748918e-05 0.00095751503 0....,[0.051722348 0.24374117 -0.16236018 -0.1723347...
132,16679-58-6,DB00035,Desmopressin,,>lcl|BSEQ0009979|Vasopressin V1a receptor\nMRL...,>lcl|BSEQ0009980|Vasopressin V1a receptor (AVP...,BE0000165,Vasopressin V1a receptor,Human,16051933,DESMOPRESSIN,[1.2739266e-05 -2.7748918e-05 0.00095751503 0....,[0.051722348 0.24374117 -0.16236018 -0.1723347...


In [54]:
t = data.drop_duplicates(['target_name', 'target_gene'])
genes = t.target_gene.apply(lambda x: ''.join(x.split('\n')[1:])).tolist()

In [60]:
genes[0]

'ATGGCGCACGTCCGAGGCTTGCAGCTGCCTGGCTGCCTGGCCCTGGCTGCCCTGTGTAGCCTTGTGCACAGCCAGCATGTGTTCCTGGCTCCTCAGCAAGCACGGTCGCTGCTCCAGCGGGTCCGGCGAGCCAACACCTTCTTGGAGGAGGTGCGCAAGGGCAACCTGGAGCGAGAGTGCGTGGAGGAGACGTGCAGCTACGAGGAGGCCTTCGAGGCTCTGGAGTCCTCCACGGCTACGGATGTGTTCTGGGCCAAGTACACAGCTTGTGAGACAGCGAGGACGCCTCGAGATAAGCTTGCTGCATGTCTGGAAGGTAACTGTGCTGAGGGTCTGGGTACGAACTACCGAGGGCATGTGAACATCACCCGGTCAGGCATTGAGTGCCAGCTATGGAGGAGTCGCTACCCACATAAGCCTGAAATCAACTCCACTACCCATCCTGGGGCCGACCTACAGGAGAATTTCTGCCGCAACCCCGACAGCAGCACCACGGGACCCTGGTGCTACACTACAGACCCCACCGTGAGGAGGCAGGAATGCAGCATCCCTGTCTGTGGCCAGGATCAAGTCACTGTAGCGATGACTCCACGCTCCGAAGGCTCCAGTGTGAATCTGTCACCTCCATTGGAGCAGTGTGTCCCTGATCGGGGGCAGCAGTACCAGGGGCGCCTGGCGGTGACCACACATGGGCTCCCCTGCCTGGCCTGGGCCAGCGCACAGGCCAAGGCCCTGAGCAAGCACCAGGACTTCAACTCAGCTGTGCAGCTGGTGGAGAACTTCTGCCGCAACCCAGACGGGGATGAGGAGGGCGTGTGGTGCTATGTGGCCGGGAAGCCTGGCGACTTTGGGTACTGCGACCTCAACTATTGTGAGGAGGCCGTGGAGGAGGAGACAGGAGATGGGCTGGATGAGGACTCAGACAGGGCCATCGAAGGGCGTACCGCCACCAGTGAGTACCAGACTTTCTTCAATCCGAGGACCTTTGGCTCGGGAGAG

In [41]:
model = train_fasttext(genes, window=5)

In [42]:
gene_vecs = [calc_fingerprint_vector(gene, model=model) for gene in genes]

In [65]:
t['target_embd'] = gene_vecs

data.merge(t[['target_id', 'target_embd']], how='left', on='target_id')
# data.to_pickle('drugbank_embds.pkl')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



# Vector Visualization

In [18]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as clr
from collections import Counter

import plotly.express as px

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import umap

from sklearn.cluster import KMeans, DBSCAN

### Drugs

In [72]:
d = data.drop_duplicates('drug_name')
embeddings1 = [[float(value) for value in v[0].split()] for v in d.drug_fingerprint.tolist()]

In [88]:
reducer = umap.UMAP(n_components=2)
umap_fit = reducer.fit_transform(embeddings1)


[1m
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "..\..\..\..\..\..\software\Anaconda3\envs\advanced\lib\site-packages\umap\nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m



In [89]:
kmeans = KMeans(n_clusters=6, random_state=0).fit(embeddings1)

In [95]:
viz_data = pd.DataFrame(umap_fit, columns = ['umap_0', 'umap_1'])
viz_data['drug'] = d.drug_name.tolist()
viz_data['cluster'] = kmeans.labels_
viz_data['cluster'] = viz_data['cluster'].apply(str)

xcol = "umap_0"
ycol = "umap_1"
px.scatter(viz_data, x=xcol, y=ycol, hover_name="drug", color='cluster')

### Targets

In [56]:
embeddings = [[float(value) for value in v.split()] for v in gene_vecs]
# embeddings = [[float(value) for value in v[0].split()] for v in data.target_gene_fingerprint.tolist()]

In [44]:
tsne = TSNE(n_components=2)
tsne_fit = tsne.fit_transform(embeddings)

In [46]:
viz_data = pd.DataFrame(tsne_fit, columns = ['tsne_0', 'tsne_1'])
viz_data['target'] = t.target_name.tolist()

xcol = "tsne_0"
ycol = "tsne_1"
px.scatter(viz_data, x=xcol, y=ycol, hover_name="target")

In [79]:
reducer = umap.UMAP(n_components=2)
umap_fit = reducer.fit_transform(embeddings)

In [80]:
kmeans = KMeans(n_clusters=6, random_state=0).fit(embeddings)

In [81]:
viz_data = pd.DataFrame(umap_fit, columns = ['umap_0', 'umap_1'])
viz_data['target'] = t.target_name.tolist()
viz_data['target_gene'] = genes
viz_data['gene_len'] = viz_data['target_gene'].apply(len)
viz_data['cluster'] = kmeans.labels_
viz_data['cluster'] = viz_data['cluster'].apply(str)

xcol = "umap_0"
ycol = "umap_1"
px.scatter(viz_data, x=xcol, y=ycol, hover_name="target", hover_data=['gene_len'], color='cluster')

In [30]:
viz_data = pd.DataFrame(umap_fit, columns = ['umap_0', 'umap_1'])
viz_data['target'] = data.target_name.tolist()


xcol = "umap_0"
ycol = "umap_1"
px.scatter(viz_data, x=xcol, y=ycol, hover_name="target")