# Clustering on SeqVec Embeddings
Haerang Lee

Embedding files are in `embeddings/DeepFold` in the GCS bucket.



In [12]:
from google.cloud import storage
import argparse
import gzip
import os
import sys
import time
from multiprocessing import Pool

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from utils import gcs_utils as gcs

import pickle
import io

In [2]:
# Get all the keys from gcs
allkeys = gcs.list_keys()

In [3]:
unique_dir = []
for key in allkeys:
    if key.split("/")[0:2] not in unique_dir:
        unique_dir.append(key.split("/")[0:2])

unique_dir

[['', 'annotations'],
 ['UP000005640_9606_HUMAN.tar'],
 ['UP000005640_9606_HUMAN', 'cif'],
 ['UP000005640_9606_HUMAN', 'pdb'],
 ['alphafold_dbs', 'mgnify'],
 ['alphafold_dbs', 'params'],
 ['alphafold_dbs', 'pdb70'],
 ['alphafold_dbs', 'small_bfd'],
 ['alphafold_dbs', 'uniclust30'],
 ['alphafold_dbs', 'uniref90'],
 ['annotations', ''],
 ['annotations', 'blast_annotations.csv'],
 ['annotations', 'blast_annotations_legacy_do_not_use.csv'],
 ['c5.go.mf.v7.4.symbols.gmt'],
 ['clusters', ''],
 ['clusters', 'random_control_clusters.csv'],
 ['embeddings', ''],
 ['embeddings', 'DeepFold'],
 ['embeddings', 'SeqVec'],
 ['run_stats.csv'],
 ['samples', 'clusters_with_few_matches.csv'],
 ['structure_files', 'atom_sites'],
 ['structure_files', 'sequences']]

In [4]:
prefix = 'embeddings/SeqVec'
keys = gcs.list_file_paths(prefix)[1:]

In [5]:
print(len(keys))
keys

21


['gs://capstone-fall21-protein/embeddings/SeqVec/seqvec_vectors_0.pkl',
 'gs://capstone-fall21-protein/embeddings/SeqVec/seqvec_vectors_1.pkl',
 'gs://capstone-fall21-protein/embeddings/SeqVec/seqvec_vectors_10.pkl',
 'gs://capstone-fall21-protein/embeddings/SeqVec/seqvec_vectors_11.pkl',
 'gs://capstone-fall21-protein/embeddings/SeqVec/seqvec_vectors_12.pkl',
 'gs://capstone-fall21-protein/embeddings/SeqVec/seqvec_vectors_13.pkl',
 'gs://capstone-fall21-protein/embeddings/SeqVec/seqvec_vectors_14.pkl',
 'gs://capstone-fall21-protein/embeddings/SeqVec/seqvec_vectors_15.pkl',
 'gs://capstone-fall21-protein/embeddings/SeqVec/seqvec_vectors_16.pkl',
 'gs://capstone-fall21-protein/embeddings/SeqVec/seqvec_vectors_17.pkl',
 'gs://capstone-fall21-protein/embeddings/SeqVec/seqvec_vectors_18.pkl',
 'gs://capstone-fall21-protein/embeddings/SeqVec/seqvec_vectors_19.pkl',
 'gs://capstone-fall21-protein/embeddings/SeqVec/seqvec_vectors_2.pkl',
 'gs://capstone-fall21-protein/embeddings/SeqVec/seqve

In [89]:
import importlib

In [95]:
importlib.reload(gcs)

<module 'utils.gcs_utils' from '/Users/haeranglee/Documents/pss/utils/gcs_utils.py'>

In [101]:
seqvec_df = gcs.download_pkl(gcs.uri_to_bucket_and_key(keys[0])[1])

In [103]:
seqvec_df.head(2)

Unnamed: 0,pdbx_db_accession,db_code,db_name,pdbx_seq_one_letter_code,protein_filename,protein_id,seqvec
0,A0A024R1R8,A0A024R1R8_HUMAN,UNP,MSSHEGGKKKALKQPKKQAKEMDEEEKAFKQKQKEEQKKLEVLKAK...,AF-A0A024R1R8-F1-model_v1,A0A024R1R8,"[-0.0093454495, 0.032306828, -0.18556054, -0.0..."
1,A0A024RBG1,NUD4B_HUMAN,UNP,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...,AF-A0A024RBG1-F1-model_v1,A0A024RBG1,"[0.021576513, -0.014022889, -0.06683804, -0.03..."


In [18]:
seqvec_df.columns

Index(['pdbx_db_accession', 'db_code', 'db_name', 'pdbx_seq_one_letter_code',
       'protein_filename', 'protein_id', 'seqvec'],
      dtype='object')

In [19]:
seqvec_df.shape

(1000, 7)

# Download and parse SeqVec

In [21]:
seqvec_df = pd.DataFrame(columns = ['pdbx_db_accession', 'db_code', 'db_name', 'pdbx_seq_one_letter_code',
       'protein_filename', 'protein_id', 'seqvec'])

for key in keys:
    sample_emb_pkl = gcs.download_pkl(gcs.uri_to_bucket_and_key(key)[1])
    seqvec_df_key = pickle.load(io.BytesIO(sample_emb_pkl))
    
    seqvec_df = seqvec_df.append(seqvec_df_key, ignore_index=True)

seqvec_df.shape

(20504, 7)

In [29]:
seqvec_df.describe()

Unnamed: 0,pdbx_db_accession,db_code,db_name,pdbx_seq_one_letter_code,protein_filename,protein_id,seqvec
count,20504,20504,20504,20504,20504,20504,20504
unique,20504,20504,1,20435,20504,20504,20504
top,A0A024R1R8,A0A024R1R8_HUMAN,UNP,MEEPRPSKRLRSMAPNQASGGPPPEPGCCVADPEGSVEADGPAQPA...,AF-A0A024R1R8-F1-model_v1,A0A024R1R8,"[-0.0093454495, 0.032306828, -0.18556054, -0.0..."
freq,1,1,20504,5,1,1,1


In [35]:
seqvec_df.loc[0]

pdbx_db_accession                                                  A0A024R1R8
db_code                                                      A0A024R1R8_HUMAN
db_name                                                                   UNP
pdbx_seq_one_letter_code    MSSHEGGKKKALKQPKKQAKEMDEEEKAFKQKQKEEQKKLEVLKAK...
protein_filename                                    AF-A0A024R1R8-F1-model_v1
protein_id                                                         A0A024R1R8
seqvec                      [-0.0093454495, 0.032306828, -0.18556054, -0.0...
Name: 0, dtype: object

# AA sequence metrics for research paper

In [28]:
print("min:", seqvec_df.pdbx_seq_one_letter_code.str.len().min())
print("mean:", seqvec_df.pdbx_seq_one_letter_code.str.len().mean())
print("median:", seqvec_df.pdbx_seq_one_letter_code.str.len().median())
print("max:", seqvec_df.pdbx_seq_one_letter_code.str.len().max())

min: 16
mean: 724.2615587202497
median: 412.0
max: 232350


In [52]:
# Convert pd Series into np arr
seqvec_np = np.stack( seqvec_df[seqvec_df["seqvec"].str.len()!=3]["seqvec"])

In [54]:
seqvec_np.shape

(20503, 1024)

In [55]:
seqvec_np[0:10]

array([[-9.34544951e-03,  3.23068276e-02, -1.85560539e-01, ...,
        -1.43279582e-01,  1.97943285e-01,  1.21675484e-01],
       [ 2.15765126e-02, -1.40228886e-02, -6.68380409e-02, ...,
        -8.51393938e-02,  4.25157137e-02,  8.40931572e-03],
       [ 1.51273096e-02,  2.13520341e-02, -3.44568044e-02, ...,
         8.32618680e-03, -2.26041477e-04, -1.20919093e-03],
       ...,
       [ 1.06032556e-02, -1.74569979e-01, -2.68086880e-01, ...,
        -2.52510719e-02, -1.26974611e-02, -7.81473666e-02],
       [ 6.51418939e-02, -6.31700233e-02, -3.16974461e-01, ...,
        -7.32864141e-02, -5.47289662e-02, -5.08046485e-02],
       [ 1.00569814e-01, -5.66286892e-02, -1.80323139e-01, ...,
         3.62861389e-03, -1.69136431e-02, -8.72481391e-02]], dtype=float32)

In [58]:
# how sparse is it?

filter_arr = []

for element in seqvec_np:
    filter_arr.append(sum(element == 0))

print(np.mean(filter_arr))

0.0


In [59]:
# how many negative numbers?

filter_arr = []

for element in seqvec_np:
    filter_arr.append(sum(element < 0))

print(np.mean(filter_arr))

517.7067258449983


# HDBSCAN on SeqVec

In [36]:
import hdbscan

In [57]:
clusterer = hdbscan.HDBSCAN(algorithm='generic', alpha=1.0, approx_min_span_tree=True,
    gen_min_span_tree=False, leaf_size=40, #memory=Memory(cachedir=None),
    metric='cosine', min_cluster_size=5, min_samples=None, p=None)
clusterer.fit(seqvec_np.astype(np.float64))
clusterer.labels_.max()

336

In [65]:
np.unique(clusterer.labels_).size

338

In [60]:
np.unique(clusterer.labels_, return_counts=True)

(array([ -1,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
         12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
         25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
         38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,
         51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
         64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,
         77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
         90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
        103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
        116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
        129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
        142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154,
        155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
        168, 169, 170, 171, 172, 173, 174, 175, 176