In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# %%bash
# MINICONDA_INSTALLER_SCRIPT=Miniconda3-4.5.4-Linux-x86_64.sh
# MINICONDA_PREFIX=/usr/local
# wget https://repo.continuum.io/miniconda/$MINICONDA_INSTALLER_SCRIPT
# chmod +x $MINICONDA_INSTALLER_SCRIPT
# ./$MINICONDA_INSTALLER_SCRIPT -b -f -p $MINICONDA_PREFIX

In [4]:
import argparse
import gzip
import os
import sys
import time
import io
# from multiprocessing import Pool
import multiprocessing

import numpy as np
import pandas as pd
from Bio.PDB import PDBParser
from tqdm.auto import tqdm
import utils.gcs_utils as gcs
import utils.spark_utils as sprk
from utils.proteins import *

In [5]:
import numpy as np
from bio_embeddings.embed import SeqVecEmbedder #, ProtTransBertBFDEmbedder  

In [6]:
sequences_df = pd.read_parquet("/home/jupyter/pss/structure_files/sequences/sequences.parquet")

In [7]:
sequences_df.head()

Unnamed: 0,pdbx_db_accession,db_code,db_name,pdbx_seq_one_letter_code,protein_filename,protein_id
0,A0A024R1R8,A0A024R1R8_HUMAN,UNP,MSSHEGGKKKALKQPKKQAKEMDEEEKAFKQKQKEEQKKLEVLKAK...,AF-A0A024R1R8-F1-model_v1,A0A024R1R8
1,A0A024RBG1,NUD4B_HUMAN,UNP,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...,AF-A0A024RBG1-F1-model_v1,A0A024RBG1
2,A0A024RCN7,A0A024RCN7_HUMAN,UNP,MERSFVWLSCLDSDSCNLTFRLGEVESHACSPSLLWNLLTQYLPPG...,AF-A0A024RCN7-F1-model_v1,A0A024RCN7
3,A0A075B6H5,A0A075B6H5_HUMAN,UNP,METVVTTLPREGGVGPSRKMLLLLLLLGPGSGLSAVVSQHPSRVIC...,AF-A0A075B6H5-F1-model_v1,A0A075B6H5
4,A0A075B6H7,KV37_HUMAN,UNP,MEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRAS...,AF-A0A075B6H7-F1-model_v1,A0A075B6H7


## Embedding the sequences with SeqVec and ProtTrans
Distribute with spark

In [9]:
import time

In [10]:
def seqvectorize(x, seqvec): 
    if isinstance(x, str):
        return seqvec.reduce_per_protein(seqvec.embed(x))
    else:
        return [seqvec.reduce_per_protein(emb) for emb in seqvec.embed_many(x)]
# seqvec_udf = F.udf(seqvectorize, ArrayType(ArrayType(FloatType())))

In [11]:
print("initializing model")
seqvec_model = SeqVecEmbedder()

initializing model


In [15]:
names, vectors = [], []
batch_size = 100
checkpoint_size = 1000

print(f"creating embedding vectors, batch size {batch_size}")
for i in tqdm(range(15*checkpoint_size, sequences_df.shape[0], checkpoint_size)):    
    print(i)
    sub_df = sequences_df.iloc[i:i+checkpoint_size,].copy()
    vectors = []
    for j in tqdm(range(0, sub_df.shape[0], batch_size)):
        start = time.time()
        batch = sub_df.iloc[j:j+batch_size,]
        embs = seqvectorize(list(batch.pdbx_seq_one_letter_code), seqvec_model)
        vectors.extend(embs)
        print(f"{j} to {j+batch_size-1} finished in {time.time()-start//1}")
    sub_df["seqvec"] = vectors
    sub_df.to_pickle(f"~/pss/vectorize/SeqVec/seqvec_vectors_{i//checkpoint_size}.pkl")

creating embedding vectors, batch size 100


  0%|          | 0/6 [00:00<?, ?it/s]

15000


  0%|          | 0/10 [00:00<?, ?it/s]

0 to 99 finished in 411.20957827568054
100 to 199 finished in 271.186429977417
200 to 299 finished in 264.38778281211853
300 to 399 finished in 276.5215756893158
400 to 499 finished in 808.8707845211029
500 to 599 finished in 374.82429552078247
600 to 699 finished in 562.596595287323
700 to 799 finished in 371.99365425109863
800 to 899 finished in 643.8899238109589
900 to 999 finished in 660.5490741729736
16000


  0%|          | 0/10 [00:00<?, ?it/s]

0 to 99 finished in 495.3925771713257
100 to 199 finished in 546.2355153560638
200 to 299 finished in 381.493305683136
300 to 399 finished in 280.0316517353058
400 to 499 finished in 236.52947282791138
500 to 599 finished in 247.61623358726501
600 to 699 finished in 316.8467628955841
700 to 799 finished in 431.9063642024994
800 to 899 finished in 331.3508496284485
900 to 999 finished in 391.0558168888092
17000


  0%|          | 0/10 [00:00<?, ?it/s]

0 to 99 finished in 578.3194725513458
100 to 199 finished in 311.57104659080505
200 to 299 finished in 413.7402150630951
300 to 399 finished in 341.6683351993561
400 to 499 finished in 322.30323457717896
500 to 599 finished in 505.5489311218262
600 to 699 finished in 309.78535532951355
700 to 799 finished in 331.6617662906647
800 to 899 finished in 596.1102843284607
900 to 999 finished in 510.7164692878723
18000


  0%|          | 0/10 [00:00<?, ?it/s]

0 to 99 finished in 261.43460965156555
100 to 199 finished in 594.2385849952698
200 to 299 finished in 515.291285276413
300 to 399 finished in 601.2114601135254
400 to 499 finished in 306.8615906238556
500 to 599 finished in 260.70225977897644
600 to 699 finished in 821.8063094615936
700 to 799 finished in 617.1864590644836
800 to 899 finished in 305.3815903663635
900 to 999 finished in 1172.096801996231
19000


  0%|          | 0/10 [00:00<?, ?it/s]

0 to 99 finished in 308.39139127731323
100 to 199 finished in 504.8575084209442
200 to 299 finished in 342.30324840545654
300 to 399 finished in 348.4340810775757
400 to 499 finished in 664.201943397522
500 to 599 finished in 633.1881601810455
600 to 699 finished in 475.62344574928284
700 to 799 finished in 1009.9329454898834
800 to 899 finished in 366.2454721927643
900 to 999 finished in 452.95703196525574
20000


  0%|          | 0/6 [00:00<?, ?it/s]

0 to 99 finished in 237.67015290260315
100 to 199 finished in 996.0906887054443
200 to 299 finished in 398.6745812892914
300 to 399 finished in 329.6600489616394
400 to 499 finished in 787.0666515827179
500 to 599 finished in 4.087510585784912


for part 14, contains longest protein TITIN which causes memory issues, here we will iterate through proteins 1 at a time as well as evaluate TITIN in parts

In [None]:
sequences_df.pdbx_seq_one_letter_code.apply(len).sort_values().tail()

In [None]:
tqdm.pandas()

In [23]:
def seqvectorize_parts(x, seqvec_model): 
    if len(x)<100000:
        return seqvectorize(x, seqvec_model)
    else:
        vectors = []
        n_parts = len(x) // 100000 + 1
        part_len = len(x) // n_parts
        for i in range(n_parts):
            vectors.append(seqvectorize(x[i*part_len:(i+1)*part_len], seqvec_model))
        return np.mean(np.vstack(vectors), axis=0)

In [None]:
checkpoint_size = 1000
print(f"creating embedding vectors, batch size {batch_size}")
for i in tqdm(range(14*checkpoint_size, 15*checkpoint_size, checkpoint_size)):    
    print(i)
    sub_df = sequences_df.iloc[i:i+checkpoint_size,].copy()
    sub_df["seqvec"] = sub_df.pdbx_seq_one_letter_code.progress_apply(lambda x: seqvectorize_parts(x, seqvec_model))
    sub_df.to_pickle(f"~/pss/vectorize/SeqVec/seqvec_vectors_{i//checkpoint_size}.pkl")