In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# %%bash
# MINICONDA_INSTALLER_SCRIPT=Miniconda3-4.5.4-Linux-x86_64.sh
# MINICONDA_PREFIX=/usr/local
# wget https://repo.continuum.io/miniconda/$MINICONDA_INSTALLER_SCRIPT
# chmod +x $MINICONDA_INSTALLER_SCRIPT
# ./$MINICONDA_INSTALLER_SCRIPT -b -f -p $MINICONDA_PREFIX

In [65]:
import argparse
import gzip
import os
import sys
import time
import io
# from multiprocessing import Pool
import multiprocessing

import numpy as np
import pandas as pd
from Bio.PDB import PDBParser
from tqdm.auto import tqdm
import utils.gcs_utils as gcs
import utils.spark_utils as sprk
from utils.proteins import *

In [38]:
import numpy as np
from bio_embeddings.embed import SeqVecEmbedder, ProtTransBertBFDEmbedder  

In [42]:
# local_dir = "/Users/skyler.roh/Downloads/UP000005640_9606_HUMAN"
# files = sorted([f"{local_dir}/{f}" for f in os.listdir(local_dir) if f.endswith(".cif.gz")])
files = gcs.list_keys("UP000005640_9606_HUMAN/cif")
fn = get_protein_sequence_from_cif

In [47]:
names = pd.read_pickle("/home/jupyter/pss/vectorize/protein_names.pkl")

In [45]:
series = []
for file in tqdm(files):
    content = gcs.download_gzip_to_string(file)
    sequence = get_protein_sequence_from_cif(content)
    sequence['protein_name'] = file.strip(".cif.gz").split("/")[-1]
    series.append(sequence)
pd.concat(series)

In [154]:
sequences_df = reduce_sequence_df(pd.concat(series))

In [83]:
sequences_df.to_pickle("/home/jupyter/pss/vectorize/sequences.pkl")

In [84]:
sequences_df = pd.read_pickle("/home/jupyter/pss/vectorize/sequences.pkl")

In [86]:
sequences_df.head()

Unnamed: 0,pdbx_db_accession,db_code,db_name,pdbx_seq_one_letter_code,protein_name
0,A0A024R1R8,A0A024R1R8_HUMAN,UNP,MSSHEGGKKKALKQPKKQAKEMDEEEKAFKQKQKEEQKKLEVLKAK...,AF-A0A024R1R8-F1-model_v1
1,A0A024RBG1,NUD4B_HUMAN,UNP,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...,AF-A0A024RBG1-F1-model_v1
2,A0A024RCN7,A0A024RCN7_HUMAN,UNP,MERSFVWLSCLDSDSCNLTFRLGEVESHACSPSLLWNLLTQYLPPG...,AF-A0A024RCN7-F1-model_v1
3,A0A075B6H5,A0A075B6H5_HUMAN,UNP,METVVTTLPREGGVGPSRKMLLLLLLLGPGSGLSAVVSQHPSRVIC...,AF-A0A075B6H5-F1-model_v1
4,A0A075B6H7,KV37_HUMAN,UNP,MEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRAS...,AF-A0A075B6H7-F1-model_v1


## Embedding the sequences with SeqVec and ProtTrans
Distribute with spark

In [11]:
# from pyspark.sql.types import *
# import pyspark.sql.functions as F
# import dask.dataframe as dd
# from dask.diagnostics import ProgressBar
# from dask.multiprocessing import get as dget
import time

In [12]:
# sparkSession = sprk.get_local_spark_session()

In [13]:
# sequences_spark_df = sparkSession.createDataFrame(
#     sequences_df[["db_code", "pdbx_seq_one_letter_code"]]
# )

In [87]:
# sequences_dask = dd.from_pandas(sequences_df, 400)

In [88]:
def seqvectorize(x, seqvec): 
    if isinstance(x, str):
        return seqvec.reduce_per_protein(seqvec.embed(x))
    else:
        return [seqvec.reduce_per_protein(emb) for emb in seqvec.embed_many(x)]
# seqvec_udf = F.udf(seqvectorize, ArrayType(ArrayType(FloatType())))

In [89]:
print("initializing model")
seqvec_model = SeqVecEmbedder()

initializing model


In [None]:
names, vectors = [], []
batch_size = 100
checkpoint_size = 1000

print(f"creating embedding vectors, batch size {batch_size}")
for i in tqdm(range(0, sequences_df.shape[0], checkpoint_size)):
    sub_df = sequences_df.iloc[i:i+checkpoint_size,].copy()
    vectors = []
    for j in tqdm(range(0, sub_df.shape[0], batch_size)):
        start = time.time()
        batch = sub_df.iloc[j:j+batch_size,]
        embs = seqvectorize(list(batch.pdbx_seq_one_letter_code), seqvec_model)
        vectors.extend(embs)
        print(f"{j} to {j+batch_size-1} finished in {time.time()-start//1}")
    sub_df["seqvec"] = vectors
    sub_df.to_pickle(f"~/pss/vectorize/SeqVec/seqvec_vectors_{i//checkpoint_size}.pkl")

creating embedding vectors, batch size 100


  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

0 to 99 finished in 192.43411922454834
100 to 199 finished in 74.98020577430725
200 to 299 finished in 132.2045156955719
300 to 399 finished in 121.71075367927551
400 to 499 finished in 157.79058170318604
500 to 599 finished in 186.30310249328613
600 to 699 finished in 566.0692126750946
700 to 799 finished in 495.47400188446045
800 to 899 finished in 303.3445870876312
900 to 999 finished in 295.8824882507324


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df["seqvec"] = vectors


  0%|          | 0/10 [00:00<?, ?it/s]

In [174]:
print(vectors[0])

[-0.00934542  0.03230684 -0.18556052 ... -0.14327957  0.19794334
  0.12167553]


In [None]:
with ProgressBar():
    res = sequences_dask \
        .map_partitions(lambda df: df.assign(seqvec=seqvectorize(df.pdbx_seq_one_letter_code.tolist(), seqvec_model)), meta={"pdbx_seq_one_letter_code": str}) \
        .compute(scheduler='threads')

[                                        ] | 0% Completed |  7hr 30min 29.8s

In [None]:
res.to_parquet('seqvec.parquet')

In [None]:
# sequences_spark_df \
#     .repartition(1000, "db_code") \
#     .withColumn("seqvec", seqvec_udf(F.col("pdbx_seq_one_letter_code"))).show(10)