# Protein embeddings on the cloud (Google Colab, Kaggle)

### 0 - Installation and libraries

In [2]:
!pip install bio_embeddings==0.2.2

Collecting bio_embeddings==0.2.2
  Downloading bio_embeddings-0.2.2-py3-none-any.whl (105 kB)
[K     |████████████████████████████████| 105 kB 528 kB/s eta 0:00:01
Collecting scikit-learn<0.25.0,>=0.24.0
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 29.4 MB/s eta 0:00:01
[?25hCollecting ruamel.yaml<0.18.0,>=0.17.10
  Downloading ruamel.yaml-0.17.16-py3-none-any.whl (109 kB)
[K     |████████████████████████████████| 109 kB 62.3 MB/s eta 0:00:01
Collecting gensim<4.0.0,>=3.8.2
  Downloading gensim-3.8.3-cp37-cp37m-manylinux1_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 481 kB/s  eta 0:00:01
Collecting atomicwrites<2.0.0,>=1.4.0
  Downloading atomicwrites-1.4.0-py2.py3-none-any.whl (6.8 kB)
Collecting lock<2019.0.0,>=2018.3.25
  Downloading lock-2018.3.25.2110.tar.gz (3.0 kB)
Collecting torch<=1.10.0,>=1.8.0
  Downloading torch-1.10.0-cp37-cp37m-manylinux1_x86_64.whl (881.9 M

In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from Bio import SeqIO
from datetime import date
from bio_embeddings.embed import ProtTransBertBFDEmbedder

### 1 - Compute protein embeddings

In [None]:
def protein_embeddings(rbp_file_names, data_dir, results_dir):
    """
    This functions computes ProttransBERT embeddings for phage RBPs. It uses the bio-embeddings
    package together with BioPython to transform fasta sequences of phage RBPs into numerical embeddings.
    
    WARNING: running this on a personal laptop without GPU is very slow and not advised. Best to start a Google
    Collab notebook and run the computations in the cloud 
    
    Input:
    - rbp_file_names: list of fasta filenames of phage RBPs
    - data_dir: location of the sequence file(s) to loop over
    - results_dir: directory to store the resulting embeddings in
    
    Output:
    - embeddings: Pandas DataFrame (as .csv file) with file names and their corresponding embedding
    """
    embedder = ProtTransBertBFDEmbedder()
    embeddings_list = []

    for name in tqdm(rbp_file_names):
        for record in SeqIO.parse(data_dir+'/'+name, 'fasta'):
            rbp_sequence = str(record.seq)
        
        embedding = embedder.embed(rbp_sequence)
        reduced_embedding = embedder.reduce_per_protein(embedding)
        embeddings_list.append(reduced_embedding)
        
    embeddings_df = pd.concat([pd.DataFrame(rbp_file_names), pd.DataFrame(embeddings_list)], axis=1)
    embeddings_df.to_csv(results_dir+'/protein_embeddings.csv', index=False)
    
    return

In [11]:
def protein_embeddings_df(dataframe, results_dir, prefix='annotated_RBPs'):
    """
    This functions computes ProttransBERT embeddings protein sequences in a dataframe. It uses the bio-embeddings
    package together with BioPython to transform fasta sequences of phage RBPs into numerical embeddings.
    
    WARNING: running this on a personal laptop without GPU is very slow and not advised. Best to start a Google
    Collab notebook and run the computations in the cloud 
    
    Input:
    - dataframe: with protein sequences under ProteinSeq and columns NCBI_id, UniProt_id
    - results_dir: directory to store the resulting embeddings in
    
    Output:
    - embeddings: Pandas DataFrame (as .csv file) with file names and their corresponding embedding
    """
    embedder = ProtTransBertBFDEmbedder()
    embeddings_list = []
    pb = tqdm(total=dataframe.shape[0])

    for i, sequence in enumerate(dataframe['ProteinSeq']):
        embedding = embedder.embed(sequence)
        reduced_embedding = embedder.reduce_per_protein(embedding)
        embeddings_list.append(reduced_embedding)
        pb.update(1)
    pb.close()
    embeddings_df = pd.concat([dataframe['NCBI_id'].reset_index(drop=True), dataframe['UniProt_id'].reset_index(drop=True),
                               pd.DataFrame(embeddings_list)], axis=1)
    embeddings_df.to_csv(results_dir+prefix+'_embeddings.csv', index=False)
    
    return embeddings_list

In [5]:
rbps = pd.read_csv('../input/protein-sequences/annotated_RBPs.csv', sep='\t')
nonrbps = pd.read_csv('../input/protein-sequences/annotated_nonRBPs.csv', sep='\t')

In [39]:
embeddings_df.to_csv('./annotated_nonRBPs_embeddings.csv', index=False)

In [12]:
# compute RBP embeddings
protein_embeddings_df(rbps, './')

# make nonRBP subset
nonrbps_sub = nonrbps.sample(n=rbps.shape[0]*10, axis=0)

# compute nonRBP embeddings
emb_list = protein_embeddings_df(nonrbps_sub, './', prefix='annotated_nonRBPs')

Some weights of the model checkpoint at /root/.cache/bio_embeddings/prottrans_bert_bfd/model_directory were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 46910/46910 [55:05<00:00, 14.19it/s]  
