# Protein embeddings on the cloud (Google Colab, Kaggle)

### 0 - Installation and libraries

In [2]:
!pip install bio_embeddings==0.2.2



In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from Bio import SeqIO
from datetime import date
from bio_embeddings.embed import ProtTransBertBFDEmbedder

### 1 - Compute protein embeddings

In [4]:
def protein_embeddings(rbp_file_names, data_dir, results_dir):
    """
    This functions computes ProttransBERT embeddings for phage RBPs. It uses the bio-embeddings
    package together with BioPython to transform fasta sequences of phage RBPs into numerical embeddings.
    
    WARNING: running this on a personal laptop without GPU is very slow and not advised. Best to start a Google
    Collab notebook and run the computations in the cloud 
    
    Input:
    - rbp_file_names: list of fasta filenames of phage RBPs
    - data_dir: location of the sequence file(s) to loop over
    - results_dir: directory to store the resulting embeddings in
    
    Output:
    - embeddings: Pandas DataFrame (as .csv file) with file names and their corresponding embedding
    """
    embedder = ProtTransBertBFDEmbedder()
    embeddings_list = []

    for name in tqdm(rbp_file_names):
        for record in SeqIO.parse(data_dir+'/'+name, 'fasta'):
            rbp_sequence = str(record.seq)
        
        embedding = embedder.embed(rbp_sequence)
        reduced_embedding = embedder.reduce_per_protein(embedding)
        embeddings_list.append(reduced_embedding)
        
    embeddings_df = pd.concat([pd.DataFrame(rbp_file_names), pd.DataFrame(embeddings_list)], axis=1)
    embeddings_df.to_csv(results_dir+'/protein_embeddings.csv', index=False)
    
    return

In [9]:
rbp_files = list(pd.read_csv('../input/klebsiellanames/rbp_file_names.csv')['names'])
protein_embeddings(rbp_files, '../input/klebsiella', './')

Some weights of the model checkpoint at /root/.cache/bio_embeddings/prottrans_bert_bfd/model_directory were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1503/1503 [02:57<00:00,  8.45it/s]
