# Protein embeddings on the cloud (Google Colab, Kaggle)

### Installation and libraries

In [None]:
!pip install bio_embeddings==0.2.2

In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from Bio import SeqIO
from datetime import date
from bio_embeddings.embed import ProtTransBertBFDEmbedder

### Define function and compute embeddings

In Kaggle, upload the fasta file you want to compute embeddings for. See the 'Add data' button in the right upper corner. This file will be located at '../input/a_folder_you_designate'. The results can be stored in the output directory, which is simply './'. The computed embeddings can then be downloaded from this folder for later use.

In [11]:
def compute_protein_embeddings(fasta_file, results_dir, prefix=''):
    """
    This functions computes ProttransBERT embeddings for protein sequences in a given FASTA file. 
    It uses the bio-embeddings package together with BioPython to transform protein sequences into numerical embeddings.
    
    WARNING: running this on a personal laptop without GPU is very slow and not advised. Best to start a Google
    Colab or Kaggle notebook and run the computations in the cloud.
    
    Input:
    - fasta_file: location of the FASTA file (string) with the sequences you want to compute embeddings for.
    - results_dir: directory to store the resulting embeddings in.
    
    Output:
    - embeddings: Pandas DataFrame (as .csv file) with protein names and their corresponding embedding
    """
    embedder = ProtTransBertBFDEmbedder()
    embeddings_list = []
    names_list = []
    pb = tqdm(total=dataframe.shape[0])
    
    for record in SeqIO.parse(fasta_file, 'fasta'):
        sequence = str(record.seq)
        embedding = embedder.embed(sequence)
        reduced_embedding = embedder.reduce_per_protein(embedding)
        embeddings_list.append(reduced_embedding)
        names_list.append(record.id)
        pb.update(1)
    pb.close()
    embeddings_df = pd.concat([pd.DataFrame({'ID':names_list}), pd.DataFrame(embeddings_list)], axis=1)
    embeddings_df.to_csv(results_dir+prefix+'_embeddings.csv', index=False)
    
    return

In [None]:
# compute embeddings
compute_protein_embeddings_df('../input/your_folder/your_fasta_file.fasta', results_dir='./')