# Protein embeddings on the cloud (Google Colab, Kaggle)

### Installation and libraries

Note: the pip's dependency resolver ERROR can be safely ignored, as long as it is mentioned below the error that all packages were installed successfully.

In [None]:
# Install software for Kaggle
!pip install bio_embeddings==0.2.2
!pip install torchvision==0.10.1

In [None]:
# Install software for Google Colab
!pip3 install -U pip > /dev/null
!pip3 install -U bio_embeddings[all] > /dev/null
!pip install scikit_learn==1.0.2
!pip install pyyaml==5.4.1

In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from Bio import SeqIO
from datetime import date
from bio_embeddings.embed import ProtTransBertBFDEmbedder

### Define function and compute embeddings

In Kaggle, upload the fasta file you want to compute embeddings for. See the 'Add data' button in the right upper corner. This file will be located at '../input/a_folder_you_designate'. The results can be stored in the output directory, which is simply './'. The computed embeddings can then be downloaded from this folder for later use.

In [11]:
def compute_protein_embeddings(fasta_file, results_dir, prefix=''):
    """
    This functions computes ProttransBERT embeddings for protein sequences in a given FASTA file. 
    It uses the bio-embeddings package together with BioPython to transform protein sequences into numerical embeddings.
    
    WARNING: running this on a personal laptop without GPU is very slow and not advised. Best to start a Google
    Colab or Kaggle notebook and run the computations in the cloud.
    
    Input:
    - fasta_file: location of the FASTA file (string) with the sequences you want to compute embeddings for.
    - results_dir: directory to store the resulting embeddings in.
    
    Output:
    - embeddings: Pandas DataFrame (as .csv file) with protein names and their corresponding embedding
    """
    embedder = ProtTransBertBFDEmbedder()
    names = [record.id for record in SeqIO.parse(fasta_file, 'fasta')]
    sequences = [str(record.seq) for record in SeqIO.parse(fasta_file, 'fasta')]
    embeddings = [embedder.reduce_per_protein(embedder.embed(sequence)) for sequence in tqdm(sequences)]
    embeddings_df = pd.concat([pd.DataFrame({'ID':names}), pd.DataFrame(embeddings)], axis=1)
    embeddings_df.to_csv(results_dir+prefix+'_embeddings.csv', index=False)
    
    return

In [None]:
# compute embeddings
compute_protein_embeddings('../input/your_folder/your_fasta_file.fasta', results_dir='./')