# Protein embeddings on the cloud (Google Colab, Kaggle)

### Installation and libraries

Note: the pip's dependency resolver ERROR can be safely ignored, as long as it is mentioned below the error that all packages were installed successfully.

In [None]:
# Install software for Kaggle
!pip install bio_embeddings==0.2.2
!pip install torchvision==0.10.1
!pip install setuptools==59.5.0

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from Bio import SeqIO
from Bio.Seq import Seq
from datetime import date
#from distutils.version import LooseVersion
from bio_embeddings.embed import ProtTransBertBFDEmbedder

### Define function and compute embeddings

In Kaggle, upload the phage_genes.csv file you want to compute embeddings for. See the 'Add data' button in the right upper corner. This file will be located at '../input/a_folder_you_designate'. The results can be stored in the output directory, which is simply './'. The computed embeddings can then be downloaded from this folder for later use.

In [None]:
def compute_protein_embeddings(general_path, data_suffix='', add=False):
    """
    This function computes protein embeddings -> SLOW ON CPU! Alternatively, can be done
    in the cloud, using the separate notebook (compute_embeddings_cloud).
    """
    genebase = pd.read_csv(general_path+'/phage_genes'+data_suffix+'.csv')
    embedder = ProtTransBertBFDEmbedder()
    if add == True:
        old_embeddings_df = pd.read_csv(general_path+'/phage_protein_embeddings'+data_suffix+'.csv')
        protein_ids = list(old_embeddings_df['ID'])
        sequences = []; names = []
        for i, sequence in enumerate(genebase['gene_sequence']):
            if genebase['gene_ID'][i] not in protein_ids:
                sequences.append(str(Seq(sequence).translate())[:-1])
                names.append(genebase['gene_ID'][i])
    else:
        names = list(genebase['gene_ID'])
        sequences = [str(Seq(sequence).translate())[:-1] for sequence in genebase['gene_sequence']]
    
    embeddings = [embedder.reduce_per_protein(embedder.embed(sequence)) for sequence in tqdm(sequences)]
    embeddings_df = pd.concat([pd.DataFrame({'ID':names}), pd.DataFrame(embeddings)], axis=1)
    if add == True:
        embeddings_df = pd.concat([old_embeddings_df, embeddings_df], axis=0)
    embeddings_df.to_csv(general_path+'/phage_protein_embeddings'+data_suffix+'.csv', index=False)
    return

In [None]:
path_to_genes = '../input/phage_data/phage_genes.csv'
compute_protein_embeddings(path_to_genes, data_suffix='', add=False)