# Protein embeddings on the cloud (Google Colab, Kaggle)

### Installation and libraries

Note: the pip's dependency resolver ERROR can be safely ignored, as long as it is mentioned below the error that all packages were installed successfully.

In [None]:
# Install software for Kaggle
!pip install bio_embeddings==0.2.2
!pip install torchvision==0.10.1
!pip install setuptools==59.5.0

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from Bio import SeqIO
from Bio.Seq import Seq
from datetime import date
#from distutils.version import LooseVersion
from bio_embeddings.embed import ProtTransBertBFDEmbedder

### Define function and compute embeddings

### Instructions for use:

**For Kaggle:**
1. Upload the `phage_genes.csv` file (or `phage_genesinference.csv` if using inference suffix) using the 'Add data' button in the right upper corner
2. The file will be located at `../input/a_folder_you_designate/phage_genes.csv`
3. Update the `path_to_genes` variable in the cell below to match your file location
4. Set `data_suffix` to match your file (e.g., `'inference'` for `phage_genesinference.csv`)
5. Run all cells - results will be saved to `./phage_protein_embeddings{suffix}.csv`
6. Download the output file from the output directory

**For Google Colab:**
1. Upload the `phage_genes.csv` file using the file upload button
2. Update the `path_to_genes` to point to your uploaded file location
3. Set `data_suffix` appropriately
4. Run all cells and download the output file

In [None]:
def compute_protein_embeddings(path_to_genes, data_suffix=''):
    """
    This function computes protein embeddings -> SLOW ON CPU! Alternatively, can be done
    in the cloud, using the separate notebook.
    
    INPUTS:
    - path_to_genes: full path to the phage_genes CSV file (e.g., '../input/phage_data/phage_genes.csv')
    - data_suffix: suffix to add to output filename (e.g., 'inference' for phage_protein_embeddingsinference.csv)
    """
    print(f'Loading genes from: {path_to_genes}')
    genebase = pd.read_csv(path_to_genes)
    print(f'Number of genes to process: {len(genebase)}')
    
    print('Initializing ProtTransBertBFDEmbedder...')
    embedder = ProtTransBertBFDEmbedder()
    print('Embedder initialized')
    
    names = list(genebase['gene_ID'])
    print('Translating DNA sequences to protein sequences...')
    sequences = [str(Seq(sequence).translate())[:-1] for sequence in genebase['gene_sequence']]
    
    print('Computing embeddings (this may take a while)...')
    embeddings = [embedder.reduce_per_protein(embedder.embed(sequence)) for sequence in tqdm(sequences, desc='Embedding proteins')]
    
    print('Creating embeddings dataframe...')
    embeddings_df = pd.concat([pd.DataFrame({'ID':names}), pd.DataFrame(embeddings)], axis=1)
    
    output_file = './phage_protein_embeddings'+data_suffix+'.csv'
    print(f'Saving embeddings to: {output_file}')
    embeddings_df.to_csv(output_file, index=False)
    print('Done!')
    return

In [None]:
# Update these paths according to your setup
# For Kaggle: path will be '../input/your_folder/phage_genesinference.csv'
# For Colab: path will be '/content/phage_genesinference.csv' or wherever you uploaded it
path_to_genes = '../input/phage_data/phage_genesinference.csv'  # Update this path!
data_suffix = 'inference'  # Change to '' if using phage_genes.csv without suffix

compute_protein_embeddings(path_to_genes, data_suffix=data_suffix)