In [38]:
!pip install esm
!pip install biopython
!pip install py3Dmol



In [39]:
# Import libraries
# Standard libraries
import pandas as pd
import numpy as np

from Bio import SeqIO
import matplotlib.pyplot as plt

# ML libraries
import torch
from huggingface_hub import login

# ESMC and batching libraries
import esm
from esm.sdk.api import (
    ESM3InferenceClient, 
    ESMProtein, 
    GenerationConfig, 
    ESMProteinError, 
    ProteinType
)
from concurrent.futures import ThreadPoolExecutor
from typing import Sequence

In [40]:
## Load ESM-C models from forge
from getpass import getpass
token = getpass("Token from Forge console: ")

from esm.sdk import client
model: ESM3InferenceClient = esm.sdk.client(
    model="esm3-medium-2024-08",
    url="https://forge.evolutionaryscale.ai",
    token=token
)

In [41]:
## Read all sequences as fasta files
def read_sequences(
    fasta_path: str) -> pd.DataFrame:
    fasta_df = pd.DataFrame(columns=["description", "sequence"])
    for record in SeqIO.parse(fasta_path, "fasta"):
        fasta_df = pd.concat(
            [fasta_df, pd.DataFrame(
                [[record.description, str(record.seq)]], 
                columns=["description", "sequence"])], 
            ignore_index=True
        )
    return fasta_df

In [42]:
## Input sequences and output logits and embeddings
def structure_sequence(
    model: ESM3InferenceClient, sequence: str):
    protein = ESMProtein(sequence=sequence)
    protein_structure = model.generate(
        protein, 
        GenerationConfig(
            track='structure', 
            num_steps=16, 
            temperature=0.2
            )
        )
    return protein_structure

## Batch embed sequences
def batch_structure(
    model: ESM3InferenceClient, inputs: Sequence[ProteinType]):
    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(structure_sequence, model, protein) for protein in inputs
        ]
        results = []
        for future in futures:
            try:
                results.append(future.result())
            except Exception as e:
                results.append(ESMProteinError(500, str(e)))
    return results

In [43]:
## Run the program!
# Import sequences and filter
fasta_path = '/home/azureuser/cloudfiles/code/Users/jc62/projects/direct_sequence_analysis/data/embeddings/ncbi_search/ads_core_gene_sequences_10/ads_test_human_dozen.fasta'
fasta_df = read_sequences(fasta_path)

# Carry out structure prediction operation
outputs = batch_structure(model, fasta_df["sequence"].tolist())

In [44]:
## Save the output
torch.save(outputs, '/home/azureuser/cloudfiles/code/Users/jc62/projects/direct_sequence_analysis/data/structures/ads_test_human_dozen_structure_tensors.pt')

In [None]:
## To convert output structure tensors to PDB format
## outputs[0].to_pdb("/home/azureuser/cloudfiles/code/Users/jc62/projects/direct_sequence_analysis/data/structures/ads_hCG.pdb")

