In [2]:
# Install the SentencePiece library using pip
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   ---------------------------------------- 10.2/991.5 kB ? eta -:--:--
   - ------------------------------------- 41.0/991.5 kB 653.6 kB/s eta 0:00:02
   ----- ---------------------------------- 133.1/991.5 kB 1.3 MB/s eta 0:00:01
   ------------- -------------------------- 327.7/991.5 kB 2.5 MB/s eta 0:00:01
   -------------- ------------------------- 368.6/991.5 kB 2.5 MB/s eta 0:00:01
   --------------------------- ------------ 686.1/991.5 kB 3.3 MB/s eta 0:00:01
   ---------------------------------------- 991.5/991.5 kB 3.9 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0


In [3]:
# Update the torch library using pip
!pip install --upgrade torch

Collecting torch
  Downloading torch-2.3.0-cp311-cp311-win_amd64.whl.metadata (26 kB)
Collecting mkl<=2021.4.0,>=2021.1.1 (from torch)
  Downloading mkl-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.4 kB)
Collecting intel-openmp==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch)
  Downloading intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.2 kB)
Collecting tbb==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch)
  Downloading tbb-2021.12.0-py3-none-win_amd64.whl.metadata (1.1 kB)
Downloading torch-2.3.0-cp311-cp311-win_amd64.whl (159.8 MB)
   ---------------------------------------- 0.0/159.8 MB ? eta -:--:--
   ---------------------------------------- 0.1/159.8 MB 1.7 MB/s eta 0:01:37
   ---------------------------------------- 0.3/159.8 MB 3.2 MB/s eta 0:00:51
   ---------------------------------------- 0.6/159.8 MB 4.5 MB/s eta 0:00:36
   ---------------------------------------- 0.6/159.8 MB 4.5 MB/s eta 0:00:36
   ---------------------------------------- 1.1/159.8 MB 5.5 MB/s 

ERROR: Cannot uninstall 'TBB'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.


In [1]:
import pandas as pd
from transformers import T5Tokenizer, T5EncoderModel
import torch
import re
import h5py

# Initialize ProtT5-XL-U50 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_bfd', do_lower_case=False)
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_bfd")

# Set device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Read peptide sequences from Excel file
file_path = 'Final_non_redundant_sequences.xlsx'
df = pd.read_excel(file_path)

print(f"Number of peptides to embed: {len(df)}")

  torch.utils._pytree._register_pytree_node(
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Number of peptides to embed: 5479


In [2]:
# Preprocess sequences
def preprocess_sequence(sequence):
    sequence = re.sub(r"[UZOB]", "X", sequence)  # Replace rare/ambiguous amino acids with 'X'
    return " ".join(list(sequence))

df['processed_sequence'] = df['sequence'].apply(preprocess_sequence)

# Define max_length for truncation
max_length = 512

# Generate embeddings and per-protein embeddings
embeddings = []

with torch.no_grad():
    for idx, sequence in enumerate(df['processed_sequence']):
        # Truncate sequence to max_length
        truncated_sequence = sequence[:max_length]
        
        inputs = tokenizer(truncated_sequence, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(device)
        output = model(**inputs)
        sequence_embedding = output.last_hidden_state.mean(dim=1).cpu().numpy()  # Mean pooling over sequence length
        embeddings.append(sequence_embedding)
        
        print(f"{idx + 1} sequences embedded...", end='\r')  # Print progress

5479 sequences embedded...

In [3]:
# Calculate per-protein embeddings
per_protein_embeddings = [emb.squeeze() for emb in embeddings]

# Save per-protein embeddings to .h5 file
with h5py.File('prot_t5_xl_bfd_per_protein_embeddings.h5', 'w') as hf:
    for i, emb in enumerate(per_protein_embeddings):
        hf.create_dataset(str(i), data=emb)

# Check dimensions of the .h5 file
with h5py.File('prot_t5_xl_bfd_per_protein_embeddings.h5', 'r') as hf:
    num_proteins = len(hf)
    first_emb_shape = hf['0'].shape
    
    print(f"Number of proteins: {num_proteins}")
    print(f"Shape of the first embedding: {first_emb_shape}")


Number of proteins: 5479
Shape of the first embedding: (1024,)


In [39]:
import pandas as pd
import h5py
import numpy as np

# Read embeddings from .h5 file
embeddings_list = []

with h5py.File('prot_t5_xl_bfd_per_protein_embeddings.h5', 'r') as hf:
    for i in range(len(hf)):
        emb = hf[str(i)][:]
        embeddings_list.append(emb)

# Convert list of embeddings to numpy array
embeddings_array = np.array(embeddings_list)

# Create DataFrame from embeddings array
df = pd.DataFrame(embeddings_array)

# Add a column for protein identifiers
df.insert(0, 'Protein_ID', range(1, len(df) + 1))

# Write DataFrame to CSV
df.to_csv('prot_t5_xl_bfd_per_protein_embeddings.csv', index=False)


In [40]:
import pandas as pd
# Read the CSV file into a DataFrame
df = pd.read_csv("prot_t5_xl_bfd_per_protein_embeddings.csv")

# Display the first few rows to verify the data
df.head()

Unnamed: 0,Protein_ID,0,1,2,3,4,5,6,7,8,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,1,0.179155,-0.023847,0.146136,-0.01627,-0.048133,-0.067123,0.013085,-0.13215,-0.150027,...,0.00416,0.047082,-0.011203,-0.158335,-0.052468,0.081036,-0.007796,-0.119145,0.05292,0.070175
1,2,-0.085285,0.095489,0.116315,-0.082521,-0.006077,-0.107044,-0.064592,-0.092938,-0.007258,...,0.077627,0.051403,0.052251,-0.186223,-0.134966,-0.033184,-0.054681,0.13904,-0.102972,0.015033
2,3,-0.079044,0.013451,0.02485,-0.100146,0.004376,-0.168924,-0.084987,-0.105355,0.020152,...,0.160928,0.098804,0.069223,-0.121971,-0.115141,-0.017345,0.030208,0.152145,-0.099492,0.05248
3,4,-0.109819,0.012157,0.08293,-0.187901,0.057573,-0.278307,-0.123186,-0.14172,-0.084983,...,-0.057178,0.217865,-0.019142,-0.184611,-0.121482,0.060979,-0.089398,0.141999,-0.073714,-0.115901
4,5,-0.129034,0.013506,0.113015,-0.119856,0.056415,-0.274226,-0.079424,-0.141707,-0.063319,...,-0.029855,0.184172,0.01326,-0.168951,-0.160576,0.086253,-0.112513,0.065789,-0.130252,-0.129067
