## Fine-Tuning ProtGPT2

In [None]:
# https://huggingface.co/nferruz/ProtGPT2
# https://huggingface.co/littleworth/protgpt2-distilled-tiny
!python run_clm.py --model_name_or_path littleworth/protgpt2-distilled-tiny \
--train_file training.txt \
--validation_file validation_M6.txt \
--tokenizer_name littleworth/protgpt2-distilled-tiny \
--do_train --do_eval \
--eval_strategy epoch \
--output_dir ./output_tiny/ \
--learning_rate 1e-4 \
--overwrite_output_dir \
--per_device_train_batch_size 64 \
--per_device_eval_batch_size 64 \
--block_size 512 \
--max_steps 4000 \
--gradient_accumulation_steps 8

## TPS Sequence Generation

In [None]:
from transformers import pipeline, GPT2LMHeadModel, GPT2TokenizerFast
import torch
import math

# https://huggingface.co/nferruz/ProtGPT2
# https://huggingface.co/nferruz/ProtGPT2/discussions/38

TOTAL_SEQUENCES = 1000
NUM_SEQ_PER_GENERATION = 50

model_path = "./output_tiny/"
device = 'cuda'

def calculatePerplexity(sequence, model, tokenizer):
    input_ids = torch.tensor(tokenizer.encode(sequence)).unsqueeze(0) 
    input_ids = input_ids.to(device)
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
    loss, logits = outputs[:2]
    return math.exp(loss)

def sort_protein_sequences(sequences, scores, top_n=1000):
    """
    Sorts protein sequences by score and returns the top N sequences.

    Args:
        sequences (list): List of protein sequences.
        scores (list): List of scores corresponding to protein sequences.
        top_n (int, optional): Number of top sequences to return. Defaults to 1000.

    Returns:
        list: Top N protein sequences along with their scores.
    """
    # Combine sequences and scores into a list of tuples
    sequence_score_pairs = list(zip(sequences, scores))
    # Sort the list in ascending order based on scores
    sorted_pairs = sorted(sequence_score_pairs, key=lambda x: x[1], reverse=False)
    return sorted_pairs[:top_n]

# ProtGPT2 model
model = GPT2LMHeadModel.from_pretrained(model_path).to(device)
# ProtGPT2 tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained(model_path)
# protgpt2 pipeline to generate artificial TPS sequences from the fine tuned model
protgpt2 = pipeline('text-generation', model=model_path, device=device)

 # integer division
num_iter = TOTAL_SEQUENCES // NUM_SEQ_PER_GENERATION
# generated TPS proteins
tps_proteins = []
# perplexity scores for corresponding proteins
ppl_scores = []
# generate sequences of different max_length tokens in increments of 10 starting from 70
for i in range(14):
    max_len_var = 70 + (10*i)
    for i in range(0, num_iter):
        # generate sequences given the <|endoftext|> prompt
        sequences = protgpt2(
            "<|endoftext|>", 
            max_length=max_len_var, # length is expressed in tokens, where each token has an average length of 4 amino acids.
            do_sample=True, 
            top_k=950, 
            repetition_penalty=1.2, 
            num_return_sequences=NUM_SEQ_PER_GENERATION, 
            pad_token_id=tokenizer.eos_token_id,  # Set pad_token_id to eos_token_id
            eos_token_id=0, 
            truncation=True)
        # compute perplexity scores for each generated sequence and store them
        for seq in sequences:
            print(seq)
            tmp_seq = seq['generated_text'].rstrip() # remove right space
            tps_proteins.append(tmp_seq + '\n')
            modified_seq_padded = tmp_seq + '<|endoftext|>'
            ppl = calculatePerplexity(modified_seq_padded, model, tokenizer)
            print("Generated ", str(len(tps_proteins)), " sequences")
            ppl_scores.append(ppl)

# pick top 10%
top_sequences = sort_protein_sequences(tps_proteins, ppl_scores, top_n=1400)

# write the generated sequences to file
output_seq = []
output_ppl_scores = []
for sequence, score in top_sequences:
    output_seq.append(sequence)
    output_ppl_scores.append(round(score, 2))

output_file_path = "top_tps_sequences.txt"
with open(output_file_path, 'w') as file:
    for item in output_seq:
         file.write(item.rstrip() + '\n')

output_file_path = "perplexity_scores.txt"
with open(output_file_path, 'w') as file:
    for item in output_ppl_scores:
         file.write(str(item) + '\n')