In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "7"
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import torch.nn.functional as F
import random 
import pandas as pd
import csv
from glob import glob
from tqdm import tqdm
# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model = AutoModelForCausalLM.from_pretrained("hugohrban/progen2-large", trust_remote_code=True).to(device)
tokenizer = AutoTokenizer.from_pretrained("hugohrban/progen2-large", trust_remote_code=True)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


input_folder = "/geniusland/home/wanglijuan/sci_proj/ESM2_opt/AMP_Methods_Compare/perplexity/MCTS/mc"  # 假设CSV文件都保存在这个文件夹中
output_folder = "/geniusland/home/wanglijuan/sci_proj/ESM2_opt/AMP_Methods_Compare/perplexity/MCTS/ply" 
csv_files = glob(os.path.join(input_folder, "*.csv"))

for input_file in csv_files:
    
    df = pd.read_csv(input_file)
    sequences = df['Sequence'].tolist()

    
    results = []
    for sequence in tqdm(sequences):
        input_ids = tokenizer.encode(sequence, return_tensors="pt", padding=True, truncation=True, max_length=2048).to(device)
        
        with torch.no_grad():
            outputs = model(input_ids, labels=input_ids)
            logits = outputs.logits
            loss = outputs.loss
            perplexity = torch.exp(loss)
            
            results.append([sequence, perplexity.item()])
    
    
    output_file = os.path.join(output_folder, os.path.basename(input_file).replace(".csv", "_perplexities.csv"))
    with open(output_file, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["Sequence", "Perplexity"])
        writer.writerows(results)
    file.close()
    
