<a href="https://colab.research.google.com/github/10udCryp7/TV-command-synthesis/blob/main/notebooks/Perplexity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!gdown 1StjZP25ROcexajtVHml-9LkfECkVsb3s

Downloading...
From: https://drive.google.com/uc?id=1StjZP25ROcexajtVHml-9LkfECkVsb3s
To: /content/sample-5000-100.zip
  0% 0.00/216k [00:00<?, ?B/s]100% 216k/216k [00:00<00:00, 60.0MB/s]


In [None]:
!unzip -q sample-5000-100.zip

In [10]:
import os
import pandas as pd
import torch
import math
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from tqdm import tqdm
all_perplexities = []
def calc_perplexity(text, tokenizer, model, device):
        """Tính perplexity cho 1 đoạn text"""
        if not isinstance(text, str) or text.strip() == "":
            return None
        try:
            encodings = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
            input_ids = encodings.input_ids.to(device)
            with torch.no_grad():
                outputs = model(input_ids, labels=input_ids)
                loss = outputs.loss
            return math.exp(loss.item())
        except Exception as e:
            print(f"⚠️ Lỗi khi tính perplexity: {e}, text={text[:50]}...")
            return None

def compute_avg_perplexity(folder_path, model_name="gpt2"):
    """
    Tính perplexity trung bình trên tất cả các file CSV trong folder.
    Các file CSV phải có cột 'text'.
    """
    # Load model & tokenizer
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
    tokenizer = GPT2TokenizerFast.from_pretrained(model_name)

    # Duyệt tất cả CSV trong folder
    csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]
    for file in tqdm(csv_files, desc="Processing CSV files"):
        file_path = os.path.join(folder_path, file)
        try:
            df = pd.read_csv(file_path)
            if "text" not in df.columns:
                print(f"⚠️ File {file} không có cột 'text', bỏ qua.")
                continue
            # Ép cột text về string và fill NaN
            df["text"] = df["text"].astype(str).fillna("")
            for text in df["text"]:
                ppl = calc_perplexity(text, tokenizer, model, device)
                if ppl is not None:
                    all_perplexities.append(ppl)
        except Exception as e:
            print(f"❌ Lỗi khi xử lý file {file}: {e}")
    if len(all_perplexities) == 0:
      print('bug len 0')
    if not all_perplexities:
        print("⚠️ Không có text hợp lệ để tính perplexity.")
        return None

    return all_perplexities
# ------------------- Example usage -------------------
folder_path = "output_parts"
save_ppl = compute_avg_perplexity(folder_path)


Processing CSV files: 100%|██████████| 50/50 [00:55<00:00,  1.10s/it]

Average Perplexity: [15.941623751183634, 59.00931835885497, 33.929459456691056, 596.2797147468302, 7800.8834087294135, 393.64221363470057, 723.1331500857772, 196.66396774574926, 109.83244889863731, 406.0765663837029, 166.91744543883047, 223.33232050890723, 218.75269328256337, 41.06420290494081, 1096.075349569691, 82.92272461251372, 172.928182302183, 397.14940080163433, 287.7746080255359, 39.829239641474274, 31.22137578582142, 16.73047477972259, 52.296163250306876, 512.0759421425053, 105.14127360650401, 413631.46481062286, 109343.24369686516, 179.25825199393037, 101.58194593238167, 515.4367850461144, 165.37960459831987, 1061.9839379015539, 101.41289266221145, 220.27000047579142, 614.0924652184461, 115.15002547192557, 98.15630950861974, 93.41034381992434, 80.96011785236037, 55.694172942690784, 75.185306968304, 74.4142932495477, 41.12110533666653, 1265.7929906752156, 88.19248465629357, 860.7732357697013, 105.14127360650401, 110.98625622849875, 146.51429676609786, 95.39001054414592, 185.42




In [14]:
import math

all_ppl_notnan = [x for x in all_perplexities if not math.isnan(x)]

In [15]:
sum(all_ppl_notnan)/len(all_ppl_notnan)

2044.6754818523486

In [16]:
max(all_ppl_notnan)

1013257.5463451869

In [17]:
import numpy as np

data = all_ppl_notnan

# Tính percentile 25% và 75%
p25 = np.percentile(data, 25)
p75 = np.percentile(data, 75)

# Lọc các giá trị nằm trong khoảng 25% - 75%
filtered = [x for x in data if p25 <= x <= p75]

print("25th percentile:", p25)
print("75th percentile:", p75)
print("Filtered data:", filtered)


25th percentile: 64.7877877223338
75th percentile: 314.93054412422475
Filtered data: [196.66396774574926, 109.83244889863731, 166.91744543883047, 223.33232050890723, 218.75269328256337, 82.92272461251372, 172.928182302183, 287.7746080255359, 105.14127360650401, 179.25825199393037, 101.58194593238167, 165.37960459831987, 101.41289266221145, 220.27000047579142, 115.15002547192557, 98.15630950861974, 93.41034381992434, 80.96011785236037, 75.185306968304, 74.4142932495477, 88.19248465629357, 105.14127360650401, 110.98625622849875, 146.51429676609786, 95.39001054414592, 185.42217813897497, 83.15753240611039, 155.6593378255558, 289.43040399415275, 193.9975695022154, 74.53172975295831, 124.81870633467578, 186.21732503639305, 91.576459283256, 70.67095768484283, 140.82417829274894, 171.87248478233462, 87.9101281422636, 110.86380853568082, 243.1278639135852, 153.8815817952273, 64.97683233803383, 132.60760489455737, 108.33144318804013, 111.80559330712583, 72.01579784707884, 141.66876441396425, 91

In [18]:
sum(filtered)/len(filtered)

145.94755091975208