In [2]:
import time
import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
from datasets import load_dataset
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
import torch.nn as nn
import pandas as pd

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
def preprocess_dataset(dataset, tokenizer, text_field, label_field, max_length=512):
    """
    Токенизирует датасет и добавляет поле "labels".
    """
    def tokenize_fn(example):
        tokenized = tokenizer(example[text_field], 
                              padding="max_length", 
                              truncation=True, 
                              max_length=max_length)
        tokenized["labels"] = example[label_field]
        return tokenized
    return dataset.map(tokenize_fn, batched=False)

In [5]:
def evaluate_model(model, dataset, batch_size=8, is_multiclass=False):
    """
    Производит предсказание на всем датасете и вычисляет runtime, accuracy и F1.
    """
    # Приводим датасет к формату torch
    dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    dataloader = DataLoader(dataset, batch_size=batch_size)
    
    all_preds = []
    all_labels = []
    start_time = time.time()
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=False):
            # Перемещаем данные на нужное устройство
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch["input_ids"], 
                            attention_mask=batch["attention_mask"])
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch["labels"].cpu().numpy())
    runtime = time.time() - start_time
    accuracy = accuracy_score(all_labels, all_preds)
    # Для бинарной классификации используем average="binary", для многоклассовой — "macro"
    f1 = f1_score(all_labels, all_preds, average="macro" if is_multiclass else "binary")
    return runtime, accuracy, f1


In [6]:
def evaluate_imdb(model, tokenizer, batch_size=8):
    """
    Загружает и оценивает модель на датасете IMDB (бинарная классификация).
    Поле текста — "text", поле меток — "label".
    """
    print("Оценка на датасете IMDB...")
    dataset = load_dataset("imdb", split="test")
    dataset = preprocess_dataset(dataset, tokenizer, text_field="text", label_field="label")
    runtime, accuracy, f1 = evaluate_model(model, dataset, batch_size=batch_size, is_multiclass=False)
    return runtime, accuracy, f1

In [7]:
tokenizer_imdb = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer_imdb.pad_token = tokenizer_imdb.eos_token
model_imdb = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model_imdb.config.pad_token_id = tokenizer_imdb.pad_token_id
model_imdb.to(device)
imdb_runtime, imdb_accuracy, imdb_f1 = evaluate_imdb(model_imdb, tokenizer_imdb, batch_size=1)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Оценка на датасете IMDB...


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

                                                                 

In [8]:
torch.cuda.empty_cache()

In [9]:
results = {
        "Dataset": ["IMDB"],
        "Runtime (s)": [imdb_runtime],
        "Accuracy": [imdb_accuracy],
        "F1 Score": [imdb_f1]
    }
df = pd.DataFrame(results)
df

Unnamed: 0,Dataset,Runtime (s),Accuracy,F1 Score
0,IMDB,513.216463,0.49984,0.665794
