In [1]:
import torch
import sklearn
from sklearn.model_selection import train_test_split
import datasets
from transformers import AutoTokenizer, GPT2Tokenizer,  GPT2ForSequenceClassification, Trainer, TrainingArguments
import random
import numpy as np
from datasets import load_dataset


In [2]:
dataset = load_dataset('csv', data_files='..\data\dataset\processed\clean_data_gpt2.csv')

In [3]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'essay', 'label'],
        num_rows: 9766
    })
})


In [4]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    combined_text = examples["prompt"] + '\n' + examples["essay"]
    return tokenizer(combined_text, padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=False)

In [5]:
encoded_dataset = tokenized_datasets.rename_column("label", "labels")
encoded_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [6]:
small_train_dataset = encoded_dataset["train"].shuffle(seed=42).select(range(5000))
small_eval_dataset = encoded_dataset["train"].shuffle(seed=42).select(range(5000, 7000))
small_test_dataset = encoded_dataset["train"].shuffle(seed=42).select(range(7000, 9766))


In [19]:
print(small_train_dataset)

Dataset({
    features: ['prompt', 'essay', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 5000
})


In [7]:
num_labels = 12  # Change this based on your dataset

# from transformers import DistilBertForSequenceClassification

# Specify the number of labels in your dataset
# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

from transformers import DistilBertConfig, DistilBertForSequenceClassification

config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=num_labels, dropout=0.3, attention_dropout=0.3)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.05,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
)

In [13]:
trainer.train()


  0%|          | 0/3130 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 4.707424640655518, 'eval_runtime': 10.1945, 'eval_samples_per_second': 196.183, 'eval_steps_per_second': 12.261, 'epoch': 1.0}
{'loss': 0.4999, 'grad_norm': 36.477108001708984, 'learning_rate': 1.6805111821086264e-05, 'epoch': 1.6}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 4.86761999130249, 'eval_runtime': 10.2701, 'eval_samples_per_second': 194.741, 'eval_steps_per_second': 12.171, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 4.901609420776367, 'eval_runtime': 10.2934, 'eval_samples_per_second': 194.299, 'eval_steps_per_second': 12.144, 'epoch': 3.0}
{'loss': 0.4073, 'grad_norm': 39.29037857055664, 'learning_rate': 1.3610223642172523e-05, 'epoch': 3.19}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 4.948614597320557, 'eval_runtime': 10.2945, 'eval_samples_per_second': 194.279, 'eval_steps_per_second': 12.142, 'epoch': 4.0}
{'loss': 0.3119, 'grad_norm': 53.777618408203125, 'learning_rate': 1.0415335463258786e-05, 'epoch': 4.79}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 5.257485389709473, 'eval_runtime': 10.3058, 'eval_samples_per_second': 194.065, 'eval_steps_per_second': 12.129, 'epoch': 5.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 5.541043758392334, 'eval_runtime': 10.3173, 'eval_samples_per_second': 193.85, 'eval_steps_per_second': 12.116, 'epoch': 6.0}
{'loss': 0.2358, 'grad_norm': 31.87915802001953, 'learning_rate': 7.220447284345049e-06, 'epoch': 6.39}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 5.524259567260742, 'eval_runtime': 10.1931, 'eval_samples_per_second': 196.211, 'eval_steps_per_second': 12.263, 'epoch': 7.0}
{'loss': 0.1749, 'grad_norm': 25.299697875976562, 'learning_rate': 4.02555910543131e-06, 'epoch': 7.99}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 5.483597755432129, 'eval_runtime': 10.1864, 'eval_samples_per_second': 196.341, 'eval_steps_per_second': 12.271, 'epoch': 8.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 5.670151710510254, 'eval_runtime': 10.2984, 'eval_samples_per_second': 194.205, 'eval_steps_per_second': 12.138, 'epoch': 9.0}
{'loss': 0.1342, 'grad_norm': 4.636297225952148, 'learning_rate': 8.306709265175719e-07, 'epoch': 9.58}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 5.767679691314697, 'eval_runtime': 10.2736, 'eval_samples_per_second': 194.674, 'eval_steps_per_second': 12.167, 'epoch': 10.0}
{'train_runtime': 885.878, 'train_samples_per_second': 56.441, 'train_steps_per_second': 3.533, 'train_loss': 0.2862178232723151, 'epoch': 10.0}


TrainOutput(global_step=3130, training_loss=0.2862178232723151, metrics={'train_runtime': 885.878, 'train_samples_per_second': 56.441, 'train_steps_per_second': 3.533, 'total_flos': 6624551116800000.0, 'train_loss': 0.2862178232723151, 'epoch': 10.0})

In [14]:
from sklearn.metrics import accuracy_score

# Run predictions on the test dataset
predictions = trainer.predict(small_test_dataset)
preds = predictions.predictions.argmax(-1)

# Calculate accuracy
labels = small_test_dataset["labels"]
accuracy = accuracy_score(labels, preds)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

  0%|          | 0/173 [00:00<?, ?it/s]

Test Accuracy: 17.25%


In [15]:
from sklearn.metrics import accuracy_score
import numpy as np

# Run predictions on the test dataset
predictions = trainer.predict(small_test_dataset)
preds = predictions.predictions.argmax(-1)

# Extract true labels and convert to NumPy array
labels = small_test_dataset["labels"].numpy()

# Compute tolerance-based accuracy
tolerance_correct = np.abs(preds - labels) <= 1  # Check if within ±1
accuracy_with_tolerance = tolerance_correct.mean()  # Average of correct predictions

print(f"Test Accuracy with Tolerance (±1): {accuracy_with_tolerance * 100:.2f}%")

  0%|          | 0/173 [00:00<?, ?it/s]

Test Accuracy with Tolerance (±1): 41.25%
