In [None]:
import os
import random

import numpy as np
import pandas as pd

import torch
import transformers

In [7]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
MAX_LENGTH = 2048
SEED = 42

In [8]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

In [9]:
model = transformers.LongformerForSequenceClassification.from_pretrained(
    "allenai/longformer-base-4096",
)
tokenizer = transformers.LongformerTokenizerFast.from_pretrained(
    "allenai/longformer-base-4096", max_length=MAX_LENGTH
)

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def train_validate_test_split(df, train_percent=0.6, validate_percent=0.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test


df = pd.read_csv("../pefk.csv")
df = df[df.dataset.isin(["wikidisputes", "wikitactics"])]
df = df.reset_index()
df.is_moderator = df.is_moderator.astype(int)
train_df, val_df, test_df = train_validate_test_split(
    df, train_percent=0.7, validate_percent=0.15
)

In [11]:
def build_contextual_dataset(df, context_window=3):
    inputs = []
    outputs = []

    # Group by conversation
    for conv_id, group in df.groupby("conv_id"):
        texts = group["text"].tolist()
        users = group["user"].tolist()
        labels = group["is_moderator"].tolist()

        for i in range(len(texts)):
            # Get previous `context_window` comments
            start = max(0, i - context_window)
            context = [
                f"<TURN> User {users[j]} posted: {texts[j]}" for j in range(start, i)
            ]

            current = f"<TURN> User {users[i]} posted: {texts[i]}"

            # Combine context and current
            input_text = " ".join(context + [current])
            inputs.append(input_text)
            outputs.append(labels[i])

    return inputs, outputs


X_train, y_train = build_contextual_dataset(train_df, context_window=2)
X_val, y_val = build_contextual_dataset(val_df, context_window=2)
X_test, y_test = build_contextual_dataset(test_df, context_window=2)

In [None]:
from datasets import Dataset


def torch_dataset(x, y):
    dataset = Dataset.from_dict({"text": x, "label": y})
    dataset = dataset.map(tokenize_function, batched=True)
    dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
    return dataset


def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,  # For Longformer models
    )


# Create a Hugging Face dataset from your training data
train_dataset = torch_dataset(X_train, y_train)
val_dataset = torch_dataset(X_val, y_val)
test_dataset = torch_dataset(X_test, y_test)

Map:   0%|          | 0/72356 [00:00<?, ? examples/s]

Map:   0%|          | 0/15505 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

GRAD_ACC_STEPS = 2
EVAL_STEPS = 200

training_args = TrainingArguments(
    output_dir="../results/training",
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    num_train_epochs=3,
    eval_strategy="steps",
    eval_steps=EVAL_STEPS / GRAD_ACC_STEPS,
    save_strategy="steps",
    save_steps=EVAL_STEPS / GRAD_ACC_STEPS,
    logging_strategy="steps",
    logging_dir="../logs",
    logging_steps=EVAL_STEPS / GRAD_ACC_STEPS,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    gradient_accumulation_steps=GRAD_ACC_STEPS,
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)  # numpy operation
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds),
    }


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

results = trainer.evaluate(eval_dataset=test_dataset)
print(results)

trainer.save_model("./best_model")  # Save the best model
tokenizer.save_pretrained("./best_model")  # Also save the tokenizer

Initializing global attention on CLS token...


Step,Training Loss,Validation Loss


KeyboardInterrupt: 