In [13]:
import os

import random
import numpy as np
import pandas as pd

from transformers import AutoTokenizer

# from transformers import XLMRobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import (
    AutoModelForSequenceClassification,
    XLMRobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)

import json


import torch
import torch.nn as nn
from torch.optim import AdamW, Adam
from torch.utils.data import Dataset, DataLoader

In [12]:
seed = 1
model_checkpoint = "distilbert-base-uncased-full-training-gpt2-data"
model_name = "distilbert-base-uncased"
tokenizer_name = "distilbert-base-uncased"

In [4]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained(
#     f"{model_checkpoint}/checkpoint-2589", num_labels=2
# )

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [6]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [7]:
def freeze_params(model):
    for name, param in model.named_parameters():
        if not name.startswith("classifier"):
            param.requires_grad = False

In [8]:
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [9]:
seed_everything(1)

In [10]:
# freeze_params(model)
if torch.cuda.is_available():
    model = model.to("cuda")

In [15]:
train = pd.read_csv("data/original/train.csv")
test = pd.read_csv("data/original/test.csv")
valid = pd.read_csv("data/original/validation.csv")

In [16]:
LABEL_REAL = 0
LABEL_FAKE = 1


def load_texts(data_file: str) -> list[str]:
    texts = []

    with open(data_file) as f:
        for line in f:
            texts.append(json.loads(line)["text"])

    return texts


def build_pandas(
    fake: list[str],
    real: list[str],
    label_fake: str = LABEL_FAKE,
    label_real: str = LABEL_REAL,
) -> pd.DataFrame:
    df = pd.DataFrame(
        {
            "text": [*fake, *real],
            "label": np.concatenate(
                [[label_fake] * len(fake), [label_real] * len(real)]
            ),
        }
    )
    return df

In [19]:
train_fake = load_texts(
    os.path.join("data/gpt-2-ouput-dataset/medium-345M-k40.train.jsonl")
)
validation_fake = load_texts(
    os.path.join(f"data/gpt-2-ouput-dataset/medium-345M-k40.valid.jsonl")
)
test_fake = load_texts(
    os.path.join(f"data/gpt-2-ouput-dataset/medium-345M-k40.test.jsonl")
)
train_real = load_texts(os.path.join("data/gpt-2-ouput-dataset/webtext.train.jsonl"))
validation_real = load_texts(
    os.path.join("data/gpt-2-ouput-dataset/webtext.valid.jsonl")
)
test_real = load_texts(os.path.join("data/gpt-2-ouput-dataset/webtext.test.jsonl"))

train_gpt = build_pandas(train_fake, train_real)
valid_gpt = build_pandas(validation_fake, validation_real)
test_gpt = build_pandas(test_fake, test_real)

In [20]:
mapping = {"human": 0, "bot": 1}

In [21]:
train["label"] = train["account.type"].apply(lambda x: mapping[x])
test["label"] = test["account.type"].apply(lambda x: mapping[x])
valid["label"] = valid["account.type"].apply(lambda x: mapping[x])

In [24]:
class TweetDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data = self.data.iloc[index]

        text = data.text
        label = data.label

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return dict(
            text=text,
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            label=torch.tensor(label, dtype=torch.long),
        )

In [25]:
batch_size = 8
lr = 2e-6

In [26]:
ds_train = TweetDataset(train, tokenizer=tokenizer, max_len=512)
ds_test = TweetDataset(test, tokenizer=tokenizer, max_len=512)
ds_valid = TweetDataset(valid, tokenizer=tokenizer, max_len=512)
ds_train_gpt = TweetDataset(train_gpt, tokenizer=tokenizer, max_len=512)
ds_test_gpt = TweetDataset(test_gpt, tokenizer=tokenizer, max_len=512)
ds_valid_gpt = TweetDataset(valid_gpt, tokenizer=tokenizer, max_len=512)

In [32]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": (predictions == labels).astype(np.float32).mean().item()}

In [30]:
args = TrainingArguments(
    f"{model_checkpoint}",
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=2_000,
    eval_steps=2_000,
    num_train_epochs=1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,  # save best and last
    load_best_model_at_end=True,
    seed=seed,
    # metric_for_best_model=metric_name,
)

In [33]:
trainer = Trainer(
    model,
    args,
    train_dataset=ds_train_gpt,
    eval_dataset=ds_valid_gpt,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(AdamW(model.parameters(), lr=lr), None),  # Optimizer, Scheduler
    # compute_loss to override one must create cutsom trainer class and override the method compute_loss(self, model, inputs, return_outputs=False):
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

In [34]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
500,0.5969,0.422342,0.8182
1000,0.3474,0.324807,0.8674
1500,0.291,0.348252,0.872
2000,0.2509,0.426761,0.8644
2500,0.2554,0.323371,0.9019
3000,0.256,0.396047,0.8869
3500,0.2255,0.462577,0.8759
4000,0.2474,0.466738,0.8772
4500,0.2216,0.224425,0.9392
5000,0.2386,0.410707,0.9015


TrainOutput(global_step=10500, training_loss=0.23855736650739398, metrics={'train_runtime': 4774.5676, 'train_samples_per_second': 104.722, 'train_steps_per_second': 13.09, 'total_flos': 1.1127261487104e+16, 'train_loss': 0.23855736650739398, 'epoch': 0.17})

In [35]:
test_results = trainer.predict(ds_test_gpt)

In [36]:
test_results.metrics

{'test_loss': 0.24656160175800323,
 'test_accuracy': 0.9412999749183655,
 'test_runtime': 101.8383,
 'test_samples_per_second': 98.195,
 'test_steps_per_second': 12.274}

In [37]:
from sklearn.metrics import (
    balanced_accuracy_score,
    f1_score,
    precision_score,
    recall_score,
)


def proba_to_pred(proba):
    pred = (proba > 0.5).astype(int)
    return pred


def calculate_metrics(y_true, y_pred):
    results = {
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
    }
    return results

In [39]:
calculate_metrics(test_gpt.label.values, np.argmax(test_results.predictions, axis=1))

{'balanced_accuracy': 0.9413,
 'f1_score': 0.9435956567694821,
 'precision': 0.9080821157758461,
 'recall': 0.982}

In [40]:
args = TrainingArguments(
    f"{model_checkpoint}-with-twitter",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,  # save best and last
    load_best_model_at_end=True,
    seed=seed,
    # metric_for_best_model=metric_name,
)

In [41]:
trainer = Trainer(
    model,
    args,
    train_dataset=ds_train,
    eval_dataset=ds_valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(AdamW(model.parameters(), lr=lr), None),  # Optimizer, Scheduler
    # compute_loss to override one must create cutsom trainer class and override the method compute_loss(self, model, inputs, return_outputs=False):
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

In [42]:
test_results = trainer.predict(ds_test)

In [43]:
test_results.metrics

{'test_loss': 2.1782612800598145,
 'test_accuracy': 0.512900710105896,
 'test_runtime': 23.9969,
 'test_samples_per_second': 106.597,
 'test_steps_per_second': 13.335}

In [44]:
calculate_metrics(test.label.values, np.argmax(test_results.predictions, axis=1))

{'balanced_accuracy': 0.5131742713223787,
 'f1_score': 0.2512019230769231,
 'precision': 0.5442708333333334,
 'recall': 0.16328125}

In [45]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3485,0.354791,0.845352
2,0.3177,0.341916,0.858384
3,0.313,0.325295,0.852737
4,0.285,0.336917,0.866203
5,0.2664,0.393071,0.851434


TrainOutput(global_step=12945, training_loss=0.32389511043567737, metrics={'train_runtime': 3226.5319, 'train_samples_per_second': 64.193, 'train_steps_per_second': 8.024, 'total_flos': 1.371832380481536e+16, 'train_loss': 0.32389511043567737, 'epoch': 5.0})

In [46]:
test_results = trainer.predict(ds_test)

In [47]:
test_results.metrics

{'test_loss': 0.3325382471084595,
 'test_accuracy': 0.8530101776123047,
 'test_runtime': 23.9339,
 'test_samples_per_second': 106.878,
 'test_steps_per_second': 13.37}

In [48]:
calculate_metrics(test.label.values, np.argmax(test_results.predictions, axis=1))

{'balanced_accuracy': 0.8529929577464789,
 'f1_score': 0.8562691131498471,
 'precision': 0.8383233532934131,
 'recall': 0.875}