In [None]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from datasets import Dataset, DatasetDict
from transformers import (
    TrainingArguments,
    Trainer,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
)
import evaluate

In [None]:
import gdown

train_data_url = "https://drive.google.com/uc?id=1HeCgnLuDoUHhP-2OsTSSC3FXRLVoI6OG"
train_data = "train_data.jsonl"
gdown.download(train_data_url, train_data, quiet=False)

dev_data_url = "https://drive.google.com/uc?id=1e_G-9a66AryHxBOwGWhriePYCCa4_29e"
dev_data = "dev_data.jsonl"
gdown.download(dev_data_url, dev_data, quiet=False)

test_data_url = "https://drive.google.com/uc?id=1-TN7sfSK1BuYHXlqxHHfwjEIE0JfarPk"
test_data = "test_data.jsonl"
gdown.download(test_data_url, test_data, quiet=False)

import os 

os.listdir("/content")

import pandas as pd
train_data = pd.read_json('train_data.jsonl', lines=True)[["text", "label"]]
dev_data = pd.read_json('dev_data.jsonl', lines=True)[["text", "label"]]
test_data = pd.read_json('test_data.jsonl', lines=True)[["text", "label"]]

In [None]:
print(train_data.shape, dev_data.shape, test_data.shape)

def label_distribution(df, name):
    dist = df["label"].value_counts(normalize=True) * 100
    print(f"\n{name} label distribution (%)")
    print(dist.sort_index().round(2))

label_distribution(train_data, "Train")
label_distribution(dev_data, "Dev")
label_distribution(test_data, "Test")


from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

train_df, eval_df = train_test_split(train_data, test_size=0.2, random_state=42, stratify=train_data["label"])

train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
eval_dataset  = Dataset.from_pandas(eval_df, preserve_index=False)
dev_dataset   = Dataset.from_pandas(dev_data, preserve_index=False)
test_dataset  = Dataset.from_pandas(test_data, preserve_index=False)

In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)


train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = eval_dataset.map(tokenize, batched=True)
dev_dataset = dev_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

id2label = {0: "human", 1: "machine"}
label2id = {"human": 0, "machine": 1}
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
)

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
dev_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    predicted_classes = np.argmax(predictions, axis=1)

    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)['accuracy'], 3)

    return {"Accuracy": acc}

from transformers import (TrainingArguments,Trainer, EarlyStoppingCallback)

lr = 2e-5
batch_size = 8
num_epochs = 3

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=4,
    warmup_ratio=0.1,
    num_train_epochs=num_epochs,
    logging_strategy="steps",
    logging_steps=500,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)

trainer.train()



In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)

    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average="weighted")
    recall = recall_score(labels, predictions, average="weighted")
    f1 = f1_score(labels, predictions, average="weighted")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

predictions = trainer.predict(test_dataset)

logits = predictions.predictions
labels = predictions.label_ids

metrics = compute_metrics((logits, labels))
print(metrics)


In [None]:
predictions = trainer.predict(dev_dataset)

logits = predictions.predictions
labels = predictions.label_ids

metrics = compute_metrics((logits, labels))
print(metrics)