### Please fill the following paths.

In [None]:
# Path to the test dataset, containing "digi.json" and "libertatea.json"
TEST_PATH = ""

# Path to the test dataset, containing "protv.json", "cancan.json" and "wowbiz.json"
TRAIN_PATH = ""

# Path where the best accuracy checkpoint can be saved
CHECKPOINT_PATH =  ""

# Path to the folder in which the model and other necessary tools are saved
FOLDER_PATH = ""

# Path to the folder in which the logs of the trainer are saved
LOGS_PATH = ""

# Path to the folder in which the results of the trainer are saved
RESULTS_PATH = ""

### Imports

In [None]:
! pip install transformers==4.28.0

In [None]:
! pip install datasets

In [None]:
import torch
import numpy as np
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import json
import pandas as pd
import os
from datasets import load_dataset
from datasets import Dataset

import torch.nn.functional as F
from torch import nn
from transformers import AutoModel

from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from transformers import TrainerCallback, EarlyStoppingCallback

### Read data

In [None]:
def read_file(path, name):
  file_path = path + "/" + name

  reader = open(file_path)
  json_array = json.load(reader)
  news = []
  # nonclickbait = 0
  # clickbait = 1

  for element in json_array:
    cat = 1
    if element["category"] == "nonclickbait":
      cat = 0
    item = {
        "title":element["title"],
        "content":element["content"],
        "category":cat
            }
    news.append(item)

  return news

In [None]:
def read_raw_data(folder_path):
  filenames = sorted(os.listdir(folder_path))

  raw_data = []
  for filename in filenames:
    print(filename)
    current = read_file(folder_path, filename)
    raw_data.extend(current)

  return raw_data

In [None]:
print('Test files:')
test_raw_data  = read_raw_data(TEST_PATH)
print("---------------------")
print('Train files:')
train_raw_data = read_raw_data(TRAIN_PATH)
print("---------------------")

In [None]:
df_train = pd.DataFrame(train_raw_data)
df_test = pd.DataFrame(test_raw_data)

In [None]:
dataset =  Dataset.from_pandas(df_train)
hf_dataset_splits = dataset.train_test_split(test_size=0.3)
train_dataset = hf_dataset_splits['train']
validation_dataset =  hf_dataset_splits['test']

test_dataset = Dataset.from_pandas(df_test)

### Model

In [None]:
model_name = "dumitrescustefan/bert-base-romanian-cased-v1"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
def preprocess_dataset(dataset, tokenizer):
    def tokenize_and_merge_title_content(examples):
        texts = [f"{line}" for line in examples['Line']]
        tokenized = tokenizer.batch_encode_plus(
            texts,
            max_length=512,
            padding="max_length",
            truncation=True,
            return_token_type_ids=False,
        )
        return {
            "input_ids": tokenized["input_ids"],
            "attention_mask": tokenized["attention_mask"],
            "labels": examples["category"]
        }

    dataset = dataset.map(tokenize_and_merge_title_content, batched=True)
    dataset.set_format(
        type='torch',
        columns=['input_ids', 'attention_mask', 'labels']
    )
    return dataset

In [None]:
tokenized_train_dataset = preprocess_dataset(train_dataset, tokenizer)
tokenized_validation_dataset = preprocess_dataset(validation_dataset, tokenizer)

Map:   0%|          | 0/4764 [00:00<?, ? examples/s]

Map:   0%|          | 0/2042 [00:00<?, ? examples/s]

In [None]:
class BestModelCheckpointCallback(TrainerCallback):
    def __init__(self, trainer, early_stopping_patience, output_dir):
        self.early_stopping_patience = early_stopping_patience
        self.best_val_acc = None
        self.patience_counter = 0
        self.output_dir = output_dir
        self.trainer = trainer

    def on_log(self, args, state, control, logs=None, **kwargs):
        val_acc = logs.get("eval_accuracy")
        if val_acc is None:
            return

        if self.best_val_acc is None or self.best_val_acc > val_acc:
            self.best_val_acc = val_acc
            self.patience_counter = 0
            self.save_best_model_checkpoint()
        else:
            self.patience_counter += 1
            if self.patience_counter >= self.early_stopping_patience:
                control.should_training_stop = True

    def save_best_model_checkpoint(self):
        if self.output_dir is not None:
            self.trainer.save_model(self.output_dir)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
training_args = TrainingArguments(
    output_dir=RESULTS_PATH,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir=LOGS_PATH,
    learning_rate=2e-5,
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    compute_metrics=compute_metrics
)

In [None]:
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)
checkpoint_path = CHECKPOINT_PATH

best_model_checkpoint_callback = BestModelCheckpointCallback(early_stopping_patience = 3,
                                                             output_dir = checkpoint_path,
                                                             trainer = trainer)

trainer.add_callback(early_stopping_callback)
trainer.add_callback(best_model_checkpoint_callback)

trainer.train()

In [None]:
trainer_path = FOLDER_PATH + "trainer"
trainer.save_model(trainer_path)

In [None]:
trainer = BertForSequenceClassification.from_pretrained(trainer_path)

In [None]:
def evaluate_custom_classifier(model, dataset, device):
    model.eval()

    correct_predictions = 0
    dataloader = DataLoader(dataset, batch_size=4)
    progress_bar = tqdm(dataloader, desc="Evaluate")

    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs, dim=1)
            correct_predictions += (preds == labels).sum().item()

    accuracy = correct_predictions / len(dataset)
    return accuracy

In [None]:
def train_custom_classifier(model, train_dataset, val_dataset, optimizer, device, num_epochs):
    model.to(device)
    model.train()

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        epoch_loss = 0
        correct_predictions = 0

        train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
        progress_bar = tqdm(train_dataloader, desc="Train")

        for batch in progress_bar:
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = torch.nn.functional.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            correct_predictions += (preds == labels).sum().item()

        epoch_loss /= len(train_dataset)
        accuracy = correct_predictions / len(train_dataset)

        print(f"Train Loss: {epoch_loss:.5f}, Train Accuracy: {accuracy:.5f}")
        val_accuracy = evaluate_custom_classifier(model, val_dataset, device)
        print(f"Validation Accuracy: {val_accuracy:.5f}")

In [None]:
class CustomClassifier(nn.Module):
    def __init__(self, pretrained_model, num_classes):
        super(CustomClassifier, self).__init__()
        self.base_model = pretrained_model
        self.dropout = nn.Dropout(0.2)
        self.dense = nn.Linear(self.base_model.config.hidden_size, 128)
        self.classifier = nn.Linear(128, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state
        x = hidden_state[:, 0, :]
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.relu(x)
        x = self.classifier(x)
        return x

    def predict(self, input_ids, attention_mask):
        outputs = self.forward(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs, dim=1)
        return predictions

In [None]:
custom_classifier = CustomClassifier(trainer.base_model, 2)

optimizer = torch.optim.AdamW(custom_classifier.parameters(), lr=2e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_custom_classifier(custom_classifier, tokenized_train_dataset, tokenized_validation_dataset, optimizer, device, num_epochs=5)

In [None]:
custom_classifier_path = FOLDER_PATH + "custom_classifier.pth"
torch.save(custom_classifier.state_dict(), custom_classifier_path)

In [None]:
classifier_model = CustomClassifier(trainer.base_model, 2)
classifier_model.load_state_dict(torch.load(custom_classifier_path, map_location=torch.device('cpu')))
classifier_model.to(device)

In [None]:
tokenized_test_dataset = preprocess_dataset(test_dataset, tokenizer)
test_accuracy = evaluate_custom_classifier(custom_classifier, tokenized_test_dataset, device)

In [None]:
def predict_labels(model, dataset, device):
    model.eval()
    dataloader = DataLoader(dataset, batch_size=8)
    predictions = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model.predict(input_ids=input_ids, attention_mask=attention_mask)
            predictions.extend(outputs.cpu().numpy())

    return predictions

In [None]:
predictions = predict_labels(custom_classifier, tokenized_test_dataset, device)
true_labels = tokenized_test_dataset["labels"]