In [None]:
import pandas as pd
import numpy as np
import csv
import torch
import random
import nltk
from tqdm import tqdm

from transformers import set_seed, AdamW, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.nn.utils import clip_grad_norm_
from sklearn.metrics import classification_report
from ftfy import fix_text

import os
# os.getcwd()

huggingface_cache_dir = 'model'

In [None]:
df = pd.read_excel('actors_training_df_Robbert.xlsx', engine="openpyxl")

# change article_id to integer
df['article_id'] = df['article_id'].astype(int)

In [None]:
df["input_text"] = df["input_text"].apply(fix_text)

In [9]:
# see some examples of input text
df['input_text'].values[:34]
# df["input_text"] = df["input_text"].replace({"√∂": "ö", "√´": "é", "√º": "ü"}, regex=True)

# see some examples of inpyut text where string contains word patienten with e with dots
df[df['input_text'].str.contains('FP')]['input_text'].values[:10]

array(['FPÖ: De heftigste kritiek uit politieke hoek kwam van Herbert Kickl, voorzitter van de rechts-populistische oppositiepartij FPÖ. \nVorige week riep de FPÖ al op tot een demonstratie, morgen.',
       'Herbert Kickl: De heftigste kritiek uit politieke hoek kwam van Herbert Kickl, voorzitter van de rechts-populistische oppositiepartij FPÖ.',
       'FPÖ: Tot de demonstratie was mede opgeroepen door de rechtse politieke partij FPÖ.',
       'AFP: Kluge, die directeur Europa van de WHO is, zegt in een interview met het Franse persbureau AFP dat "wanneer omikron eenmaal verdwijnt er voor heel wat weken en maanden een periode van wijdverbreide immuniteit zal zijn".',
       'Hans Kluge: Dat zegt de Belg Hans Kluge van de Wereldgezondheidsorganisatie (WHO) over de coronapandemie in Europa. \nKluge, die directeur Europa van de WHO is, zegt in een interview met het Franse persbureau AFP dat "wanneer omikron eenmaal verdwijnt er voor heel wat weken en maanden een periode van wijdverbreid

In [None]:
# replace the first : with :\n
df["input_text"] = df["input_text"].str.replace(":", ":\n", n=1)

In [None]:
# read the reliability df
reliability_df = pd.read_csv('NOS/nos_analysis/actor_analysis/coref_resolution/reliability_actors_final_cleaned_elif.csv',
                             sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

reliability_df['article_id'] = reliability_df['article_id'].astype(int)
reliability_df = reliability_df[reliability_df['coder'] == 'Elif Kilik']

In [None]:
train_df = df[~df['article_id'].isin(reliability_df['article_id'])]
test_df = df[df['article_id'].isin(reliability_df['article_id'])]

In [15]:
train_df.quoted.value_counts()

quoted
1    1326
0     813
Name: count, dtype: int64

In [17]:
# calculate the token size for input_text values
train_df['token_size'] = train_df['input_text'].apply(lambda x: len(x.split()))
train_df['token_size'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['token_size'] = train_df['input_text'].apply(lambda x: len(x.split()))


count    2139.000000
mean       51.587190
std        45.841842
min         5.000000
25%        23.000000
50%        37.000000
75%        64.000000
max       428.000000
Name: token_size, dtype: float64

In [None]:
X = train_df[['input_text']]
y = train_df[['quoted']]


X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.1, shuffle=True, random_state=42)

print(X_train.shape, X_val.shape)  # Shapes of the input texts
print(y_train.shape, y_val.shape)    # Shapes of the binary labels

(1925, 1) (214, 1)
(1925, 1) (214, 1)


In [19]:
y_train.quoted.value_counts()

quoted
1    1191
0     734
Name: count, dtype: int64

In [20]:
y_val.head()
y_val.quoted.value_counts()

quoted
1    135
0     79
Name: count, dtype: int64

In [None]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)

In [None]:
torch.cuda.empty_cache()

In [None]:
# Set seed for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# Hyperparameters
learning_rates = [1e-5, 1e-6, 5e-6]
accumulation_steps = 2  # This simulates a batch size of 16 by accumulating over 2 steps with batch size 8
best_val_loss = float('inf')
best_model_state = None
patience = 3  # Number of epochs to wait before stopping if no improvement
epochs_to_test = [3, 5]  # Epochs to test

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("DTAI-KULeuven/robbert-2023-dutch-base", cache_dir=huggingface_cache_dir)

# Encode data
def encode(docs):
    encoded_dict = tokenizer.batch_encode_plus(docs, add_special_tokens=True, padding='max_length',
                                                return_attention_mask=True, truncation=True, return_tensors='pt')
    return encoded_dict['input_ids'], encoded_dict['attention_mask']

train_input_ids, train_att_masks = encode(X_train['input_text'].tolist())
valid_input_ids, valid_att_masks = encode(X_val['input_text'].tolist())
train_y = torch.LongTensor(y_train.values.squeeze())
valid_y = torch.LongTensor(y_val.values.squeeze())

train_dataset = TensorDataset(train_input_ids, train_att_masks, train_y)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=8)  

valid_dataset = TensorDataset(valid_input_ids, valid_att_masks, valid_y)
valid_sampler = SequentialSampler(valid_dataset)
valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=8)

save_directory = "your_save_directory"  # Specify your save directory here
os.makedirs(save_directory, exist_ok=True)

# Loop through specified learning rates
for learning_rate in learning_rates:
    print(f"\nTraining with learning rate: {learning_rate}")
    
    # Initialize the model for each learning rate
    model = AutoModelForSequenceClassification.from_pretrained("DTAI-KULeuven/robbert-2023-dutch-base", num_labels=2,  # Binary classification
                                                               cache_dir=huggingface_cache_dir)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Define optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = torch.nn.CrossEntropyLoss()

    # Loop through specified epochs
    for epochs in epochs_to_test:
        print(f"\nTraining for {epochs} epochs...")
        patience_counter = 0  # To track epochs without improvement

        for epoch_num in range(epochs):
            print(f"Epoch: {epoch_num + 1}/{epochs}")

            # Training
            model.train()
            train_loss = 0
            for step_num, batch_data in enumerate(tqdm(train_dataloader, desc='Training')):
                input_ids, att_mask, labels = [data.to(device) for data in batch_data]
                output = model(input_ids=input_ids, attention_mask=att_mask, labels=labels)

                loss = output.loss
                train_loss += loss.item()

                loss = loss / accumulation_steps  # Scale the loss
                loss.backward()  # Backpropagate the loss

                # Gradient accumulation
                if (step_num + 1) % accumulation_steps == 0:
                    clip_grad_norm_(model.parameters(), max_norm=1.0)
                    optimizer.step()
                    optimizer.zero_grad()

            # Average training loss for this epoch
            train_loss /= len(train_dataloader)
            print(f"Train loss: {train_loss:.4f}")

            # Validation
            model.eval()
            valid_loss = 0
            valid_pred = []
            with torch.no_grad():
                for step_num_e, batch_data in enumerate(tqdm(valid_dataloader, desc='Validation')):
                    input_ids, att_mask, labels = [data.to(device) for data in batch_data]
                    output = model(input_ids=input_ids, attention_mask=att_mask, labels=labels)
                    loss = output.loss
                    valid_loss += loss.item()
                    valid_pred.append(output.logits.cpu().detach().numpy())

            # Average validation loss for this epoch
            valid_loss /= len(valid_dataloader)
            print(f"Validation loss: {valid_loss:.4f}")

            # Early stopping check
            if valid_loss < best_val_loss:
                best_val_loss = valid_loss
                best_model_state = model.state_dict()
                patience_counter = 0  # Reset patience counter if we have improvement
                print("New best model found! Saving...")

                # Save the model components
                torch.save(best_model_state, os.path.join(save_directory, "best_model_state.bin"))
                tokenizer.save_pretrained(save_directory)
                model.config.save_pretrained(save_directory)
                torch.save(optimizer.state_dict(), os.path.join(save_directory, "optimizer_state.bin"))
                print("Model and optimizer states saved.")

            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break  # Stop training if no improvement

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training with learning rate: 1e-05





Training for 3 epochs...
Epoch: 1/3


Training: 100%|██████████| 241/241 [01:19<00:00,  3.04it/s]


Train loss: 0.4474


Validation: 100%|██████████| 27/27 [00:03<00:00,  8.78it/s]


Validation loss: 0.2248
New best model found! Saving...
Model and optimizer states saved.
Epoch: 2/3


Training: 100%|██████████| 241/241 [01:19<00:00,  3.02it/s]


Train loss: 0.2258


Validation: 100%|██████████| 27/27 [00:02<00:00, 10.23it/s]


Validation loss: 0.2318
Epoch: 3/3


Training: 100%|██████████| 241/241 [01:20<00:00,  3.00it/s]


Train loss: 0.1625


Validation: 100%|██████████| 27/27 [00:02<00:00, 10.11it/s]


Validation loss: 0.3120

Training for 5 epochs...
Epoch: 1/5


Training: 100%|██████████| 241/241 [01:19<00:00,  3.02it/s]


Train loss: 0.1185


Validation: 100%|██████████| 27/27 [00:02<00:00, 10.27it/s]


Validation loss: 0.4241
Epoch: 2/5


Training: 100%|██████████| 241/241 [01:22<00:00,  2.92it/s]


Train loss: 0.0961


Validation: 100%|██████████| 27/27 [00:02<00:00, 10.08it/s]


Validation loss: 0.4166
Epoch: 3/5


Training: 100%|██████████| 241/241 [01:20<00:00,  3.01it/s]


Train loss: 0.0451


Validation: 100%|██████████| 27/27 [00:03<00:00,  7.76it/s]


Validation loss: 0.8014
Early stopping triggered.

Training with learning rate: 1e-06


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training for 3 epochs...
Epoch: 1/3


Training: 100%|██████████| 241/241 [01:19<00:00,  3.03it/s]


Train loss: 0.7177


Validation: 100%|██████████| 27/27 [00:02<00:00, 10.18it/s]


Validation loss: 0.5787
Epoch: 2/3


Training: 100%|██████████| 241/241 [01:19<00:00,  3.02it/s]


Train loss: 0.5572


Validation: 100%|██████████| 27/27 [00:03<00:00,  8.89it/s]


Validation loss: 0.4444
Epoch: 3/3


Training: 100%|██████████| 241/241 [01:20<00:00,  2.99it/s]


Train loss: 0.4614


Validation: 100%|██████████| 27/27 [00:02<00:00,  9.93it/s]


Validation loss: 0.3396
Early stopping triggered.

Training for 5 epochs...
Epoch: 1/5


Training: 100%|██████████| 241/241 [01:19<00:00,  3.02it/s]


Train loss: 0.3917


Validation: 100%|██████████| 27/27 [00:02<00:00, 10.24it/s]


Validation loss: 0.3454
Epoch: 2/5


Training: 100%|██████████| 241/241 [01:22<00:00,  2.93it/s]


Train loss: 0.3415


Validation: 100%|██████████| 27/27 [00:02<00:00, 10.17it/s]


Validation loss: 0.3189
Epoch: 3/5


Training: 100%|██████████| 241/241 [01:20<00:00,  2.98it/s]


Train loss: 0.3031


Validation: 100%|██████████| 27/27 [00:02<00:00, 10.17it/s]


Validation loss: 0.2679
Early stopping triggered.

Training with learning rate: 5e-06


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training for 3 epochs...
Epoch: 1/3


Training: 100%|██████████| 241/241 [01:22<00:00,  2.90it/s]


Train loss: 0.5450


Validation: 100%|██████████| 27/27 [00:02<00:00, 10.12it/s]


Validation loss: 0.2995
Epoch: 2/3


Training: 100%|██████████| 241/241 [01:21<00:00,  2.96it/s]


Train loss: 0.3166


Validation: 100%|██████████| 27/27 [00:02<00:00, 10.23it/s]


Validation loss: 0.2624
Epoch: 3/3


Training: 100%|██████████| 241/241 [01:19<00:00,  3.05it/s]


Train loss: 0.2530


Validation: 100%|██████████| 27/27 [00:02<00:00, 10.13it/s]


Validation loss: 0.2361
Early stopping triggered.

Training for 5 epochs...
Epoch: 1/5


Training: 100%|██████████| 241/241 [01:22<00:00,  2.92it/s]


Train loss: 0.2004


Validation: 100%|██████████| 27/27 [00:02<00:00, 10.26it/s]


Validation loss: 0.2471
Epoch: 2/5


Training: 100%|██████████| 241/241 [01:19<00:00,  3.04it/s]


Train loss: 0.1392


Validation: 100%|██████████| 27/27 [00:02<00:00, 10.00it/s]


Validation loss: 0.3327
Epoch: 3/5


Training: 100%|██████████| 241/241 [01:20<00:00,  2.99it/s]


Train loss: 0.1210


Validation: 100%|██████████| 27/27 [00:02<00:00, 10.18it/s]

Validation loss: 0.3585
Early stopping triggered.





In [None]:
# Validation
model.eval()
valid_loss = 0
valid_pred = []

with torch.no_grad():
    for step_num_e, batch_data in enumerate(tqdm(valid_dataloader, desc='Validation')):
        input_ids, att_mask, labels = [data.to(device) for data in batch_data]
        
        # During validation, no need to pass labels to the model
        output = model(input_ids=input_ids, attention_mask=att_mask)
        logits = output.logits
        
        # Compute validation loss manually (if needed)
        # loss = criterion(logits, labels)
        valid_loss += loss.item()

        # Store the logits for all validation examples
        valid_pred.append(logits.cpu().detach().numpy())

# Average validation loss
valid_loss /= len(valid_dataloader)
print(f"Validation loss: {valid_loss:.4f}")

# Check if valid_pred has any entries before concatenation
if valid_pred:
    valid_pred = np.concatenate(valid_pred)  # Concatenate predictions from batches

    # Apply softmax to get class probabilities
    valid_pred_softmax = torch.softmax(torch.tensor(valid_pred), dim=1).numpy()

    # Get the class predictions (0 or 1) based on the higher probability
    valid_pred_labels = np.argmax(valid_pred_softmax, axis=1)

    # Flatten ground truth for classification report
    y_true = valid_y.numpy()

    print(classification_report(y_true, valid_pred_labels, 
                                zero_division=0))


Validation: 100%|██████████| 27/27 [00:02<00:00, 10.68it/s]

Validation loss: 0.0170
              precision    recall  f1-score   support

           0       0.96      0.86      0.91        79
           1       0.92      0.98      0.95       135

    accuracy                           0.93       214
   macro avg       0.94      0.92      0.93       214
weighted avg       0.94      0.93      0.93       214






In [None]:
# Ensure reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
    
# Load the model architecture
model = AutoModelForSequenceClassification.from_pretrained("DTAI-KULeuven/robbert-2023-dutch-base", num_labels=2, cache_dir=huggingface_cache_dir)

# Load the saved model state (weights)
model.load_state_dict(torch.load(os.path.join(save_directory, "best_model_state.bin")))

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Load the model configuration
model.config.from_pretrained(save_directory)

# Load the optimizer state (if you plan to resume training)
optimizer = AdamW(model.parameters(), lr=learning_rate)
optimizer.load_state_dict(torch.load(os.path.join(save_directory, "optimizer_state.bin")))

# Save the model
model.save_pretrained(save_directory)  # Saves the model architecture and weights

# Save the tokenizer
tokenizer.save_pretrained(save_directory)  # Saves the tokenizer

# Optionally, save the optimizer state if you're resuming training
torch.save(optimizer.state_dict(), os.path.join(save_directory, "optimizer_state.bin"))

# Set model to evaluation mode for inference
model.eval()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(os.path.join(save_directory, "best_model_state.bin")))
  optimizer.load_state_dict(torch.load(os.path.join(save_directory, "optimizer_state.bin")))


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
# Save the model
model.save_pretrained(save_directory)  # Saves the model architecture and weights

# Save the tokenizer
tokenizer.save_pretrained(save_directory)  # Saves the tokenizer

# Test Results

In [None]:
# Ensure that you have the device set up
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Ensure reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

model.to(device)
# Set the model to evaluation mode
model.eval()

# encode input text for the test set
test_input_ids, test_att_masks = encode(test_df['input_text'].tolist())
test_y = torch.LongTensor(test_df['quoted'].values.squeeze())

test_dataset = TensorDataset(test_input_ids, test_att_masks, test_y)
test_sampler = SequentialSampler(test_dataset)

test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=8)

# Initialize loss and predictions
test_loss = 0
test_pred = []

# Evaluate on the test set
with torch.no_grad():
    for batch_data in tqdm(test_dataloader, desc='Testing'):
        input_ids, att_mask, labels = [data.to(device) for data in batch_data]

        # Get model output
        output = model(input_ids=input_ids, attention_mask=att_mask, labels=labels)
        loss = output.loss
        test_loss += loss.item()

        # Store logits for prediction
        test_pred.append(output.logits.cpu().detach().numpy())

# Compute average loss
test_loss /= len(test_dataloader)
print(f"Test loss: {test_loss:.4f}")

# Ensure there are predictions to evaluate
if test_pred:
    test_pred = np.concatenate(test_pred)

    # Apply softmax for class probabilities
    test_pred_softmax = torch.softmax(torch.tensor(test_pred).cpu(), dim=1).numpy()

    # Get predicted labels
    test_pred_labels = np.argmax(test_pred_softmax, axis=1)

    # Get true labels
    y_true = test_y.numpy()

    # Print classification report
    print(classification_report(y_true, test_pred_labels, zero_division=0))

Testing: 100%|██████████| 61/61 [00:06<00:00,  9.96it/s]

Test loss: 0.2689
              precision    recall  f1-score   support

           0       0.90      0.88      0.89       214
           1       0.91      0.92      0.92       271

    accuracy                           0.91       485
   macro avg       0.90      0.90      0.90       485
weighted avg       0.91      0.91      0.91       485






In [None]:
# append the predictions to the df_elif
test_df['predicted'] = test_pred_labels

In [None]:
# save the df_elif
test_df.to_csv('path_to_RobBERT_quote_classifier.csv',
               sep = ';', encoding = 'utf-8', index = False, quoting=csv.QUOTE_NONNUMERIC)