In [None]:
import pandas as pd
import numpy as np
import os
import csv
import torch
import random
from tqdm import tqdm

from transformers import set_seed, AdamW, AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.nn.utils import clip_grad_norm_
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
df = pd.read_csv('path_to_actors_training_data_created_in_step3', sep=';', quoting=csv.QUOTE_NONNUMERIC, encoding = 'utf-8') # csv
# change article_id to integer
df['article_id'] = df['article_id'].astype(int)
# select only quoted actors
df = df[df['quoted'] == 1]

In [None]:
df['input_text'].values[:5] # check input text created in step 3

array(['Buitenlandse Zaken:\n Het ministerie van Buitenlandse Zaken zegt dat de ambassade in Peking de situatie op de voet volgt.',
       'Wereldgezondheidsorganisatie:\n Het ministerie adviseert reizigers de adviezen van het Landelijk Coordinatiecentrum Reizigersadvisering (LCR) en de Wereldgezondheidsorganisatie (WHO) in de gaten te houden. \nDe Wereldgezondheidsorganisatie beschouwt de uitbraak van het virus als een Chinese aangelegenheid.',
       'Bruins:\n Minister Bruins voor Medische Zorg vindt het verschrikkelijk dat mensen met een Aziatisch uiterlijk worden gediscrimineerd vanwege het coronavirus. \nBruins reageerde op vragen van onder anderen GroenLinks-Kamerlid Ellemeet. \nBruins zei hierop dat hij dit de komende dagen nog verschillende keren wil doen, ook buiten de Tweede Kamer. \nBruins is niet van plan om evenementen en attracties te sluiten die veel Chinese toeristen trekken, zoals de Keukenhof.',
       'GroenLinks-Kamerlid Ellemeet:\n Bruins reageerde op vragen van o

In [None]:
# read the annotations df with researcher codings
reliability_df = pd.read_csv('reliability_actors_final_cleaned_elif.csv',
                             sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

reliability_df['article_id'] = reliability_df['article_id'].astype(int)
reliability_df = reliability_df[reliability_df['coder'] == 'Elif Kilik']
train_df = df[~df['article_id'].isin(reliability_df['article_id'])]
test_df = df[df['article_id'].isin(reliability_df['article_id'])]

In [56]:
train_df.talks_covid_measures.value_counts()

talks_covid_measures
0.0    752
1.0    477
Name: count, dtype: int64

In [57]:
test_df.talks_covid_measures.value_counts()

talks_covid_measures
0.0    138
1.0    102
Name: count, dtype: int64

In [None]:
X = train_df[['input_text']]
y = train_df[['talks_covid_measures']]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42, stratify=y)

print(X_train.shape, X_val.shape)  # Shapes of the input texts
print(y_train.shape, y_val.shape)    # Shapes of the binary labels

(983, 1) (246, 1)
(983, 1) (246, 1)


# Fine-tune RobBERT Dutch

In [60]:
# get the nr of 1 labels in y_train and y_val
y_train.talks_covid_measures.value_counts()

talks_covid_measures
0.0    601
1.0    382
Name: count, dtype: int64

In [61]:
y_val.talks_covid_measures.value_counts()

talks_covid_measures
0.0    151
1.0     95
Name: count, dtype: int64

In [62]:
set_seed(42)
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)

In [63]:
torch.cuda.empty_cache()

In [None]:
# Define the path to save the best model
save_directory = "path_to_save_best_model"  # Replace with your desired path
# Create the directory if it does not exist
os.makedirs(save_directory, exist_ok=True)

In [None]:
# Set seed for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# Hyperparameters
learning_rates = [1e-5, 2e-5, 5e-5]
accumulation_steps = 2  # This simulates a batch size of 16 by accumulating over 2 steps with batch size 8
best_val_loss = float('inf')
best_model_state = None
patience = 3  # Number of epochs to wait before stopping if no improvement
epochs_to_test = [3, 5]  # Epochs to test

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("DTAI-KULeuven/robbert-2023-dutch-base")

# Encode data
def encode(docs):
    encoded_dict = tokenizer.batch_encode_plus(docs, add_special_tokens=True, padding='max_length',
                                                return_attention_mask=True, truncation=True, return_tensors='pt')
    return encoded_dict['input_ids'], encoded_dict['attention_mask']

train_input_ids, train_att_masks = encode(X_train['input_text'].tolist())
valid_input_ids, valid_att_masks = encode(X_val['input_text'].tolist())
train_y = torch.LongTensor(y_train.values.squeeze())
valid_y = torch.LongTensor(y_val.values.squeeze())

train_dataset = TensorDataset(train_input_ids, train_att_masks, train_y)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=8)  

valid_dataset = TensorDataset(valid_input_ids, valid_att_masks, valid_y)
valid_sampler = SequentialSampler(valid_dataset)
valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=8)

# Loop through specified learning rates
for learning_rate in learning_rates:
    print(f"\nTraining with learning rate: {learning_rate}")
    
    # Initialize the model for each learning rate
    model = AutoModelForSequenceClassification.from_pretrained("DTAI-KULeuven/robbert-2023-dutch-base", num_labels=2)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Define optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = torch.nn.CrossEntropyLoss()

    # Loop through specified epochs
    for epochs in epochs_to_test:
        print(f"\nTraining for {epochs} epochs...")
        patience_counter = 0  # To track epochs without improvement

        for epoch_num in range(epochs):
            print(f"Epoch: {epoch_num + 1}/{epochs}")

            # Training
            model.train()
            train_loss = 0
            for step_num, batch_data in enumerate(tqdm(train_dataloader, desc='Training')):
                input_ids, att_mask, labels = [data.to(device) for data in batch_data]
                output = model(input_ids=input_ids, attention_mask=att_mask, labels=labels)

                loss = output.loss
                train_loss += loss.item()

                loss = loss / accumulation_steps  # Scale the loss
                loss.backward()  # Backpropagate the loss

                # Gradient accumulation
                if (step_num + 1) % accumulation_steps == 0:
                    clip_grad_norm_(model.parameters(), max_norm=1.0)
                    optimizer.step()
                    optimizer.zero_grad()

            # Average training loss for this epoch
            train_loss /= len(train_dataloader)
            print(f"Train loss: {train_loss:.4f}")

            # Validation
            model.eval()
            valid_loss = 0
            valid_pred = []
            with torch.no_grad():
                for step_num_e, batch_data in enumerate(tqdm(valid_dataloader, desc='Validation')):
                    input_ids, att_mask, labels = [data.to(device) for data in batch_data]
                    output = model(input_ids=input_ids, attention_mask=att_mask, labels=labels)
                    loss = output.loss
                    valid_loss += loss.item()
                    valid_pred.append(output.logits.cpu().detach().numpy())

            # Average validation loss for this epoch
            valid_loss /= len(valid_dataloader)
            print(f"Validation loss: {valid_loss:.4f}")

            # Early stopping check
            if valid_loss < best_val_loss:
                best_val_loss = valid_loss
                best_model_state = model.state_dict()
                patience_counter = 0  # Reset patience counter if we have improvement
                print("New best model found! Saving...")
                torch.save(best_model_state, os.path.join(save_directory, "best_model_state.bin"))
                tokenizer.save_pretrained(save_directory)
                model.config.save_pretrained(save_directory)
                torch.save(optimizer.state_dict(), os.path.join(save_directory, "optimizer_state.bin"))
                print("Model and optimizer states saved.")
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break  # Stop training if no improvement

In [None]:
# Validation
model.eval()
valid_loss = 0
valid_pred = []

with torch.no_grad():
    for step_num_e, batch_data in enumerate(tqdm(valid_dataloader, desc='Validation')):
        input_ids, att_mask, labels = [data.to(device) for data in batch_data]
        
        # During validation, no need to pass labels to the model
        output = model(input_ids=input_ids, attention_mask=att_mask)
        logits = output.logits
        
        # Compute validation loss manually (if needed)
        # loss = criterion(logits, labels)
        valid_loss += loss.item()

        # Store the logits for all validation examples
        valid_pred.append(logits.cpu().detach().numpy())

# Average validation loss
valid_loss /= len(valid_dataloader)
print(f"Validation loss: {valid_loss:.4f}")

# Check if valid_pred has any entries before concatenation
if valid_pred:
    valid_pred = np.concatenate(valid_pred)  # Concatenate predictions from batches

    # Apply softmax to get class probabilities
    valid_pred_softmax = torch.softmax(torch.tensor(valid_pred), dim=1).numpy()

    # Get the class predictions (0 or 1) based on the higher probability
    valid_pred_labels = np.argmax(valid_pred_softmax, axis=1)

    # Flatten ground truth for classification report
    y_true = valid_y.numpy()

    print(classification_report(y_true, valid_pred_labels, 
                                zero_division=0))


Validation: 100%|██████████| 31/31 [00:01<00:00, 16.17it/s]

Validation loss: 0.6006
              precision    recall  f1-score   support

           0       0.90      0.89      0.90       151
           1       0.83      0.84      0.84        95

    accuracy                           0.87       246
   macro avg       0.87      0.87      0.87       246
weighted avg       0.87      0.87      0.87       246






In [78]:
pd.crosstab(y_val['talks_covid_measures'], valid_pred_labels, rownames=['True'], colnames=['Predicted'])

Predicted,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,135,16
1.0,15,80


# Test Results

In [None]:
test_df = pd.read_csv('path_to_test_data_researcher_codings',
                 sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC) #csv file

# change article_id to integer
test_df['article_id'] = test_df['article_id'].astype(int)
# drop if colnames has unnamed 
test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]
# keep only if directly_quoted or indirectly_quoted is 1
test_df = test_df[(test_df['directly_quoted'] == 1) | (test_df['indirectly_quoted'] == 1)]

In [71]:
# get article_id, actor_name, talks_covid_measures, relevant_sentences_string
test_df = test_df[['article_id', 'actor_name', 'talks_covid_measures', 'input_text_corrected',
       'talks_covid_corrected', 'measures_positive_corrected',
       'measures_negative_corrected', 'measures_neutral_corrected']]

In [None]:
test_df.talks_covid_corrected.value_counts() # these are manually corrected labels

talks_covid_corrected
0.0    181
1.0    114
Name: count, dtype: int64

In [None]:
# encode input text for the test set
test_input_ids, test_att_masks = encode(test_df['input_text_corrected'].tolist())
test_y = torch.LongTensor(test_df['talks_covid_corrected'].values.squeeze())

test_dataset = TensorDataset(test_input_ids, test_att_masks, test_y)
test_sampler = SequentialSampler(test_dataset)

test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=8)

# the model is loaded

model.eval()
test_loss = 0

test_pred = []

with torch.no_grad():
    for step_num_e, batch_data in enumerate(tqdm(test_dataloader, desc='Testing')):
        input_ids, att_mask, labels = [data.to(device) for data in batch_data]
        output = model(input_ids=input_ids, attention_mask=att_mask, labels=labels)
        loss = output.loss
        test_loss += loss.item()
        test_pred.append(output.logits.cpu().detach().numpy())

# Average test loss
test_loss /= len(test_dataloader)
print(f"Test loss: {test_loss:.4f}")

# Check if test_pred has any entries before concatenation

if test_pred:
    test_pred = np.concatenate(test_pred)  # Concatenate predictions from batches

    # Apply softmax to get class probabilities
    test_pred_softmax = torch.softmax(torch.tensor(test_pred), dim=1).numpy()

    # Get the class predictions
    test_pred_labels = np.argmax(test_pred_softmax, axis=1)

    # Flatten ground truth for classification report
    y_true = test_y.numpy()

print(classification_report(y_true, test_pred_labels, zero_division=0))

Testing: 100%|██████████| 37/37 [00:02<00:00, 17.69it/s]

Test loss: 0.5945
              precision    recall  f1-score   support

           0       0.89      0.92      0.90       181
           1       0.86      0.82      0.84       114

    accuracy                           0.88       295
   macro avg       0.87      0.87      0.87       295
weighted avg       0.88      0.88      0.88       295






In [None]:
# append the predictions to the talks_covid_corrected
test_df['talks_covid_pred_robbert'] = test_pred_labels

# save the test_df with predictions
test_df.to_csv('path_to_save_test_df_with_predictions/actors_discusses_measures_RobBERTpreds.csv', sep=';', index=False, quoting=csv.QUOTE_NONNUMERIC, encoding='utf-8')