In [None]:
import pandas as pd
import numpy as np
import os
import csv
import torch
import random
from tqdm import tqdm

from transformers import set_seed
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.nn.utils import clip_grad_norm_

In [5]:
# go one level up in the directory
os.chdir("/data/500gbstorage/actor_classification")


huggingface_cache_dir = 'model'

# change huggingface cache
os.environ['TRANSFORMERS_CACHE'] = huggingface_cache_dir

In [6]:
set_seed(42)
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)

# Load & Prep Data

In [None]:
# load the manually annotated dataset
df = pd.read_csv('data/coded_df_topics_full.csv',
                 sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

# change article_id to integer
df['article_id'] = df['article_id'].astype(int)

topic_vars = ['about_covid',  'topic_a', 'topic_b', 'topic_c', 'topic_d', 'topic_e', 'topic_f', 'topic_g', 'topic_h', 
              'topic_i', 'topic_j', 'topic_k', 'topic_l', 'topic_m', 'topic_n']

# change all topic vars to int
for i in topic_vars:
    df[i] = df[i].astype(int)

In [None]:
# articles df includes all article texts, categories and keywords
articles_df = pd.read_csv('data/final_nosarticles.csv',
                          sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)
print(articles_df.shape)

# get article text, category, keywords and page_id
articles_df = articles_df[['page_id', 'Title', 'Text', 'Category', 'Keywords']].drop_duplicates()
# make page id integer
articles_df['page_id'] = articles_df['page_id'].astype(int)
# change page_id to article_id
articles_df.rename(columns = {'page_id': 'article_id'}, inplace = True)
# change LINE BREAK to \n
articles_df['Text'] = articles_df['Text'].str.replace('[LINE_BREAK]', '\n')

In [10]:
# make an input text, combining Title, Text, Category and Keywords and before Category add string 'Categories: ' and before Keywords add string 'Keywords: ' if Category and Keywords are empty skip them
articles_df['Category'] = articles_df['Category'].fillna('')
articles_df['Keywords'] = articles_df['Keywords'].fillna('')
articles_df['input_text'] = articles_df['Text'] + '\n' + 'Categories: ' + articles_df['Category'] + ' ' + 'Keywords: ' + articles_df['Keywords']

In [None]:
# merge articles_df with df
df = pd.merge(df, articles_df, on='article_id', how = 'left')

In [None]:
# split the reliability annotations/test data
train_df = df[df['reliability_article'] == 0]
test_df = df[df['reliability_article'] == 1]

In [14]:
train_df.about_covid.value_counts()

about_covid
1    410
0    266
Name: count, dtype: int64

In [16]:
# Select input text
X = train_df[['input_text']]

# Select target labels directly from the binary topic columns
y = train_df[['about_covid']]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

print(X_train.shape, X_val.shape)  # Shapes of the input texts
print(y_train.shape, y_val.shape)    # Shapes of the binary labels


(540, 1) (136, 1)
(540, 1) (136, 1)


In [18]:
# calculate the number of tokens in the input text
def count_tokens(text):
    return len(text.split())

train_df['num_tokens'] = train_df['input_text'].apply(count_tokens)
train_df['num_tokens'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['num_tokens'] = train_df['input_text'].apply(count_tokens)


count     676.000000
mean      450.272189
std       210.423411
min        91.000000
25%       276.000000
50%       404.000000
75%       611.000000
max      1106.000000
Name: num_tokens, dtype: float64

# Fine-Tuned RobBERT Classifier

In [None]:
# Set seed for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# Hyperparameters
learning_rates = [2e-5, 2e-6]
accumulation_steps = 4  # This simulates a batch size of 16 by accumulating over 2 steps with batch size 8
best_val_loss = float('inf')
best_model_state = None
patience = 4  # Number of epochs to wait before stopping if no improvement
epochs_to_test = [5, 10]  # Epochs to test

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("DTAI-KULeuven/robbert-2023-dutch-base", cache_dir=huggingface_cache_dir)

# Encode data
def encode(docs):
    encoded_dict = tokenizer.batch_encode_plus(docs, add_special_tokens=True, padding='max_length',
                                                return_attention_mask=True, truncation=True, return_tensors='pt')
    return encoded_dict['input_ids'], encoded_dict['attention_mask']

train_input_ids, train_att_masks = encode(X_train['input_text'].tolist())
valid_input_ids, valid_att_masks = encode(X_val['input_text'].tolist())
train_y = torch.LongTensor(y_train.values.squeeze())
valid_y = torch.LongTensor(y_val.values.squeeze())

train_dataset = TensorDataset(train_input_ids, train_att_masks, train_y)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=4)  # Small batch size

valid_dataset = TensorDataset(valid_input_ids, valid_att_masks, valid_y)
valid_sampler = SequentialSampler(valid_dataset)
valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=4)

# Loop through specified learning rates
for learning_rate in learning_rates:
    print(f"\nTraining with learning rate: {learning_rate}")
    
    # Initialize the model for each learning rate
    model = AutoModelForSequenceClassification.from_pretrained("DTAI-KULeuven/robbert-2023-dutch-base", 
                                                                num_labels=2, 
                                                                cache_dir=huggingface_cache_dir)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Define optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = torch.nn.CrossEntropyLoss()

    # Loop through specified epochs
    for epochs in epochs_to_test:
        print(f"\nTraining for {epochs} epochs...")
        patience_counter = 0  # To track epochs without improvement

        for epoch_num in range(epochs):
            print(f"Epoch: {epoch_num + 1}/{epochs}")

            # Training
            model.train()
            train_loss = 0
            for step_num, batch_data in enumerate(tqdm(train_dataloader, desc='Training')):
                input_ids, att_mask, labels = [data.to(device) for data in batch_data]
                
                # CrossEntropyLoss automatically applies softmax, so logits are passed directly
                output = model(input_ids=input_ids, attention_mask=att_mask)
                logits = output.logits  # Get logits directly
                
                loss = criterion(logits, labels)  # Compute loss
                train_loss += loss.item()

                loss = loss / accumulation_steps  # Scale the loss
                loss.backward()  # Backpropagate the loss

                # Gradient accumulation
                if (step_num + 1) % accumulation_steps == 0:
                    clip_grad_norm_(model.parameters(), max_norm=1.0)
                    optimizer.step()
                    optimizer.zero_grad()

            # Average training loss for this epoch
            train_loss /= len(train_dataloader)
            print(f"Train loss: {train_loss:.4f}")

            # Validation
            model.eval()
            valid_loss = 0
            valid_pred = []
            with torch.no_grad():
                for step_num_e, batch_data in enumerate(tqdm(valid_dataloader, desc='Validation')):
                    input_ids, att_mask, labels = [data.to(device) for data in batch_data]
                    output = model(input_ids=input_ids, attention_mask=att_mask)
                    logits = output.logits
                    
                    loss = criterion(logits, labels)  # Compute loss
                    valid_loss += loss.item()
                    valid_pred.append(logits.cpu().detach().numpy())

            # Average validation loss for this epoch
            valid_loss /= len(valid_dataloader)
            print(f"Validation loss: {valid_loss:.4f}")

            # Early stopping check
            if valid_loss < best_val_loss:
                best_val_loss = valid_loss
                best_model_state = model.state_dict()
                patience_counter = 0  # Reset patience counter if we have improvement
                print("New best model found! Saving...")
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break  # Stop training if no improvement

In [23]:
# After training, save the best model
torch.save(best_model_state, 'results/about_covid_robbert_base_best_model.pt')
print(f"Best model saved with Val loss: {best_val_loss:.4f}")

Best model saved with Val loss: 0.2772


In [24]:
# Load the best model state
model = AutoModelForSequenceClassification.from_pretrained("DTAI-KULeuven/robbert-2023-dutch-base",
                                                              num_labels=2, cache_dir=huggingface_cache_dir)

model.load_state_dict(torch.load('results/about_covid_robbert_base_best_model.pt'))
model.eval()  # Set the model to evaluation mode

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Validation
model.eval()
valid_loss = 0
valid_pred = []

with torch.no_grad():
    for step_num_e, batch_data in enumerate(tqdm(valid_dataloader, desc='Validation')):
        input_ids, att_mask, labels = [data.to(device) for data in batch_data]
        
        # During validation, no need to pass labels to the model
        output = model(input_ids=input_ids, attention_mask=att_mask)
        logits = output.logits
        
        # Compute validation loss manually (if needed)
        loss = criterion(logits, labels)
        valid_loss += loss.item()

        # Store the logits for all validation examples
        valid_pred.append(logits.cpu().detach().numpy())

# Average validation loss
valid_loss /= len(valid_dataloader)
print(f"Validation loss: {valid_loss:.4f}")

# Check if valid_pred has any entries before concatenation
if valid_pred:
    valid_pred = np.concatenate(valid_pred)  # Concatenate predictions from batches

    # Apply softmax to get class probabilities
    valid_pred_softmax = torch.softmax(torch.tensor(valid_pred), dim=1).numpy()

    # Get the class predictions (0 or 1) based on the higher probability
    valid_pred_labels = np.argmax(valid_pred_softmax, axis=1)

    # Flatten ground truth for classification report
    y_true = valid_y.numpy()

    print(classification_report(y_true, valid_pred_labels, 
                                target_names=['not_about_covid', 'about_covid'],
                                zero_division=0))


Validation: 100%|██████████| 34/34 [00:01<00:00, 31.81it/s]


Validation loss: 0.4954
                 precision    recall  f1-score   support

not_about_covid       0.96      0.84      0.90        57
    about_covid       0.90      0.97      0.93        79

       accuracy                           0.92       136
      macro avg       0.93      0.91      0.92       136
   weighted avg       0.92      0.92      0.92       136



In [None]:
# read the test annotations annotated by the researcher
reliability_df = pd.read_csv('data/reliability_topics_elif.csv',
                             sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

reliability_df['article_id'] = reliability_df['article_id'].astype(int)

reliability_df.head()

# merge reliability_df with articles_df
reliability_df = pd.merge(reliability_df, articles_df, on='article_id', how = 'left')
print(reliability_df.shape)

# make an input text, combining Title, Text, Category and Keywords and before Category add string 'Categories: ' and before Keywords add string 'Keywords: ' if Category and Keywords are empty skip them
reliability_df['Category'] = reliability_df['Category'].fillna('')
reliability_df['Keywords'] = reliability_df['Keywords'].fillna('')
reliability_df['input_text'] = reliability_df['Text'] + '\n' + 'Categories: ' + reliability_df['Category'] + ' ' + 'Keywords: ' + reliability_df['Keywords']

# make a test_df for all coders separately
test_df_elif = reliability_df[reliability_df['coder'] == 'Elif Kilik']

In [None]:
# run model on the test data

X_test = test_df_elif[['input_text']]
# Select target labels directly from the binary topic columns
y_test = test_df_elif[['about_covid']]
# Encode data
test_input_ids, test_att_masks = encode(X_test['input_text'].tolist())
test_y = torch.LongTensor(y_test.values.squeeze())
test_dataset = TensorDataset(test_input_ids, test_att_masks, test_y)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=4)

# predict on test data
model.eval()
test_loss = 0
test_pred = []

with torch.no_grad():
    for step_num_e, batch_data in enumerate(tqdm(test_dataloader, desc='Testing')):
        input_ids, att_mask, labels = [data.to(device) for data in batch_data]
        
        # During validation, no need to pass labels to the model
        output = model(input_ids=input_ids, attention_mask=att_mask)
        logits = output.logits
        
        # Compute validation loss manually (if needed)
        loss = criterion(logits, labels)
        test_loss += loss.item()

        # Store the logits for all validation examples
        test_pred.append(logits.cpu().detach().numpy())

# get predictions to compute classification report

# Average validation loss
test_loss /= len(test_dataloader)

# Check if valid_pred has any entries before concatenation

if test_pred:
    test_pred = np.concatenate(test_pred)  # Concatenate predictions from batches

    # Apply softmax to get class probabilities
    test_pred_softmax = torch.softmax(torch.tensor(test_pred), dim=1).numpy()

    # Get the class predictions
    test_pred_labels = np.argmax(test_pred_softmax, axis=1)

    # Flatten ground truth for classification report
    y_true = test_y.numpy()
    
    print(classification_report(y_true, test_pred_labels,
                                target_names=['not_about_covid', 'about_covid'],
                                zero_division=0))

Testing:   0%|          | 0/30 [00:00<?, ?it/s]

Testing: 100%|██████████| 30/30 [00:00<00:00, 31.64it/s]

                 precision    recall  f1-score   support

not_about_covid       0.89      0.81      0.85        42
    about_covid       0.90      0.95      0.93        78

       accuracy                           0.90       120
      macro avg       0.90      0.88      0.89       120
   weighted avg       0.90      0.90      0.90       120




