In [4]:
import json

In [5]:
with open("labels_dict.json", "r") as f:
    dict_labels = json.load(f)

In [6]:
dict_labels

{'CC: Amplifying Climate Fears': ['Amplifying existing fears of global warming',
  'Doomsday scenarios for humans',
  'Earth will be uninhabitable soon',
  'Whatever we do it is already too late',
  'Other'],
 'CC: Climate change is beneficial': ['CO2 is beneficial',
  'Temperature increase is beneficial',
  'Other'],
 'CC:  Controversy about green technologies': ['Nuclear energy is not climate friendly',
  'Renewable energy is costly',
  'Renewable energy is dangerous',
  'Renewable energy is unreliable',
  'Other'],
 'CC: Criticism of climate movement': ['Ad hominem attacks on key activists',
  'Climate movement is alarmist',
  'Climate movement is corrupt',
  'Other'],
 'CC: Criticism of climate policies': ['Climate policies are ineffective',
  'Climate policies are only for profit',
  'Climate policies have negative impact on the economy',
  'Other'],
 'CC: Criticism of institutions and authorities': ['Criticism of international entities',
  'Criticism of national governments',
  '

In [7]:
def split_second_occurrence(s):
    first = s.find(': ')
    if first == -1:
        return [s]  # no colon found
    second = s.find(': ', first + 1)
    if second == -1:
        return [s]  # only one colon found
    return [s[:second], s[second+2:]]


text = 'CC: Criticism of climate movement: Ad hominem attacks on key activists'
split_second_occurrence(text)

['CC: Criticism of climate movement', 'Ad hominem attacks on key activists']

In [8]:
text.find(': ')

2

In [10]:
def create_label_indices(hierarchy_dict):
    # Create narrative index mapping
    narrative_indices = {narrative: idx 
                        for idx, narrative in enumerate(hierarchy_dict.keys())}
    
    # Create subnarrative index mappings for each narrative
    subnarrative_indices = {
        narrative: {subnarr: idx 
                   for idx, subnarr in enumerate(subnarratives)}
        for narrative, subnarratives in hierarchy_dict.items()
    }
    
    return narrative_indices, subnarrative_indices

def encode_labels(label_string, hierarchy_dict, narrative_indices, subnarrative_indices):
    # Initialize encoding vectors
    narrative_encoding = [0] * len(hierarchy_dict)
    subnarrative_encodings = {
        narrative: [0] * len(subnarratives)
        for narrative, subnarratives in hierarchy_dict.items()
    }
    
    # Process each label
    for label in label_string.split(';'):
        try:
            # narrative, subnarrative = label.split(':')
            if label == "Other":
                narrative = "Other"
                subnarrative = "Other"
            else:
                narrative, subnarrative = split_second_occurrence(label)
            
            # Validate against hierarchy
            if narrative in hierarchy_dict:
                # Encode narrative
                narr_idx = narrative_indices[narrative]
                narrative_encoding[narr_idx] = 1
                
                # Encode subnarrative if valid
                if subnarrative in hierarchy_dict[narrative]:
                    sub_idx = subnarrative_indices[narrative][subnarrative]
                    subnarrative_encodings[narrative][sub_idx] = 1
        except ValueError:
            print(f"Skipping malformed label: {label}")
    
    return narrative_encoding, subnarrative_encodings

In [11]:
# Create index mappings once
narrative_indices, subnarrative_indices = create_label_indices(dict_labels)

# Process labels
def process_dataset(df, hierarchy_dict, narrative_indices, subnarrative_indices):
    encoded_data = []
    for label_string in df['labels']:
        narrative_enc, subnarrative_enc = encode_labels(
            label_string,
            hierarchy_dict,
            narrative_indices,
            subnarrative_indices
        )
        encoded_data.append({
            'narrative_encoding': narrative_enc,
            'subnarrative_encodings': subnarrative_enc
        })
    return encoded_data

# Model

In [12]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaModel, XLMRobertaTokenizer

from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
from sklearn.metrics import f1_score

In [13]:
class HierarchicalClassifier(nn.Module):
    def __init__(self, hierarchy_dict, model_name='xlm-roberta-base'):
        super().__init__()
        # Load base model
        self.bert = XLMRobertaModel.from_pretrained(model_name)
        self.hidden_size = self.bert.config.hidden_size
        
        # Store hierarchy information
        self.hierarchy_dict = hierarchy_dict
        self.num_narratives = len(hierarchy_dict)
        
        # Narrative classifier
        self.narrative_classifier = nn.Linear(self.hidden_size, self.num_narratives)
        
        # Subnarrative classifiers - one for each narrative
        self.subnarrative_classifiers = nn.ModuleDict({
            narrative: nn.Linear(self.hidden_size, len(subnarratives))
            for narrative, subnarratives in hierarchy_dict.items()
        })
        
        # Activation functions
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, input_ids, attention_mask):
        # Get BERT embeddings
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        
        # Narrative predictions
        narrative_logits = self.narrative_classifier(pooled_output)
        narrative_probs = self.sigmoid(narrative_logits)
        
        # Subnarrative predictions for each narrative
        subnarrative_outputs = {}
        for narrative, classifier in self.subnarrative_classifiers.items():
            subnarrative_logits = classifier(pooled_output)
            subnarrative_probs = self.sigmoid(subnarrative_logits)
            subnarrative_outputs[narrative] = subnarrative_probs
        
        return narrative_probs, subnarrative_outputs

# Training functions
class HierarchicalLoss(nn.Module):
    def __init__(self, hierarchy_dict):
        super().__init__()
        self.hierarchy_dict = hierarchy_dict
        self.bce = nn.BCELoss()
    
    def forward(self, narrative_preds, subnarrative_preds, narrative_labels, subnarrative_labels):
        # Narrative loss
        narrative_loss = self.bce(narrative_preds, narrative_labels)
        
        # Subnarrative loss - only for positive narratives
        subnarrative_loss = 0
        batch_size = narrative_labels.size(0)
        
        for narrative, pred in subnarrative_preds.items():
            narrative_idx = list(self.hierarchy_dict.keys()).index(narrative)
            # Get mask for positive narratives
            narrative_mask = narrative_labels[:, narrative_idx].bool()
            
            if narrative_mask.any():
                # Only calculate loss for positive narratives
                sub_pred = pred[narrative_mask]
                sub_label = subnarrative_labels[narrative][narrative_mask]
                subnarrative_loss += self.bce(sub_pred, sub_label)
        
        # Combine losses
        total_loss = narrative_loss + subnarrative_loss
        return total_loss

# # Training loop
# def train_model(model, train_dataloader, optimizer, criterion, device):
#     model.train()
    
#     for batch in train_dataloader:
#         # Move batch to device
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         narrative_labels = batch['narrative_labels'].to(device)
#         subnarrative_labels = {k: v.to(device) for k, v in batch['subnarrative_labels'].items()}
        
#         # Forward pass
#         narrative_preds, subnarrative_preds = model(input_ids, attention_mask)
        
#         # Calculate loss
#         loss = criterion(narrative_preds, subnarrative_preds, 
#                         narrative_labels, subnarrative_labels)
        
#         # Backward pass
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

In [14]:
def train_model(model, train_dataloader, val_dataloader, optimizer, criterion, device, num_epochs):
    """
    Train the hierarchical classifier
    """
    best_val_f1 = 0
    early_stopping_rounds = 3
    no_improve = 0
    
    for epoch in range(num_epochs):
        # Training Phase
        model.train()
        train_losses = []
        narrative_preds_list = []
        narrative_labels_list = []
        
        # Training loop with progress bar
        train_loop = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs} [Train]')
        for batch in train_loop:
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            narrative_labels = batch['narrative_labels'].to(device)
            subnarrative_labels = {k: v.to(device) 
                                 for k, v in batch['subnarrative_labels'].items()}
            
            # Forward pass
            narrative_preds, subnarrative_preds = model(input_ids, attention_mask)
            
            # Calculate loss
            loss = criterion(narrative_preds, subnarrative_preds, 
                           narrative_labels, subnarrative_labels)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Track metrics
            train_losses.append(loss.item())
            narrative_preds_list.append(narrative_preds.detach().cpu().numpy())
            narrative_labels_list.append(narrative_labels.detach().cpu().numpy())
            
            # Update progress bar
            train_loop.set_postfix({
                'loss': f'{np.mean(train_losses[-100:]):.4f}'
            })
        
        # Calculate training metrics
        train_narrative_preds = np.concatenate(narrative_preds_list)
        train_narrative_labels = np.concatenate(narrative_labels_list)
        train_narrative_preds_binary = (train_narrative_preds > 0.5).astype(int)
        train_f1 = f1_score(train_narrative_labels, 
                           train_narrative_preds_binary, 
                           average='macro')
        
        # Validation Phase
        model.eval()
        val_losses = []
        val_narrative_preds_list = []
        val_narrative_labels_list = []
        
        with torch.no_grad():
            val_loop = tqdm(val_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs} [Val]')
            for batch in val_loop:
                # Move batch to device
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                narrative_labels = batch['narrative_labels'].to(device)
                subnarrative_labels = {k: v.to(device) 
                                     for k, v in batch['subnarrative_labels'].items()}
                
                # Forward pass
                narrative_preds, subnarrative_preds = model(input_ids, attention_mask)
                
                # Calculate loss
                loss = criterion(narrative_preds, subnarrative_preds, 
                               narrative_labels, subnarrative_labels)
                
                # Track metrics
                val_losses.append(loss.item())
                val_narrative_preds_list.append(narrative_preds.cpu().numpy())
                val_narrative_labels_list.append(narrative_labels.cpu().numpy())
                
                # Update progress bar
                val_loop.set_postfix({
                    'loss': f'{np.mean(val_losses[-100:]):.4f}'
                })
        
        # Calculate validation metrics
        val_narrative_preds = np.concatenate(val_narrative_preds_list)
        val_narrative_labels = np.concatenate(val_narrative_labels_list)
        val_narrative_preds_binary = (val_narrative_preds > 0.5).astype(int)
        val_f1 = f1_score(val_narrative_labels, 
                         val_narrative_preds_binary, 
                         average='macro')
        
        # Print epoch metrics
        print(f'\nEpoch {epoch + 1}/{num_epochs}:')
        print(f'Train Loss: {np.mean(train_losses):.4f}, Train F1: {train_f1:.4f}')
        print(f'Val Loss: {np.mean(val_losses):.4f}, Val F1: {val_f1:.4f}')
        
        # Model checkpointing
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_f1': val_f1,
            }, 'best_model.pt')
            print('New best model saved!')
            no_improve = 0
        else:
            no_improve += 1
            
        # Early stopping
        # if no_improve >= early_stopping_rounds:
        #     print(f'\nEarly stopping triggered after {epoch + 1} epochs')
        #     break
            
        print('-' * 50)
    
    # Load best model
    checkpoint = torch.load('best_model.pt')
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"\nTraining completed. Best validation F1: {best_val_f1:.4f}")
    
    return model

In [14]:
def train_model(model, train_dataloader, val_dataloader, optimizer, criterion, device, num_epochs):
    """
    Train the hierarchical classifier with in-place tqdm progress bars
    """
    best_val_f1 = 0
    early_stopping_rounds = 3
    no_improve = 0
    
    # Main epoch progress bar
    epoch_bar = tqdm(range(num_epochs), desc='Training Progress', position=0)
    
    for epoch in epoch_bar:
        # Training Phase
        model.train()
        train_losses = []
        narrative_preds_list = []
        narrative_labels_list = []
        
        # Training loop with progress bar
        train_loop = tqdm(train_dataloader, 
                         desc=f'Epoch {epoch + 1}/{num_epochs} [Train]',
                         position=1, 
                         leave=False)  # leave=False ensures bar is removed after completion
        
        for batch in train_loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            narrative_labels = batch['narrative_labels'].to(device)
            subnarrative_labels = {k: v.to(device) 
                                 for k, v in batch['subnarrative_labels'].items()}
            
            narrative_preds, subnarrative_preds = model(input_ids, attention_mask)
            loss = criterion(narrative_preds, subnarrative_preds, 
                           narrative_labels, subnarrative_labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_losses.append(loss.item())
            narrative_preds_list.append(narrative_preds.detach().cpu().numpy())
            narrative_labels_list.append(narrative_labels.detach().cpu().numpy())
            
            # Update inner progress bar
            train_loop.set_postfix({
                'loss': f'{np.mean(train_losses[-100:]):.4f}'
            })
        
        train_loop.close()  # Explicitly close training bar
        
        # Calculate training metrics
        train_narrative_preds = np.concatenate(narrative_preds_list)
        train_narrative_labels = np.concatenate(narrative_labels_list)
        train_narrative_preds_binary = (train_narrative_preds > 0.5).astype(int)
        train_f1 = f1_score(train_narrative_labels, 
                           train_narrative_preds_binary, 
                           average='macro', 
                           zero_division=0
                           )
        
        # Validation Phase
        model.eval()
        val_losses = []
        val_narrative_preds_list = []
        val_narrative_labels_list = []
        
        with torch.no_grad():
            # Validation loop with progress bar
            val_loop = tqdm(val_dataloader, 
                          desc=f'Epoch {epoch + 1}/{num_epochs} [Val]',
                          position=1, 
                          leave=False)  # leave=False ensures bar is removed after completion
            
            for batch in val_loop:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                narrative_labels = batch['narrative_labels'].to(device)
                subnarrative_labels = {k: v.to(device) 
                                     for k, v in batch['subnarrative_labels'].items()}
                
                narrative_preds, subnarrative_preds = model(input_ids, attention_mask)
                loss = criterion(narrative_preds, subnarrative_preds, 
                               narrative_labels, subnarrative_labels)
                
                val_losses.append(loss.item())
                val_narrative_preds_list.append(narrative_preds.cpu().numpy())
                val_narrative_labels_list.append(narrative_labels.cpu().numpy())
                
                # Update inner progress bar
                val_loop.set_postfix({
                    'loss': f'{np.mean(val_losses[-100:]):.4f}'
                })
        
        val_loop.close()  # Explicitly close validation bar
        
        # Calculate validation metrics
        val_narrative_preds = np.concatenate(val_narrative_preds_list)
        val_narrative_labels = np.concatenate(val_narrative_labels_list)
        val_narrative_preds_binary = (val_narrative_preds > 0.5).astype(int)
        val_f1 = f1_score(val_narrative_labels, 
                         val_narrative_preds_binary, 
                         average='macro',
                         zero_division=0)
        
        # Update main epoch bar with metrics
        epoch_bar.set_postfix({
            'train_f1': f'{train_f1:.4f}',
            'val_f1': f'{val_f1:.4f}',
            'train_loss': f'{np.mean(train_losses):.4f}',
            'val_loss': f'{np.mean(val_losses):.4f}'
        })
        
        # Model checkpointing
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_f1': val_f1,
            }, 'best_model.pt')
            no_improve = 0
        else:
            no_improve += 1
    
    # Load best model
    checkpoint = torch.load('best_model.pt')
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"\nTraining completed. Best validation F1: {best_val_f1:.4f}")
    
    return model

In [15]:
# Initialize tokenizer and model
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model = HierarchicalClassifier(dict_labels)

# Prepare data
def prepare_batch(texts, labels, tokenizer, max_length=512):
    # Tokenize texts
    encodings = tokenizer(texts, 
                         truncation=True, 
                         padding=True, 
                         max_length=max_length,
                         return_tensors='pt')
    
    # Encode labels
    narrative_encodings = []
    subnarrative_encodings = {narrative: [] for narrative in hierarchy_dict}
    
    for label in labels:
        narr_enc, sub_enc = encode_labels(label, hierarchy_dict, 
                                        narrative_indices, subnarrative_indices)
        narrative_encodings.append(narr_enc)
        for narrative, encoding in sub_enc.items():
            subnarrative_encodings[narrative].append(encoding)
    
    # Convert to tensors
    batch = {
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'narrative_labels': torch.tensor(narrative_encodings),
        'subnarrative_labels': {k: torch.tensor(v) 
                               for k, v in subnarrative_encodings.items()}
    }
    
    return batch

In [16]:

def prepare_data(df, hierarchy_dict, test_size=0.2, random_state=42):
    """
    Prepare data from DataFrame format for training
    """
    # Split into train and validation
    train_df, val_df = train_test_split(
        df, 
        test_size=test_size, 
        random_state=random_state
    )
    
    print(f"Training samples: {len(train_df)}, Validation samples: {len(val_df)}")
    
    return train_df, val_df
    

def create_label_indices(hierarchy_dict):
    """Create indices for narratives and subnarratives"""
    # Create narrative index mapping
    narrative_indices = {narrative: idx 
                        for idx, narrative in enumerate(hierarchy_dict.keys())}
    
    # Create subnarrative index mappings for each narrative
    subnarrative_indices = {
        narrative: {subnarr: idx 
                   for idx, subnarr in enumerate(subnarratives)}
        for narrative, subnarratives in hierarchy_dict.items()
    }
    
    return narrative_indices, subnarrative_indices



class HierarchicalDataset(Dataset):
    def __init__(self, df, tokenizer, hierarchy_dict, narrative_indices, subnarrative_indices, max_length=512):
        self.texts = df['text'].values
        self.labels = df['labels'].values
        self.tokenizer = tokenizer
        self.hierarchy_dict = hierarchy_dict
        self.narrative_indices = narrative_indices
        self.subnarrative_indices = subnarrative_indices
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = str(self.labels[idx])
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        
        narrative_enc, subnarrative_enc = encode_labels(
            label, 
            self.hierarchy_dict, 
            self.narrative_indices, 
            self.subnarrative_indices
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'narrative_labels': torch.tensor(narrative_enc, dtype=torch.float),
            'subnarrative_labels': {k: torch.tensor(v, dtype=torch.float) 
                                  for k, v in subnarrative_enc.items()}
        }


def train_hierarchical_classifier(df, hierarchy_dict, 
                                model_name='xlm-roberta-base',
                                batch_size=16,
                                num_epochs=5,
                                learning_rate=2e-5,
                                max_length=512):
    """Main training function"""
    # Create indices first
    narrative_indices, subnarrative_indices = create_label_indices(hierarchy_dict)
    
    # Prepare data
    train_df, val_df = prepare_data(df, hierarchy_dict)
    
    # Initialize tokenizer and model
    tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
    model = HierarchicalClassifier(hierarchy_dict, model_name)
    
    # Create datasets with indices
    train_dataset = HierarchicalDataset(
        train_df, tokenizer, hierarchy_dict, 
        narrative_indices, subnarrative_indices, max_length
    )
    val_dataset = HierarchicalDataset(
        val_df, tokenizer, hierarchy_dict,
        narrative_indices, subnarrative_indices, max_length
    )
    
    train_dataloader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=True,
        num_workers=2
    )
    val_dataloader = DataLoader(
        val_dataset, 
        batch_size=batch_size,
        num_workers=2
    )
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    criterion = HierarchicalLoss(hierarchy_dict)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    model = model.to(device)
    
    # Train model with updated parameters
    train_model(
        model,
        train_dataloader,
        val_dataloader,
        optimizer,
        criterion,
        device,
        num_epochs
    )
    
    return model, tokenizer, narrative_indices, subnarrative_indices




# Load data

In [26]:
train_df = pd.read_csv('../csv_data/combined_train.csv', header=None, names=['file', 'narrative', 'labels', 'text'])


In [27]:
train_df.sample(5)

Unnamed: 0,file,narrative,labels,text
1758,PT_52.txt,CC: Downplaying climate change;CC: Criticism o...,CC: Downplaying climate change: Other;CC: Crit...,Mudança climática: o tema que não é bem-vindo ...
367,EN_UA_012234.txt,URW: Amplifying war-related fears;URW: Amplify...,URW: Amplifying war-related fears: Russia will...,Putin’s allies call for London to be ‘turned t...
772,A8_CC_BG_8732.txt,CC: Criticism of climate policies;CC: Criticis...,CC: Criticism of climate policies: Climate pol...,
955,BG_663.txt,"URW: Discrediting the West, Diplomacy;URW: Dis...","URW: Discrediting the West, Diplomacy: The Wes...",Европа се натресе с двата крака в блатото/Пора...
1594,PT_374.txt,URW: Praise of Russia;URW: Discrediting Ukrain...,URW: Praise of Russia: Praise of Russian Presi...,"Apenas uma nova ""Yalta"". Putin repetiu mais um..."


In [28]:

# Usage example:
def main():
    # Your DataFrame and hierarchy dictionary
    # df = pd.DataFrame({'text': [...], 'label': [...]})
    # hierarchy_dict = {...}
    
    # Train the model
    model, tokenizer , narrative_indices, subnarrative_indices = train_hierarchical_classifier(
        df=train_df,
        hierarchy_dict=dict_labels,
        batch_size=16,
        num_epochs=100
    )
    save_model(
        model=model,
        tokenizer=tokenizer,
        hierarchy_dict=dict_labels,
        narrative_indices=narrative_indices,
        subnarrative_indices=subnarrative_indices,
        save_dir='model'
    )
    
    # # Save the trained model
    # torch.save({
    #     'model_state_dict': model.state_dict(),
    #     'hierarchy_dict': dict_labels,
 
    # }, 'hierarchical_classifier.pt')

# Example of loading and using the model for prediction
def predict(text, model, tokenizer, hierarchy_dict, device):
    model.eval()
    
    # Tokenize
    inputs = tokenizer(
        text,
        truncation=True,
        max_length=512,
        padding='max_length',
        return_tensors='pt'
    ).to(device)
    
    with torch.no_grad():
        narrative_probs, subnarrative_probs = model(
            inputs['input_ids'],
            inputs['attention_mask']
        )
        
        # Get narrative predictions
        narrative_preds = (narrative_probs > 0.5).cpu().numpy()[0]
        
        # Get predicted narratives
        predicted_narratives = [
            list(hierarchy_dict.keys())[i] 
            for i in range(len(narrative_preds)) 
            if narrative_preds[i]
        ]
        
        # Get subnarrative predictions for predicted narratives
        predictions = {}
        for narrative in predicted_narratives:
            sub_probs = subnarrative_probs[narrative].cpu().numpy()[0]
            sub_preds = sub_probs > 0.5
            predicted_subs = [
                hierarchy_dict[narrative][i] 
                for i in range(len(sub_preds)) 
                if sub_preds[i]
            ]
            predictions[narrative] = predicted_subs
            
    return predictions

In [None]:
main()

In [31]:
save_model(
    model=model,
    tokenizer=tokenizer,
    hierarchy_dict=dict_labels,
    narrative_indices=narrative_indices,
    subnarrative_indices=subnarrative_indices,
    save_dir='model'
)

Model and components saved to model


In [17]:
dev_data = pd.read_csv('../csv_data/dev_with_text.csv')

In [18]:
next(model.parameters()).is_cuda

False

In [19]:
model

HierarchicalClassifier(
  (bert): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)


In [34]:
def pred( model, tokenizer,hierarchy_dict, text, device="cuda:0"):   
    model.to(device)
    model.eval()
    
    # Tokenize input
    inputs = tokenizer(
        text,
        truncation=True,
        max_length=512,
        padding='max_length',
        return_tensors='pt'
    ).to(device)
    
    # Make prediction
    with torch.no_grad():
        narrative_probs, subnarrative_probs = model(
            inputs['input_ids'],
            inputs['attention_mask']
        )
        
        # Get narrative predictions
        narrative_preds = (narrative_probs > 0.5).cpu().numpy()[0]
        
        # Create reverse mapping for indices
        rev_narrative_indices = {v: k for k, v in narrative_indices.items()}
        
        # Get predicted narratives with probabilities
        results = {}
        for idx, is_predicted in enumerate(narrative_preds):
            if is_predicted:
                narrative = rev_narrative_indices[idx]
                narrative_prob = float(narrative_probs[0][idx].cpu())
                
                # Get subnarrative predictions for this narrative
                sub_probs = subnarrative_probs[narrative].cpu().numpy()[0]
                sub_preds = sub_probs > 0.5
                
                # Get predicted subnarratives with probabilities
                sub_results = {}
                for sub_idx, sub_pred in enumerate(sub_preds):
                    if sub_pred:
                        subnarrative = hierarchy_dict[narrative][sub_idx]
                        sub_prob = float(sub_probs[sub_idx])
                        sub_results[subnarrative] = sub_prob
                
                results[narrative] = {
                    'probability': narrative_prob,
                    'subnarratives': sub_results
                }
    
    return results


In [35]:
pred(model, tokenizer, dict_labels, device= "cuda:0",text=dev_data['text'][0])

{'CC: Climate change is beneficial': {'probability': 0.5171463489532471,
  'subnarratives': {}},
 'CC:  Controversy about green technologies': {'probability': 0.5582747459411621,
  'subnarratives': {'Renewable energy is costly': 0.5833089351654053,
   'Renewable energy is unreliable': 0.5185739398002625,
   'Other': 0.5054910182952881}},
 'CC: Criticism of institutions and authorities': {'probability': 0.6473690271377563,
  'subnarratives': {'Criticism of international entities': 0.5140891075134277}},
 'CC: Downplaying climate change': {'probability': 0.5262501239776611,
  'subnarratives': {'Climate cycles are natural': 0.569594144821167,
   'Human activities do not impact climate change': 0.5489768385887146,
   'Humans and nature will adapt to the changes': 0.5431116819381714,
   'Temperature increase does not have significant impact': 0.5608167052268982,
   'Weather suggests the trend is global cooling': 0.5263381004333496}},
 'CC: Green policies are geopolitical instruments': {'prob

In [22]:
import os
from transformers import XLMRobertaTokenizer
import json

def save_model(model, tokenizer, hierarchy_dict, narrative_indices, subnarrative_indices, save_dir='model'):
    """
    Save the model and all necessary components
    """
    # Create directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    # Save model state
    torch.save(model.state_dict(), os.path.join(save_dir, 'model.pt'))
    
    # Save tokenizer
    tokenizer.save_pretrained(save_dir)
    
    # Save hierarchy and indices
    metadata = {
        'hierarchy_dict': hierarchy_dict,
        'narrative_indices': narrative_indices,
        'subnarrative_indices': subnarrative_indices
    }
    
    with open(os.path.join(save_dir, 'metadata.json'), 'w') as f:
        json.dump(metadata, f)
        
    print(f"Model and components saved to {save_dir}")

def load_model(model_dir='model'):
    """
    Load the model and all components
    """
    # Load metadata
    with open(os.path.join(model_dir, 'metadata.json'), 'r') as f:
        metadata = json.load(f)
    
    hierarchy_dict = metadata['hierarchy_dict']
    narrative_indices = metadata['narrative_indices']
    subnarrative_indices = metadata['subnarrative_indices']
    
    # Initialize model
    model = HierarchicalClassifier(hierarchy_dict)
    model.load_state_dict(torch.load(os.path.join(model_dir, 'model.pt')))
    
    # Load tokenizer
    tokenizer = XLMRobertaTokenizer.from_pretrained(model_dir)
    
    return model, tokenizer, hierarchy_dict, narrative_indices, subnarrative_indices

def predict(text, model_dir='model', device=None):
    """
    Make predictions using the saved model
    """
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Load model and components
    model, tokenizer, hierarchy_dict, narrative_indices, subnarrative_indices = load_model(model_dir)
    model = model.to(device)
    model.eval()
    
    # Tokenize input
    inputs = tokenizer(
        text,
        truncation=True,
        max_length=512,
        padding='max_length',
        return_tensors='pt'
    ).to(device)
    
    # Make prediction
    with torch.no_grad():
        narrative_probs, subnarrative_probs = model(
            inputs['input_ids'],
            inputs['attention_mask']
        )
        
        # Get narrative predictions
        narrative_preds = (narrative_probs > 0.5).cpu().numpy()[0]
        
        # Create reverse mapping for indices
        rev_narrative_indices = {v: k for k, v in narrative_indices.items()}
        
        # Get predicted narratives with probabilities
        results = {}
        for idx, is_predicted in enumerate(narrative_preds):
            if is_predicted:
                narrative = rev_narrative_indices[idx]
                narrative_prob = float(narrative_probs[0][idx].cpu())
                
                # Get subnarrative predictions for this narrative
                sub_probs = subnarrative_probs[narrative].cpu().numpy()[0]
                sub_preds = sub_probs > 0.5
                
                # Get predicted subnarratives with probabilities
                sub_results = {}
                for sub_idx, sub_pred in enumerate(sub_preds):
                    if sub_pred:
                        subnarrative = hierarchy_dict[narrative][sub_idx]
                        sub_prob = float(sub_probs[sub_idx])
                        sub_results[subnarrative] = sub_prob
                
                results[narrative] = {
                    'probability': narrative_prob,
                    'subnarratives': sub_results
                }
    
    return results

# Example usage:
def save_and_predict_example():
    # After training:
    save_model(
        model=trained_model,
        tokenizer=tokenizer,
        hierarchy_dict=hierarchy_dict,
        narrative_indices=narrative_indices,
        subnarrative_indices=subnarrative_indices,
        save_dir='model'
    )
    
    # Later, to make predictions:
    text = "Your input text here"
    predictions = predict(text, model_dir='model')
    
    # Print predictions in a formatted way
    print("\nPredictions:")
    for narrative, data in predictions.items():
        print(f"\nNarrative: {narrative} (probability: {data['probability']:.3f})")
        print("Subnarratives:")
        for subnarr, prob in data['subnarratives'].items():
            print(f"  - {subnarr} (probability: {prob:.3f})")

# Example output format:
"""
Predictions:

Narrative: Politics (probability: 0.945)
Subnarratives:
  - International (probability: 0.876)
  - Elections (probability: 0.654)

Narrative: Technology (probability: 0.823)
Subnarratives:
  - AI (probability: 0.912)
"""

'\nPredictions:\n\nNarrative: Politics (probability: 0.945)\nSubnarratives:\n  - International (probability: 0.876)\n  - Elections (probability: 0.654)\n\nNarrative: Technology (probability: 0.823)\nSubnarratives:\n  - AI (probability: 0.912)\n'

# Prediction

In [23]:
def predict(text, model_dir='model', device=None, return_probs=False):
    """
    Make predictions and return in original label format
    Args:
        text: Input text
        model_dir: Directory containing saved model
        device: torch device
        return_probs: If True, return probabilities along with labels
    Returns:
        if return_probs=False: string in format "Narrative1:Subnarrative1;Narrative2:Subnarrative2"
        if return_probs=True: tuple of (label_string, probabilities_dict)
    """
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # # Load model and components
    model, tokenizer, hierarchy_dict, narrative_indices, subnarrative_indices = load_model(model_dir)
    model = model.to(device)
    model.eval()
    
    # Tokenize input
    inputs = tokenizer(
        text,
        truncation=True,
        max_length=512,
        padding='max_length',
        return_tensors='pt'
    ).to(device)
    
    # Make prediction
    with torch.no_grad():
        narrative_probs, subnarrative_probs = model(
            inputs['input_ids'],
            inputs['attention_mask']
        )
        
        # Get narrative predictions
        narrative_preds = (narrative_probs > 0.5).cpu().numpy()[0]
        
        # Create reverse mapping for indices
        rev_narrative_indices = {v: k for k, v in narrative_indices.items()}
        
        # Store predictions in original format
        predictions = []
        probabilities = {}
        
        for idx, is_predicted in enumerate(narrative_preds):
            if is_predicted:
                narrative = rev_narrative_indices[idx]
                narrative_prob = float(narrative_probs[0][idx].cpu())
                
                # Get subnarrative predictions for this narrative
                sub_probs = subnarrative_probs[narrative].cpu().numpy()[0]
                sub_preds = sub_probs > 0.5
                
                # Get predicted subnarratives
                for sub_idx, sub_pred in enumerate(sub_preds):
                    if sub_pred:
                        subnarrative = hierarchy_dict[narrative][sub_idx]
                        sub_prob = float(sub_probs[sub_idx])
                        
                        # Create label in original format
                        label = f"{narrative}:{subnarrative}"
                        predictions.append(label)
                        
                        # Store probabilities
                        probabilities[label] = {
                            'narrative_prob': narrative_prob,
                            'subnarrative_prob': sub_prob
                        }
    
    # Join predictions with semicolon
    final_prediction = ";".join(predictions)
    
    if return_probs:
        return final_prediction, probabilities
    return final_prediction

# Example usage:
def prediction_example(text):
    
    # Get only labels
    labels = predict(text, model_dir='model')
    print("\nPredicted Labels:")
    print(labels)
    # Output example: "Politics:International;Technology:AI"
    
    # Get labels and probabilities
    labels, probs = predict(text, model_dir='model', return_probs=True)
    print("\nDetailed Predictions:")
    print(f"Labels: {labels}")
    print("\nProbabilities:")
    for label, probs in probs.items():
        print(f"{label}:")
        print(f"  Narrative probability: {probs['narrative_prob']:.3f}")
        print(f"  Subnarrative probability: {probs['subnarrative_prob']:.3f}")


In [25]:
prediction_example(dev_data['text'][1])

  model.load_state_dict(torch.load(os.path.join(model_dir, 'model.pt')))



Predicted Labels:
CC:  Controversy about green technologies:Renewable energy is costly;CC:  Controversy about green technologies:Renewable energy is unreliable;CC:  Controversy about green technologies:Other;CC: Criticism of institutions and authorities:Criticism of international entities;CC: Downplaying climate change:Climate cycles are natural;CC: Downplaying climate change:Human activities do not impact climate change;CC: Downplaying climate change:Humans and nature will adapt to the changes;CC: Downplaying climate change:Temperature increase does not have significant impact;CC: Downplaying climate change:Weather suggests the trend is global cooling;CC: Green policies are geopolitical instruments:Green activities are a form of neo-colonialism;CC: Green policies are geopolitical instruments:Other;CC: Hidden plots by secret schemes of powerful groups:Blaming global elites;URW: Amplifying war-related fears:Russia will also attack other countries;URW: Amplifying war-related fears:There

NameError: name 'labels' is not defined

In [32]:



# # Load model and components
model, tokenizer, hierarchy_dict, narrative_indices, subnarrative_indices = load_model("model/")
model = model.to(device)
model.eval()

  model.load_state_dict(torch.load(os.path.join(model_dir, 'model.pt')))


HierarchicalClassifier(
  (bert): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)


In [61]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
results = []
for text in dev_data['text']:

    # Tokenize input
    inputs = tokenizer(
        text,
        truncation=True,
        max_length=512,
        padding='max_length',
        return_tensors='pt'
    ).to(device)

    # Make prediction
    with torch.no_grad():
        narrative_probs, subnarrative_probs = model(
            inputs['input_ids'],
            inputs['attention_mask']
        )
        
        # Get narrative predictions
        narrative_preds = (narrative_probs > 0.5).cpu().numpy()[0]
        
        # Create reverse mapping for indices
        rev_narrative_indices = {v: k for k, v in narrative_indices.items()}
        
        # Store predictions in original format
        predictions = []
        probabilities = {}
        
        for idx, is_predicted in enumerate(narrative_preds):
            if is_predicted:
                narrative = rev_narrative_indices[idx]
                narrative_prob = float(narrative_probs[0][idx].cpu())
                
                # Get subnarrative predictions for this narrative
                sub_probs = subnarrative_probs[narrative].cpu().numpy()[0]
                sub_preds = sub_probs > 0.5
                
                # Get predicted subnarratives
                for sub_idx, sub_pred in enumerate(sub_preds):
                    if sub_pred:
                        subnarrative = hierarchy_dict[narrative][sub_idx]
                        sub_prob = float(sub_probs[sub_idx])
                        
                        # Create label in original format
                        label = f"{narrative}: {subnarrative}"
                        predictions.append(label)
                        
                        # Store probabilities
                        probabilities[label] = {
                            'narrative_prob': narrative_prob,
                            'subnarrative_prob': sub_prob
                        }

    # Join predictions with semicolon
    final_prediction = ";".join(predictions)
    results.append(final_prediction)



In [62]:
results

['CC:  Controversy about green technologies: Renewable energy is costly;CC:  Controversy about green technologies: Renewable energy is unreliable;CC:  Controversy about green technologies: Other;CC: Criticism of institutions and authorities: Criticism of international entities;CC: Downplaying climate change: Climate cycles are natural;CC: Downplaying climate change: Human activities do not impact climate change;CC: Downplaying climate change: Humans and nature will adapt to the changes;CC: Downplaying climate change: Temperature increase does not have significant impact;CC: Downplaying climate change: Weather suggests the trend is global cooling;CC: Green policies are geopolitical instruments: Green activities are a form of neo-colonialism;CC: Green policies are geopolitical instruments: Other;CC: Hidden plots by secret schemes of powerful groups: Blaming global elites;URW: Amplifying war-related fears: Russia will also attack other countries;URW: Amplifying war-related fears: There is

In [43]:
narrative_probs

tensor([[0.4775, 0.5205, 0.5584, 0.4411, 0.4433, 0.6486, 0.5231, 0.5323, 0.5227,
         0.4837, 0.5221, 0.4721, 0.5580, 0.5044, 0.5405, 0.4981, 0.4749, 0.5344,
         0.5508, 0.4958, 0.5095, 0.5207]], device='cuda:0')

In [52]:
dict_labels['CC: Amplifying Climate Fears']


['Amplifying existing fears of global warming',
 'Doomsday scenarios for humans',
 'Earth will be uninhabitable soon',
 'Whatever we do it is already too late',
 'Other']

In [54]:
narrative_indices['CC: Amplifying Climate Fears']

0

In [56]:
rev_narrative_indices

{0: 'CC: Amplifying Climate Fears',
 1: 'CC: Climate change is beneficial',
 2: 'CC:  Controversy about green technologies',
 3: 'CC: Criticism of climate movement',
 4: 'CC: Criticism of climate policies',
 5: 'CC: Criticism of institutions and authorities',
 6: 'CC: Downplaying climate change',
 7: 'CC: Green policies are geopolitical instruments',
 8: 'CC: Hidden plots by secret schemes of powerful groups',
 9: 'CC: Questioning the measurements and science',
 10: 'URW: Amplifying war-related fears',
 11: 'URW: Blaming the war on others rather than the invader',
 12: 'URW: Discrediting Ukraine',
 13: 'URW: Discrediting the West, Diplomacy',
 14: 'URW: Distrust towards Media',
 15: 'URW: Hidden plots by secret schemes of powerful groups',
 16: 'URW: Negative Consequences for the West',
 17: 'URW: Overpraising the West',
 18: 'URW: Praise of Russia',
 19: 'URW: Russia is the Victim',
 20: 'URW: Speculating war outcomes',
 21: 'Other'}

In [59]:
'CC: Amplifying Climate Fears'.find("A")

4