In [1]:
import torch
import torch.nn as nn
import  torch.nn.functional as F
from typing import Tuple, Dict, List,Optional
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import json
from tqdm import tqdm
import numpy as np
import pandas as pd


from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [2]:

class HierarchicalAttention(nn.Module):
    def __init__(self, hidden_size: int):
        super().__init__()
        self.attention = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.Tanh(),
            nn.Linear(hidden_size // 2, 1)
        )
    
    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        # hidden_states: [batch_size, seq_len, hidden_size]
        attention_weights = self.attention(hidden_states)
        attention_weights = attention_weights.squeeze(-1)
        attention_weights = attention_weights.masked_fill(~attention_mask.bool(), float('-inf'))
        attention_weights = F.softmax(attention_weights, dim=1)
        attended = torch.bmm(attention_weights.unsqueeze(1), hidden_states)
        return attended.squeeze(1)  # [batch_size, hidden_size]

class LevelSpecificFeatures(nn.Module):
    def __init__(self, hidden_size: int, dropout_rate: float = 0.1):
        super().__init__()
        self.attention = HierarchicalAttention(hidden_size)
        self.feature_extractor = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, hidden_size)
        )
    
    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        attended = self.attention(hidden_states, attention_mask)
        features = self.feature_extractor(attended)
        return features

class HierarchicalClassifier(nn.Module):
    def __init__(
        self,
        pretrained_model: AutoModel,
        num_top: int,
        num_narratives: int,
        num_subnarratives: int,
        dropout_rate: float = 0.1
    ):
        super().__init__()
        self.bert = pretrained_model
        hidden_size = self.bert.config.hidden_size
        
        # Level-specific feature extractors
        self.top_features = LevelSpecificFeatures(hidden_size, dropout_rate)
        self.narrative_features = LevelSpecificFeatures(hidden_size, dropout_rate)
        self.subnarrative_features = LevelSpecificFeatures(hidden_size, dropout_rate)
        
        # Classifiers with hierarchical dimensions
        self.top_classifier = nn.Linear(hidden_size, num_top)
        self.narrative_classifier = nn.Linear(hidden_size + num_top, num_narratives)
        self.subnarrative_classifier = nn.Linear(hidden_size + num_narratives, num_subnarratives)
        
        self.dropout = nn.Dropout(dropout_rate)
        # Level weights for loss calculation
        self.register_buffer('level_weights', torch.tensor([1.0, 1.2, 1.5]))
    
    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        labels: Optional[Dict[str, torch.Tensor]] = None
    ) -> Dict[str, torch.Tensor]:
        # Get BERT outputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        
        # Top level classification
        top_features = self.top_features(hidden_states, attention_mask)
        top_logits = self.top_classifier(top_features)
        top_probs = F.softmax(top_logits, dim=-1)
        
        # Narrative level classification with top level information
        narrative_features = self.narrative_features(hidden_states, attention_mask)
        narrative_logits = self.narrative_classifier(torch.cat([narrative_features, top_probs], dim=-1))
        narrative_probs = F.softmax(narrative_logits, dim=-1)
        
        # Subnarrative level classification with narrative level information
        subnarrative_features = self.subnarrative_features(hidden_states, attention_mask)
        subnarrative_logits = self.subnarrative_classifier(
            torch.cat([subnarrative_features, narrative_probs], dim=-1)
        )
        
        outputs = {
            'narrative_logits': top_logits,
            'subnarrative_logits': narrative_logits,
            'subsubnarrative_logits': subnarrative_logits
        }
        
        if labels is not None:
            loss = self.calculate_hierarchical_loss(outputs, labels)
            outputs['loss'] = loss
            
        return outputs
    
    def calculate_hierarchical_loss(
        self,
        outputs: Dict[str, torch.Tensor],
        labels: Dict[str, torch.Tensor]
    ) -> torch.Tensor:
        # Calculate individual level losses
        narrative_loss = F.cross_entropy(outputs['narrative_logits'], labels['narrative'])
        subnarrative_loss = F.cross_entropy(outputs['subnarrative_logits'], labels['subnarrative'])
        subsubnarrative_loss = F.cross_entropy(outputs['subsubnarrative_logits'], labels['subsubnarrative'])
        
        # Combine losses with level weights
        total_loss = (
            self.level_weights[0] * narrative_loss +
            self.level_weights[1] * subnarrative_loss +
            self.level_weights[2] * subsubnarrative_loss
        )
        
        # Add consistency regularization
        consistency_loss = self.calculate_consistency_loss(outputs, labels)
        total_loss += 0.1 * consistency_loss
        
        return total_loss
    
    def calculate_consistency_loss(
        self,
        outputs: Dict[str, torch.Tensor],
        labels: Dict[str, torch.Tensor]
    ) -> torch.Tensor:
        # Get probabilities for each level
        narrative_probs = F.softmax(outputs['narrative_logits'], dim=-1)
        subnarrative_probs = F.softmax(outputs['subnarrative_logits'], dim=-1)
        subsubnarrative_probs = F.softmax(outputs['subsubnarrative_logits'], dim=-1)
        
        # Calculate consistency loss using negative log likelihood
        consistency_loss = -(
            torch.log(narrative_probs[torch.arange(len(labels['narrative'])), labels['narrative']] + 1e-10) +
            torch.log(subnarrative_probs[torch.arange(len(labels['subnarrative'])), labels['subnarrative']] + 1e-10) +
            torch.log(subsubnarrative_probs[torch.arange(len(labels['subsubnarrative'])), labels['subsubnarrative']] + 1e-10)
        ).mean()
        
        return consistency_loss
    
    def predict(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor
    ) -> Dict[str, torch.Tensor]:
        """
        Make predictions without computing loss
        """
        outputs = self.forward(input_ids, attention_mask)
        predictions = {
            'narrative': torch.argmax(outputs['narrative_logits'], dim=-1),
            'subnarrative': torch.argmax(outputs['subnarrative_logits'], dim=-1),
            'subsubnarrative': torch.argmax(outputs['subsubnarrative_logits'], dim=-1)
        }
        return predictions

In [28]:
class HierarchicalDataset(Dataset):
    def __init__(
        self,
        data_path: str,
        tokenizer: AutoTokenizer,
        max_length: int = 512,
        is_training: bool = True
    ):
        # Load data
        self.df = pd.read_csv(data_path)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_training = is_training
        
        # Initialize label encoders
        self.label_encoders = {
            'narrative': LabelEncoder(),
            'subnarrative': LabelEncoder(),
            'subsubnarrative': LabelEncoder()
        }
        
        # Fit label encoders and transform labels
        self.encoded_labels = {}
        for level in self.label_encoders:
            self.label_encoders[level].fit(self.df[level].unique())
            self.encoded_labels[level] = self.label_encoders[level].transform(self.df[level])
            
        # Store number of classes for each level
        self.num_classes = {
            level: len(self.label_encoders[level].classes_)
            for level in self.label_encoders
        }
        
        # Save label mappings
        self.label_mappings = {
            level: dict(enumerate(self.label_encoders[level].classes_))
            for level in self.label_encoders
        }
        
    def __len__(self) -> int:
        return len(self.df)
    
    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        text = self.df.iloc[idx]['text']
        
        # Tokenize text
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Get encoded labels
        labels = {
            'narrative': torch.tensor(self.encoded_labels['narrative'][idx]),
            'subnarrative': torch.tensor(self.encoded_labels['subnarrative'][idx]),
            'subsubnarrative': torch.tensor(self.encoded_labels['subsubnarrative'][idx])
        }
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': labels
        }
    
    def save_label_mappings(self, path: str):
        """Save label encoders and mappings"""
        with open(path, 'w') as f:
            json.dump(self.label_mappings, f)
    
    @classmethod
    def load_label_mappings(cls, path: str) -> Dict:
        """Load label mappings"""
        with open(path, 'r') as f:
            return json.load(f)

class HierarchicalTrainer:
    def __init__(
        self,
        model: torch.nn.Module,
        train_dataloader: DataLoader,
        val_dataloader: DataLoader,
        optimizer: torch.optim.Optimizer,
        device: torch.device,
        num_epochs: int = 10,
        patience: int = 3
    ):
        self.model = model
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
        self.optimizer = optimizer
        self.device = device
        self.num_epochs = num_epochs
        self.patience = patience
        
        self.best_val_loss = float('inf')
        self.patience_counter = 0
        
    def train_epoch(self) -> Dict[str, float]:
        self.model.train()
        total_loss = 0
        all_predictions = {
            'narrative': [], 'subnarrative': [], 'subsubnarrative': []
        }
        all_labels = {
            'narrative': [], 'subnarrative': [], 'subsubnarrative': []
        }
        
        for batch in tqdm(self.train_dataloader, desc="Training"):
            self.optimizer.zero_grad()
            
            # Move batch to device
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            labels = {k: v.to(self.device) for k, v in batch['labels'].items()}
            
            # Forward pass
            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            # Backward pass
            loss = outputs['loss']
            loss.backward()
            self.optimizer.step()
            
            total_loss += loss.item()
            
            # Store predictions and labels for metrics
            with torch.no_grad():
                for level in ['narrative', 'subnarrative', 'subsubnarrative']:
                    preds = torch.argmax(outputs[f'{level}_logits'], dim=1)
                    all_predictions[level].extend(preds.cpu().numpy())
                    all_labels[level].extend(labels[level].cpu().numpy())
        
        # Calculate metrics
        metrics = self.calculate_metrics(all_predictions, all_labels)
        metrics['loss'] = total_loss / len(self.train_dataloader)
        
        return metrics
    
    @torch.no_grad()
    def validate(self) -> Dict[str, float]:
        self.model.eval()
        total_loss = 0
        all_predictions = {
            'narrative': [], 'subnarrative': [], 'subsubnarrative': []
        }
        all_labels = {
            'narrative': [], 'subnarrative': [], 'subsubnarrative': []
        }
        
        for batch in tqdm(self.val_dataloader, desc="Validation"):
            # Move batch to device
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            labels = {k: v.to(self.device) for k, v in batch['labels'].items()}
            
            # Forward pass
            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            total_loss += outputs['loss'].item()
            
            # Store predictions and labels
            for level in ['narrative', 'subnarrative', 'subsubnarrative']:
                preds = torch.argmax(outputs[f'{level}_logits'], dim=1)
                all_predictions[level].extend(preds.cpu().numpy())
                all_labels[level].extend(labels[level].cpu().numpy())
        
        # Calculate metrics
        metrics = self.calculate_metrics(all_predictions, all_labels)
        metrics['loss'] = total_loss / len(self.val_dataloader)
        
        return metrics
    
    def calculate_metrics(
        self,
        predictions: Dict[str, List],
        labels: Dict[str, List]
    ) -> Dict[str, float]:
        metrics = {}
        
        for level in predictions:
            level_metrics = classification_report(
                labels[level],
                predictions[level],
                output_dict=True
            )
            metrics[f'{level}_f1'] = level_metrics['weighted avg']['f1-score']
            metrics[f'{level}_accuracy'] = level_metrics['accuracy']
        
        return metrics
    
    def train(self) -> Dict[str, List[float]]:
        history = {
            'train_loss': [], 'val_loss': [],
            'train_metrics': [], 'val_metrics': []
        }
        
        for epoch in range(self.num_epochs):
            print(f"\nEpoch {epoch + 1}/{self.num_epochs}")
            
            # Training
            train_metrics = self.train_epoch()
            history['train_loss'].append(train_metrics['loss'])
            history['train_metrics'].append(train_metrics)
            
            # Validation
            val_metrics = self.validate()
            history['val_loss'].append(val_metrics['loss'])
            history['val_metrics'].append(val_metrics)
            
            # Print metrics
            print(f"Train Loss: {train_metrics['loss']:.4f}")
            print(f"Val Loss: {val_metrics['loss']:.4f}")
            for level in ['narrative', 'subnarrative', 'subsubnarrative']:
                print(f"{level} - Train F1: {train_metrics[f'{level}_f1']:.4f}, "
                      f"Val F1: {val_metrics[f'{level}_f1']:.4f}")
            
            # Early stopping
            if val_metrics['loss'] < self.best_val_loss:
                self.best_val_loss = val_metrics['loss']
                self.patience_counter = 0
                # Save best model
                torch.save(self.model.state_dict(), 'model/best_model.pt')
            # else:
            #     self.patience_counter += 1
            #     if self.patience_counter >= self.patience:
            #         print("Early stopping triggered")
            #         break
        
        return history

# Usage example:
def create_dataloaders(
    train_path: str,
    val_path: str,
    tokenizer: AutoTokenizer,
    batch_size: int = 16
) -> Tuple[DataLoader, DataLoader, HierarchicalDataset]:
    # Create datasets
    train_dataset = HierarchicalDataset(train_path, tokenizer, is_training=True)
    val_dataset = HierarchicalDataset(val_path, tokenizer, is_training=False)
    
    # Create dataloaders
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=4
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=4
    )
    
    return train_dataloader, val_dataloader, train_dataset

In [46]:

def train_hierarchical_classifier(
    train_path: str,
    val_path: str,
    model_name: str = "XLM-RoBERTa-base",
    batch_size: int = 16,
    learning_rate: float = 2e-5,
    num_epochs: int = 10
):
    # Initialize tokenizer and base model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    base_model = AutoModel.from_pretrained(model_name)
    
    # Create dataloaders
    train_dataloader, val_dataloader, train_dataset = create_dataloaders(
        train_path,
        val_path,
        tokenizer,
        batch_size
    )
    
    # Initialize model
    model = HierarchicalClassifier(
        pretrained_model=base_model,
        num_top=train_dataset.num_classes['narrative'],
        num_narratives=train_dataset.num_classes['subnarrative'],
        num_subnarratives=train_dataset.num_classes['subsubnarrative']
    )
    
    # Save label mappings
    train_dataset.save_label_mappings('label_mappings.json')
    
    # Setup training
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=10)
    
    # Initialize trainer
    trainer = HierarchicalTrainer(
        model=model,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        optimizer=optimizer,
        device=device,
        num_epochs=num_epochs
    )
    
    # Train model
    history = trainer.train()
    
    return model, history, train_dataset.label_mappings

# Example usage
model, history, label_mappings = train_hierarchical_classifier(
    train_path="train.csv",
    val_path="val.csv",
    num_epochs=20
)

# Save final model
torch.save({
    'model_state_dict': model.state_dict(),
    'label_mappings': label_mappings,
    'history': history
}, 'final_model.pt')


Epoch 1/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 9.2191
Val Loss: 11.3127
narrative - Train F1: 0.8234, Val F1: 0.8925
subnarrative - Train F1: 0.1302, Val F1: 0.2093
subsubnarrative - Train F1: 0.0837, Val F1: 0.0008

Epoch 2/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 7.7371
Val Loss: 11.8080
narrative - Train F1: 0.8991, Val F1: 0.9259
subnarrative - Train F1: 0.2271, Val F1: 0.1932
subsubnarrative - Train F1: 0.1307, Val F1: 0.0234

Epoch 3/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 7.3212
Val Loss: 11.9439
narrative - Train F1: 0.9248, Val F1: 0.9252
subnarrative - Train F1: 0.2725, Val F1: 0.2557
subsubnarrative - Train F1: 0.1559, Val F1: 0.0248

Epoch 4/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 6.9261
Val Loss: 12.3851
narrative - Train F1: 0.9436, Val F1: 0.9270
subnarrative - Train F1: 0.3227, Val F1: 0.2832
subsubnarrative - Train F1: 0.1903, Val F1: 0.0216

Epoch 5/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 6.6143
Val Loss: 12.7214
narrative - Train F1: 0.9583, Val F1: 0.9272
subnarrative - Train F1: 0.3584, Val F1: 0.3208
subsubnarrative - Train F1: 0.2058, Val F1: 0.0238

Epoch 6/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 6.3312
Val Loss: 12.1949
narrative - Train F1: 0.9755, Val F1: 0.9356
subnarrative - Train F1: 0.3733, Val F1: 0.3292
subsubnarrative - Train F1: 0.2266, Val F1: 0.0306

Epoch 7/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 6.0185
Val Loss: 13.3025
narrative - Train F1: 0.9861, Val F1: 0.9241
subnarrative - Train F1: 0.4080, Val F1: 0.3198
subsubnarrative - Train F1: 0.2396, Val F1: 0.0284

Epoch 8/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 5.9546
Val Loss: 13.2262
narrative - Train F1: 0.9796, Val F1: 0.9370
subnarrative - Train F1: 0.4162, Val F1: 0.3446
subsubnarrative - Train F1: 0.2405, Val F1: 0.0283

Epoch 9/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 5.7101
Val Loss: 13.6006
narrative - Train F1: 0.9846, Val F1: 0.9349
subnarrative - Train F1: 0.4337, Val F1: 0.3593
subsubnarrative - Train F1: 0.2607, Val F1: 0.0292

Epoch 10/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 5.5729
Val Loss: 14.1193
narrative - Train F1: 0.9926, Val F1: 0.9422
subnarrative - Train F1: 0.4457, Val F1: 0.3537
subsubnarrative - Train F1: 0.2724, Val F1: 0.0285

Epoch 11/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 5.4050
Val Loss: 13.8437
narrative - Train F1: 0.9926, Val F1: 0.9295
subnarrative - Train F1: 0.4657, Val F1: 0.3419
subsubnarrative - Train F1: 0.2822, Val F1: 0.0275

Epoch 12/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 5.2338
Val Loss: 14.2413
narrative - Train F1: 0.9954, Val F1: 0.9317
subnarrative - Train F1: 0.4889, Val F1: 0.3503
subsubnarrative - Train F1: 0.2944, Val F1: 0.0241

Epoch 13/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 5.1395
Val Loss: 14.3028
narrative - Train F1: 0.9935, Val F1: 0.9329
subnarrative - Train F1: 0.5025, Val F1: 0.3587
subsubnarrative - Train F1: 0.3066, Val F1: 0.0291

Epoch 14/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 5.0295
Val Loss: 14.9266
narrative - Train F1: 0.9933, Val F1: 0.9394
subnarrative - Train F1: 0.5068, Val F1: 0.3475
subsubnarrative - Train F1: 0.3153, Val F1: 0.0296

Epoch 15/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 4.8611
Val Loss: 15.0166
narrative - Train F1: 0.9945, Val F1: 0.9401
subnarrative - Train F1: 0.5301, Val F1: 0.3619
subsubnarrative - Train F1: 0.3265, Val F1: 0.0273

Epoch 16/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 4.7521
Val Loss: 15.0338
narrative - Train F1: 0.9969, Val F1: 0.9352
subnarrative - Train F1: 0.5351, Val F1: 0.3561
subsubnarrative - Train F1: 0.3403, Val F1: 0.0270

Epoch 17/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 4.6272
Val Loss: 15.4342
narrative - Train F1: 0.9976, Val F1: 0.9353
subnarrative - Train F1: 0.5437, Val F1: 0.3549
subsubnarrative - Train F1: 0.3513, Val F1: 0.0325

Epoch 18/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 4.5039
Val Loss: 15.4117
narrative - Train F1: 0.9976, Val F1: 0.9360
subnarrative - Train F1: 0.5595, Val F1: 0.3589
subsubnarrative - Train F1: 0.3670, Val F1: 0.0310

Epoch 19/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 4.4327
Val Loss: 15.3291
narrative - Train F1: 0.9954, Val F1: 0.9437
subnarrative - Train F1: 0.5562, Val F1: 0.3539
subsubnarrative - Train F1: 0.3634, Val F1: 0.0325

Epoch 20/20


Training:   0%|          | 0/205 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Train Loss: 4.3292
Val Loss: 15.8359
narrative - Train F1: 0.9982, Val F1: 0.9337
subnarrative - Train F1: 0.5661, Val F1: 0.3441
subsubnarrative - Train F1: 0.3900, Val F1: 0.0292


In [12]:
# val_df = pd.read_csv('val.csv')
# text = val_df.sample(1)['text'].values[0]

# # model.predict(input_ids, attention_mask)
# text

"Russia Turns Missiles on Ukraine's Farm Storage \n\n Russian cruise missiles, flying low and hugging the terrain to dodge Ukrainian air defenses, destroyed farm storage buildings in the Odesa region early Friday, Ukrainian officials said, as the Kremlin’s forces expanded their targets following three days of bombardment of the region’s Black Sea port infrastructure.\n\nTwo missiles struck the storage facility, starting a fire, and while workers fought to put it out another missile hit, destroying farm and firefighting equipment, the southern Odesa region’s Gov. Oleh Kiper said.\n\nThe attack injured two people and destroyed 100 metric tons of peas and 20 metric tons of barley, according to Kiper.\n\nThe attack was small-scale in comparison with barrages in recent days that put Odesa in Russia’s crosshairs after Moscow tore up a wartime deal that allowed Ukraine to send grain through the key Black Sea port.\n\nRussia targeted Ukrainian critical grain export infrastructure after vowing 

In [47]:
model_name = "XLM-RoBERTa-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# base_model = AutoModel.from_pretrained(model_name)

In [48]:
def predict_text(text: str, model: HierarchicalClassifier, tokenizer: AutoTokenizer, label_mappings: dict) -> dict:
    """
    Predict using the model's built-in predict method with CUDA support
    
    Args:
        text: Input text to classify
        model: Trained HierarchicalClassifier model (already loaded)
        tokenizer: BERT tokenizer
        label_mappings: Dictionary mapping indices to label names
    """
    # Ensure model is on CUDA
    device = torch.device('cuda')
    model = model.to(device)
    
    # Tokenize input
    inputs = tokenizer(
        text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    # Move inputs to CUDA
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Get predictions
    model.eval()
    with torch.no_grad():
        predictions = model.predict(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask']
        )
    
    # Convert prediction indices to labels
    results = {}
    for level in ['narrative', 'subnarrative', 'subsubnarrative']:
        pred_idx = predictions[level][0].cpu().item()  # Move to CPU before getting item
        # Try both string and integer keys
        try:
            results[level] = label_mappings[level][str(pred_idx)]
        except KeyError:
            try:
                results[level] = label_mappings[level][pred_idx]
            except KeyError:
                print(f"Available keys for {level}: {list(label_mappings[level].keys())}")
                print(f"Predicted index for {level}: {pred_idx}")
                results[level] = f"Unknown-{pred_idx}"
    
    return results

# Example usage with debug info
def predict_with_debug(text: str, model: HierarchicalClassifier, tokenizer: AutoTokenizer, label_mappings: dict):
    print("\nLabel mapping structure:")
    for level in ['narrative', 'subnarrative', 'subsubnarrative']:
        print(f"\n{level} mapping keys type:", type(next(iter(label_mappings[level].keys()))))
        print(f"First few {level} mappings:", dict(list(label_mappings[level].items())[:3]))
    
    predictions = predict_text(text, model, tokenizer, label_mappings)
    
    print("\nInput text:")
    print(text)
    print("\nPredictions:")
    for level, pred in predictions.items():
        print(f"{level}: {pred}")
    
    return predictions

# # Test with a sample sentence
# test = data_exploded.sample(1)
# test_sentence = test['text'].values[0]

test_sentence = "I am a pacifist and I believe in non-violence."
print(test_sentence, "\n**************************************")
# test_sentence = "Climate change policies are just a way for governments to control industries while China keeps polluting."
predictions = predict_with_debug(test_sentence, model, tokenizer, label_mappings)

I am a pacifist and I believe in non-violence. 
**************************************

Label mapping structure:

narrative mapping keys type: <class 'int'>
First few narrative mappings: {0: 'CC', 1: 'Other', 2: 'URW'}

subnarrative mapping keys type: <class 'int'>
First few subnarrative mappings: {0: 'Amplifying Climate Fears', 1: 'Amplifying war-related fears', 2: 'Blaming the war on others rather than the invader'}

subsubnarrative mapping keys type: <class 'int'>
First few subsubnarrative mappings: {0: 'Ad hominem attacks on key activists', 1: 'Amplifying existing fears of global warming', 2: 'Blaming global elites'}

Input text:
I am a pacifist and I believe in non-violence.

Predictions:
narrative: URW
subnarrative: Discrediting Ukraine
subsubnarrative: Other


In [49]:
import json
with open('label_mappings.json', 'r') as f:
    label_mappings = json.load(f)

In [50]:
label_mappings

{'narrative': {'0': 'CC', '1': 'Other', '2': 'URW'},
 'subnarrative': {'0': 'Amplifying Climate Fears',
  '1': 'Amplifying war-related fears',
  '2': 'Blaming the war on others rather than the invader',
  '3': 'Climate change is beneficial',
  '4': 'Controversy about green technologies',
  '5': 'Criticism of climate movement',
  '6': 'Criticism of climate policies',
  '7': 'Criticism of institutions and authorities',
  '8': 'Discrediting Ukraine',
  '9': 'Discrediting the West, Diplomacy',
  '10': 'Distrust towards Media',
  '11': 'Downplaying climate change',
  '12': 'Green policies are geopolitical instruments',
  '13': 'Hidden plots by secret schemes of powerful groups',
  '14': 'Negative Consequences for the West',
  '15': 'Overpraising the West',
  '16': 'Praise of Russia',
  '17': 'Questioning the measurements and science',
  '18': 'Russia is the Victim',
  '19': 'Speculating war outcomes',
  '20': nan},
 'subsubnarrative': {'0': 'Ad hominem attacks on key activists',
  '1': 'Amp

In [51]:
dev_df = pd.read_csv("../csv_data/dev_with_text.csv")

In [86]:
en_dev = pd.read_csv("/home/ka55gawy/semeval2025/task_data/dev/EN/subtask-2-annotations.txt", sep="\t", header=None, names=["filepath",  "subnarrative", "subsubnarrative"])
bg_dev = pd.read_csv("/home/ka55gawy/semeval2025/task_data/dev/BG/subtask-2-annotations.txt", sep="\t", header=None, names=["filepath",  "subnarrative", "subsubnarrative"])
pt_dev = pd.read_csv("/home/ka55gawy/semeval2025/task_data/dev/PT/subtask-2-annotations.txt", sep="\t", header=None, names=["filepath",  "subnarrative", "subsubnarrative"])
hi_dev = pd.read_csv("/home/ka55gawy/semeval2025/task_data/dev/HI/subtask-2-annotations.txt", sep="\t", header=None, names=["filepath",  "subnarrative", "subsubnarrative"])
ru_dev = pd.read_csv("/home/ka55gawy/semeval2025/task_data/dev/RU/subtask-2-annotations.txt", sep="\t", header=None, names=["filepath",  "subnarrative", "subsubnarrative"])

In [87]:
en_dev

Unnamed: 0,filepath,subnarrative,subsubnarrative
0,EN_UA_DEV_100012.txt,"URW: Discrediting the West, Diplomacy;URW: Dis...","URW: Discrediting the West, Diplomacy: The Wes..."
1,EN_CC_200053.txt,Other,Other
2,EN_CC_200040.txt,CC: Criticism of climate movement;CC: Criticis...,CC: Criticism of climate movement: Other;CC: C...
3,EN_CC_200070.txt,CC: Criticism of institutions and authorities;...,CC: Criticism of institutions and authorities:...
4,EN_UA_DEV_100034.txt,URW: Overpraising the West,URW: Overpraising the West: The West belongs i...
5,EN_CC_200049.txt,CC: Questioning the measurements and science;C...,CC: Questioning the measurements and science: ...
6,EN_UA_DEV_100003.txt,Other,Other
7,EN_UA_DEV_100033.txt,URW: Speculating war outcomes,URW: Speculating war outcomes: Russian army is...
8,EN_CC_200036.txt,CC: Criticism of climate movement;CC: Criticis...,CC: Criticism of climate movement: Ad hominem ...
9,EN_CC_200079.txt,CC: Criticism of institutions and authorities;...,CC: Criticism of institutions and authorities:...


In [88]:
def read_file_content(filename):
    if not filename:
        return None
    if filename.startswith("EN"):
        path_prefix = "/home/ka55gawy/semeval2025/task_data/dev/EN/subtask-2-documents"
    elif filename.startswith("BG"):    
        path_prefix = "/home/ka55gawy/semeval2025/task_data/dev/BG/subtask-2-documents"
    elif filename.startswith("PT"):
        path_prefix = "/home/ka55gawy/semeval2025/task_data/dev/PT/subtask-2-documents"
    elif filename.startswith("RU"):
        path_prefix = "/home/ka55gawy/semeval2025/task_data/dev/RU/subtask-2-documents"
    elif filename.startswith("HI"):
        path_prefix = "/home/ka55gawy/semeval2025/task_data/dev/HI/subtask-2-documents"
    full_path = os.path.join(path_prefix, filename)
    encoding = 'utf-8'
    try:
        with open(full_path, 'r', encoding=encoding) as file:
            return file.read()
    except Exception as e:
        print(f"Error reading file {full_path}: {e}")
        return None


def bg_file_content(filename):
    path_prefix = "/home/ka55gawy/semeval2025/task_data/dev/BG/subtask-2-documents"
    full_path = os.path.join(path_prefix, filename)
    encoding = 'utf-8'
    try:
        with open(full_path, 'r', encoding=encoding) as file:
            return file.read()
    except Exception as e:
        print(f"Error reading file {full_path}: {e}")
        return None





In [80]:
import os

In [89]:
en_dev['text'] = en_dev.filepath.apply(read_file_content)
bg_dev["text"] = bg_dev.filepath.apply(bg_file_content)
pt_dev["text"] = pt_dev.filepath.apply(read_file_content)
hi_dev["text"] = hi_dev.filepath.apply(read_file_content)
ru_dev["text"] = ru_dev.filepath.apply(read_file_content)

In [91]:
en_dev

Unnamed: 0,filepath,subnarrative,subsubnarrative,text
0,EN_UA_DEV_100012.txt,"URW: Discrediting the West, Diplomacy;URW: Dis...","URW: Discrediting the West, Diplomacy: The Wes...",Ukraine's Minerals: What the West is Fighting ...
1,EN_CC_200053.txt,Other,Other,UK’s Sunak Reverses Decision to Skip COP27 Cli...
2,EN_CC_200040.txt,CC: Criticism of climate movement;CC: Criticis...,CC: Criticism of climate movement: Other;CC: C...,Climate Protesters Out Of Control As They Atta...
3,EN_CC_200070.txt,CC: Criticism of institutions and authorities;...,CC: Criticism of institutions and authorities:...,Wat? L.A. Mayor Garcetti Flies to Argentina to...
4,EN_UA_DEV_100034.txt,URW: Overpraising the West,URW: Overpraising the West: The West belongs i...,Opinion: Restructuring Ukrainian debt is a ste...
5,EN_CC_200049.txt,CC: Questioning the measurements and science;C...,CC: Questioning the measurements and science: ...,Alarmists Warn of U.S. ‘Heat Dome’ Tied to Hum...
6,EN_UA_DEV_100003.txt,Other,Other,"Medvedev: Russia Seeks More in Ukraine, 'Proba..."
7,EN_UA_DEV_100033.txt,URW: Speculating war outcomes,URW: Speculating war outcomes: Russian army is...,Wild Kremlin TV hosts threaten the U.S. with n...
8,EN_CC_200036.txt,CC: Criticism of climate movement;CC: Criticis...,CC: Criticism of climate movement: Ad hominem ...,Climate cultists push bizarre scare language a...
9,EN_CC_200079.txt,CC: Criticism of institutions and authorities;...,CC: Criticism of institutions and authorities:...,Klaus Schwab Wants To Ban People Washing Their...


In [82]:
en_text = en_dev_processed["text"].tolist()
bg_text = bg_dev_processed["text"].tolist()
pt_text = pt_dev_processed["text"].tolist()
hi_text = hi_dev_processed["text"].tolist()
ru_text = ru_dev_processed["text"].tolist()


0       Ukraine's Minerals: What the West is Fighting ...
1       UK’s Sunak Reverses Decision to Skip COP27 Cli...
2       Climate Protesters Out Of Control As They Atta...
3       Wat? L.A. Mayor Garcetti Flies to Argentina to...
4       Opinion: Restructuring Ukrainian debt is a ste...
5       Alarmists Warn of U.S. ‘Heat Dome’ Tied to Hum...
6       Medvedev: Russia Seeks More in Ukraine, 'Proba...
7       Wild Kremlin TV hosts threaten the U.S. with n...
8       Climate cultists push bizarre scare language a...
9       Klaus Schwab Wants To Ban People Washing Their...
10      General Milley: Russian military stocks rapidl...
11      There Are Only Downsides To Prolonging The War...
12      New Sarah Westall & Scott Ritter: Russia is Wi...
13      Greta Thunberg Calls For 'Overthrow of Whole C...
14      Biden’s green policies are making housing no l...
15      CO2 is the GREENING molecule: New study shows ...
16      Link to Major Banks Bend The Knee To Climate A...
17      How Gr

In [94]:
# Define your list of languages
languages = ["en", "bg", "hi", "ru", "pt"]

# Iterate over each language
for lang in languages:
    # Load the appropriate dataset for each language
    lang_dev = eval(f"{lang}_dev")  # Assuming you have variables like en_dev, bg_dev, etc.
    
    # Get the text data for the current language
    lang_text = lang_dev["text"].tolist()
    
    # Process each sentence in this language
    res = []
    for sentence in lang_text:
        res.append(predict_with_debug(sentence, model, tokenizer, label_mappings))
    
    # Create and format the results DataFrame
    res_df = pd.DataFrame(res)
    res_df['filename'] = lang_dev['filepath']
    res_df["top"] = res_df["narrative"] + ": " + res_df["subnarrative"]
    res_df["nar"] = res_df["top"] + ": " + res_df["subsubnarrative"]
    res_df = res_df[["filename", "top", "nar"]]
    
    # Save results to a language-specific file
    res_df.to_csv(f"results/{lang}_dev_results.txt", index=False, sep="\t", header=False)
    
    print(f"Processed {lang} language data")


Label mapping structure:

narrative mapping keys type: <class 'str'>
First few narrative mappings: {'0': 'CC', '1': 'Other', '2': 'URW'}

subnarrative mapping keys type: <class 'str'>
First few subnarrative mappings: {'0': 'Amplifying Climate Fears', '1': 'Amplifying war-related fears', '2': 'Blaming the war on others rather than the invader'}

subsubnarrative mapping keys type: <class 'str'>
First few subsubnarrative mappings: {'0': 'Ad hominem attacks on key activists', '1': 'Amplifying existing fears of global warming', '2': 'Blaming global elites'}

Input text:
Ukraine's Minerals: What the West is Fighting For 

Washington “cannot afford” to allow Russia to achieve victory in the Ukraine conflict as this would mean losing direct access to vast mineral assets. That was the view of US Senator Lindsey Graham* in an interview with ‘Face the Nation’ on CBS News in June. “They’re sitting on 10 to $12 trillion of critical minerals in Ukraine. They could be the richest country in all of E

In [None]:
en_text = en_dev["text"].tolist()

res = []
for sentence in en_text:
    res.append(predict_with_debug(sentence, model, tokenizer, label_mappings))
res_df = pd.DataFrame(res)
res_df['filename'] = en_dev['filepath']
res_df["top"] = res_df["narrative"] + ": " + res_df["subnarrative"]
res_df["nar"] = res_df["top"] + ": " + res_df["subsubnarrative"]
res_df = res_df[["filename", "top", "nar"]]
res_df.to_csv("results/en_dev_results.txt", index=False, sep="\t", header=False)


Label mapping structure:

narrative mapping keys type: <class 'str'>
First few narrative mappings: {'0': 'CC', '1': 'Other', '2': 'URW'}

subnarrative mapping keys type: <class 'str'>
First few subnarrative mappings: {'0': 'Amplifying Climate Fears', '1': 'Amplifying war-related fears', '2': 'Blaming the war on others rather than the invader'}

subsubnarrative mapping keys type: <class 'str'>
First few subsubnarrative mappings: {'0': 'Ad hominem attacks on key activists', '1': 'Amplifying existing fears of global warming', '2': 'Blaming global elites'}

Input text:
Ukraine's Minerals: What the West is Fighting For 

Washington “cannot afford” to allow Russia to achieve victory in the Ukraine conflict as this would mean losing direct access to vast mineral assets. That was the view of US Senator Lindsey Graham* in an interview with ‘Face the Nation’ on CBS News in June. “They’re sitting on 10 to $12 trillion of critical minerals in Ukraine. They could be the richest country in all of E

In [54]:
res_df = pd.DataFrame(res)

In [55]:
res_df['filename'] = dev_df['filename']
res_df

Unnamed: 0,narrative,subnarrative,subsubnarrative,filename
0,Other,,,EN_UA_DEV_100012.txt
1,CC,Criticism of institutions and authorities,,EN_CC_200053.txt
2,CC,Criticism of climate movement,Other,EN_CC_200040.txt
3,Other,,,EN_CC_200070.txt
4,URW,Discrediting Ukraine,,EN_UA_DEV_100034.txt
5,CC,Amplifying Climate Fears,Amplifying existing fears of global warming,EN_CC_200049.txt
6,URW,Amplifying war-related fears,Other,EN_UA_DEV_100003.txt
7,URW,Amplifying war-related fears,There is a real possibility that nuclear weapo...,EN_UA_DEV_100033.txt
8,CC,Hidden plots by secret schemes of powerful groups,Other,EN_CC_200036.txt
9,Other,,,EN_CC_200079.txt


In [56]:
res_df["top"] = res_df["narrative"] + ": " + res_df["subnarrative"]
res_df["nar"] = res_df["top"] + ": " + res_df["subsubnarrative"]
res_df = res_df[["filename", "top", "nar"]]

In [57]:
res_df.to_csv("results/res1.txt", index=False, sep="\t", header=False)

# Predict with loaded model

In [40]:
num_top = 3
num_subnarratives = 21
num_subsubnarratives = 68

In [41]:
train_path = "legacy/train.csv"
val_path = "legacy/val.csv"

In [42]:
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# base_model = AutoModel.from_pretrained("bert-base-uncased")

In [43]:
# Initialize model
loaded_model = HierarchicalClassifier(
    pretrained_model=base_model,
    num_top=num_top,
    num_narratives=num_subnarratives,
    num_subnarratives=num_subsubnarratives
)

NameError: name 'base_model' is not defined

In [92]:
state_dict = torch.load('final_model.pt')
loaded_model.load_state_dict(state_dict)
loaded_model.eval()

  state_dict = torch.load('final_model.pt')


RuntimeError: Error(s) in loading state_dict for HierarchicalClassifier:
	Missing key(s) in state_dict: "level_weights", "bert.embeddings.word_embeddings.weight", "bert.embeddings.position_embeddings.weight", "bert.embeddings.token_type_embeddings.weight", "bert.embeddings.LayerNorm.weight", "bert.embeddings.LayerNorm.bias", "bert.encoder.layer.0.attention.self.query.weight", "bert.encoder.layer.0.attention.self.query.bias", "bert.encoder.layer.0.attention.self.key.weight", "bert.encoder.layer.0.attention.self.key.bias", "bert.encoder.layer.0.attention.self.value.weight", "bert.encoder.layer.0.attention.self.value.bias", "bert.encoder.layer.0.attention.output.dense.weight", "bert.encoder.layer.0.attention.output.dense.bias", "bert.encoder.layer.0.attention.output.LayerNorm.weight", "bert.encoder.layer.0.attention.output.LayerNorm.bias", "bert.encoder.layer.0.intermediate.dense.weight", "bert.encoder.layer.0.intermediate.dense.bias", "bert.encoder.layer.0.output.dense.weight", "bert.encoder.layer.0.output.dense.bias", "bert.encoder.layer.0.output.LayerNorm.weight", "bert.encoder.layer.0.output.LayerNorm.bias", "bert.encoder.layer.1.attention.self.query.weight", "bert.encoder.layer.1.attention.self.query.bias", "bert.encoder.layer.1.attention.self.key.weight", "bert.encoder.layer.1.attention.self.key.bias", "bert.encoder.layer.1.attention.self.value.weight", "bert.encoder.layer.1.attention.self.value.bias", "bert.encoder.layer.1.attention.output.dense.weight", "bert.encoder.layer.1.attention.output.dense.bias", "bert.encoder.layer.1.attention.output.LayerNorm.weight", "bert.encoder.layer.1.attention.output.LayerNorm.bias", "bert.encoder.layer.1.intermediate.dense.weight", "bert.encoder.layer.1.intermediate.dense.bias", "bert.encoder.layer.1.output.dense.weight", "bert.encoder.layer.1.output.dense.bias", "bert.encoder.layer.1.output.LayerNorm.weight", "bert.encoder.layer.1.output.LayerNorm.bias", "bert.encoder.layer.2.attention.self.query.weight", "bert.encoder.layer.2.attention.self.query.bias", "bert.encoder.layer.2.attention.self.key.weight", "bert.encoder.layer.2.attention.self.key.bias", "bert.encoder.layer.2.attention.self.value.weight", "bert.encoder.layer.2.attention.self.value.bias", "bert.encoder.layer.2.attention.output.dense.weight", "bert.encoder.layer.2.attention.output.dense.bias", "bert.encoder.layer.2.attention.output.LayerNorm.weight", "bert.encoder.layer.2.attention.output.LayerNorm.bias", "bert.encoder.layer.2.intermediate.dense.weight", "bert.encoder.layer.2.intermediate.dense.bias", "bert.encoder.layer.2.output.dense.weight", "bert.encoder.layer.2.output.dense.bias", "bert.encoder.layer.2.output.LayerNorm.weight", "bert.encoder.layer.2.output.LayerNorm.bias", "bert.encoder.layer.3.attention.self.query.weight", "bert.encoder.layer.3.attention.self.query.bias", "bert.encoder.layer.3.attention.self.key.weight", "bert.encoder.layer.3.attention.self.key.bias", "bert.encoder.layer.3.attention.self.value.weight", "bert.encoder.layer.3.attention.self.value.bias", "bert.encoder.layer.3.attention.output.dense.weight", "bert.encoder.layer.3.attention.output.dense.bias", "bert.encoder.layer.3.attention.output.LayerNorm.weight", "bert.encoder.layer.3.attention.output.LayerNorm.bias", "bert.encoder.layer.3.intermediate.dense.weight", "bert.encoder.layer.3.intermediate.dense.bias", "bert.encoder.layer.3.output.dense.weight", "bert.encoder.layer.3.output.dense.bias", "bert.encoder.layer.3.output.LayerNorm.weight", "bert.encoder.layer.3.output.LayerNorm.bias", "bert.encoder.layer.4.attention.self.query.weight", "bert.encoder.layer.4.attention.self.query.bias", "bert.encoder.layer.4.attention.self.key.weight", "bert.encoder.layer.4.attention.self.key.bias", "bert.encoder.layer.4.attention.self.value.weight", "bert.encoder.layer.4.attention.self.value.bias", "bert.encoder.layer.4.attention.output.dense.weight", "bert.encoder.layer.4.attention.output.dense.bias", "bert.encoder.layer.4.attention.output.LayerNorm.weight", "bert.encoder.layer.4.attention.output.LayerNorm.bias", "bert.encoder.layer.4.intermediate.dense.weight", "bert.encoder.layer.4.intermediate.dense.bias", "bert.encoder.layer.4.output.dense.weight", "bert.encoder.layer.4.output.dense.bias", "bert.encoder.layer.4.output.LayerNorm.weight", "bert.encoder.layer.4.output.LayerNorm.bias", "bert.encoder.layer.5.attention.self.query.weight", "bert.encoder.layer.5.attention.self.query.bias", "bert.encoder.layer.5.attention.self.key.weight", "bert.encoder.layer.5.attention.self.key.bias", "bert.encoder.layer.5.attention.self.value.weight", "bert.encoder.layer.5.attention.self.value.bias", "bert.encoder.layer.5.attention.output.dense.weight", "bert.encoder.layer.5.attention.output.dense.bias", "bert.encoder.layer.5.attention.output.LayerNorm.weight", "bert.encoder.layer.5.attention.output.LayerNorm.bias", "bert.encoder.layer.5.intermediate.dense.weight", "bert.encoder.layer.5.intermediate.dense.bias", "bert.encoder.layer.5.output.dense.weight", "bert.encoder.layer.5.output.dense.bias", "bert.encoder.layer.5.output.LayerNorm.weight", "bert.encoder.layer.5.output.LayerNorm.bias", "bert.encoder.layer.6.attention.self.query.weight", "bert.encoder.layer.6.attention.self.query.bias", "bert.encoder.layer.6.attention.self.key.weight", "bert.encoder.layer.6.attention.self.key.bias", "bert.encoder.layer.6.attention.self.value.weight", "bert.encoder.layer.6.attention.self.value.bias", "bert.encoder.layer.6.attention.output.dense.weight", "bert.encoder.layer.6.attention.output.dense.bias", "bert.encoder.layer.6.attention.output.LayerNorm.weight", "bert.encoder.layer.6.attention.output.LayerNorm.bias", "bert.encoder.layer.6.intermediate.dense.weight", "bert.encoder.layer.6.intermediate.dense.bias", "bert.encoder.layer.6.output.dense.weight", "bert.encoder.layer.6.output.dense.bias", "bert.encoder.layer.6.output.LayerNorm.weight", "bert.encoder.layer.6.output.LayerNorm.bias", "bert.encoder.layer.7.attention.self.query.weight", "bert.encoder.layer.7.attention.self.query.bias", "bert.encoder.layer.7.attention.self.key.weight", "bert.encoder.layer.7.attention.self.key.bias", "bert.encoder.layer.7.attention.self.value.weight", "bert.encoder.layer.7.attention.self.value.bias", "bert.encoder.layer.7.attention.output.dense.weight", "bert.encoder.layer.7.attention.output.dense.bias", "bert.encoder.layer.7.attention.output.LayerNorm.weight", "bert.encoder.layer.7.attention.output.LayerNorm.bias", "bert.encoder.layer.7.intermediate.dense.weight", "bert.encoder.layer.7.intermediate.dense.bias", "bert.encoder.layer.7.output.dense.weight", "bert.encoder.layer.7.output.dense.bias", "bert.encoder.layer.7.output.LayerNorm.weight", "bert.encoder.layer.7.output.LayerNorm.bias", "bert.encoder.layer.8.attention.self.query.weight", "bert.encoder.layer.8.attention.self.query.bias", "bert.encoder.layer.8.attention.self.key.weight", "bert.encoder.layer.8.attention.self.key.bias", "bert.encoder.layer.8.attention.self.value.weight", "bert.encoder.layer.8.attention.self.value.bias", "bert.encoder.layer.8.attention.output.dense.weight", "bert.encoder.layer.8.attention.output.dense.bias", "bert.encoder.layer.8.attention.output.LayerNorm.weight", "bert.encoder.layer.8.attention.output.LayerNorm.bias", "bert.encoder.layer.8.intermediate.dense.weight", "bert.encoder.layer.8.intermediate.dense.bias", "bert.encoder.layer.8.output.dense.weight", "bert.encoder.layer.8.output.dense.bias", "bert.encoder.layer.8.output.LayerNorm.weight", "bert.encoder.layer.8.output.LayerNorm.bias", "bert.encoder.layer.9.attention.self.query.weight", "bert.encoder.layer.9.attention.self.query.bias", "bert.encoder.layer.9.attention.self.key.weight", "bert.encoder.layer.9.attention.self.key.bias", "bert.encoder.layer.9.attention.self.value.weight", "bert.encoder.layer.9.attention.self.value.bias", "bert.encoder.layer.9.attention.output.dense.weight", "bert.encoder.layer.9.attention.output.dense.bias", "bert.encoder.layer.9.attention.output.LayerNorm.weight", "bert.encoder.layer.9.attention.output.LayerNorm.bias", "bert.encoder.layer.9.intermediate.dense.weight", "bert.encoder.layer.9.intermediate.dense.bias", "bert.encoder.layer.9.output.dense.weight", "bert.encoder.layer.9.output.dense.bias", "bert.encoder.layer.9.output.LayerNorm.weight", "bert.encoder.layer.9.output.LayerNorm.bias", "bert.encoder.layer.10.attention.self.query.weight", "bert.encoder.layer.10.attention.self.query.bias", "bert.encoder.layer.10.attention.self.key.weight", "bert.encoder.layer.10.attention.self.key.bias", "bert.encoder.layer.10.attention.self.value.weight", "bert.encoder.layer.10.attention.self.value.bias", "bert.encoder.layer.10.attention.output.dense.weight", "bert.encoder.layer.10.attention.output.dense.bias", "bert.encoder.layer.10.attention.output.LayerNorm.weight", "bert.encoder.layer.10.attention.output.LayerNorm.bias", "bert.encoder.layer.10.intermediate.dense.weight", "bert.encoder.layer.10.intermediate.dense.bias", "bert.encoder.layer.10.output.dense.weight", "bert.encoder.layer.10.output.dense.bias", "bert.encoder.layer.10.output.LayerNorm.weight", "bert.encoder.layer.10.output.LayerNorm.bias", "bert.encoder.layer.11.attention.self.query.weight", "bert.encoder.layer.11.attention.self.query.bias", "bert.encoder.layer.11.attention.self.key.weight", "bert.encoder.layer.11.attention.self.key.bias", "bert.encoder.layer.11.attention.self.value.weight", "bert.encoder.layer.11.attention.self.value.bias", "bert.encoder.layer.11.attention.output.dense.weight", "bert.encoder.layer.11.attention.output.dense.bias", "bert.encoder.layer.11.attention.output.LayerNorm.weight", "bert.encoder.layer.11.attention.output.LayerNorm.bias", "bert.encoder.layer.11.intermediate.dense.weight", "bert.encoder.layer.11.intermediate.dense.bias", "bert.encoder.layer.11.output.dense.weight", "bert.encoder.layer.11.output.dense.bias", "bert.encoder.layer.11.output.LayerNorm.weight", "bert.encoder.layer.11.output.LayerNorm.bias", "bert.pooler.dense.weight", "bert.pooler.dense.bias", "top_features.attention.attention.0.weight", "top_features.attention.attention.0.bias", "top_features.attention.attention.2.weight", "top_features.attention.attention.2.bias", "top_features.feature_extractor.0.weight", "top_features.feature_extractor.0.bias", "top_features.feature_extractor.3.weight", "top_features.feature_extractor.3.bias", "narrative_features.attention.attention.0.weight", "narrative_features.attention.attention.0.bias", "narrative_features.attention.attention.2.weight", "narrative_features.attention.attention.2.bias", "narrative_features.feature_extractor.0.weight", "narrative_features.feature_extractor.0.bias", "narrative_features.feature_extractor.3.weight", "narrative_features.feature_extractor.3.bias", "subnarrative_features.attention.attention.0.weight", "subnarrative_features.attention.attention.0.bias", "subnarrative_features.attention.attention.2.weight", "subnarrative_features.attention.attention.2.bias", "subnarrative_features.feature_extractor.0.weight", "subnarrative_features.feature_extractor.0.bias", "subnarrative_features.feature_extractor.3.weight", "subnarrative_features.feature_extractor.3.bias", "top_classifier.weight", "top_classifier.bias", "narrative_classifier.weight", "narrative_classifier.bias", "subnarrative_classifier.weight", "subnarrative_classifier.bias". 
	Unexpected key(s) in state_dict: "model_state_dict", "label_mappings", "history". 

In [22]:
model.to('cuda')

HierarchicalClassifier(
  (bert): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)


In [23]:
res = []
for sentence in dev_sentences:
    res.append(predict_text(sentence, model, tokenizer, label_mappings))


# predict_with_debug("I am a pacifist and I believe in non-violence.", model, tokenizer, label_mappings)

In [24]:
res_df = pd.DataFrame(res)

In [25]:
res_df['filename'] = dev_df['filename']
res_df

Unnamed: 0,narrative,subnarrative,subsubnarrative,filename
0,Other,Other,Other,EN_UA_DEV_100012.txt
1,Other,Other,Other,EN_CC_200053.txt
2,CC,Criticism of institutions and authorities,Other,EN_CC_200040.txt
3,Other,Other,Other,EN_CC_200070.txt
4,URW,Other,Other,EN_UA_DEV_100034.txt
5,CC,Criticism of climate policies,Other,EN_CC_200049.txt
6,URW,Other,Other,EN_UA_DEV_100003.txt
7,Other,Other,Other,EN_UA_DEV_100033.txt
8,CC,Criticism of institutions and authorities,Other,EN_CC_200036.txt
9,CC,Criticism of institutions and authorities,Other,EN_CC_200079.txt


In [26]:
res_df["top"] = res_df["narrative"] + ": " + res_df["subnarrative"]
res_df["nar"] = res_df["top"] + ": " + res_df["subsubnarrative"]
res_df = res_df[["filename", "top", "nar"]]

In [27]:
res_df.to_csv("results/res_model_load.txt", index=False, sep="\t", header=False)