# Semeval 2025 Task 10
### Subtask 1: Entity Framing -- Multilingual Model

Given a news article and a list of mentions of named entities (NEs) in the article, assign for each such mention one or more roles using a predefined taxonomy of fine-grained roles covering three main type of roles: protagonists, antagonists, and innocent. This is a multi-label multi-class text-span classification task.

In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import ModelCheckpoint

from matplotlib import pyplot as plt
import seaborn as sns
import os

In [2]:
data = []
ignore_folders = ['.DS_Store']

base_dir_documents = 'data/semeval_data/raw-documents'

for language_folder in os.listdir(base_dir_documents):
    
    if language_folder in ignore_folders:
        continue

    language_path = os.path.join(base_dir_documents, language_folder)
    if os.path.isdir(language_path):
        for root, _, files in os.walk(language_path):
            for file in files:
                if file.endswith('.txt'):
                    file_path = os.path.join(root, file)
                    
                    article_id = file
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                    
                    data.append({
                        'language': language_folder,
                        'article_id': article_id,
                        'content': content
                    })

documents_df = pd.DataFrame(data)

In [3]:
documents_df.shape

(726, 3)

In [4]:
documents_df.sample(5)

Unnamed: 0,language,article_id,content
610,EN,EN_UA_102655.txt,BIOWARFARE ON AMERICANS: RFK Jr. says CIA was ...
317,BG,A9_BG_3762.txt,"Американски професор: Зеленски е убиец, Русия ..."
407,BG,BG_1390.txt,Шефове на най-големите световни петролни компа...
445,HI,HI_138.txt,Climate Change: सीएम विष्णुदेव साय ने किसे बता...
486,HI,HI_6.txt,- Hindi News\n- International\n- Russia Ukrain...


In [5]:
documents_df['article_id'].unique

<bound method Series.unique of 0             PT_53.txt
1             PT_47.txt
2             PT_90.txt
3             PT_84.txt
4            PT_166.txt
             ...       
721    EN_CC_100076.txt
722    EN_UA_103251.txt
723    EN_UA_002991.txt
724    EN_UA_008072.txt
725    EN_UA_015962.txt
Name: article_id, Length: 726, dtype: object>

In [6]:
base_dir_labels = 'data/semeval_data/labels'

raw_annotation_data = []

for language_folder in os.listdir(base_dir_labels):
    
    if language_folder in ignore_folders:
        continue
    
    language_path = os.path.join(base_dir_labels, language_folder)
    if os.path.isdir(language_path):
        for root, _, files in os.walk(language_path):
            label_file = 'subtask-1-annotations.txt'
            file_path = os.path.join(root, label_file)
            with open(file_path, 'r', encoding='utf-8') as file:
                for line in file:
                    parts = line.strip().split('\t')
                    article_id = parts[0]
                    entity_mention = parts[1]
                    start_offset = int(parts[2])
                    end_offset = int(parts[3])
                    main_role = parts[4]
                                
                    sub_roles = parts[5:] 
                    raw_annotation_data.append({
                        "article_id": article_id,
                        "entity_mention": entity_mention,
                        "start_offset": start_offset,
                        "end_offset": end_offset,
                        "main_role": main_role,
                        "sub_roles": sub_roles,
                    })
   
annotations_df = pd.DataFrame(raw_annotation_data)

In [7]:
annotations_df.head()

Unnamed: 0,article_id,entity_mention,start_offset,end_offset,main_role,sub_roles
0,PT_161.txt,Portugal,377,384,Innocent,[Victim]
1,PT_161.txt,França,1072,1077,Innocent,[Victim]
2,PT_161.txt,IPMA,2158,2161,Protagonist,[Guardian]
3,PT_13.txt,Ucrânia,184,190,Innocent,[Victim]
4,PT_196.txt,Engajamundo,421,431,Protagonist,[Rebel]


In [8]:
annotations_df.shape

(2535, 6)

In [9]:
dataset = pd.merge(documents_df, annotations_df, on='article_id')
dataset.head()

Unnamed: 0,language,article_id,content,entity_mention,start_offset,end_offset,main_role,sub_roles
0,PT,PT_53.txt,Mais de 1600 Cientistas Negam a Emergência Cli...,Ivar Giaever,937,948,Protagonist,[Guardian]
1,PT,PT_53.txt,Mais de 1600 Cientistas Negam a Emergência Cli...,John F. Clauser,952,966,Protagonist,[Guardian]
2,PT,PT_47.txt,Zelensky admite a inevitabilidade da retirada ...,Ucrânia,857,863,Innocent,[Forgotten]
3,PT,PT_84.txt,Zelensky planeia segunda cimeira de paz em nov...,Volodymyr Zelensky,113,130,Protagonist,[Peacemaker]
4,PT,PT_84.txt,Zelensky planeia segunda cimeira de paz em nov...,Rússia,372,377,Antagonist,[Foreign Adversary]


In [10]:
def extract_article_id(filename):
    number_part = filename.split('_')[-1].split('.')[0]
    return number_part

print(extract_article_id('EN_UA_103861.txt'))

103861


In [11]:
dataset['article_id'] = dataset['article_id'].apply(extract_article_id)

In [12]:
dataset.head()

Unnamed: 0,language,article_id,content,entity_mention,start_offset,end_offset,main_role,sub_roles
0,PT,53,Mais de 1600 Cientistas Negam a Emergência Cli...,Ivar Giaever,937,948,Protagonist,[Guardian]
1,PT,53,Mais de 1600 Cientistas Negam a Emergência Cli...,John F. Clauser,952,966,Protagonist,[Guardian]
2,PT,47,Zelensky admite a inevitabilidade da retirada ...,Ucrânia,857,863,Innocent,[Forgotten]
3,PT,84,Zelensky planeia segunda cimeira de paz em nov...,Volodymyr Zelensky,113,130,Protagonist,[Peacemaker]
4,PT,84,Zelensky planeia segunda cimeira de paz em nov...,Rússia,372,377,Antagonist,[Foreign Adversary]


In [13]:
dataset.shape

(2535, 8)

In [14]:
dataset['main_role'].value_counts()

main_role
Antagonist     1234
Protagonist     737
Innocent        564
Name: count, dtype: int64

In [15]:
dataset['sub_roles'].explode().value_counts()

sub_roles
Victim               478
Foreign Adversary    363
Guardian             285
Virtuous             218
Instigator           207
Peacemaker           163
Incompetent          144
Tyrant               127
Conspirator          120
Deceiver             112
Terrorist            101
Corrupt               90
Rebel                 60
Saboteur              52
Exploited             49
Underdog              43
Traitor               31
Scapegoat             26
Forgotten             24
Bigot                 23
Martyr                20
Spy                   16
Name: count, dtype: int64

In [16]:
def get_context(row, window=150):
    content = row['content']
    start = int(row['start_offset'])
    end = int(row['end_offset'])

    words = content.split()

    pre_entity_text = content[:start].split()
    post_entity_text = content[end + 1:].split()

    context_before = " ".join(pre_entity_text[-window:])
    context_after = " ".join(post_entity_text[:window])

    return context_before, context_after

dataset['context_before'], dataset['context_after'] = zip(*dataset.apply(get_context, axis=1))
dataset['entity_context'] = dataset['context_before'] + " " + dataset['entity_mention'] + " " + dataset['context_after']

In [17]:
dataset.drop(columns=['context_before', 'context_after'], inplace=True)
dataset.head()

Unnamed: 0,language,article_id,content,entity_mention,start_offset,end_offset,main_role,sub_roles,entity_context
0,PT,53,Mais de 1600 Cientistas Negam a Emergência Cli...,Ivar Giaever,937,948,Protagonist,[Guardian],Mais de 1600 Cientistas Negam a Emergência Cli...
1,PT,53,Mais de 1600 Cientistas Negam a Emergência Cli...,John F. Clauser,952,966,Protagonist,[Guardian],Mais de 1600 Cientistas Negam a Emergência Cli...
2,PT,47,Zelensky admite a inevitabilidade da retirada ...,Ucrânia,857,863,Innocent,[Forgotten],Zelensky admite a inevitabilidade da retirada ...
3,PT,84,Zelensky planeia segunda cimeira de paz em nov...,Volodymyr Zelensky,113,130,Protagonist,[Peacemaker],Zelensky planeia segunda cimeira de paz em nov...
4,PT,84,Zelensky planeia segunda cimeira de paz em nov...,Rússia,372,377,Antagonist,[Foreign Adversary],Zelensky planeia segunda cimeira de paz em nov...


In [18]:
!python -m  spacy download xx_ent_wiki_sm
!python -m  spacy download en_core_web_sm
!python -m  spacy download pt_core_news_sm

zsh:1: command not found: python
zsh:1: command not found: python
zsh:1: command not found: python


In [19]:
language_model_map = {
    "BG": "xx_ent_wiki_sm",
    "EN": "en_core_web_sm",
    "HI": "xx_ent_wiki_sm", 
    "PT": "pt_core_news_sm",
}

In [20]:
!pip3 install emoji



In [21]:
import spacy
import re
import emoji

nlp_models = {lang: spacy.load(model) for lang, model in language_model_map.items()}

def clean_article(article_text, language_code):
    nlp = nlp_models.get(language_code, nlp_models["BG"])

    article_text = re.sub(r'http\S+|www\S+|https\S+|[a-zA-Z0-9.-]+\.com', '', article_text, flags=re.MULTILINE)
    doc = nlp(article_text)
    cleaned_tokens = [
        token.text + token.whitespace_ for token in doc
        if not (token.is_space or '@' in token.text or emoji.is_emoji(token.text) or token.like_num)
    ]
    return "".join(cleaned_tokens).strip()

dataset["entity_context"] = dataset.apply(lambda row: clean_article(row["entity_context"], row["language"]), axis=1)

In [22]:
from sklearn.preprocessing import LabelEncoder

main_role_encoder = LabelEncoder()
dataset['main_role_encoded'] = main_role_encoder.fit_transform(dataset['main_role'])

In [23]:
dataset.head()

Unnamed: 0,language,article_id,content,entity_mention,start_offset,end_offset,main_role,sub_roles,entity_context,main_role_encoded
0,PT,53,Mais de 1600 Cientistas Negam a Emergência Cli...,Ivar Giaever,937,948,Protagonist,[Guardian],Mais de Cientistas Negam a Emergência Climátic...,2
1,PT,53,Mais de 1600 Cientistas Negam a Emergência Cli...,John F. Clauser,952,966,Protagonist,[Guardian],Mais de Cientistas Negam a Emergência Climátic...,2
2,PT,47,Zelensky admite a inevitabilidade da retirada ...,Ucrânia,857,863,Innocent,[Forgotten],Zelensky admite a inevitabilidade da retirada ...,1
3,PT,84,Zelensky planeia segunda cimeira de paz em nov...,Volodymyr Zelensky,113,130,Protagonist,[Peacemaker],Zelensky planeia segunda cimeira de paz em nov...,2
4,PT,84,Zelensky planeia segunda cimeira de paz em nov...,Rússia,372,377,Antagonist,[Foreign Adversary],Zelensky planeia segunda cimeira de paz em nov...,0


In [24]:
from transformers import XLMRobertaTokenizer

tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")



In [25]:
def tokenize_data(entity_contexts, max_length=512):
    encodings = tokenizer(entity_contexts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    return encodings

In [26]:
from sklearn.model_selection import train_test_split

def split_data(data, train_size=0.8, val_size_ratio=0.5):
    train, temp = train_test_split(data, train_size=train_size, shuffle=True)
    
    val_size = (1 - train_size) * val_size_ratio
    
    test, val = train_test_split(temp, train_size=val_size, shuffle=True)
    
    return train, val, test

In [27]:
dataset_train, dataset_val, dataset_test = split_data(dataset)

In [29]:
train_encodings = tokenize_data(dataset_train['entity_context'].tolist())
val_encodings = tokenize_data(dataset_val['entity_context'].tolist())

In [30]:
train_main_roles_truths = dataset_train['main_role_encoded'].tolist()

train_main_roles_truths[:5]

[0, 0, 0, 0, 1]

In [31]:
val_main_roles_truths = dataset_val['main_role_encoded'].tolist()

val_main_roles_truths[:5]

[2, 2, 0, 0, 0]

In [32]:
from transformers import XLMRobertaModel
import torch.nn as nn

class MultilingualRoleClassifier(nn.Module):
    def __init__(self, main_roles_len):
        super(MultilingualRoleClassifier, self).__init__()
        self.backbone = XLMRobertaModel.from_pretrained("xlm-roberta-base")
        self.main_role_classifier = nn.Linear(self.backbone.config.hidden_size, main_roles_len)

    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]
        main_role_logits = self.main_role_classifier(pooled_output)
        return main_role_logits

In [33]:
from torch.utils.data import DataLoader, Dataset
import torch
import torch.optim as optim
from torch.nn import CrossEntropyLoss

model_config = {
    'batch_size': 8,
    'lr': 5e-5
}

class RoleDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

train_dataset = RoleDataset(input_ids=train_encodings['input_ids'],
                             attention_mask=train_encodings['attention_mask'],
                             labels=train_main_roles_truths
                           )
train_loader = DataLoader(train_dataset, batch_size=model_config['batch_size'], shuffle=True)

val_dataset = RoleDataset(input_ids=val_encodings['input_ids'],
                             attention_mask=val_encodings['attention_mask'],
                             labels=val_main_roles_truths
                           )

val_loader = DataLoader(val_dataset, batch_size=model_config['batch_size'])

In [34]:
model = MultilingualRoleClassifier(main_roles_len=len(main_role_encoder.classes_))
optimizer = optim.AdamW(model.parameters(), lr=model_config['lr'])
loss_fn = CrossEntropyLoss()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [35]:
def freeze_roberta_layers(model, num_layers_to_freeze=4):
  for i in range(num_layers_to_freeze):
    for param in model.backbone.encoder.layer[i].parameters():
      param.requires_grad = False

  for param in model.main_role_classifier.parameters():
    param.requires_grad = True

In [36]:
freeze_roberta_layers(model)

In [37]:
def train(train_loader, val_loader, model, optimizer, num_epochs=5):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)

            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask)
                loss = loss_fn(outputs, labels)
                val_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss (Avg): {train_loss/len(train_loader)}, Val Loss (Avg): {val_loss/len(val_loader)}")

In [None]:
train(train_loader, val_loader, model, optimizer, num_epochs=5)