# Semeval 2025 Task 10
### Subtask 2: Narrative Classification

Given a news article and a [two-level taxonomy of narrative labels](https://propaganda.math.unipd.it/semeval2025task10/NARRATIVE-TAXONOMIES.pdf) (where each narrative is subdivided into subnarratives) from a particular domain, assign to the article all the appropriate subnarrative labels. This is a multi-label multi-class document classification task.

## 1. Importing libraries

We will start by importing the libraries needed.

In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import ModelCheckpoint

from matplotlib import pyplot as plt
import seaborn as sns
import os


In [2]:
!pip install InstructorEmbedding



## 2. Reading our data

In [3]:
raw_annotation_data = []

with open("data/semeval_data/subtask-2-annotations.txt", 'r') as file:
    for line in file:
        parts = line.strip().split('\t')
        article_id = parts[0]
        narrative_to_subnarratives = parts[2].split(';') # second part is the mapping from narrative to subnarrative
        narratives = []
        subnarratives = []

        # look to that narrative to subnarrative mapping
        for nar_to_sub in narrative_to_subnarratives:
          subnarrative_list = nar_to_sub.split(' ')
          if subnarrative_list[0] == 'Other':
            narratives.append('Other')
            subnarratives.append('Other')
            continue

          nar_to_sub = ' '.join(subnarrative_list[1:])
          nar, sub = nar_to_sub.split(':')
          narratives.append(nar.strip())
          subnarratives.append(sub.strip())

        raw_annotation_data.append({
            'article_id': article_id,
            'narratives': narratives,
            'subnarratives': subnarratives
        })

annotations_df = pd.DataFrame(raw_annotation_data)

In [4]:
annotations_df.sample(20)

Unnamed: 0,article_id,narratives,subnarratives
45,EN_UA_102963.txt,[Other],[Other]
21,EN_CC_100122.txt,[Other],[Other]
86,EN_UA_022339.txt,"[Speculating war outcomes, Discrediting Ukrain...","[Ukrainian army is collapsing, Situation in Uk..."
143,EN_UA_103025.txt,"[Criticism of climate policies, Criticism of c...","[Climate policies are ineffective, Climate pol..."
66,EN_UA_010735.txt,[Other],[Other]
195,EN_UA_013257.txt,"[Russia is the Victim, Blaming the war on othe...",[Russia actions in Ukraine are only self-defen...
15,EN_UA_022051.txt,"[Amplifying war-related fears, Blaming the war...","[Russia will also attack other countries, The ..."
63,EN_CC_100243.txt,[Other],[Other]
83,EN_UA_002991.txt,[Other],[Other]
116,EN_UA_029155.txt,"[Discrediting Ukraine, Distrust towards Media]",[Discrediting Ukrainian government and officia...


In [5]:
annotations_df.shape

(200, 3)

In [6]:
annotations_df.iloc[2]

article_id                                        EN_UA_021270.txt
narratives       [Speculating war outcomes, Discrediting Ukrain...
subnarratives    [Other, Situation in Ukraine is hopeless, West...
Name: 2, dtype: object

In [7]:
annotations_df.iloc[2].narratives

['Speculating war outcomes',
 'Discrediting Ukraine',
 'Discrediting the West, Diplomacy',
 'Praise of Russia',
 'Discrediting the West, Diplomacy']

In [8]:
annotations_df.iloc[2].subnarratives

['Other',
 'Situation in Ukraine is hopeless',
 'West is tired of Ukraine',
 'Praise of Russian military might',
 'The West does not care about Ukraine, only about its interests']

In [9]:
def read_file_content(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()


In [10]:
folder_path = "data/semeval_data/raw-documents"
documents_df = pd.DataFrame(columns=['article_id', 'content'])
for filename in os.listdir(folder_path):
    if filename.endswith('.txt') and filename.startswith('EN'):
        article_id = (filename)
        file_path = os.path.join(folder_path, filename)
        content = read_file_content(file_path)

        new_row = pd.DataFrame({'article_id': [article_id], 'content': [content]})

        documents_df = pd.concat([documents_df, new_row], ignore_index=True)

documents_df.head()

Unnamed: 0,article_id,content
0,EN_UA_104876.txt,Putin honours army unit blamed for Bucha massa...
1,EN_UA_023211.txt,Europe Putin thanks US journalist Tucker Carls...
2,EN_UA_011260.txt,Russia has a clear plan to resolve the conflic...
3,EN_UA_101067.txt,"First war of TikTok era sees tragedy, humor an..."
4,EN_UA_102963.txt,Ukraine's President Zelenskyy to address Mexic...


In [11]:
documents_df.shape

(200, 2)

In [12]:
dataset = pd.merge(documents_df, annotations_df, on='article_id')
dataset.head()

Unnamed: 0,article_id,content,narratives,subnarratives
0,EN_UA_104876.txt,Putin honours army unit blamed for Bucha massa...,[Other],[Other]
1,EN_UA_023211.txt,Europe Putin thanks US journalist Tucker Carls...,[Other],[Other]
2,EN_UA_011260.txt,Russia has a clear plan to resolve the conflic...,"[Russia is the Victim, Discrediting Ukraine, D...","[UA is anti-RU extremists, Ukraine is a hub fo..."
3,EN_UA_101067.txt,"First war of TikTok era sees tragedy, humor an...",[Other],[Other]
4,EN_UA_102963.txt,Ukraine's President Zelenskyy to address Mexic...,[Other],[Other]


In [13]:
def extract_article_id(filename):
    number_part = filename.split('_')[-1].split('.')[0]
    return number_part

print(extract_article_id('EN_UA_103861.txt'))

103861


In [14]:
dataset['article_id'] = dataset['article_id'].apply(extract_article_id)
dataset.head()

Unnamed: 0,article_id,content,narratives,subnarratives
0,104876,Putin honours army unit blamed for Bucha massa...,[Other],[Other]
1,23211,Europe Putin thanks US journalist Tucker Carls...,[Other],[Other]
2,11260,Russia has a clear plan to resolve the conflic...,"[Russia is the Victim, Discrediting Ukraine, D...","[UA is anti-RU extremists, Ukraine is a hub fo..."
3,101067,"First war of TikTok era sees tragedy, humor an...",[Other],[Other]
4,102963,Ukraine's President Zelenskyy to address Mexic...,[Other],[Other]


In [15]:
dataset.shape

(200, 4)

In [16]:
unique_narratives = dataset['narratives'].explode().unique()
unique_narratives

array(['Other', 'Russia is the Victim', 'Discrediting Ukraine',
       'Blaming the war on others rather than the invader',
       'Discrediting the West, Diplomacy',
       'Criticism of institutions and authorities',
       'Criticism of climate policies', 'Criticism of climate movement',
       'Hidden plots by secret schemes of powerful groups',
       'Controversy about green technologies',
       'Amplifying war-related fears', 'Downplaying climate change',
       'Speculating war outcomes', 'Overpraising the West',
       'Distrust towards Media',
       'Questioning the measurements and science', 'Praise of Russia',
       'Negative Consequences for the West',
       'Climate change is beneficial',
       'Green policies are geopolitical instruments'], dtype=object)

In [17]:
dataset['narratives'].explode().value_counts()

narratives
Other                                                97
Discrediting the West, Diplomacy                     50
Amplifying war-related fears                         43
Discrediting Ukraine                                 29
Criticism of institutions and authorities            24
Blaming the war on others rather than the invader    21
Criticism of climate movement                        18
Russia is the Victim                                 17
Speculating war outcomes                             17
Criticism of climate policies                        16
Hidden plots by secret schemes of powerful groups    16
Praise of Russia                                     12
Overpraising the West                                10
Distrust towards Media                               10
Controversy about green technologies                  9
Questioning the measurements and science              8
Negative Consequences for the West                    7
Downplaying climate change           

In [18]:
unique_subnarratives = dataset['subnarratives'].explode().unique()
unique_subnarratives

array(['Other', 'UA is anti-RU extremists',
       'Ukraine is a hub for criminal activities',
       'Ukraine is associated with nazism', 'Ukraine is the aggressor',
       'The West are the aggressors', 'Ukraine is a puppet of the West',
       'Diplomacy does/will not work',
       'Criticism of political organizations and figures',
       'Climate movement is corrupt',
       'Ad hominem attacks on key activists', 'Blaming global elites',
       'Climate policies are ineffective',
       'By continuing the war we risk WWIII',
       'There is a real possibility that nuclear weapons will be employed',
       'Ice is not melting', 'Climate cycles are natural',
       'Criticism of international entities',
       'Russia will also attack other countries',
       'Criticism of national governments',
       'Discrediting Ukrainian military', 'Ukrainian army is collapsing',
       'Discrediting Ukrainian government and officials and policies',
       'West is tired of Ukraine',
       'C

In [19]:
dataset['subnarratives'].explode().value_counts()

subnarratives
Other                                                                     151
The West are the aggressors                                                18
There is a real possibility that nuclear weapons will be employed          16
Criticism of national governments                                          12
Russia will also attack other countries                                    11
Western media is an instrument of propaganda                                9
The West does not care about Ukraine, only about its interests              8
Ad hominem attacks on key activists                                         8
Ukraine is a puppet of the West                                             8
Criticism of political organizations and figures                            7
Diplomacy does/will not work                                                7
The West belongs in the right side of history                               7
The West is weak                                  

Our initial labels look something like this:

```
data = [
    ["Class A", "Class B"],
    ["Class B", "Class C"],
    ["Class A"]
]
```

The fit step identifies the unique labels across the dataset (Class A, Class B, Class C).
And then we get:

```
[[1 1 0]
 [0 1 1]
 [1 0 0]]
 ```

In [20]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb_narratives = MultiLabelBinarizer()
mlb_subnarratives = MultiLabelBinarizer()

In [21]:
dataset['narratives']

0                                                [Other]
1                                                [Other]
2      [Russia is the Victim, Discrediting Ukraine, D...
3                                                [Other]
4                                                [Other]
                             ...                        
195    [Criticism of institutions and authorities, Cr...
196                   [Discrediting the West, Diplomacy]
197                                              [Other]
198                       [Amplifying war-related fears]
199                           [Speculating war outcomes]
Name: narratives, Length: 200, dtype: object

In [22]:
narratives_binary = mlb_narratives.fit_transform(dataset['narratives'])
subnarratives_binary = mlb_subnarratives.fit_transform(dataset['subnarratives'])

dataset['narratives_encoded'] = narratives_binary.tolist()
dataset['subnarratives_encoded'] = subnarratives_binary.tolist()

In [23]:
dataset.head()

Unnamed: 0,article_id,content,narratives,subnarratives,narratives_encoded,subnarratives_encoded
0,104876,Putin honours army unit blamed for Bucha massa...,[Other],[Other],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,23211,Europe Putin thanks US journalist Tucker Carls...,[Other],[Other],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,11260,Russia has a clear plan to resolve the conflic...,"[Russia is the Victim, Discrediting Ukraine, D...","[UA is anti-RU extremists, Ukraine is a hub fo...","[0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,101067,"First war of TikTok era sees tragedy, humor an...",[Other],[Other],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,102963,Ukraine's President Zelenskyy to address Mexic...,[Other],[Other],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [24]:
len(unique_narratives)

20

In [25]:
len(unique_subnarratives)

58

In [26]:
import spacy
import re
import emoji

nlp = spacy.load("en_core_web_sm")

def clean_article(article_text):
    # Remove URLs
    article_text = re.sub(r'http\S+|www\S+|https\S+|[a-zA-Z0-9.-]+\.com', '', article_text, flags=re.MULTILINE)

    doc = nlp(article_text)
    cleaned_tokens = []

    for token in doc:
        if (token.is_space or '@' in token.text or emoji.is_emoji(token.text)):
            continue

        cleaned_tokens.append(token.text + token.whitespace_)

    cleaned_article = "".join(cleaned_tokens).strip()

    return cleaned_article

In [27]:
dataset['cleaned_content'] = dataset['content'].apply(clean_article)

In [28]:
import warnings
from sklearn.metrics import classification_report, confusion_matrix

def get_classification_report(y_true, y_pred):
  # We will ignore the warnings we get
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        report = classification_report(y_true, y_pred, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    return report_df

def plot_confusion_matrix(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()
    return conf_matrix