# Semeval 2025 Task 10
### Subtask 2: Narrative Baseline Classification -- Multilingual

Given a news article and a [two-level taxonomy of narrative labels](https://propaganda.math.unipd.it/semeval2025task10/NARRATIVE-TAXONOMIES.pdf) (where each narrative is subdivided into subnarratives) from a particular domain, assign to the article all the appropriate subnarrative labels. This is a multi-label multi-class document classification task.

## 1. Setup

### 1.1 Getting and analyzing data

In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import ModelCheckpoint

from matplotlib import pyplot as plt
import seaborn as sns
import os

In [2]:
data = []
ignore_folders = ['.DS_Store']

base_dir_documents = '../data/semeval_data/train/raw-documents'

for language_folder in os.listdir(base_dir_documents):

    if language_folder in ignore_folders:
        continue

    language_path = os.path.join(base_dir_documents, language_folder)
    if os.path.isdir(language_path):
        for root, _, files in os.walk(language_path):
            for file in files:
                if file.endswith('.txt'):
                    file_path = os.path.join(root, file)

                    article_id = file
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()

                    data.append({
                        'language': language_folder,
                        'article_id': article_id,
                        'content': content
                    })

documents_df = pd.DataFrame(data)

In [3]:
print(documents_df.shape)
documents_df.head()

(1709, 3)


Unnamed: 0,language,article_id,content
0,RU,RU-URW-1161.txt,В ближайшие два месяца США будут стремиться к ...
1,RU,RU-URW-1175.txt,В ЕС испугались последствий популярности правы...
2,RU,RU-URW-1149.txt,Возможность признания Аллы Пугачевой иностранн...
3,RU,RU-URW-1015.txt,Азаров рассказал о смене риторики Киева по пер...
4,RU,RU-URW-1001.txt,В россиянах проснулась массовая любовь к путеш...


In [4]:
base_dir_labels = '../data/semeval_data/train/labels'

raw_annotation_data = []

for language_folder in os.listdir(base_dir_labels):

    if language_folder in ignore_folders:
        continue

    print('Now processing language', language_folder)

    language_path = os.path.join(base_dir_labels, language_folder)
    if os.path.isdir(language_path):
        for root, _, files in os.walk(language_path):
            label_file = 'subtask-2-annotations.txt'
            file_path = os.path.join(root, label_file)

            with open(file_path, 'r') as file:
                for line in file:
                    parts = line.strip().split('\t')
                    article_id = parts[0]
                    narrative_to_subnarratives = parts[2].split(';')
                    narratives = []
                    subnarratives = []

                    for nar_to_sub in narrative_to_subnarratives:
                      subnarrative_list = nar_to_sub.split(' ')
                      if subnarrative_list[0] == 'Other':
                        narratives.append('Other')
                        subnarratives.append('Other')
                        continue

                      nar_to_sub = ' '.join(subnarrative_list[1:])
                      nar, sub = nar_to_sub.split(':')
                      narratives.append(nar.strip())
                      subnarratives.append(sub.strip())

                    raw_annotation_data.append({
                        'article_id': article_id,
                        'narratives': narratives,
                        'subnarratives': subnarratives
                    })

annotations_df = pd.DataFrame(raw_annotation_data)

Now processing language RU
Now processing language PT
Now processing language BG
Now processing language HI
Now processing language EN


In [5]:
from collections import defaultdict

narrative_to_subnarratives = defaultdict(set)

for record in raw_annotation_data:
    narratives = record['narratives']
    subnarratives = record['subnarratives']

    for nar, sub in zip(narratives, subnarratives):
        narrative_to_subnarratives[nar].add(sub)

narrative_to_subnarratives = {nar: list(subs) for nar, subs in narrative_to_subnarratives.items()}

In [6]:
narrative_to_subnarratives

{'Discrediting Ukraine': ['Discrediting Ukrainian military',
  'Situation in Ukraine is hopeless',
  'Ukraine is a hub for criminal activities',
  'Discrediting Ukrainian nation and society',
  'Ukraine is associated with nazism',
  'Other',
  'Rewriting Ukraine’s history',
  'Discrediting Ukrainian government and officials and policies',
  'Ukraine is a puppet of the West'],
 'Discrediting the West, Diplomacy': ['Diplomacy does/will not work',
  'The West is weak',
  'West is tired of Ukraine',
  'Other',
  'The West does not care about Ukraine, only about its interests',
  'The EU is divided',
  'The West is overreacting'],
 'Praise of Russia': ['Russia is a guarantor of peace and prosperity',
  'Russian invasion has strong national support',
  'Other',
  'Russia has international support from a number of countries and people',
  'Praise of Russian President Vladimir Putin',
  'Praise of Russian military might'],
 'Russia is the Victim': ['The West is russophobic',
  'UA is anti-RU e

In [7]:
annotations_df.head()

Unnamed: 0,article_id,narratives,subnarratives
0,RU-URW-1080.txt,[Discrediting Ukraine],[Discrediting Ukrainian government and officia...
1,RU-URW-1013.txt,"[Discrediting the West, Diplomacy]","[The West does not care about Ukraine, only ab..."
2,RU-URW-1145.txt,[Praise of Russia],[Praise of Russian military might]
3,RU-URW-1048.txt,[Discrediting Ukraine],[Discrediting Ukrainian military]
4,RU-URW-1001.txt,[Praise of Russia],[Russia is a guarantor of peace and prosperity]


In [8]:
annotations_df.tail()

Unnamed: 0,article_id,narratives,subnarratives
1694,EN_CC_200022.txt,"[Criticism of institutions and authorities, Cr...","[Criticism of national governments, Other, Met..."
1695,EN_CC_100028.txt,[Other],[Other]
1696,EN_CC_300010.txt,[Amplifying Climate Fears],[Other]
1697,EN_UA_013257.txt,"[Russia is the Victim, Blaming the war on othe...",[Russia actions in Ukraine are only self-defen...
1698,EN_UA_000104.txt,[Other],[Other]


In [9]:
annotations_df.shape

(1699, 3)

In [10]:
dataset = pd.merge(documents_df, annotations_df, on='article_id')
dataset.head()

Unnamed: 0,language,article_id,content,narratives,subnarratives
0,RU,RU-URW-1161.txt,В ближайшие два месяца США будут стремиться к ...,[Blaming the war on others rather than the inv...,"[The West are the aggressors, Other, The West ..."
1,RU,RU-URW-1175.txt,В ЕС испугались последствий популярности правы...,"[Discrediting the West, Diplomacy, Discreditin...","[The West is weak, Other, The EU is divided]"
2,RU,RU-URW-1149.txt,Возможность признания Аллы Пугачевой иностранн...,[Distrust towards Media],[Western media is an instrument of propaganda]
3,RU,RU-URW-1015.txt,Азаров рассказал о смене риторики Киева по пер...,"[Discrediting Ukraine, Discrediting Ukraine]","[Ukraine is a puppet of the West, Discrediting..."
4,RU,RU-URW-1001.txt,В россиянах проснулась массовая любовь к путеш...,[Praise of Russia],[Russia is a guarantor of peace and prosperity]


In [11]:
dataset.shape

(1699, 5)

In [12]:
dataset.head()

Unnamed: 0,language,article_id,content,narratives,subnarratives
0,RU,RU-URW-1161.txt,В ближайшие два месяца США будут стремиться к ...,[Blaming the war on others rather than the inv...,"[The West are the aggressors, Other, The West ..."
1,RU,RU-URW-1175.txt,В ЕС испугались последствий популярности правы...,"[Discrediting the West, Diplomacy, Discreditin...","[The West is weak, Other, The EU is divided]"
2,RU,RU-URW-1149.txt,Возможность признания Аллы Пугачевой иностранн...,[Distrust towards Media],[Western media is an instrument of propaganda]
3,RU,RU-URW-1015.txt,Азаров рассказал о смене риторики Киева по пер...,"[Discrediting Ukraine, Discrediting Ukraine]","[Ukraine is a puppet of the West, Discrediting..."
4,RU,RU-URW-1001.txt,В россиянах проснулась массовая любовь к путеш...,[Praise of Russia],[Russia is a guarantor of peace and prosperity]


In [13]:
row = 5
english_article = dataset[dataset['language'] == 'EN'].iloc[row].content
english_article

'Trump Lawyer Demands Accountability From Intel Chiefs Who Backed Hunter Biden \n\n An attorney for former President Donald Trump wants the 51 former intelligence chiefs held responsible for backing Hunter Biden in the unfolding story of the laptop abandoned in a Delaware repair shop.\n\nLawyer Tim Parlatore\'s goal is to uncover alleged communications between the 51 former senior intel leaders and the Biden 2020 campaign.\n\nPolitico had reported that an Oct. 19, 2020, letter, signed by the former intelligence officials, outlined their assessment that a New York Post disclosure of emails allegedly belonging to Hunter Biden "has all the classic earmarks of a Russia information operation."\n\nThose signing the letter included former CIA Directors Leon Panetta, Mike Hayden and John Brennan, along with former Director of National Intelligence James Clapper.\n\nThe letter offered no evidence, but raised suspicions by the former intel officials.\n\nThe Post had previously reported that duri

In [14]:
dataset.shape

(1699, 5)

In [15]:
dataset['narratives']

0       [Blaming the war on others rather than the inv...
1       [Discrediting the West, Diplomacy, Discreditin...
2                                [Distrust towards Media]
3            [Discrediting Ukraine, Discrediting Ukraine]
4                                      [Praise of Russia]
                              ...                        
1694                       [Amplifying war-related fears]
1695    [Criticism of climate movement, Downplaying cl...
1696    [Criticism of institutions and authorities, Co...
1697                           [Speculating war outcomes]
1698                           [Amplifying Climate Fears]
Name: narratives, Length: 1699, dtype: object

In [16]:
unique_narratives = dataset['narratives'].explode().unique()
unique_narratives

array(['Blaming the war on others rather than the invader',
       'Discrediting the West, Diplomacy',
       'Hidden plots by secret schemes of powerful groups',
       'Discrediting Ukraine', 'Praise of Russia',
       'Distrust towards Media', 'Russia is the Victim',
       'Negative Consequences for the West', 'Speculating war outcomes',
       'Amplifying war-related fears', 'Overpraising the West',
       'Downplaying climate change',
       'Criticism of institutions and authorities',
       'Questioning the measurements and science',
       'Climate change is beneficial', 'Criticism of climate policies',
       'Criticism of climate movement', 'Amplifying Climate Fears',
       'Other', 'Controversy about green technologies',
       'Green policies are geopolitical instruments'], dtype=object)

In [17]:
print(len(dataset['narratives'].explode().value_counts()))
dataset['narratives'].explode().value_counts()

21


narratives
Discrediting Ukraine                                 584
Discrediting the West, Diplomacy                     452
Praise of Russia                                     406
Amplifying Climate Fears                             357
Other                                                324
Amplifying war-related fears                         297
Russia is the Victim                                 229
Criticism of institutions and authorities            216
Blaming the war on others rather than the invader    194
Speculating war outcomes                             132
Criticism of climate policies                        127
Negative Consequences for the West                   104
Criticism of climate movement                         84
Hidden plots by secret schemes of powerful groups     84
Downplaying climate change                            68
Distrust towards Media                                53
Overpraising the West                                 51
Controversy about gr

In [18]:
unique_subnarratives = dataset['subnarratives'].explode().unique()
unique_subnarratives

array(['The West are the aggressors', 'Other', 'The West is weak',
       'Ukraine is a puppet of the West',
       'Ukraine is associated with nazism',
       'Russia is a guarantor of peace and prosperity',
       'The West does not care about Ukraine, only about its interests',
       'The EU is divided',
       'Western media is an instrument of propaganda',
       'Discrediting Ukrainian government and officials and policies',
       'The West is overreacting', 'UA is anti-RU extremists',
       'Discrediting Ukrainian nation and society',
       'Discrediting Ukrainian military',
       'Ukrainian media cannot be trusted',
       'Praise of Russian military might', 'The West is russophobic',
       'Ukrainian army is collapsing',
       'Russia has international support from a number of countries and people',
       'Praise of Russian President Vladimir Putin',
       'By continuing the war we risk WWIII', 'Ukraine is the aggressor',
       'Russia actions in Ukraine are only sel

In [19]:
len(unique_subnarratives)

74

In [20]:
pd.set_option('display.max_rows', 100)

dataset['subnarratives'].explode().value_counts()

subnarratives
Other                                                                     1164
Amplifying existing fears of global warming                                178
Discrediting Ukrainian government and officials and policies               157
Praise of Russian military might                                           145
The West are the aggressors                                                112
Ukraine is a puppet of the West                                            106
Russia is a guarantor of peace and prosperity                              101
Discrediting Ukrainian military                                            100
There is a real possibility that nuclear weapons will be employed           96
Criticism of national governments                                           85
The West does not care about Ukraine, only about its interests              85
Ukraine is the aggressor                                                    78
Russia has international support from 

### 1.2 Encoding classification labels

In [21]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb_narratives = MultiLabelBinarizer()
mlb_subnarratives = MultiLabelBinarizer()

In [22]:
narratives_binary = mlb_narratives.fit_transform(dataset['narratives'])
subnarratives_binary = mlb_subnarratives.fit_transform(dataset['subnarratives'])

dataset['narratives_encoded'] = narratives_binary.tolist()
dataset['subnarratives_encoded'] = subnarratives_binary.tolist()

In [23]:
# Finding narratives to subnarratives indices mapping
narrative_to_sub_map = {}
narrative_classes = list(mlb_narratives.classes_)
subnarrative_classes = list(mlb_subnarratives.classes_)

for narrative, subnarratives in narrative_to_subnarratives.items():
    narrative_idx = narrative_classes.index(narrative)
    subnarrative_indices = [subnarrative_classes.index(sub) for sub in subnarratives]
    narrative_to_sub_map[narrative_idx] = subnarrative_indices

print(narrative_to_sub_map)

{8: [21, 50, 64, 22, 66, 33, 39, 20, 65], 9: [19, 60, 71, 33, 56, 53, 58], 17: [42, 46, 33, 41, 34, 35], 19: [59, 63, 40, 33], 10: [72, 69, 33], 1: [62, 33, 3, 43, 31], 16: [32, 57, 55, 33], 2: [67, 54, 33], 20: [45, 44, 68, 33], 13: [2, 6, 33], 14: [61, 47, 33], 0: [24, 73, 33, 23, 1], 15: [33], 7: [17, 14, 33, 16, 15], 5: [9, 8, 0, 33], 11: [7, 4, 28, 49, 33, 27, 70, 51, 29], 6: [12, 10, 11, 33], 18: [18, 48, 33, 26, 30], 3: [5, 52, 33], 4: [36, 37, 38, 33], 12: [33, 13, 25]}


In [24]:
# Finding subnarrative to narratives indices mapping
subnarrative_to_narrative_map = {}
for narrative_idx, subnarrative_indices in narrative_to_sub_map.items():
    for subnarrative_idx in subnarrative_indices:
        subnarrative_to_narrative_map[subnarrative_idx] = narrative_idx
        subnarrative_classes[subnarrative_idx] = narrative_classes[narrative_idx]

print(subnarrative_to_narrative_map)

{21: 8, 50: 8, 64: 8, 22: 8, 66: 8, 33: 12, 39: 8, 20: 8, 65: 8, 19: 9, 60: 9, 71: 9, 56: 9, 53: 9, 58: 9, 42: 17, 46: 17, 41: 17, 34: 17, 35: 17, 59: 19, 63: 19, 40: 19, 72: 10, 69: 10, 62: 1, 3: 1, 43: 1, 31: 1, 32: 16, 57: 16, 55: 16, 67: 2, 54: 2, 45: 20, 44: 20, 68: 20, 2: 13, 6: 13, 61: 14, 47: 14, 24: 0, 73: 0, 23: 0, 1: 0, 17: 7, 14: 7, 16: 7, 15: 7, 9: 5, 8: 5, 0: 5, 7: 11, 4: 11, 28: 11, 49: 11, 27: 11, 70: 11, 51: 11, 29: 11, 12: 6, 10: 6, 11: 6, 18: 18, 48: 18, 26: 18, 30: 18, 5: 3, 52: 3, 36: 4, 37: 4, 38: 4, 13: 12, 25: 12}


In [25]:
dataset.head()

Unnamed: 0,language,article_id,content,narratives,subnarratives,narratives_encoded,subnarratives_encoded
0,RU,RU-URW-1161.txt,В ближайшие два месяца США будут стремиться к ...,[Blaming the war on others rather than the inv...,"[The West are the aggressors, Other, The West ...","[0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,RU,RU-URW-1175.txt,В ЕС испугались последствий популярности правы...,"[Discrediting the West, Diplomacy, Discreditin...","[The West is weak, Other, The EU is divided]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,RU,RU-URW-1149.txt,Возможность признания Аллы Пугачевой иностранн...,[Distrust towards Media],[Western media is an instrument of propaganda],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,RU,RU-URW-1015.txt,Азаров рассказал о смене риторики Киева по пер...,"[Discrediting Ukraine, Discrediting Ukraine]","[Ukraine is a puppet of the West, Discrediting...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,RU,RU-URW-1001.txt,В россиянах проснулась массовая любовь к путеш...,[Praise of Russia],[Russia is a guarantor of peace and prosperity],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [26]:

dataset['narratives'].explode().value_counts()

narratives
Discrediting Ukraine                                 584
Discrediting the West, Diplomacy                     452
Praise of Russia                                     406
Amplifying Climate Fears                             357
Other                                                324
Amplifying war-related fears                         297
Russia is the Victim                                 229
Criticism of institutions and authorities            216
Blaming the war on others rather than the invader    194
Speculating war outcomes                             132
Criticism of climate policies                        127
Negative Consequences for the West                   104
Criticism of climate movement                         84
Hidden plots by secret schemes of powerful groups     84
Downplaying climate change                            68
Distrust towards Media                                53
Overpraising the West                                 51
Controversy about gr

In [27]:
subnarratives_counts = dataset['subnarratives'].explode().value_counts()
print(len(subnarratives_counts))
subnarratives_counts

74


subnarratives
Other                                                                     1164
Amplifying existing fears of global warming                                178
Discrediting Ukrainian government and officials and policies               157
Praise of Russian military might                                           145
The West are the aggressors                                                112
Ukraine is a puppet of the West                                            106
Russia is a guarantor of peace and prosperity                              101
Discrediting Ukrainian military                                            100
There is a real possibility that nuclear weapons will be employed           96
Criticism of national governments                                           85
The West does not care about Ukraine, only about its interests              85
Ukraine is the aggressor                                                    78
Russia has international support from 

### 1.3 Cleaning articles

In [28]:
language_model_map = {
    "BG": "xx_ent_wiki_sm",
    "EN": "en_core_web_sm",
    "HI": "xx_ent_wiki_sm",
    "PT": "pt_core_news_sm",
    "RU": "ru_core_news_sm",
}

!python3 -m spacy download xx_ent_wiki_sm
!python3 -m spacy download pt_core_news_sm
!python3 -m spacy download ru_core_news_sm
!python3 -m spacy download en_core_web_sm

Collecting xx-ent-wiki-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.8.0/xx_ent_wiki_sm-3.8.0-py3-none-any.whl (11.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('xx_ent_wiki_sm')
Collecting pt-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.8.0/pt_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')
Collecting ru-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.8.

In [29]:
!pip3 -q install emoji

In [30]:
import spacy
import emoji

nlp_models = {lang: spacy.load(model) for lang, model in language_model_map.items()}

In [31]:
import re

class ArticleCleaner:
    def __init__(self, nlp_models):
        self.nlp_models = nlp_models

    def _clean_paragraph(self, paragraph, nlp):
        """Cleans individual paragraphs by removing links, emails, and normalizing tokens."""
        # Remove URLs, emails, and mentions
        paragraph = re.sub(
            r'http\S+|www\S+|https\S+|[a-zA-Z0-9.-]+\.com|[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+|@[A-Za-z0-9_]+',
            '',
            paragraph
        )

        doc = nlp(paragraph)
        cleaned_tokens = []
        important_entity_types = ["PERSON", "ORG", "GPE"]

        for token in doc:
            if token.is_space or emoji.is_emoji(token.text):
                continue

            if token.ent_type_ in important_entity_types:
                cleaned_tokens.append(token.text + token.whitespace_)
            else:
                cleaned_tokens.append(token.text.lower() + token.whitespace_)

        return "".join(cleaned_tokens).strip()

    def _preprocess_article_text(self, article_text):
        """Preprocess the article text by splitting into header, body, and footer."""
        parts = re.split(r'\n{2,}', article_text)

        if len(parts) > 2:
            header = parts[0].strip()
            footer = parts[-1].strip()
            body = parts[1:-1]
        else:
            header = parts[0].strip() if len(parts) > 0 else ""
            footer = parts[1].strip() if len(parts) > 1 else ""
            body = []

        return header, body, footer

    def clean_article_with_paragraphs(self, article_text, language_code):
        """Main method to clean the article by processing the header, body, and footer."""
        nlp = self.nlp_models.get(language_code, self.nlp_models["EN"])

        header, body, footer = self._preprocess_article_text(article_text)

        cleaned_header = f"<PARA>{self._clean_paragraph(header, nlp)}</PARA>" if header else ""
        cleaned_footer = f"<PARA>{self._clean_paragraph(footer, nlp)}</PARA>" if footer else ""
        cleaned_body = " ".join([self._clean_paragraph(paragraph, nlp) for paragraph in body])

        combined_text = "\n\n".join(filter(None, [cleaned_header, cleaned_body, cleaned_footer]))
        return combined_text.strip()

In [32]:
article_cleaner = ArticleCleaner(nlp_models)

In [33]:
dataset["content"] = dataset.apply(
    lambda row: article_cleaner.clean_article_with_paragraphs(row["content"], row["language"]),
    axis=1
)

In [34]:
row = 7
english_article = dataset[dataset['language'] == 'EN'].iloc[row].content
english_article



In [35]:
def split_into_sections(content):
    parts = re.split(r'<PARA>|</PARA>', content)
    parts = [p.strip() for p in parts if p.strip()]

    if len(parts) == 1:
        return parts[0], "", ""
    elif len(parts) == 2:
        return parts[0], parts[1], ""
    else:
        header = parts[0]
        footer = parts[-1]
        body = " ".join(parts[1:-1])
        return header, body, footer

In [36]:
header, body, footer = split_into_sections(english_article)
print("Header: ", header)
print("\n\n")
print("Body: ", body)
print("\n\n")
print("Footer: ", footer)

Header:  UN chief Warns of global economic crisis at world economic forum in Davos






Footer:  other tech firms, such as Amazon, Meta, Alphabet, Salesforce, and Twitter, have announced similar moves in recent weeks. Microsoft, based in Redmond, Washington, had 221,000 full-time employees as of june 30, 2022, according to government filings.


In [37]:
!pip install -q iterative-stratification

In [38]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

def stratified_train_val_split(data, labels_column, train_size=0.8, splits=5, shuffle=True, min_instances=2):
    if shuffle:
        shuffled_indices = np.arange(len(data))
        np.random.shuffle(shuffled_indices)
        data = data.iloc[shuffled_indices].reset_index(drop=True)

    labels = np.array(data[labels_column].tolist())
    rare_indices = []
    common_indices = []

    class_counts = labels.sum(axis=0)
    rare_classes = np.where(class_counts <= min_instances)[0]

    for idx, label_row in enumerate(labels):
        if any(label_row[rare_classes]):
            rare_indices.append(idx)
        else:
            common_indices.append(idx)

    rare_data = data.iloc[rare_indices]
    rare_labels = labels[rare_indices]
    train_rare = rare_data.iloc[:len(rare_data) // 2].reset_index(drop=True)
    val_rare = rare_data.iloc[len(rare_data) // 2:].reset_index(drop=True)

    common_data = data.iloc[common_indices].reset_index(drop=True)
    common_labels = labels[common_indices]

    mskf = MultilabelStratifiedKFold(n_splits=splits)
    for train_idx, val_idx in mskf.split(np.zeros(len(common_labels)), common_labels):
        train_common = common_data.iloc[train_idx]
        val_common = common_data.iloc[val_idx]
        break

    train_data = pd.concat([train_rare, train_common]).reset_index(drop=True)
    val_data = pd.concat([val_rare, val_common]).reset_index(drop=True)

    return train_data, val_data

(dataset_train), (dataset_val) = stratified_train_val_split(
    dataset,
    labels_column="subnarratives_encoded",
    min_instances=2
)

In [39]:
train_sub_nar_counts = dataset_train['subnarratives'].explode().value_counts()
print(len(train_sub_nar_counts))
train_sub_nar_counts

74


subnarratives
Other                                                                     930
Amplifying existing fears of global warming                               142
Discrediting Ukrainian government and officials and policies              128
Praise of Russian military might                                          116
The West are the aggressors                                                89
Ukraine is a puppet of the West                                            84
Russia is a guarantor of peace and prosperity                              81
Discrediting Ukrainian military                                            80
There is a real possibility that nuclear weapons will be employed          77
The West does not care about Ukraine, only about its interests             68
Criticism of national governments                                          67
Russia has international support from a number of countries and people     62
Ukraine is the aggressor                          

In [40]:
val_sub_nar_counts = dataset_val['subnarratives'].explode().value_counts()
print(len(val_sub_nar_counts))
val_sub_nar_counts

71


subnarratives
Other                                                                     234
Amplifying existing fears of global warming                                36
Discrediting Ukrainian government and officials and policies               29
Praise of Russian military might                                           29
The West are the aggressors                                                23
Ukraine is a puppet of the West                                            22
Discrediting Ukrainian military                                            20
Russia is a guarantor of peace and prosperity                              20
There is a real possibility that nuclear weapons will be employed          19
Criticism of national governments                                          18
The West does not care about Ukraine, only about its interests             17
Ukraine is the aggressor                                                   16
Criticism of political organizations and figures  

### 1.4 Getting embeddings for the articles

#### KaLM

In [42]:
!pip install --upgrade sentence-transformers



In [43]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch

kalm = "HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5"
kalm_model = SentenceTransformer(kalm)
kalm_max_length = 512 # recommended by the model
kalm_model.max_seq_length = kalm_max_length

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/208 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/601k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/55.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [44]:
device = "cuda" if torch.cuda.is_available() else "cpu"
kalm_model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: Qwen2Model 
  (1): Pooling({'word_embedding_dimension': 896, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [45]:
texts = [
    "This is a news article about politics.",  # English
    "यह राजनीति के बारे में एक समाचार लेख है।",  # Hindi translation
    "Este é um artigo de notícias sobre política.",  # Portuguese translation
    "Това е новинарска статия за политика.",  # Bulgarian translation
    "Это новость о политике.",  # Russian translation
    "The weather was nice today."  # Unrelated sentence
]

embeddings = kalm_model.encode(texts)

In [46]:
from sklearn.metrics.pairwise import cosine_similarity

cos_sim_matrix = cosine_similarity(embeddings)

print("Cosine Similarity Matrix:")
print(cos_sim_matrix)

Cosine Similarity Matrix:
[[1.0000002  0.8933759  0.93265355 0.8766835  0.87550384 0.7049875 ]
 [0.8933759  0.9999999  0.9101454  0.90202624 0.9001051  0.7053539 ]
 [0.93265355 0.9101454  1.0000001  0.9168279  0.9119193  0.7264224 ]
 [0.8766835  0.90202624 0.9168279  1.0000001  0.9440486  0.7201146 ]
 [0.87550384 0.9001051  0.9119193  0.9440486  0.99999994 0.7126509 ]
 [0.7049875  0.7053539  0.7264224  0.7201146  0.7126509  1.0000002 ]]


In [47]:
from transformers import AutoTokenizer, AutoModel
import torch

class EmbeddingUtils:
    def __init__(self, model, tokenizer, max_length, device=None):
        self.model = model
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.device = device

    def _split_long_paragraph_and_get_embeddings(self, text, encode_fn, strategy="sum"):
        """
        Splits a long paragraph into chunks if it exceeds max_length in tokens,
        calls the provided encoding function on each chunk, then aggregates.
        """
        embeddings = []

        # We'll do a naive approach: check token length, if too big -> chunk it.
        while True:
            # Tokenize
            tokens = self.tokenizer(
                text, truncation=False, return_tensors="pt", add_special_tokens=False
            )

            # Ensure token tensors are on the correct device
            tokens = {key: value.to(self.device) for key, value in tokens.items()}
            num_tokens = tokens["input_ids"].shape[1]

            # If it fits in one chunk, just encode and break
            if num_tokens <= self.max_length:
                emb = encode_fn(text)
                embeddings.append(emb)
                break
            else:
                # If it doesn't fit, let's do a naive split ~ in half by chars
                split_index = len(text) // 2
                chunk = text[:split_index]
                emb = encode_fn(chunk)
                embeddings.append(emb)

                text = text[split_index:].strip()

        aggregated_emb = self._aggregate_embeddings(embeddings, strategy=strategy)
        return aggregated_emb

    def _aggregate_embeddings(self, embedding_list, strategy="sum"):
        """
        Combine multiple chunk embeddings into a single vector.
        """
        if not embedding_list:
            print('[WARNING] Embedding list was empty')
            return None

        # Stack them on the same device
        stacked = torch.stack([emb.to(self.device) for emb in embedding_list], dim=0)

        if strategy == "mean":
            agg = stacked.mean(dim=0)
        elif strategy == "sum":
            agg = stacked.sum(dim=0)
        elif strategy == "concat":
            agg = torch.cat(embedding_list, dim=0)
        elif strategy == "rms":
            squares = stacked**2
            agg = torch.sqrt(squares.mean(dim=0))
        else:
            raise ValueError(f"Unknown strategy: {strategy}")

        return agg

In [56]:
# Processor based on KALM, if you want to use another model, change the constructor and the _encode accordingly
class KALMEmbeddingProcessor:
    def __init__(self, model, max_length=512, device='cpu'):
        self.model = model
        self.model.max_seq_length = max_length
        self.device = device
        self.instruction = "Produce an embedding useful for detecting relevant war- or climate-related narratives from a taxonomy."
        self.utils = EmbeddingUtils(self.model, self.model.tokenizer, self.model.max_seq_length, self.device)
        print(f"Max length is set to {self.model.max_seq_length}.")
        print("Using device", device)

    def _encode(self, sentence):
        text_to_encode = f"Instruct: {self.instruction}\nQuery: {sentence}"

        embedding = self.model.encode(
            text_to_encode,
            convert_to_tensor=True,
            normalize_embeddings=False,
            show_progress_bar=False,
            device=self.device
        )
        return embedding

    def get_embeddings(self, content, strategy="mean"):
        """
        Main method that splits into header, body, footer, applies chunking,
        and aggregates into a single doc embedding.
        """
        header, body, footer = split_into_sections(content)

        section_embs = []

        # 1) Header
        if header:
            emb = self.utils._split_long_paragraph_and_get_embeddings(header, self._encode, strategy="sum")
            if emb is not None:
                section_embs.append(emb)

        # 2) Body
        if body:
            emb = self.utils._split_long_paragraph_and_get_embeddings(body, self._encode, strategy="sum")
            if emb is not None:
                section_embs.append(emb)

        # 3) Footer
        if footer:
            emb = self.utils._split_long_paragraph_and_get_embeddings(footer, self._encode, strategy="sum")
            if emb is not None:
                section_embs.append(emb)

        if not header and not body and not footer:
            print("[WARNING] Empty article or no sections found")
            return None

        final_emb = self.utils._aggregate_embeddings(section_embs, strategy=strategy)
        if final_emb is None:
            print("[WARNING] Failed to aggregate embeddings")
            return None

        final_emb_np = final_emb.detach().cpu().numpy()
        return final_emb_np

In [57]:
processor_kalm = KALMEmbeddingProcessor(model=kalm_model, max_length=kalm_max_length, device=device)

Max length is set to 512.
Using device cpu


In [58]:
train_embeddings = dataset_train['content'].apply(
    lambda content: processor_kalm.get_embeddings(content, strategy="sum")
)

In [59]:
train_embeddings = np.array(train_embeddings.tolist())
train_embeddings.shape

(1369, 896)

In [60]:
val_embeddings = dataset_val['content'].apply(
    lambda content: processor_kalm.get_embeddings(content, strategy="sum")
)

In [61]:
val_embeddings = np.array(val_embeddings.tolist())
val_embeddings.shape

(330, 896)

In [71]:
y_train_nar = dataset_train['narratives_encoded'].tolist()
y_val_nar = dataset_val['narratives_encoded'].tolist()

In [72]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

ovr_logistic_nar = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight='balanced'))

In [73]:
ovr_logistic_nar.fit(train_embeddings, y_train_nar)

In [74]:
import warnings
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, StratifiedKFold

def get_classification_report(y_true, y_pred):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        report = classification_report(y_true, y_pred, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    return report_df

def get_cross_val_score(model, x, y, scoring='f1_macro', splits=3):
    """Perform cross-validation and compute scores."""
    cv = StratifiedKFold(n_splits=splits, shuffle=True)
    cross_val_scores = cross_val_score(model, x, y, cv=cv, scoring=scoring)
    print(f"Cross-validation scores: {cross_val_scores}")
    print(f"Mean CV F1 Score: {cross_val_scores.mean()}")

In [75]:
import warnings
from sklearn.metrics import (
    hamming_loss,
)

def evaluate_model(model, x, y_true):
    y_pred = model.predict(x)

    classification_report_df = get_classification_report(y_true, y_pred)
    print("Classification Report:")
    print(classification_report_df)
    print("\n")

    hamming = hamming_loss(y_true, y_pred)
    print(f"Hamming Loss: {hamming:.4f}")
    print("\n")

In [76]:
evaluate_model(ovr_logistic_nar, val_embeddings, y_val_nar)

Classification Report:
              precision    recall  f1-score  support
0              0.800000  0.956522  0.871287     46.0
1              0.409639  0.894737  0.561983     38.0
2              0.274725  0.675676  0.390625     37.0
3              0.100000  1.000000  0.181818      1.0
4              0.210526  0.666667  0.320000      6.0
5              0.300000  0.818182  0.439024     11.0
6              0.340000  0.944444  0.500000     18.0
7              0.508475  0.937500  0.659341     32.0
8              0.587719  0.893333  0.708995     75.0
9              0.490909  0.750000  0.593407     72.0
10             0.192308  0.454545  0.270270     11.0
11             0.346154  0.818182  0.486486     11.0
12             0.000000  0.000000  0.000000      2.0
13             0.371429  0.812500  0.509804     16.0
14             0.185185  0.476190  0.266667     21.0
15             0.436893  0.762712  0.555556     59.0
16             0.105263  0.444444  0.170213      9.0
17             0.524752

In [77]:
y_train_sub_nar = dataset_train['subnarratives_encoded'].tolist()
y_val_sub_nar = dataset_val['subnarratives_encoded'].tolist()

In [78]:
ovr_logistic_sub = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight='balanced'))
ovr_logistic_sub.fit(train_embeddings, y_train_sub_nar)

In [79]:
evaluate_model(ovr_logistic_sub, val_embeddings, y_val_sub_nar)

Classification Report:
              precision    recall  f1-score  support
0              0.142857  0.750000  0.240000      4.0
1              0.727273  0.888889  0.800000     36.0
2              0.375000  0.600000  0.461538      5.0
3              0.217391  0.833333  0.344828     12.0
4              0.500000  0.500000  0.500000      2.0
5              0.000000  0.000000  0.000000      1.0
6              0.235294  0.666667  0.347826      6.0
7              0.266667  1.000000  0.421053      4.0
8              0.227273  1.000000  0.370370      5.0
9              0.100000  0.333333  0.153846      3.0
10             0.107143  0.428571  0.171429      7.0
11             0.230769  0.750000  0.352941      4.0
12             0.275862  1.000000  0.432432      8.0
13             0.000000  0.000000  0.000000      1.0
14             0.200000  0.666667  0.307692      6.0
15             0.367347  1.000000  0.537313     18.0
16             0.282609  0.812500  0.419355     16.0
17             0.142857

### Building a simple Neural Network with 2 heads

In [80]:
train_embeddings_tensor = torch.tensor(train_embeddings, dtype=torch.float32)
val_embeddings_tensor = torch.tensor(val_embeddings, dtype=torch.float32)

In [81]:
input_size = train_embeddings_tensor.shape[1]
print(input_size)

896


In [82]:
import torch
import torch.nn as nn

class MultiTaskClassifier(nn.Module):
    def __init__(self,
                 input_size,
                 hidden_size,
                 num_narratives=len(mlb_narratives.classes_),
                 num_subnarratives=len(mlb_subnarratives.classes_),
                 dropout_rate=0.3
                ):

        super(MultiTaskClassifier, self).__init__()

        self.shared_layer = nn.Sequential(
            nn.Linear(input_size, hidden_size * 2),
            nn.BatchNorm1d(hidden_size * 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        self.narrative_head = nn.Sequential(
            nn.Linear(hidden_size * 2, num_narratives),
            nn.Sigmoid()
        )

        self.subnarrative_head = nn.Sequential(
            nn.Linear(hidden_size * 2, num_subnarratives),
            nn.Sigmoid()
        )

    def forward(self, x):
        shared_output = self.shared_layer(x)
        narratives = self.narrative_head(shared_output)
        subnarratives = self.subnarrative_head(shared_output)
        return narratives, subnarratives

In [83]:
simple_model = MultiTaskClassifier(input_size=input_size, hidden_size=512)
narratives, subnarratives = simple_model(train_embeddings_tensor)
print(narratives.shape, subnarratives.shape)

torch.Size([1369, 21]) torch.Size([1369, 74])


In [84]:
y_train_nar = torch.tensor(y_train_nar, dtype=torch.float32)
y_train_sub_nar = torch.tensor(y_train_sub_nar, dtype=torch.float32)

y_val_nar = torch.tensor(y_val_nar, dtype=torch.float32)
y_val_sub_nar = torch.tensor(y_val_sub_nar, dtype=torch.float32)

In [85]:
import torch
import torch.nn as nn

def compute_class_weights(y_train):
    total_samples = y_train.shape[0]
    class_weights = []
    for label in range(y_train.shape[1]):
        pos_count = y_train[:, label].sum().item()
        neg_count = total_samples - pos_count
        pos_weight = total_samples / (2 * pos_count) if pos_count > 0 else 0
        neg_weight = total_samples / (2 * neg_count) if neg_count > 0 else 0
        class_weights.append((pos_weight, neg_weight))
    return class_weights

class WeightedBCELoss(nn.Module):
    def __init__(self, class_weights):
        super().__init__()
        self.class_weights = class_weights

    def forward(self, probs, targets):
        bce_loss = 0
        epsilon = 1e-7
        for i, (pos_weight, neg_weight) in enumerate(self.class_weights):
            prob = probs[:, i]
            bce = -pos_weight * targets[:, i] * torch.log(prob + epsilon) - \
                  neg_weight * (1 - targets[:, i]) * torch.log(1 - prob + epsilon)
            bce_loss += bce.mean()
        return bce_loss / len(self.class_weights)

class_weights_nar = compute_class_weights(y_train_nar)
narrative_criterion = WeightedBCELoss(class_weights_nar)

In [86]:
class_weights_sub_nar = compute_class_weights(y_train_sub_nar)
subnarrative_criterion = WeightedBCELoss(class_weights_sub_nar)

In [87]:
optimizer = torch.optim.Adam(simple_model.parameters(), lr=0.001)

In [88]:
def train_with_early_stopping(
    model,
    optimizer,
    narrative_criterion,
    subnarrative_criterion,
    train_embeddings=train_embeddings_tensor,
    y_train_nar=y_train_nar,
    y_train_sub_nar=y_train_sub_nar,
    val_embeddings=val_embeddings_tensor,
    y_val_nar=y_val_nar,
    y_val_sub_nar=y_val_sub_nar,
    patience=3,
    num_epochs=100,
):
    best_val_loss = float('inf')
    best_model = None
    patience_counter = 0

    for epoch in range(num_epochs):
        model.train()
        narratives, subnarratives = model(train_embeddings)

        narrative_loss = narrative_criterion(narratives, y_train_nar)
        subnarrative_loss = subnarrative_criterion(subnarratives, y_train_sub_nar)
        loss = narrative_loss + subnarrative_loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            val_narratives, val_subnarratives = model(val_embeddings)
            val_narrative_loss = narrative_criterion(val_narratives, y_val_nar)
            val_subnarrative_loss = subnarrative_criterion(val_subnarratives, y_val_sub_nar)
            val_loss = val_narrative_loss + val_subnarrative_loss

        print(f"Epoch {epoch+1}/{num_epochs}, "
              f"Training Loss: {loss.item():.4f} "
              f"(Narrative: {narrative_loss.item():.4f}, Subnarrative: {subnarrative_loss.item():.4f}), "
              f"Validation Loss: {val_loss.item():.4f} "
              f"(Narrative: {val_narrative_loss.item():.4f}, Subnarrative: {val_subnarrative_loss.item():.4f})")

        if val_loss.item() < best_val_loss:
            best_val_loss = val_loss.item()
            patience_counter = 0
            best_model = model.state_dict()
        else:
            patience_counter += 1
            print(f"Validation loss did not improve for {patience_counter} epoch(s).")

        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

    if best_model:
        model.load_state_dict(best_model)

    return model

In [89]:
trained_simple_model = train_with_early_stopping(
    model=simple_model,
    optimizer=optimizer,
    narrative_criterion=narrative_criterion,
    subnarrative_criterion=subnarrative_criterion,
)

Epoch 1/100, Training Loss: 1.4299 (Narrative: 0.7117, Subnarrative: 0.7182), Validation Loss: 1.4560 (Narrative: 0.7098, Subnarrative: 0.7461)
Epoch 2/100, Training Loss: 1.1613 (Narrative: 0.5601, Subnarrative: 0.6012), Validation Loss: 1.4469 (Narrative: 0.7044, Subnarrative: 0.7425)
Epoch 3/100, Training Loss: 1.0270 (Narrative: 0.4958, Subnarrative: 0.5312), Validation Loss: 1.4382 (Narrative: 0.6994, Subnarrative: 0.7388)
Epoch 4/100, Training Loss: 0.9273 (Narrative: 0.4476, Subnarrative: 0.4797), Validation Loss: 1.4299 (Narrative: 0.6948, Subnarrative: 0.7351)
Epoch 5/100, Training Loss: 0.8473 (Narrative: 0.4097, Subnarrative: 0.4376), Validation Loss: 1.4219 (Narrative: 0.6905, Subnarrative: 0.7315)
Epoch 6/100, Training Loss: 0.7838 (Narrative: 0.3825, Subnarrative: 0.4013), Validation Loss: 1.4141 (Narrative: 0.6862, Subnarrative: 0.7279)
Epoch 7/100, Training Loss: 0.7329 (Narrative: 0.3609, Subnarrative: 0.3720), Validation Loss: 1.4061 (Narrative: 0.6817, Subnarrative: 

In [90]:
target_names_nar = mlb_narratives.classes_
target_names_sub = mlb_subnarratives.classes_

In [91]:
from sklearn.metrics import classification_report, f1_score

def evaluate_model(
    model,
    embeddings,
    y_nar_true,
    y_sub_nar_true,
    thresholds=np.arange(0.1, 1.0, 0.1),
    target_names_nar=target_names_nar,
    target_names_sub=target_names_sub
):
    best_threshold = 0
    best_f1 = 0
    best_classification_report_nar = None
    best_classification_report_sub = None

    for threshold in thresholds:
        with torch.no_grad():
            nar_pred_logits, sub_nar_pred_logits = model(embeddings)

            nar_predictions = (nar_pred_logits >= threshold).int().cpu().numpy()
            sub_nar_predictions = (sub_nar_pred_logits >= threshold).int().cpu().numpy()

            y_nar_true_np = y_nar_true.cpu().numpy()
            y_sub_nar_true_np = y_sub_nar_true.cpu().numpy()

            classification_rep_nar = classification_report(
                y_nar_true_np, nar_predictions, target_names=target_names_nar, zero_division=0
            )
            classification_rep_sub = classification_report(
                y_sub_nar_true_np, sub_nar_predictions, target_names=target_names_sub, zero_division=0
            )
            f1_nar = f1_score(y_nar_true_np, nar_predictions, average='macro')
            f1_sub = f1_score(y_sub_nar_true_np, sub_nar_predictions, average='macro')

            avg_f1 = (f1_nar + f1_sub) / 2

            if avg_f1 > best_f1:
                best_f1 = avg_f1
                best_threshold = threshold
                best_classification_report_nar = classification_rep_nar
                best_classification_report_sub = classification_rep_sub

    print(f"Best Threshold: {best_threshold}, Best F1 Score: {best_f1}")
    print("\nBest Narratives Classification Report:")
    print(best_classification_report_nar)
    print("\nBest Sub-Narratives Classification Report:")
    print(best_classification_report_sub)

In [92]:
evaluate_model(
    model=trained_simple_model,
    embeddings=val_embeddings_tensor,
    y_nar_true=y_val_nar,
    y_sub_nar_true=y_val_sub_nar,
)

Best Threshold: 0.5, Best F1 Score: 0.38129958681441356

Best Narratives Classification Report:
                                                   precision    recall  f1-score   support

                         Amplifying Climate Fears       0.81      0.93      0.87        46
                     Amplifying war-related fears       0.64      0.76      0.70        38
Blaming the war on others rather than the invader       0.31      0.51      0.38        37
                     Climate change is beneficial       0.00      0.00      0.00         1
             Controversy about green technologies       0.40      0.67      0.50         6
                    Criticism of climate movement       0.33      0.73      0.46        11
                    Criticism of climate policies       0.42      0.89      0.57        18
        Criticism of institutions and authorities       0.62      0.91      0.73        32
                             Discrediting Ukraine       0.65      0.85      0.74    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [94]:
import torch
import joblib

def save_model(model, save_path=""):
    torch.save({
        'model_state_dict': model.state_dict(),
        'input_size': input_size,
        'hidden_size': model.hidden_size,
        'num_narratives': len(mlb_narratives.classes_),
        'num_subnarratives': len(mlb_subnarratives.classes_),
        'dropout_rate': model.dropout_rate
    }, save_path)

    joblib.dump(mlb_narratives, 'mlb_narratives.pkl')
    joblib.dump(mlb_subnarratives, 'mlb_subnarratives.pkl')

    print(f"Model saved to {save_path}")

### Changes to the current Neural Network

#### Providing the already classified narrative as an input

In [95]:
class MultiTaskClassifierTweaked(nn.Module):
    def __init__(self,
                 input_size,
                 hidden_size,
                 num_narratives=len(mlb_narratives.classes_),
                 num_subnarratives=len(mlb_subnarratives.classes_),
                 dropout_rate=0.3
                ):
        super(MultiTaskClassifierTweaked, self).__init__()

        self.shared_layer = nn.Sequential(
            nn.Linear(input_size, hidden_size * 2),
            nn.BatchNorm1d(hidden_size * 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        self.narrative_head = nn.Sequential(
            nn.Linear(hidden_size * 2, num_narratives),
            nn.Sigmoid()
        )

        self.subnarrative_head = nn.Sequential(
            nn.Linear(hidden_size * 2 + num_narratives, num_subnarratives),
            nn.Sigmoid()
        )

    def forward(self, x):
        shared_output = self.shared_layer(x)
        narratives = self.narrative_head(shared_output)

        masked_input = torch.cat([shared_output, narratives], dim=1)
        subnarratives = self.subnarrative_head(masked_input)

        return narratives, subnarratives

In [96]:
model_tweaked = MultiTaskClassifierTweaked(
    input_size=input_size,
    hidden_size=512
)
narratives, subnarratives = model_tweaked(train_embeddings_tensor)
print(narratives.shape, subnarratives.shape)

torch.Size([1369, 21]) torch.Size([1369, 74])


In [97]:
optimizer_new = torch.optim.Adam(model_tweaked.parameters(), lr=0.001)

In [98]:
trained_model_tweaked = train_with_early_stopping(
    model=model_tweaked,
    optimizer=optimizer_new,
    narrative_criterion=narrative_criterion,
    subnarrative_criterion=subnarrative_criterion,
)

Epoch 1/100, Training Loss: 1.4490 (Narrative: 0.7259, Subnarrative: 0.7232), Validation Loss: 1.4576 (Narrative: 0.7104, Subnarrative: 0.7472)
Epoch 2/100, Training Loss: 1.1693 (Narrative: 0.5679, Subnarrative: 0.6014), Validation Loss: 1.4493 (Narrative: 0.7055, Subnarrative: 0.7438)
Epoch 3/100, Training Loss: 1.0269 (Narrative: 0.5001, Subnarrative: 0.5268), Validation Loss: 1.4410 (Narrative: 0.7008, Subnarrative: 0.7402)
Epoch 4/100, Training Loss: 0.9280 (Narrative: 0.4515, Subnarrative: 0.4765), Validation Loss: 1.4330 (Narrative: 0.6964, Subnarrative: 0.7366)
Epoch 5/100, Training Loss: 0.8523 (Narrative: 0.4144, Subnarrative: 0.4379), Validation Loss: 1.4253 (Narrative: 0.6922, Subnarrative: 0.7331)
Epoch 6/100, Training Loss: 0.7856 (Narrative: 0.3859, Subnarrative: 0.3996), Validation Loss: 1.4175 (Narrative: 0.6878, Subnarrative: 0.7297)
Epoch 7/100, Training Loss: 0.7324 (Narrative: 0.3618, Subnarrative: 0.3706), Validation Loss: 1.4097 (Narrative: 0.6832, Subnarrative: 

In [99]:
evaluate_model(
    model=trained_model_tweaked,
    embeddings=val_embeddings_tensor,
    y_nar_true=y_val_nar,
    y_sub_nar_true=y_val_sub_nar,
)

Best Threshold: 0.6, Best F1 Score: 0.38378793447133747

Best Narratives Classification Report:
                                                   precision    recall  f1-score   support

                         Amplifying Climate Fears       0.91      0.89      0.90        46
                     Amplifying war-related fears       0.72      0.74      0.73        38
Blaming the war on others rather than the invader       0.46      0.32      0.38        37
                     Climate change is beneficial       0.00      0.00      0.00         1
             Controversy about green technologies       0.50      0.67      0.57         6
                    Criticism of climate movement       0.44      0.64      0.52        11
                    Criticism of climate policies       0.64      0.78      0.70        18
        Criticism of institutions and authorities       0.66      0.78      0.71        32
                             Discrediting Ukraine       0.77      0.72      0.74    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Fixing loss to account for hierarchy

In [100]:
def hierarchical_loss_groundtruth_gating(
    narr_probs,
    sub_probs,
    y_narr,
    y_sub,
    parent_of_sub,
):
    """Penalizes subnarratives when the parent is truly active."""
    narr_loss = narrative_criterion(narr_probs, y_narr)

    batch_size, num_subs = sub_probs.size()
    mask = torch.zeros_like(sub_probs)
    # The "target" for those subnarratives that are masked out = 0
    sub_labels_masked = torch.zeros_like(y_sub)

    for s in range(num_subs):
        p = parent_of_sub[s]  # parent narrative index
        # Indices in the batch where parent is 1
        # active_indices = (y_narr[:, p] == 1).nonzero(as_tuple=True)[0]
        active_indices = (y_narr[:, p] == 1)
        # Turn on mask for these subnarratives
        mask[active_indices, s] = 1

        # Also copy the actual sub-label from y_sub for these active samples
        sub_labels_masked[active_indices, s] = y_sub[active_indices, s]

    masked_sub_probs = sub_probs * mask
    sub_loss = subnarrative_criterion(masked_sub_probs, sub_labels_masked)

    total_loss = narr_loss + sub_loss
    return total_loss

In [101]:
def train_with_early_stopping_hierarchical(
    model,
    optimizer,
    narrative_criterion,
    subnarrative_criterion,
    train_embeddings=train_embeddings_tensor,
    y_train_nar=y_train_nar,
    y_train_sub_nar=y_train_sub_nar,
    val_embeddings=val_embeddings_tensor,
    y_val_nar=y_val_nar,
    y_val_sub_nar=y_val_sub_nar,
    patience=3,
    num_epochs=100,
):
    best_val_loss = float('inf')
    best_model = None
    patience_counter = 0

    for epoch in range(num_epochs):
        model.train()
        narratives, subnarratives = model(train_embeddings)

        train_loss = hierarchical_loss_groundtruth_gating(narratives, subnarratives, y_train_nar, y_train_sub_nar, subnarrative_to_narrative_map)

        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            val_narratives, val_subnarratives = model(val_embeddings)
            val_loss = hierarchical_loss_groundtruth_gating(val_narratives, val_subnarratives, y_val_nar, y_val_sub_nar, subnarrative_to_narrative_map)

        print(f"Epoch {epoch+1}/{num_epochs}, "
              f"Training Loss: {train_loss.item():.4f} "
              f"Validation Loss: {val_loss.item():.4f} ")

        if val_loss.item() < best_val_loss:
            best_val_loss = val_loss.item()
            patience_counter = 0
            best_model = model.state_dict()
        else:
            patience_counter += 1
            print(f"Validation loss did not improve for {patience_counter} epoch(s).")

        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

    if best_model:
        model.load_state_dict(best_model)

    return model

In [102]:
model_hierarchy_loss = MultiTaskClassifier(input_size=input_size, hidden_size=512)

In [103]:
optimizer = torch.optim.Adam(model_hierarchy_loss.parameters(), lr=0.001)

In [104]:
trained_hierarchy = train_with_early_stopping_hierarchical(
    model=model_hierarchy_loss,
    optimizer=optimizer,
    narrative_criterion=narrative_criterion,
    subnarrative_criterion=subnarrative_criterion,
)

Epoch 1/100, Training Loss: 1.1001 Validation Loss: 1.1275 
Epoch 2/100, Training Loss: 0.8005 Validation Loss: 1.1118 
Epoch 3/100, Training Loss: 0.6576 Validation Loss: 1.0963 
Epoch 4/100, Training Loss: 0.5818 Validation Loss: 1.0820 
Epoch 5/100, Training Loss: 0.5265 Validation Loss: 1.0689 
Epoch 6/100, Training Loss: 0.4954 Validation Loss: 1.0569 
Epoch 7/100, Training Loss: 0.4653 Validation Loss: 1.0460 
Epoch 8/100, Training Loss: 0.4477 Validation Loss: 1.0357 
Epoch 9/100, Training Loss: 0.4318 Validation Loss: 1.0260 
Epoch 10/100, Training Loss: 0.4154 Validation Loss: 1.0168 
Epoch 11/100, Training Loss: 0.3998 Validation Loss: 1.0080 
Epoch 12/100, Training Loss: 0.3869 Validation Loss: 0.9997 
Epoch 13/100, Training Loss: 0.3767 Validation Loss: 0.9921 
Epoch 14/100, Training Loss: 0.3621 Validation Loss: 0.9850 
Epoch 15/100, Training Loss: 0.3534 Validation Loss: 0.9784 
Epoch 16/100, Training Loss: 0.3431 Validation Loss: 0.9719 
Epoch 17/100, Training Loss: 0.33

In [105]:
from sklearn.metrics import classification_report, f1_score
import numpy as np

def evaluate_model_h(
    model,
    embeddings,
    y_nar_true,
    y_sub_nar_true,
    parent_of_sub,
    thresholds=np.arange(0.1, 1.0, 0.1),
    target_names_nar=mlb_narratives.classes_,
    target_names_sub=mlb_subnarratives.classes_,
):
    best_threshold = 0
    best_f1 = 0
    best_classification_report_nar = None
    best_classification_report_sub = None

    y_nar_true_np = y_nar_true.cpu().numpy()
    y_sub_nar_true_np = y_sub_nar_true.cpu().numpy()

    with torch.no_grad():
        nar_pred_logits, sub_nar_pred_logits = model(embeddings)
        nar_pred_logits = nar_pred_logits.cpu().numpy()
        sub_nar_pred_logits = sub_nar_pred_logits.cpu().numpy()

    for threshold in thresholds:
        nar_predictions = (nar_pred_logits >= 0.5).astype(int)

        sub_nar_predictions = (sub_nar_pred_logits >= threshold).astype(int)

        for s in range(sub_nar_predictions.shape[1]):
            p = parent_of_sub[s]
            sub_nar_predictions[:, s] = sub_nar_predictions[:, s] * nar_predictions[:, p]

        classification_rep_nar = classification_report(
            y_nar_true_np, nar_predictions,
            target_names=target_names_nar, zero_division=0
        )
        classification_rep_sub = classification_report(
            y_sub_nar_true_np, sub_nar_predictions,
            target_names=target_names_sub, zero_division=0
        )

        f1_nar = f1_score(y_nar_true_np, nar_predictions, average='macro')
        f1_sub = f1_score(y_sub_nar_true_np, sub_nar_predictions, average='macro')

        avg_f1 = (f1_nar + f1_sub) / 2.0

        if avg_f1 > best_f1:
            best_f1 = avg_f1
            best_threshold = threshold
            best_classification_report_nar = classification_rep_nar
            best_classification_report_sub = classification_rep_sub

    print(f"Best Threshold: {best_threshold}, Best F1 Score (avg nar/sub): {best_f1:.3f}\n")
    print("\nBest Narratives Classification Report:")
    print(best_classification_report_nar)
    print("\nBest Sub-Narratives Classification Report:")
    print(best_classification_report_sub)

In [106]:
evaluate_model_h(
    model=trained_hierarchy,
    embeddings=val_embeddings_tensor,
    y_nar_true=y_val_nar,
    parent_of_sub=subnarrative_to_narrative_map,
    y_sub_nar_true=y_val_sub_nar,
)

Best Threshold: 0.7000000000000001, Best F1 Score (avg nar/sub): 0.365


Best Narratives Classification Report:
                                                   precision    recall  f1-score   support

                         Amplifying Climate Fears       0.83      0.93      0.88        46
                     Amplifying war-related fears       0.60      0.82      0.69        38
Blaming the war on others rather than the invader       0.26      0.62      0.36        37
                     Climate change is beneficial       0.00      0.00      0.00         1
             Controversy about green technologies       0.36      0.67      0.47         6
                    Criticism of climate movement       0.38      0.73      0.50        11
                    Criticism of climate policies       0.38      0.89      0.53        18
        Criticism of institutions and authorities       0.67      0.88      0.76        32
                             Discrediting Ukraine       0.62      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Building a neural network with multiple heads, one for each narrative hierarchy

In [107]:
dataset['subnarratives_encoded']

0       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                              ...                        
1694    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1695    [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...
1696    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
1697    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1698    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: subnarratives_encoded, Length: 1699, dtype: object

In [108]:
narrative_to_sub_map

{8: [21, 50, 64, 22, 66, 33, 39, 20, 65],
 9: [19, 60, 71, 33, 56, 53, 58],
 17: [42, 46, 33, 41, 34, 35],
 19: [59, 63, 40, 33],
 10: [72, 69, 33],
 1: [62, 33, 3, 43, 31],
 16: [32, 57, 55, 33],
 2: [67, 54, 33],
 20: [45, 44, 68, 33],
 13: [2, 6, 33],
 14: [61, 47, 33],
 0: [24, 73, 33, 23, 1],
 15: [33],
 7: [17, 14, 33, 16, 15],
 5: [9, 8, 0, 33],
 11: [7, 4, 28, 49, 33, 27, 70, 51, 29],
 6: [12, 10, 11, 33],
 18: [18, 48, 33, 26, 30],
 3: [5, 52, 33],
 4: [36, 37, 38, 33],
 12: [33, 13, 25]}

In [109]:
def remap_subnarratives(row, narrative_to_sub_map):
    """Takes in a row and encodes the current subnarrative list to the associated hierarchy based on the narr-subnar map"""
    for narr_idx, sub_indices in narrative_to_sub_map.items():
        sub_labels = [row['subnarratives_encoded'][sub_idx] for sub_idx in sub_indices]
        col_name = f"narrative_hierarchy_{narr_idx}"
        row[col_name] = sub_labels
    return row

dataset_train_cpy = dataset_train.apply(remap_subnarratives, axis=1, args=(narrative_to_sub_map,)).copy()

In [110]:
dataset_val_cpy = dataset_val.apply(remap_subnarratives, axis=1, args=(narrative_to_sub_map,)).copy()

In [111]:
dataset_val_cpy.head()

Unnamed: 0,language,article_id,content,narratives,subnarratives,narratives_encoded,subnarratives_encoded,narrative_hierarchy_8,narrative_hierarchy_9,narrative_hierarchy_17,...,narrative_hierarchy_0,narrative_hierarchy_15,narrative_hierarchy_7,narrative_hierarchy_5,narrative_hierarchy_11,narrative_hierarchy_6,narrative_hierarchy_18,narrative_hierarchy_3,narrative_hierarchy_4,narrative_hierarchy_12
0,EN,EN_CC_200022.txt,<PARA>Denmark to Punish Farmers for cow ‘emiss...,"[Criticism of institutions and authorities, Cr...","[Criticism of national governments, Other, Met...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[0, 0, 0, 1, 0, 0, 0]","[0, 0, 1, 0, 0, 0]",...,"[0, 0, 1, 0, 0]",[1],"[0, 1, 1, 0, 1]","[0, 0, 0, 1]","[0, 0, 0, 0, 1, 0, 0, 0, 0]","[0, 0, 0, 1]","[0, 0, 1, 1, 1]","[0, 0, 1]","[0, 0, 0, 1]","[1, 0, 0]"
1,EN,EN_CC_200221.txt,<PARA>“the hour of decision”</PARA>\n\nshortly...,[Hidden plots by secret schemes of powerful gr...,"[Blaming global elites, Other, Other, CO2 conc...","[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, ...","[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0]","[0, 0, 0, 1, 0, 0, 0]","[0, 0, 1, 0, 0, 0]",...,"[0, 0, 1, 0, 0]",[1],"[0, 0, 1, 1, 0]","[0, 0, 0, 1]","[0, 1, 0, 1, 1, 0, 0, 0, 0]","[0, 0, 0, 1]","[0, 1, 1, 0, 0]","[0, 0, 1]","[0, 0, 0, 1]","[1, 0, 0]"
2,RU,RU-URW-1106.txt,<PARA>российская разведка рассекретила планы ф...,"[Praise of Russia, Distrust towards Media, Bla...","[Praise of Russian military might, Western med...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 1]",...,"[0, 0, 0, 0, 0]",[0],"[0, 0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0]"
3,EN,EN_UA_300052.txt,<PARA>killing russian culture: ‘public opinion...,[Russia is the Victim],[The West is russophobic],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]",...,"[0, 0, 0, 0, 0]",[0],"[0, 0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0]"
4,EN,EN_UA_011260.txt,<PARA>Russia has a clear plan to resolve the c...,"[Russia is the Victim, Discrediting Ukraine, D...","[UA is anti-RU extremists, Ukraine is a hub fo...","[0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 1, 1, 0, 0, 1]","[1, 0, 0, 1, 0, 0, 0]","[0, 0, 1, 0, 0, 0]",...,"[0, 0, 1, 0, 0]",[1],"[0, 0, 1, 0, 0]","[0, 0, 0, 1]","[0, 0, 0, 0, 1, 0, 0, 0, 0]","[0, 0, 0, 1]","[0, 0, 1, 0, 0]","[0, 0, 1]","[0, 0, 0, 1]","[1, 0, 0]"


In [112]:
for narr_idx, sub_indices in narrative_to_sub_map.items():
    column_name = f"narrative_hierarchy_{narr_idx}"
    res = dataset_train_cpy[column_name]
    print(res)

0       [0, 0, 0, 0, 0, 0, 0, 0, 0]
1       [0, 0, 0, 0, 0, 1, 0, 0, 0]
2       [0, 0, 0, 0, 0, 1, 0, 0, 0]
3       [0, 0, 0, 0, 0, 1, 0, 0, 0]
4       [0, 0, 0, 0, 0, 0, 0, 1, 0]
                   ...             
1364    [0, 0, 0, 0, 0, 1, 0, 0, 0]
1365    [0, 0, 0, 0, 1, 1, 0, 1, 0]
1366    [0, 0, 0, 0, 0, 0, 0, 1, 0]
1367    [1, 0, 0, 0, 0, 0, 0, 1, 0]
1368    [0, 0, 0, 0, 0, 1, 0, 1, 0]
Name: narrative_hierarchy_8, Length: 1369, dtype: object
0       [0, 0, 0, 0, 0, 0, 0]
1       [0, 0, 0, 1, 0, 0, 0]
2       [0, 0, 0, 1, 0, 0, 0]
3       [0, 0, 0, 1, 0, 0, 0]
4       [0, 0, 0, 0, 0, 0, 0]
                ...          
1364    [0, 1, 0, 1, 0, 0, 0]
1365    [0, 0, 0, 1, 0, 0, 0]
1366    [0, 0, 0, 0, 0, 0, 0]
1367    [0, 0, 0, 0, 0, 0, 0]
1368    [0, 0, 0, 1, 0, 0, 0]
Name: narrative_hierarchy_9, Length: 1369, dtype: object
0       [0, 0, 0, 0, 0, 0]
1       [0, 0, 1, 0, 0, 0]
2       [0, 0, 1, 0, 0, 0]
3       [0, 0, 1, 0, 0, 0]
4       [0, 0, 0, 0, 0, 0]
               ...       

In [113]:
# Sort order of narratives to start from hierarchy 0
narrative_order = sorted(narrative_to_sub_map.keys())

In [114]:
narrative_order

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [115]:
def aggregate_subnarratives(row, narrative_order, narrative_to_sub_map):
    """Takes in a row, and aggregates all hierarchy columns to 1 list.
    The encoded list will be a list of lists, starting from the first hierarchy"""
    aggregated = []
    for narr_idx in narrative_order:
        column_name = f"narrative_hierarchy_{narr_idx}"
        sub_labels = row[column_name]
        aggregated.append(sub_labels)
    return aggregated

dataset_train['aggregated_subnarratives'] = dataset_train_cpy.apply(
    aggregate_subnarratives,
    axis=1,
    args=(narrative_order, narrative_to_sub_map)
)

dataset_val['aggregated_subnarratives'] = dataset_val_cpy.apply(
    aggregate_subnarratives,
    axis=1,
    args=(narrative_order, narrative_to_sub_map)
)

In [116]:
dataset_train['aggregated_subnarratives']

0       [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0], ...
1       [[0, 0, 1, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1], ...
2       [[0, 0, 1, 0, 0], [1, 1, 0, 0, 0], [0, 0, 1], ...
3       [[0, 0, 1, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1], ...
4       [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0], ...
                              ...                        
1364    [[0, 0, 1, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1], ...
1365    [[0, 0, 1, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1], ...
1366    [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 1, 0], ...
1367    [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0], ...
1368    [[0, 0, 1, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1], ...
Name: aggregated_subnarratives, Length: 1369, dtype: object

In [117]:
y_train_sub_heads = dataset_train['aggregated_subnarratives'].to_numpy()
y_val_sub_heads = dataset_val['aggregated_subnarratives'].to_numpy()

In [118]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiTaskClassifierMultiHead(nn.Module):
    def __init__(
        self,
        input_size,
        hidden_size,
        num_narratives=len(mlb_narratives.classes_),
        narrative_to_sub_map=narrative_to_sub_map,
        dropout_rate=0.3
    ):
        super().__init__()
        # Shared layer
        self.shared_layer = nn.Sequential(
            nn.Linear(input_size, hidden_size * 2),
            nn.BatchNorm1d(hidden_size * 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        # Top-level narratives: multi-label => Sigmoid
        self.narrative_head = nn.Sequential(
            nn.Linear(hidden_size * 2, num_narratives),
            nn.Sigmoid()
        )

        # Subnarrative heads: multi-label => Sigmoid
        self.subnarrative_heads = nn.ModuleDict()
        for narr_idx, sub_indices in narrative_to_sub_map.items():
            num_subs_for_this_narr = len(sub_indices)
            self.subnarrative_heads[str(narr_idx)] = nn.Sequential(
                nn.Linear(hidden_size * 2, num_subs_for_this_narr),
                nn.Sigmoid()
            )

    def forward(self, x):
        shared_out = self.shared_layer(x)
        narr_probs = self.narrative_head(shared_out)

        sub_probs_dict = {}
        for narr_idx, head in self.subnarrative_heads.items():
            sub_probs_dict[narr_idx] = head(shared_out)

        return narr_probs, sub_probs_dict

In [119]:
model_multi_head = MultiTaskClassifierMultiHead(
    input_size=input_size,
    hidden_size=512,
)

In [120]:
# For each subnarrative head, add a weighted version of BCE based on the indices
sub_criterion_dict = {}

for narr_idx, sub_indices in narrative_to_sub_map.items():
    local_weights = [ class_weights_sub_nar[sub_i] for sub_i in sub_indices ]

    sub_criterion = WeightedBCELoss(local_weights)
    sub_criterion_dict[str(narr_idx)] = sub_criterion

In [121]:
def multi_head_loss(narr_probs, sub_probs_dict, y_narr, y_sub_heads):
    narr_loss = narrative_criterion(narr_probs, y_narr)

    sub_loss = 0.0
    count_active = 0
    i = 0
    for narr_idx_str, sub_probs in sub_probs_dict.items():
        narr_idx = int(narr_idx_str)
        # Find the true subnarratives for the batch
        y_sub = [row[narr_idx] for row in y_sub_heads]
        y_sub_tensor = torch.tensor(y_sub, dtype=torch.float32)

        sub_loss_func = sub_criterion_dict[narr_idx_str]
        ce_loss = sub_loss_func(sub_probs, y_sub_tensor)

        sub_loss += ce_loss
        count_active += 1
        i += 1

    if count_active > 0:
        sub_loss = sub_loss / count_active
    else:
        sub_loss = 0.0

    total_loss = narr_loss + sub_loss

    return total_loss

In [122]:
def train_with_multihead(
    model,
    optimizer,
    narrative_criterion,
    subnarrative_criterion,
    train_embeddings=train_embeddings_tensor,
    y_train_nar=y_train_nar,
    y_train_sub_heads=y_train_sub_heads,
    val_embeddings=val_embeddings_tensor,
    y_val_nar=y_val_nar,
    y_val_sub_heads=y_val_sub_heads,
    patience=3,
    num_epochs=100,
):
    best_val_loss = float('inf')
    best_model = None
    patience_counter = 0

    for epoch in range(num_epochs):
        model.train()
        train_narr_probs, train_sub_probs_dict = model(train_embeddings)
        train_loss = multi_head_loss(train_narr_probs, train_sub_probs_dict, y_train_nar, y_train_sub_heads)

        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            val_narr_probs, val_sub_probs_dict = model(val_embeddings)
            val_loss = multi_head_loss(val_narr_probs, val_sub_probs_dict, y_val_nar, y_val_sub_heads)

        print(f"Epoch {epoch+1}/{num_epochs}, "
              f"Training Loss: {train_loss.item():.4f} "
              f"Validation Loss: {val_loss.item():.4f} ")

        if val_loss.item() < best_val_loss:
            best_val_loss = val_loss.item()
            patience_counter = 0
            best_model = model.state_dict()
        else:
            patience_counter += 1
            print(f"Validation loss did not improve for {patience_counter} epoch(s).")

        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

    if best_model:
        model.load_state_dict(best_model)

    return model

In [123]:
optimizer_multi_head = torch.optim.AdamW(model_multi_head.parameters(), lr=0.001)

In [124]:
train_with_multihead(
    model=model_multi_head,
    optimizer=optimizer_multi_head,
    narrative_criterion=narrative_criterion,
    subnarrative_criterion=subnarrative_criterion,
)

Epoch 1/100, Training Loss: 1.4551 Validation Loss: 1.4329 
Epoch 2/100, Training Loss: 1.1907 Validation Loss: 1.4251 
Epoch 3/100, Training Loss: 1.0658 Validation Loss: 1.4175 
Epoch 4/100, Training Loss: 0.9726 Validation Loss: 1.4102 
Epoch 5/100, Training Loss: 0.9029 Validation Loss: 1.4034 
Epoch 6/100, Training Loss: 0.8470 Validation Loss: 1.3964 
Epoch 7/100, Training Loss: 0.7977 Validation Loss: 1.3891 
Epoch 8/100, Training Loss: 0.7573 Validation Loss: 1.3813 
Epoch 9/100, Training Loss: 0.7228 Validation Loss: 1.3728 
Epoch 10/100, Training Loss: 0.6901 Validation Loss: 1.3639 
Epoch 11/100, Training Loss: 0.6636 Validation Loss: 1.3549 
Epoch 12/100, Training Loss: 0.6386 Validation Loss: 1.3456 
Epoch 13/100, Training Loss: 0.6122 Validation Loss: 1.3362 
Epoch 14/100, Training Loss: 0.5925 Validation Loss: 1.3267 
Epoch 15/100, Training Loss: 0.5731 Validation Loss: 1.3174 
Epoch 16/100, Training Loss: 0.5556 Validation Loss: 1.3082 
Epoch 17/100, Training Loss: 0.53

MultiTaskClassifierMultiHead(
  (shared_layer): Sequential(
    (0): Linear(in_features=896, out_features=1024, bias=True)
    (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
  )
  (narrative_head): Sequential(
    (0): Linear(in_features=1024, out_features=21, bias=True)
    (1): Sigmoid()
  )
  (subnarrative_heads): ModuleDict(
    (8): Sequential(
      (0): Linear(in_features=1024, out_features=9, bias=True)
      (1): Sigmoid()
    )
    (9): Sequential(
      (0): Linear(in_features=1024, out_features=7, bias=True)
      (1): Sigmoid()
    )
    (17): Sequential(
      (0): Linear(in_features=1024, out_features=6, bias=True)
      (1): Sigmoid()
    )
    (19): Sequential(
      (0): Linear(in_features=1024, out_features=4, bias=True)
      (1): Sigmoid()
    )
    (10): Sequential(
      (0): Linear(in_features=1024, out_features=3, bias=True)
      (1): Sigmoid()
    )
    (1): Sequent

In [125]:
import numpy as np
import torch
from sklearn.metrics import classification_report, f1_score

def evaluate_multihead_model(
    model,
    embeddings,
    y_nar_true,
    y_sub_hierarchical,
    num_subnarratives = len(mlb_subnarratives.classes_),
    thresholds = np.arange(0.1, 1.0, 0.1),
    target_names_nar=mlb_narratives.classes_,
    target_names_sub=mlb_subnarratives.classes_,
    device='cpu',
):

    def build_global_sub_array(
        y_sub_hierarchical,
        num_subnarratives=74,
        narrative_to_sub_map=narrative_to_sub_map,
        narrative_order=narrative_order,
    ):
        """Reconstructs the subnarratives to flatten them (again) to a single array for evaluation"""
        num_samples = len(y_sub_hierarchical)
        sub_global_array = np.zeros((num_samples, num_subnarratives), dtype=int)

        for i in range(num_samples):
            for j, narr_idx in enumerate(narrative_order):
                sub_label_vec = y_sub_hierarchical[i][j]
                narr_idx = int(narr_idx)
                sub_indices = narrative_to_sub_map[narr_idx]
                for local_sub_i, global_sub_i in enumerate(sub_indices):
                    sub_global_array[i, global_sub_i] = sub_label_vec[local_sub_i]

        return sub_global_array

    embeddings = embeddings.to(device)
    y_nar_true_np = y_nar_true.cpu().numpy()

    best_threshold = 0
    best_f1 = -1
    best_report_nar = None
    best_report_sub = None
    samples = len(embeddings)

    with torch.no_grad():
        # get the predictions for both
        narr_probs, sub_probs_dict = model(embeddings)

        narr_probs = narr_probs.cpu().numpy()
        for k in sub_probs_dict:
            sub_probs_dict[k] = sub_probs_dict[k].cpu().numpy()

    for threshold in thresholds:
        narr_preds = (narr_probs >= threshold).astype(int)

        # Need to reconstruct the subnarratives to flatten them (again) to a single array for evaluation
        sub_preds_global = np.zeros((samples, num_subnarratives), dtype=int)

        for narr_idx, sub_indices in narrative_to_sub_map.items():
            # Get the predictions for this narrative hierarchy
            sub_probs_for_narr = sub_probs_dict[str(narr_idx)]
            # If top-level narrative is 1, then threshold subnarratives; otherwise 0.
            # Finds for each sample, go to the narr_idx position (the hierarchy we are at)
            predicted_narr_mask = narr_preds[:, narr_idx] == 1  # shape (num_samples,)

            # For all samples, threshold sub_probs_for_narr:
            sub_preds_for_narr = (sub_probs_for_narr >= threshold).astype(int)

            # But only keep sub_preds_for_narr if predicted_narr_mask is True:
            # If predicted_narr_mask is False for a sample, subnarratives go to 0.
            for sample_idx in range(samples):
                if predicted_narr_mask[sample_idx] == 1:
                    # Construct the flattened pred array
                    for local_sub_i, global_sub_i in enumerate(sub_indices):
                        sub_preds_global[sample_idx, global_sub_i] = sub_preds_for_narr[sample_idx, local_sub_i]
                else:
                    continue

        f1_nar = f1_score(y_nar_true_np, narr_preds, average="macro", zero_division=0)

        # Also flatten the true y_sub to a single array in the same way as we did with the predictions
        y_sub_true_np = build_global_sub_array(y_sub_hierarchical, num_subnarratives=num_subnarratives)

        f1_sub = f1_score(y_sub_true_np, sub_preds_global, average="macro", zero_division=0)

        avg_f1 = (f1_nar + f1_sub) / 2.0

        if avg_f1 > best_f1:
            best_f1 = avg_f1
            best_threshold = threshold

            report_nar = classification_report(
                y_nar_true_np,
                narr_preds,
                target_names=target_names_nar,
                zero_division=0
            )
            report_sub = classification_report(
                y_sub_true_np,
                sub_preds_global,
                target_names=target_names_sub,
                zero_division=0
            )
            best_report_nar = report_nar
            best_report_sub = report_sub

    print(f"Best threshold = {best_threshold:.2f}, best (avg) F1 = {best_f1:.4f}")
    print("Best Narratives classification report:")
    print(best_report_nar)
    print("Best Subnarratives classification report:")
    print(best_report_sub)

In [126]:
evaluate_multihead_model(
    model=model_multi_head,
    embeddings=val_embeddings_tensor,
    y_nar_true=y_val_nar,
    y_sub_hierarchical=y_val_sub_heads,
)

Best threshold = 0.50, best (avg) F1 = 0.3748
Best Narratives classification report:
                                                   precision    recall  f1-score   support

                         Amplifying Climate Fears       0.80      0.96      0.87        46
                     Amplifying war-related fears       0.58      0.76      0.66        38
Blaming the war on others rather than the invader       0.32      0.59      0.42        37
                     Climate change is beneficial       0.00      0.00      0.00         1
             Controversy about green technologies       0.36      0.67      0.47         6
                    Criticism of climate movement       0.35      0.73      0.47        11
                    Criticism of climate policies       0.45      0.83      0.59        18
        Criticism of institutions and authorities       0.58      0.88      0.70        32
                             Discrediting Ukraine       0.65      0.83      0.73        75
    