# Semeval 2025 Task 10
### Subtask 2: Narrative Baseline Classification -- Multilingual

Given a news article and a [two-level taxonomy of narrative labels](https://propaganda.math.unipd.it/semeval2025task10/NARRATIVE-TAXONOMIES.pdf) (where each narrative is subdivided into subnarratives) from a particular domain, assign to the article all the appropriate subnarrative labels. This is a multi-label multi-class document classification task.

## 1. Setup

### 1.1 Getting and analyzing data

In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import ModelCheckpoint

from matplotlib import pyplot as plt
import seaborn as sns
import os

In [2]:
data = []
ignore_folders = ['.DS_Store']

base_dir_documents = '../data/semeval_data/raw-documents'

for language_folder in os.listdir(base_dir_documents):

    if language_folder in ignore_folders:
        continue

    language_path = os.path.join(base_dir_documents, language_folder)
    if os.path.isdir(language_path):
        for root, _, files in os.walk(language_path):
            for file in files:
                if file.endswith('.txt'):
                    file_path = os.path.join(root, file)

                    article_id = file
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()

                    data.append({
                        'language': language_folder,
                        'article_id': article_id,
                        'content': content
                    })

documents_df = pd.DataFrame(data)

In [3]:
print(documents_df.shape)
documents_df.head()

(726, 3)


Unnamed: 0,language,article_id,content
0,PT,PT_53.txt,Mais de 1600 Cientistas Negam a Emergência Cli...
1,PT,PT_47.txt,Zelensky admite a inevitabilidade da retirada ...
2,PT,PT_90.txt,Sanções à Rússia tiveram pouco impacto na guer...
3,PT,PT_84.txt,Zelensky planeia segunda cimeira de paz em nov...
4,PT,PT_166.txt,"Ministro da Agricultura critica ""radicais verd..."


In [4]:
base_dir_labels = '../data/semeval_data/labels'

raw_annotation_data = []

for language_folder in os.listdir(base_dir_labels):

    if language_folder in ignore_folders:
        continue

    print('Now processing language', language_folder)

    language_path = os.path.join(base_dir_labels, language_folder)
    if os.path.isdir(language_path):
        for root, _, files in os.walk(language_path):
            label_file = 'subtask-2-annotations.txt'
            file_path = os.path.join(root, label_file)

            with open(file_path, 'r') as file:
                for line in file:
                    parts = line.strip().split('\t')
                    article_id = parts[0]
                    narrative_to_subnarratives = parts[2].split(';')
                    narratives = []
                    subnarratives = []

                    for nar_to_sub in narrative_to_subnarratives:
                      subnarrative_list = nar_to_sub.split(' ')
                      if subnarrative_list[0] == 'Other':
                        narratives.append('Other')
                        subnarratives.append('Other')
                        continue

                      nar_to_sub = ' '.join(subnarrative_list[1:])
                      nar, sub = nar_to_sub.split(':')
                      narratives.append(nar.strip())
                      subnarratives.append(sub.strip())

                    raw_annotation_data.append({
                        'article_id': article_id,
                        'narratives': narratives,
                        'subnarratives': subnarratives
                    })

annotations_df = pd.DataFrame(raw_annotation_data)

Now processing language PT
Now processing language BG
Now processing language HI
Now processing language EN


In [5]:
annotations_df.head()

Unnamed: 0,article_id,narratives,subnarratives
0,PT_161.txt,"[Amplifying Climate Fears, Amplifying Climate ...","[Other, Amplifying existing fears of global wa..."
1,PT_13.txt,"[Praise of Russia, Discrediting Ukraine]","[Praise of Russian military might, Discreditin..."
2,PT_153.txt,"[Amplifying Climate Fears, Amplifying Climate ...","[Earth will be uninhabitable soon, Amplifying ..."
3,PT_196.txt,"[Criticism of institutions and authorities, Am...","[Criticism of national governments, Other]"
4,PT_56.txt,"[Criticism of institutions and authorities, Am...","[Criticism of national governments, Amplifying..."


In [6]:
annotations_df.tail()

Unnamed: 0,article_id,narratives,subnarratives
721,EN_UA_013257.txt,"[Russia is the Victim, Blaming the war on othe...",[Russia actions in Ukraine are only self-defen...
722,EN_UA_000104.txt,[Other],[Other]
723,EN_UA_102958.txt,"[Amplifying war-related fears, Amplifying war-...","[Other, There is a real possibility that nucle..."
724,EN_UA_027787.txt,[Blaming the war on others rather than the inv...,"[The West are the aggressors, Ukraine is the a..."
725,EN_CC_100139.txt,[Other],[Other]


In [7]:
annotations_df.shape

(726, 3)

In [8]:
dataset = pd.merge(documents_df, annotations_df, on='article_id')
dataset.head()

Unnamed: 0,language,article_id,content,narratives,subnarratives
0,PT,PT_53.txt,Mais de 1600 Cientistas Negam a Emergência Cli...,"[Downplaying climate change, Criticism of inst...","[Other, Criticism of political organizations a..."
1,PT,PT_47.txt,Zelensky admite a inevitabilidade da retirada ...,"[Discrediting Ukraine, Overpraising the West]","[Discrediting Ukrainian military, Other]"
2,PT,PT_90.txt,Sanções à Rússia tiveram pouco impacto na guer...,"[Praise of Russia, Russia is the Victim, Discr...","[Other, Other, Other]"
3,PT,PT_84.txt,Zelensky planeia segunda cimeira de paz em nov...,[Discrediting Ukraine],[Other]
4,PT,PT_166.txt,"Ministro da Agricultura critica ""radicais verd...","[Criticism of climate movement, Criticism of c...","[Other, Other]"


In [9]:
def extract_article_id(filename):
    number_part = filename.split('_')[-1].split('.')[0]
    return number_part

print(extract_article_id('EN_UA_103861.txt'))

103861


In [10]:
dataset['article_id'] = dataset['article_id'].apply(extract_article_id)

In [11]:
dataset.head()

Unnamed: 0,language,article_id,content,narratives,subnarratives
0,PT,53,Mais de 1600 Cientistas Negam a Emergência Cli...,"[Downplaying climate change, Criticism of inst...","[Other, Criticism of political organizations a..."
1,PT,47,Zelensky admite a inevitabilidade da retirada ...,"[Discrediting Ukraine, Overpraising the West]","[Discrediting Ukrainian military, Other]"
2,PT,90,Sanções à Rússia tiveram pouco impacto na guer...,"[Praise of Russia, Russia is the Victim, Discr...","[Other, Other, Other]"
3,PT,84,Zelensky planeia segunda cimeira de paz em nov...,[Discrediting Ukraine],[Other]
4,PT,166,"Ministro da Agricultura critica ""radicais verd...","[Criticism of climate movement, Criticism of c...","[Other, Other]"


In [12]:
dataset.shape

(726, 5)

In [13]:
dataset['narratives']

0      [Downplaying climate change, Criticism of inst...
1          [Discrediting Ukraine, Overpraising the West]
2      [Praise of Russia, Russia is the Victim, Discr...
3                                 [Discrediting Ukraine]
4      [Criticism of climate movement, Criticism of c...
                             ...                        
721    [Criticism of institutions and authorities, Cr...
722                   [Discrediting the West, Diplomacy]
723                                              [Other]
724                       [Amplifying war-related fears]
725                           [Speculating war outcomes]
Name: narratives, Length: 726, dtype: object

In [14]:
unique_narratives = dataset['narratives'].explode().unique()
unique_narratives

array(['Downplaying climate change',
       'Criticism of institutions and authorities',
       'Questioning the measurements and science',
       'Climate change is beneficial', 'Criticism of climate policies',
       'Criticism of climate movement', 'Discrediting Ukraine',
       'Overpraising the West', 'Praise of Russia',
       'Russia is the Victim', 'Discrediting the West, Diplomacy',
       'Other', 'Amplifying Climate Fears',
       'Controversy about green technologies', 'Speculating war outcomes',
       'Amplifying war-related fears',
       'Negative Consequences for the West',
       'Blaming the war on others rather than the invader',
       'Hidden plots by secret schemes of powerful groups',
       'Distrust towards Media',
       'Green policies are geopolitical instruments'], dtype=object)

In [15]:
print(len(dataset['narratives'].explode().value_counts()))
dataset['narratives'].explode().value_counts()

21


narratives
Discrediting Ukraine                                 240
Discrediting the West, Diplomacy                     238
Praise of Russia                                     170
Other                                                147
Amplifying war-related fears                         130
Amplifying Climate Fears                             123
Russia is the Victim                                 107
Blaming the war on others rather than the invader     93
Criticism of institutions and authorities             79
Criticism of climate policies                         59
Speculating war outcomes                              59
Negative Consequences for the West                    56
Criticism of climate movement                         35
Distrust towards Media                                34
Overpraising the West                                 34
Hidden plots by secret schemes of powerful groups     31
Downplaying climate change                            26
Controversy about gr

In [16]:
unique_subnarratives = dataset['subnarratives'].explode().unique()
unique_subnarratives

array(['Other', 'Criticism of political organizations and figures',
       'Scientific community is unreliable',
       'Humans and nature will adapt to the changes', 'CO2 is beneficial',
       'Climate policies have negative impact on the economy',
       'Climate cycles are natural',
       'Methodologies/metrics used are unreliable/faulty',
       'Climate movement is alarmist', 'Discrediting Ukrainian military',
       'Amplifying existing fears of global warming',
       'Doomsday scenarios for humans',
       'Criticism of national governments', 'The West is russophobic',
       'Praise of Russian military might',
       'Climate policies are ineffective',
       'Renewable energy is dangerous',
       'Russia is a guarantor of peace and prosperity',
       'Ukraine is a hub for criminal activities',
       'Russia actions in Ukraine are only self-defence',
       'Ukraine is associated with nazism',
       'Ukraine is a puppet of the West',
       'Situation in Ukraine is hopel

In [17]:
len(unique_subnarratives)

71

In [18]:
pd.set_option('display.max_rows', 100)

dataset['subnarratives'].explode().value_counts()

subnarratives
Other                                                                     497
The West are the aggressors                                                64
Discrediting Ukrainian government and officials and policies               63
Amplifying existing fears of global warming                                56
Praise of Russian military might                                           56
Ukraine is a puppet of the West                                            49
The West does not care about Ukraine, only about its interests             47
Russia is a guarantor of peace and prosperity                              44
The West is russophobic                                                    40
There is a real possibility that nuclear weapons will be employed          39
The West is weak                                                           35
Russia will also attack other countries                                    34
Situation in Ukraine is hopeless                  

In [19]:
dataset['subnarratives'].explode().value_counts()

subnarratives
Other                                                                     497
The West are the aggressors                                                64
Discrediting Ukrainian government and officials and policies               63
Amplifying existing fears of global warming                                56
Praise of Russian military might                                           56
Ukraine is a puppet of the West                                            49
The West does not care about Ukraine, only about its interests             47
Russia is a guarantor of peace and prosperity                              44
The West is russophobic                                                    40
There is a real possibility that nuclear weapons will be employed          39
The West is weak                                                           35
Russia will also attack other countries                                    34
Situation in Ukraine is hopeless                  

### 1.2 Encoding classification labels

In [20]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb_narratives = MultiLabelBinarizer()
mlb_subnarratives = MultiLabelBinarizer()

In [21]:
narratives_binary = mlb_narratives.fit_transform(dataset['narratives'])
subnarratives_binary = mlb_subnarratives.fit_transform(dataset['subnarratives'])

dataset['narratives_encoded'] = narratives_binary.tolist()
dataset['subnarratives_encoded'] = subnarratives_binary.tolist()

In [22]:
dataset.head()

Unnamed: 0,language,article_id,content,narratives,subnarratives,narratives_encoded,subnarratives_encoded
0,PT,53,Mais de 1600 Cientistas Negam a Emergência Cli...,"[Downplaying climate change, Criticism of inst...","[Other, Criticism of political organizations a...","[0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, ..."
1,PT,47,Zelensky admite a inevitabilidade da retirada ...,"[Discrediting Ukraine, Overpraising the West]","[Discrediting Ukrainian military, Other]","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,PT,90,Sanções à Rússia tiveram pouco impacto na guer...,"[Praise of Russia, Russia is the Victim, Discr...","[Other, Other, Other]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,PT,84,Zelensky planeia segunda cimeira de paz em nov...,[Discrediting Ukraine],[Other],"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,PT,166,"Ministro da Agricultura critica ""radicais verd...","[Criticism of climate movement, Criticism of c...","[Other, Other]","[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [23]:
remove_narratives = dataset['narratives'].explode().value_counts()
remove_narratives_idx = remove_narratives[remove_narratives < 2].index.tolist()

remove_subnarratives = dataset['subnarratives'].explode().value_counts()
remove_subnarratives_idx = remove_subnarratives[remove_subnarratives < 2].index.tolist()

In [24]:
def remove_labels_from_list(labels, remove_idx):
    return [label for label in labels if label not in remove_idx]

dataset['narratives'] = dataset['narratives'].apply(lambda x: remove_labels_from_list(x, remove_narratives_idx))

dataset['subnarratives'] = dataset['subnarratives'].apply(lambda x: remove_labels_from_list(x, remove_subnarratives_idx))

In [25]:
dataset['narratives'].explode().value_counts()

narratives
Discrediting Ukraine                                 240
Discrediting the West, Diplomacy                     238
Praise of Russia                                     170
Other                                                147
Amplifying war-related fears                         130
Amplifying Climate Fears                             123
Russia is the Victim                                 107
Blaming the war on others rather than the invader     93
Criticism of institutions and authorities             79
Criticism of climate policies                         59
Speculating war outcomes                              59
Negative Consequences for the West                    56
Criticism of climate movement                         35
Distrust towards Media                                34
Overpraising the West                                 34
Hidden plots by secret schemes of powerful groups     31
Downplaying climate change                            26
Controversy about gr

In [26]:
subnarratives_counts = dataset['subnarratives'].explode().value_counts()
print(len(subnarratives_counts))
subnarratives_counts

67


subnarratives
Other                                                                     497
The West are the aggressors                                                64
Discrediting Ukrainian government and officials and policies               63
Praise of Russian military might                                           56
Amplifying existing fears of global warming                                56
Ukraine is a puppet of the West                                            49
The West does not care about Ukraine, only about its interests             47
Russia is a guarantor of peace and prosperity                              44
The West is russophobic                                                    40
There is a real possibility that nuclear weapons will be employed          39
The West is weak                                                           35
Russia will also attack other countries                                    34
Situation in Ukraine is hopeless                  

### 1.3 Cleaning articles

In [27]:
language_model_map = {
    "BG": "xx_ent_wiki_sm",
    "EN": "en_core_web_sm",
    "HI": "xx_ent_wiki_sm",
    "PT": "pt_core_news_sm",
}

In [28]:
!pip3 -q install emoji

In [30]:
import spacy
import re
import emoji

nlp_models = {lang: spacy.load(model) for lang, model in language_model_map.items()}

def clean_article(article_text, language_code):
    nlp = nlp_models.get(language_code, nlp_models["EN"])

    article_text = re.sub(r'http\S+|www\S+|https\S+|[a-zA-Z0-9.-]+\.com|[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+|@[A-Za-z0-9_]+', '', article_text)

    doc = nlp(article_text)

    cleaned_tokens = []

    important_entity_types = ["PERSON", "ORG", "GPE"]

    for token in doc:
        if token.is_space or emoji.is_emoji(token.text):
            continue

        if token.ent_type_ in important_entity_types:
            cleaned_tokens.append(token.text + token.whitespace_)
        else:
            cleaned_tokens.append(token.text.lower() + token.whitespace_)

    return "".join(cleaned_tokens).strip()

dataset["content"] = dataset.apply(lambda row: clean_article(row["content"], row["language"]), axis=1)

In [31]:
!pip install -q iterative-stratification

In [32]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

def stratified_train_val_split(data, labels_column, train_size=0.8, splits=5, shuffle=True, min_instances=2):
    if shuffle:
        shuffled_indices = np.arange(len(data))
        np.random.shuffle(shuffled_indices)
        data = data.iloc[shuffled_indices].reset_index(drop=True)

    labels = np.array(data[labels_column].tolist())
    rare_indices = []
    common_indices = []

    class_counts = labels.sum(axis=0)
    rare_classes = np.where(class_counts <= min_instances)[0]

    for idx, label_row in enumerate(labels):
        if any(label_row[rare_classes]):
            rare_indices.append(idx)
        else:
            common_indices.append(idx)

    rare_data = data.iloc[rare_indices]
    rare_labels = labels[rare_indices]
    train_rare = rare_data.iloc[:len(rare_data) // 2].reset_index(drop=True)
    val_rare = rare_data.iloc[len(rare_data) // 2:].reset_index(drop=True)

    common_data = data.iloc[common_indices].reset_index(drop=True)
    common_labels = labels[common_indices]

    mskf = MultilabelStratifiedKFold(n_splits=splits)
    for train_idx, val_idx in mskf.split(np.zeros(len(common_labels)), common_labels):
        train_common = common_data.iloc[train_idx]
        val_common = common_data.iloc[val_idx]
        break

    train_data = pd.concat([train_rare, train_common]).reset_index(drop=True)
    val_data = pd.concat([val_rare, val_common]).reset_index(drop=True)

    return train_data, val_data

(dataset_train), (dataset_val) = stratified_train_val_split(
    dataset,
    labels_column="subnarratives_encoded",
    min_instances=2
)

In [33]:
train_sub_nar_counts = dataset_train['subnarratives'].explode().value_counts()
print(len(train_sub_nar_counts))
train_sub_nar_counts

66


subnarratives
Other                                                                     388
The West are the aggressors                                                52
Discrediting Ukrainian government and officials and policies               51
Amplifying existing fears of global warming                                45
Praise of Russian military might                                           45
Ukraine is a puppet of the West                                            39
The West does not care about Ukraine, only about its interests             37
Russia is a guarantor of peace and prosperity                              35
The West is russophobic                                                    32
There is a real possibility that nuclear weapons will be employed          31
The West is weak                                                           28
Situation in Ukraine is hopeless                                           27
Russia will also attack other countries           

In [34]:
val_sub_nar_counts = dataset_val['subnarratives'].explode().value_counts()
print(len(val_sub_nar_counts))
val_sub_nar_counts

62


subnarratives
Other                                                                     109
The West are the aggressors                                                12
Discrediting Ukrainian government and officials and policies               12
Praise of Russian military might                                           11
Amplifying existing fears of global warming                                11
The West does not care about Ukraine, only about its interests             10
Ukraine is a puppet of the West                                            10
Russia is a guarantor of peace and prosperity                               9
There is a real possibility that nuclear weapons will be employed           8
The West is russophobic                                                     8
Criticism of national governments                                           8
Situation in Ukraine is hopeless                                            7
Russia will also attack other countries           

### 1.3.2 Getting more data

In [35]:
lower_freq_bound = 20
low_freq_narratives = dataset_train['narratives'].explode().value_counts()
low_freq_narratives = low_freq_narratives[low_freq_narratives < lower_freq_bound].index.tolist()

low_freq_subnarratives = dataset_train['subnarratives'].explode().value_counts()
low_freq_subnarratives = low_freq_subnarratives[low_freq_subnarratives < lower_freq_bound].index.tolist()

In [36]:
low_freq_narratives

['Downplaying climate change',
 'Questioning the measurements and science',
 'Controversy about green technologies',
 'Green policies are geopolitical instruments',
 'Climate change is beneficial']

In [37]:
low_freq_subnarratives

['Diplomacy does/will not work',
 'Western media is an instrument of propaganda',
 'By continuing the war we risk WWIII',
 'Ukraine is a hub for criminal activities',
 'Praise of Russian President Vladimir Putin',
 'Ukrainian army is collapsing',
 'The EU is divided',
 'Climate policies have negative impact on the economy',
 'Sanctions imposed by Western countries will backfire',
 'The West has the strongest international support',
 'Ukraine is associated with nazism',
 'Doomsday scenarios for humans',
 'Climate policies are ineffective',
 'West is tired of Ukraine',
 'The West belongs in the right side of history',
 'Earth will be uninhabitable soon',
 'Russian army is collapsing',
 'Climate movement is alarmist',
 'NATO should/will directly intervene',
 'Blaming global elites',
 'UA is anti-RU extremists',
 'Criticism of international entities',
 'Climate policies are only for profit',
 'Climate cycles are natural',
 'Climate agenda has hidden motives',
 'Methodologies/metrics used a

In [38]:
low_freq_data = dataset_train[
    dataset_train['narratives'].apply(lambda x: any(label in low_freq_narratives for label in x)) |
    dataset_train['subnarratives'].apply(lambda x: any(label in low_freq_subnarratives for label in x))
]

In [39]:
print(low_freq_data.shape)
low_freq_data.head()

(238, 7)


Unnamed: 0,language,article_id,content,narratives,subnarratives,narratives_encoded,subnarratives_encoded
0,PT,53,mais de 1600 cientistas negam a emergência cli...,"[Downplaying climate change, Criticism of inst...","[Other, Criticism of political organizations a...","[0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, ..."
1,EN,100124,glaciers give the lie to global warming narrat...,"[Downplaying climate change, Downplaying clima...","[Ice is not melting, Climate cycles are natura...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
2,PT,52,mudança climática: o tema que não é bem-vindo ...,"[Downplaying climate change, Criticism of inst...","[Other, Criticism of political organizations a...","[0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
3,HI,120,"अब रूस-यूक्रेन युद्ध में मच सकता है कोहराम, us...","[Speculating war outcomes, Overpraising the We...","[Ukrainian army is collapsing, NATO will destr...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,HI,137,बदलती जलवायु में पौधों द्वारा हमारी सोच से कही...,"[Downplaying climate change, Amplifying Climat...","[Humans and nature will adapt to the changes, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [40]:
low_freq_data['language'].value_counts()

language
BG    76
EN    63
PT    56
HI    43
Name: count, dtype: int64

In [41]:
!pip install -q googletrans==4.0.0-rc1

In [42]:
from googletrans import Translator
from time import sleep

translator = Translator()

language_map = {
    'EN': 'en',  # English
    'BG': 'bg',  # Bulgarian
    'PT': 'pt',  # Portuguese
    'HI': 'hi',  # Hindi
}

def translate_text_google(text, target_lang='en'):
    try:
        # sleeping to avoid rate-limiting
        sleep(1)
        translated = translator.translate(text, dest=target_lang)
        return translated.text
    except Exception as e:
        print(f"Translation failed for language {target_lang} with error: {e}")
        return ""

text = "Hola, ¿cómo estás?"
translated_text = translate_text_google(text)
print(translated_text)

Hello how are you?


In [43]:
def augment_with_translation(text, lang='EN'):
    try:
        if lang != 'EN':
            # translate to English, then back to original language
            translated_text = translate_text_google(text, target_lang='en')
            return translate_text_google(translated_text, target_lang=language_map.get(lang, 'EN'))
        else:
            # translate to FR, then back to EN
            temp_translation = translate_text_google(text, target_lang='fr')
            return translate_text_google(temp_translation, target_lang='en')
    except Exception as e:
        print(f"Error processing text: {e}")
        return text

In [44]:
data_augment = True
augmented_df = None

def translate_and_augment_data(low_freq_data):
    augmented_data = []
    low_freq_data = low_freq_data.reset_index(drop=True)

    for index, row in low_freq_data.iterrows():
        original_text = row['content']
        language = row['language']
        article_id = row['article_id']
        narratives = row['narratives']
        subnarratives = row['subnarratives']
        narratives_encoded = row['narratives_encoded']
        subnarratives_encoded = row['subnarratives_encoded']

        try:
            if language != 'EN':
                translated_text = augment_with_translation(original_text, lang=language)
            else:
                translated_text = augment_with_translation(original_text, lang='EN')

            augmented_data.append({
                'language': language,
                'article_id': article_id,
                'content': translated_text,
                'narratives': narratives,
                'subnarratives': subnarratives,
                'narratives_encoded': narratives_encoded,
                'subnarratives_encoded': subnarratives_encoded
            })
        except Exception as e:
            print(f"Error processing row {index}: {e}")

    return pd.DataFrame(augmented_data)

if data_augment: augmented_df = translate_and_augment_data(low_freq_data)

Translation failed for language en with error: the JSON object must be str, bytes or bytearray, not NoneType
Translation failed for language hi with error: the JSON object must be str, bytes or bytearray, not NoneType
Translation failed for language en with error: the JSON object must be str, bytes or bytearray, not NoneType
Translation failed for language hi with error: the JSON object must be str, bytes or bytearray, not NoneType
Translation failed for language en with error: the JSON object must be str, bytes or bytearray, not NoneType
Translation failed for language hi with error: the JSON object must be str, bytes or bytearray, not NoneType
Translation failed for language fr with error: [Errno 54] Connection reset by peer
Translation failed for language en with error: the JSON object must be str, bytes or bytearray, not NoneType
Translation failed for language en with error: the JSON object must be str, bytes or bytearray, not NoneType
Translation failed for language hi with error

In [45]:
blank_content = augmented_df[augmented_df['content'].str.strip() == '']
len(blank_content)

14

In [46]:
augmented_df = augmented_df[augmented_df['content'].str.strip() != '']

In [47]:
print(len(augmented_df))
augmented_df.head()

224


Unnamed: 0,language,article_id,content,narratives,subnarratives,narratives_encoded,subnarratives_encoded
0,PT,53,Mais de 1600 cientistas negam a emergência cli...,"[Downplaying climate change, Criticism of inst...","[Other, Criticism of political organizations a...","[0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, ..."
1,EN,100124,The glaciers put the lying to global warming T...,"[Downplaying climate change, Downplaying clima...","[Ice is not melting, Climate cycles are natura...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
2,PT,52,Mudança climática: o tema que não é bem -vindo...,"[Downplaying climate change, Criticism of inst...","[Other, Criticism of political organizations a...","[0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
3,HI,120,"अब रूस-यूक्रेन युद्ध में गठित एक हंगामे, अमेरि...","[Speculating war outcomes, Overpraising the We...","[Ukrainian army is collapsing, NATO will destr...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,BG,892,Глобално затопляне? Няма причина за безпокойст...,"[Downplaying climate change, Downplaying clima...","[Ice is not melting, Climate cycles are natural]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."


In [48]:
augmented_df['subnarratives'].explode().value_counts()

subnarratives
Other                                                                     121
Discrediting Ukrainian government and officials and policies               20
Ukraine is a puppet of the West                                            20
Diplomacy does/will not work                                               19
The West are the aggressors                                                19
Amplifying existing fears of global warming                                18
The West is russophobic                                                    17
Western media is an instrument of propaganda                               17
The West is weak                                                           17
By continuing the war we risk WWIII                                        17
The EU is divided                                                          16
Ukraine is a hub for criminal activities                                   16
Praise of Russian President Vladimir Putin        

In [49]:
print(augmented_df['narratives_encoded'].head())

0    [0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, ...
1    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
2    [0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, ...
3    [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
5    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
Name: narratives_encoded, dtype: object


In [50]:
print(augmented_df['subnarratives_encoded'].head())

0    [0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, ...
1    [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
2    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
3    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
Name: subnarratives_encoded, dtype: object


In [51]:
dataset_train = pd.concat([dataset_train, augmented_df], ignore_index=True)

In [52]:
dataset_train.shape

(803, 7)

In [53]:
dataset_train['narratives'].explode().value_counts()

narratives
Discrediting the West, Diplomacy                     301
Discrediting Ukraine                                 293
Praise of Russia                                     196
Amplifying war-related fears                         156
Amplifying Climate Fears                             147
Russia is the Victim                                 123
Other                                                122
Criticism of institutions and authorities            101
Blaming the war on others rather than the invader     98
Speculating war outcomes                              83
Criticism of climate policies                         81
Negative Consequences for the West                    67
Distrust towards Media                                47
Overpraising the West                                 47
Hidden plots by secret schemes of powerful groups     45
Criticism of climate movement                         43
Downplaying climate change                            34
Questioning the meas

### 1.4 Getting embeddings for the articles

In [54]:
embeddings_dir = '../data/embeddings/narrative_classification_multilingual/'
embedding_file_name = 'all_embeddings.npy'
embeddings_full_path = embeddings_dir + embedding_file_name

In [55]:
import os

def are_embeddings_saved(filepath):
    if os.path.exists(filepath):
        return True
    return False

In [56]:
are_embeddings_saved(embeddings_full_path)

False

In [62]:
from sentence_transformers import SentenceTransformer
labse_model = SentenceTransformer('sentence-transformers/LaBSE')

In [58]:
texts = [
    "This is a news article about politics.",  # English
    "यह राजनीति के बारे में एक समाचार लेख है।",  # Hindi
    "Este é um artigo de notícias sobre política.",  # Portuguese
    "Това е новинарска статия за политика.",  # Bulgarian
    "The sun rises in the east."  # Unrelated English sentence
]

embeddings = labse_model.encode(texts)

In [59]:
from sklearn.metrics.pairwise import cosine_similarity

cos_sim_matrix = cosine_similarity(embeddings)

print("Cosine Similarity Matrix:")
print(cos_sim_matrix)

Cosine Similarity Matrix:
[[1.         0.8829176  0.9265815  0.8651781  0.19801825]
 [0.8829176  1.0000002  0.96578115 0.89237857 0.23692542]
 [0.9265815  0.96578115 1.0000002  0.9107212  0.21260464]
 [0.8651781  0.89237857 0.9107212  1.0000002  0.18266794]
 [0.19801825 0.23692542 0.21260464 0.18266794 1.        ]]


In [60]:
def get_embeddings(contents, model):
    embeddings = []
    for content in contents:
        embedding = model.encode(content)
        embeddings.append(embedding)
    return np.array(embeddings)

train_embeddings = get_embeddings(dataset_train['content'], labse_model)

In [63]:
train_embeddings.shape

(803, 768)

In [64]:
val_embeddings = get_embeddings(dataset_val['content'], labse_model)

In [65]:
val_embeddings.shape

(147, 768)

In [66]:
y_train_nar = dataset_train['narratives_encoded'].tolist()
y_val_nar = dataset_val['narratives_encoded'].tolist()

In [67]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

ovr_logistic = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight='balanced'))

In [68]:
train_embeddings.shape

(803, 768)

In [69]:
ovr_logistic.fit(train_embeddings, y_train_nar)

In [70]:
import warnings
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

def get_classification_report(y_true, y_pred):
  # We will ignore the warnings we get
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        report = classification_report(y_true, y_pred, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    return report_df

def get_cross_val_score(model, x, y, scoring='f1_macro', splits=3):
    """Perform cross-validation and compute scores."""
    cv = StratifiedKFold(n_splits=splits, shuffle=True)
    cross_val_scores = cross_val_score(model, x, y, cv=cv, scoring=scoring)
    print(f"Cross-validation scores: {cross_val_scores}")
    print(f"Mean CV F1 Score: {cross_val_scores.mean()}")

In [71]:
import warnings
from sklearn.metrics import (
    hamming_loss,
)

def evaluate_model(model, x, y_true):
    y_pred = model.predict(x)

    classification_report_df = get_classification_report(y_true, y_pred)
    print("Classification Report:")
    print(classification_report_df)
    print("\n")

    hamming = hamming_loss(y_true, y_pred)
    print(f"Hamming Loss: {hamming:.4f}")
    print("\n")

In [72]:
evaluate_model(ovr_logistic, val_embeddings, y_val_nar)

Classification Report:
              precision    recall  f1-score  support
0              0.541667  0.764706  0.634146     17.0
1              0.416667  0.714286  0.526316     21.0
2              0.212766  0.625000  0.317460     16.0
3              0.000000  0.000000  0.000000      2.0
4              0.400000  0.800000  0.533333      5.0
5              0.466667  1.000000  0.636364      7.0
6              0.277778  0.909091  0.425532     11.0
7              0.256410  0.833333  0.392157     12.0
8              0.604167  0.852941  0.707317     34.0
9              0.430769  0.823529  0.565657     34.0
10             0.333333  0.571429  0.421053      7.0
11             0.454545  0.714286  0.555556      7.0
12             0.000000  0.000000  0.000000      1.0
13             0.090909  0.400000  0.148148      5.0
14             0.333333  0.727273  0.457143     11.0
15             0.523810  0.880000  0.656716     25.0
16             0.166667  0.428571  0.240000      7.0
17             0.430769

In [73]:
y_train_sub_nar = dataset_train['subnarratives_encoded'].tolist()
y_val_sub_nar = dataset_val['subnarratives_encoded'].tolist()

In [74]:
ovr_logistic = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight='balanced'))
ovr_logistic.fit(train_embeddings, y_train_sub_nar)



In [75]:
evaluate_model(ovr_logistic, val_embeddings, y_val_sub_nar)

Classification Report:
              precision    recall  f1-score  support
0              1.000000  0.666667  0.800000      3.0
1              0.473684  0.818182  0.600000     11.0
2              0.222222  1.000000  0.363636      2.0
3              0.200000  0.600000  0.300000      5.0
4              0.000000  0.000000  0.000000      1.0
5              0.000000  0.000000  0.000000      2.0
6              0.000000  0.000000  0.000000      1.0
7              0.166667  0.500000  0.250000      2.0
8              0.250000  0.666667  0.363636      3.0
9              0.250000  0.500000  0.333333      2.0
10             0.071429  0.333333  0.117647      3.0
11             0.000000  0.000000  0.000000      2.0
12             0.200000  0.750000  0.315789      4.0
13             0.000000  0.000000  0.000000      3.0
14             0.218750  0.875000  0.350000      8.0
15             0.050000  0.200000  0.080000      5.0
16             0.000000  0.000000  0.000000      1.0
17             0.000000

In [76]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier

xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')

multi_xgb_classifier = MultiOutputClassifier(xgb_classifier)
multi_xgb_classifier.fit(train_embeddings, y_train_nar)

In [77]:
evaluate_model(multi_xgb_classifier, val_embeddings, y_val_nar)

Classification Report:
              precision    recall  f1-score  support
0              0.750000  0.529412  0.620690     17.0
1              0.666667  0.476190  0.555556     21.0
2              0.000000  0.000000  0.000000     16.0
3              0.000000  0.000000  0.000000      2.0
4              0.000000  0.000000  0.000000      5.0
5              0.000000  0.000000  0.000000      7.0
6              1.000000  0.090909  0.166667     11.0
7              0.250000  0.083333  0.125000     12.0
8              0.684211  0.382353  0.490566     34.0
9              0.608696  0.411765  0.491228     34.0
10             1.000000  0.142857  0.250000      7.0
11             1.000000  0.285714  0.444444      7.0
12             0.000000  0.000000  0.000000      1.0
13             0.000000  0.000000  0.000000      5.0
14             0.666667  0.181818  0.285714     11.0
15             0.818182  0.360000  0.500000     25.0
16             0.333333  0.142857  0.200000      7.0
17             0.647059

In [78]:
multi_xgb_classifier.fit(train_embeddings, y_train_sub_nar)

In [79]:
evaluate_model(multi_xgb_classifier, val_embeddings, y_val_sub_nar)

Classification Report:
              precision    recall  f1-score  support
0              0.000000  0.000000  0.000000      3.0
1              0.833333  0.454545  0.588235     11.0
2              0.000000  0.000000  0.000000      2.0
3              1.000000  0.200000  0.333333      5.0
4              0.000000  0.000000  0.000000      1.0
5              0.000000  0.000000  0.000000      2.0
6              0.000000  0.000000  0.000000      1.0
7              0.000000  0.000000  0.000000      2.0
8              0.000000  0.000000  0.000000      3.0
9              0.000000  0.000000  0.000000      2.0
10             0.000000  0.000000  0.000000      3.0
11             0.000000  0.000000  0.000000      2.0
12             0.000000  0.000000  0.000000      4.0
13             0.000000  0.000000  0.000000      3.0
14             0.500000  0.125000  0.200000      8.0
15             0.333333  0.200000  0.250000      5.0
16             0.000000  0.000000  0.000000      1.0
17             0.000000

### Fine-tuning Roberta to predict narratives

In [83]:
from transformers import XLMRobertaTokenizer

tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

In [84]:
def tokenize_data(entity_contexts, max_length=512):
    encodings = tokenizer(entity_contexts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    return encodings

In [85]:
train_encodings = tokenize_data(dataset_train['content'].tolist())
val_encodings = tokenize_data(dataset_val['content'].tolist())

In [86]:
train_narrative_truths = dataset_train['narratives_encoded'].tolist()

train_narrative_truths[:5]

[[0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0]]

In [87]:
val_narrative_truths = dataset_val['narratives_encoded'].tolist()

val_narrative_truths[:5]

[[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1],
 [0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [88]:
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

model_config = {
    'batch_size': 8,
    'num_epochs': 10,
    'lr': 3e-5,
}

In [89]:
from transformers import XLMRobertaModel

class NarrativeClassificationRoberta(nn.Module):
    def __init__(self, narrative_classes):
        super(NarrativeClassificationRoberta, self).__init__()
        self.narrative_classes = narrative_classes
        self.backbone = XLMRobertaModel.from_pretrained("xlm-roberta-base")
        self.classifier = nn.Linear(self.backbone.config.hidden_size, narrative_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]

        narrative_logits = self.classifier(pooled_output)

        return narrative_logits

narrative_classifier_roberta = NarrativeClassificationRoberta(narrative_classes=len(mlb_narratives.classes_))

In [90]:
class NarrativesDataset(Dataset):
    def __init__(self, encodings, narrative_labels):
        self.encodings = encodings
        self.narrative_labels = narrative_labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['narrative_labels'] = torch.tensor(self.narrative_labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.narrative_labels)

train_dataset_nar = NarrativesDataset(
    train_encodings,
    dataset_train['narratives_encoded'].tolist(),
)

val_dataset_nar = NarrativesDataset(
    val_encodings,
    dataset_val['narratives_encoded'].tolist(),
)

train_loader_nar = DataLoader(train_dataset_nar, batch_size=model_config['batch_size'], shuffle=True)
val_loader_nar = DataLoader(val_dataset_nar, batch_size=model_config['batch_size'], shuffle=False)

In [92]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
optimizer_nar = optim.AdamW(narrative_classifier_roberta.parameters(), lr=model_config['lr'])

narrative_classifier_roberta.to(device)

NarrativeClassificationRoberta(
  (backbone): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

In [93]:
from torch.nn.functional import binary_cross_entropy_with_logits

def bce_with_weights(outputs, targets, weights=None):
    criterion = torch.nn.BCEWithLogitsLoss(reduction='none')
    loss = criterion(outputs, targets)

    if weights is not None:
        weights = weights.to(outputs.device)
        loss = loss * weights

    loss = loss.mean()
    return loss, loss.item()

In [94]:
from transformers import get_linear_schedule_with_warmup

total_steps = len(train_loader_nar) * model_config['num_epochs']
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(optimizer_nar, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

In [95]:
import torch
import pandas as pd

def compute_class_weights(dataset, label_column, classes, max_weight=10, epsilon=1e-8):
    label_df = pd.DataFrame(dataset[label_column].tolist(), columns=classes)
    class_counts = label_df.sum().sort_values(ascending=False)
    class_counts = class_counts.reindex(classes)

    total_samples = len(dataset)

    class_weights = {
        i: total_samples / (len(class_counts) * (count + epsilon))
        for i, count in enumerate(class_counts.values)
    }

    weights_tensor = torch.tensor(list(class_weights.values()), dtype=torch.float)
    weights_tensor = torch.clamp(weights_tensor, max=max_weight)

    for i, class_name in enumerate(classes):
        print(f"Class: {class_name}, Weight: {weights_tensor[i].item()}")

    return weights_tensor

narrative_weights_tensor = compute_class_weights(
    dataset=dataset_train,
    label_column='narratives_encoded',
    classes=mlb_narratives.classes_
)

Class: Amplifying Climate Fears, Weight: 0.4395183324813843
Class: Amplifying war-related fears, Weight: 0.3213285207748413
Class: Blaming the war on others rather than the invader, Weight: 0.4345238208770752
Class: Climate change is beneficial, Weight: 10.0
Class: Controversy about green technologies, Weight: 2.549206256866455
Class: Criticism of climate movement, Weight: 1.2746031284332275
Class: Criticism of climate policies, Weight: 0.6373015642166138
Class: Criticism of institutions and authorities, Weight: 0.4840265214443207
Class: Discrediting Ukraine, Weight: 0.2078157365322113
Class: Discrediting the West, Diplomacy, Weight: 0.17621241509914398
Class: Distrust towards Media, Weight: 0.8892580270767212
Class: Downplaying climate change, Weight: 1.9119048118591309
Class: Green policies are geopolitical instruments, Weight: 10.0
Class: Hidden plots by secret schemes of powerful groups, Weight: 0.980463981628418
Class: Negative Consequences for the West, Weight: 0.6373015642166138

In [96]:
def freeze_layers(model, num_layers_to_freeze=2):
  assert hasattr(model, 'classifier'), "Model must have a classifier attribute"
  for i in range(num_layers_to_freeze):
      for param in model.backbone.encoder.layer[i].parameters():
          param.requires_grad = False

  for param in model.classifier.parameters():
      param.requires_grad = True

In [97]:
freeze_layers(narrative_classifier_roberta, num_layers_to_freeze=1)

In [98]:
def train_model(
    model,
    train_loader,
    val_loader,
    optimizer,
    label_column,
    scheduler=None,
    weights=None,
    num_epochs=model_config['num_epochs'],
    device='cuda'
):
    for epoch in range(num_epochs):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch[label_column].to(device)

            logits = model(input_ids, attention_mask)
            loss, loss_item = bce_with_weights(logits, labels, weights)

            loss.backward()
            optimizer.step()
            if scheduler:
              scheduler.step()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch[label_column].to(device)

                logits = model(input_ids, attention_mask)
                _, batch_loss = bce_with_weights(logits, labels)
                val_loss += batch_loss

        avg_val_loss = val_loss / len(val_loader)
        print(f"Epoch {epoch + 1}, Training Loss (AVG): {loss_item}, Validation Loss (AVG): {avg_val_loss}")

In [None]:
train_model(
    model=narrative_classifier_roberta,
    train_loader=train_loader_nar,
    val_loader=val_loader_nar,
    optimizer=optimizer_nar,
    scheduler=scheduler,
    label_column='narrative_labels',
    num_epochs=model_config.get('num_epochs', 5),
    device=device
)

In [None]:
from sklearn.metrics import f1_score

def evaluate_transformer_model(
    model,
    val_loader,
    label_column,
    thresholds=np.arange(0.1, 1.0, 0.1),
    device='cuda',
    target_names=None
):
    best_threshold = 0
    best_f1 = 0
    best_classification_report = None

    all_preds = []
    all_truths = []

    for threshold in thresholds:
        preds = []
        truths = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch[label_column].to(device)
                logits = model(input_ids, attention_mask)
                probs = torch.sigmoid(logits)

                model_preds = (probs >= threshold).int()
                preds.extend(model_preds.cpu().numpy())
                truths.extend(labels.cpu().numpy())

        classification_rep = classification_report(truths, preds, target_names=target_names, zero_division=0)
        current_f1 = f1_score(truths, preds, average='macro')

        if current_f1 > best_f1:
            best_f1 = current_f1
            best_threshold = threshold
            best_classification_report = classification_rep

    print(f"Best Threshold: {best_threshold}, Best F1 Score: {best_f1}")
    print("Best Classification Report:")
    print(best_classification_report)

In [None]:
evaluate_transformer_model(
    narrative_classifier_roberta, 
    val_loader_nar, 
    label_column='narrative_labels', 
    target_names=mlb_narratives.classes_
)

### Fine-tuning Roberta to predict subnarratives

In [None]:
train_sub_narrative_truths = dataset_train['subnarratives_encoded'].tolist()

train_sub_narrative_truths[0][:5]

In [None]:
val_sub_narrative_truths = dataset_val['subnarratives_encoded'].tolist()

val_sub_narrative_truths[0][:5]

In [None]:
import torch
from torch import nn
from transformers import XLMRobertaModel

class SubNarrativeClassificationRoberta(nn.Module):
    def __init__(self, sub_narrative_classes):
        super(SubNarrativeClassificationRoberta, self).__init__()
        self.sub_narrative_classes = sub_narrative_classes
        self.backbone = XLMRobertaModel.from_pretrained("xlm-roberta-base")
        self.classifier = nn.Linear(self.backbone.config.hidden_size, sub_narrative_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]

        subnarrative_logits = self.classifier(pooled_output)


        return subnarrative_logits

subnarrative_classifier_roberta = SubNarrativeClassificationRoberta(sub_narrative_classes=len(mlb_subnarratives.classes_))

In [None]:
import torch
from torch.utils.data import Dataset

class SubNarrativesDataset(Dataset):
    def __init__(self, encodings, sub_narrative_labels):
        self.encodings = encodings
        self.sub_narrative_labels = sub_narrative_labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['sub_narrative_labels'] = torch.tensor(self.sub_narrative_labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.sub_narrative_labels)

train_dataset = SubNarrativesDataset(
    train_encodings,
    dataset_train['subnarratives_encoded'].tolist(),
)

val_dataset = SubNarrativesDataset(
    val_encodings,
    dataset_val['subnarratives_encoded'].tolist(),
)

train_loader_sub = DataLoader(train_dataset, batch_size=model_config['batch_size'], shuffle=True)
val_loader_sub = DataLoader(val_dataset, batch_size=model_config['batch_size'], shuffle=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
optimizer_sub = optim.AdamW(subnarrative_classifier_roberta.parameters(), lr=model_config['lr'])
criterion_sub = nn.BCEWithLogitsLoss()

subnarrative_classifier_roberta.to(device)

In [None]:
freeze_layers(subnarrative_classifier_roberta, num_layers_to_freeze=1)

In [None]:
train_model(subnarrative_classifier_roberta, train_loader_sub, val_loader_sub, optimizer_sub, label_column='sub_narrative_labels')

In [None]:
evaluate_transformer_model(subnarrative_classifier_roberta, val_loader_sub, label_column='sub_narrative_labels', target_names=mlb_subnarratives.classes_)

### Predicting narratives and subnarratives using MultiTask learning

In [None]:
from torch import nn
from transformers import XLMRobertaModel

class MultiTaskRoberta(nn.Module):
    def __init__(self, narrative_classes, sub_narrative_classes):
        super(MultiTaskRoberta, self).__init__()
        self.backbone = XLMRobertaModel.from_pretrained("xlm-roberta-base")
        self.narrative_classifier = nn.Linear(self.backbone.config.hidden_size, narrative_classes)
        self.sub_narrative_classifier = nn.Linear(self.backbone.config.hidden_size, sub_narrative_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]

        narrative_logits = self.narrative_classifier(pooled_output)

        sub_narrative_logits = self.sub_narrative_classifier(pooled_output)

        return narrative_logits, sub_narrative_logits

multi_task_model = MultiTaskRoberta(narrative_classes=len(mlb_narratives.classes_), sub_narrative_classes=len(mlb_subnarratives.classes_))

In [None]:
from torch.utils.data import Dataset

class NewsDataset(Dataset):
    def __init__(self, encodings, narrative_labels, subnarrative_labels):
        self.encodings = encodings
        self.narrative_labels = narrative_labels
        self.subnarrative_labels = subnarrative_labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['narrative_labels'] = torch.tensor(self.narrative_labels[idx], dtype=torch.float)
        item['subnarrative_labels'] = torch.tensor(self.subnarrative_labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.narrative_labels)

train_dataset = NewsDataset(
    train_encodings,
    dataset_train['narratives_encoded'].tolist(),
    dataset_train['subnarratives_encoded'].tolist()
)

val_dataset = NewsDataset(
    val_encodings,
    dataset_val['narratives_encoded'].tolist(),
    dataset_val['subnarratives_encoded'].tolist()
)

train_loader = DataLoader(train_dataset, batch_size=model_config['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=model_config['batch_size'], shuffle=False)

In [None]:
def mask_sub_narrative_logits(narrative_probs, sub_narrative_logits, narrative_to_sub_mapping, threshold=0.2):
    batch_size, sub_narrative_size = sub_narrative_logits.size()
    mask = torch.zeros_like(sub_narrative_logits)

    for i in range(batch_size):
        active_narratives = torch.where(narrative_probs[i] > threshold)[0]

        for narrative_idx in active_narratives:
            sub_indices = list(narrative_to_sub_mapping[narrative_idx.item()])
            mask[i, sub_indices] = 1

    epsilon = 1e-8
    masked_sub_narrative_logits = sub_narrative_logits * mask + epsilon * (1 - mask)

    return masked_sub_narrative_logits

In [None]:
narrative_to_sub_mapping = {}

for i, row in dataset_train.iterrows():
    narratives_encoded_list = row['narratives_encoded']
    subnarratives_encoded_list = row['subnarratives_encoded']

    for j in range(len(narratives_encoded_list)):
        if narratives_encoded_list[j] == 1:
            if j not in narrative_to_sub_mapping:
                narrative_to_sub_mapping[j] = set()

            subnarratives_encoded_list = np.asarray(subnarratives_encoded_list)

            indices_of_ones = np.where(subnarratives_encoded_list == 1)[0]

            narrative_to_sub_mapping[j].update(indices_of_ones)

In [None]:
from torch.nn.functional import binary_cross_entropy_with_logits

def compute_loss(narrative_logits, sub_narrative_logits, narrative_labels, sub_narrative_labels, narrative_to_sub_mapping, weights=None):
    narrative_loss = binary_cross_entropy_with_logits(narrative_logits, narrative_labels, pos_weight=weights)

    masked_sub_logits = mask_sub_narrative_logits(narrative_labels, sub_narrative_logits, narrative_to_sub_mapping)
    sub_narrative_loss = binary_cross_entropy_with_logits(masked_sub_logits, sub_narrative_labels)

    return narrative_loss + sub_narrative_loss

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
optimizer = optim.AdamW(multi_task_model.parameters(), lr=model_config['lr'])

multi_task_model.to(device)

In [None]:
def freeze_multi_task_roberta_layers(model, num_layers_to_freeze=2):
  for i in range(num_layers_to_freeze):
    for param in model.backbone.encoder.layer[i].parameters():
      param.requires_grad = False

  for param in model.narrative_classifier.parameters():
    param.requires_grad = True

  for param in model.sub_narrative_classifier.parameters():
    param.requires_grad = True

In [None]:
freeze_multi_task_roberta_layers(multi_task_model)

In [None]:
for epoch in range(5):
    multi_task_model.train()
    epoch_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        narrative_labels = batch['narrative_labels'].to(device)
        sub_narrative_labels = batch['subnarrative_labels'].to(device)

        narrative_logits, sub_narrative_logits = multi_task_model(input_ids, attention_mask)
        loss = compute_loss(narrative_logits, sub_narrative_logits, narrative_labels, sub_narrative_labels, narrative_to_sub_mapping)
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = epoch_loss / len(train_loader)

    multi_task_model.eval()
    val_loss = 0
    for batch in val_loader:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            narrative_labels = batch['narrative_labels'].to(device)
            sub_narrative_labels = batch['subnarrative_labels'].to(device)

            narrative_logits, sub_narrative_logits = multi_task_model(input_ids, attention_mask)
            loss = compute_loss(narrative_logits, sub_narrative_logits, narrative_labels, sub_narrative_labels, narrative_to_sub_mapping)
            val_loss += loss.item()
    avg_val_loss = val_loss / len(val_loader)

    print(f"Epoch {epoch + 1}, Training Loss (AVG): {avg_train_loss:.4f}, Validation Loss (AVG): {avg_val_loss:.4f}")

In [None]:
def evaluate_with_threshold_selection(
    model,
    val_loader,
    narrative_targets_name,
    subnarrative_targets_name,
    thresholds=np.arange(0.1, 1.0, 0.1),
    device='cuda'
):
    best_narrative_threshold = 0
    best_subnarrative_threshold = 0
    best_narrative_f1 = 0
    best_subnarrative_f1 = 0
    best_narrative_report = None
    best_subnarrative_report = None

    all_narratives_truth = []
    all_subnarratives_truth = []

    for batch in val_loader:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            narrative_labels = batch['narrative_labels'].to(device)
            subnarrative_labels = batch['subnarrative_labels'].to(device)
            narrative_logits, subnarrative_logits = model(input_ids, attention_mask)

            narrative_probs = torch.sigmoid(narrative_logits)
            subnarrative_probs = torch.sigmoid(subnarrative_logits)

            all_narratives_truth.extend(narrative_labels.cpu().numpy())
            all_subnarratives_truth.extend(subnarrative_labels.cpu().numpy())

    for threshold in thresholds:
        narratives_pred = []
        subnarratives_pred = []

        for batch in val_loader:
            with torch.no_grad():
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                narrative_logits, subnarrative_logits = model(input_ids, attention_mask)

                narrative_probs = torch.sigmoid(narrative_logits)
                subnarrative_probs = torch.sigmoid(subnarrative_logits)

                narratives_pred.extend((narrative_probs >= threshold).int().cpu().numpy())
                subnarratives_pred.extend((subnarrative_probs >= threshold).int().cpu().numpy())

        # Narrative Metrics
        narrative_report = classification_report(
            all_narratives_truth,
            narratives_pred,
            target_names=narrative_targets_name,
            zero_division=0
        )
        narrative_f1 = f1_score(all_narratives_truth, narratives_pred, average='macro')

        if narrative_f1 > best_narrative_f1:
            best_narrative_f1 = narrative_f1
            best_narrative_threshold = threshold
            best_narrative_report = narrative_report

        subnarrative_report = classification_report(
            all_subnarratives_truth,
            subnarratives_pred,
            target_names=subnarrative_targets_name,
            zero_division=0
        )
        subnarrative_f1 = f1_score(all_subnarratives_truth, subnarratives_pred, average='macro')

        if subnarrative_f1 > best_subnarrative_f1:
            best_subnarrative_f1 = subnarrative_f1
            best_subnarrative_threshold = threshold
            best_subnarrative_report = subnarrative_report

    print(f"Best Narrative Threshold: {best_narrative_threshold}, Best Narrative F1 Score: {best_narrative_f1}")
    print("Best Classification Report for Narratives:")
    print(best_narrative_report)

    print(f"Best Subnarrative Threshold: {best_subnarrative_threshold}, Best Subnarrative F1 Score: {best_subnarrative_f1}")
    print("Best Classification Report for Subnarratives:")
    print(best_subnarrative_report)

evaluate_with_threshold_selection(
    model=multi_task_model,
    val_loader=val_loader,
    narrative_targets_name=mlb_narratives.classes_,
    subnarrative_targets_name=mlb_subnarratives.classes_,
    thresholds=np.arange(0.1, 1.0, 0.1),
    device=device
)