# Semeval 2025 Task 10
### Subtask 2: Narrative Baseline Classification -- Multilingual

Given a news article and a [two-level taxonomy of narrative labels](https://propaganda.math.unipd.it/semeval2025task10/NARRATIVE-TAXONOMIES.pdf) (where each narrative is subdivided into subnarratives) from a particular domain, assign to the article all the appropriate subnarrative labels. This is a multi-label multi-class document classification task.

## 1. Setup

### 1.1 Getting and analyzing data

In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import ModelCheckpoint

from matplotlib import pyplot as plt
import seaborn as sns
import os

In [2]:
root_dir = '../../'

We go ahead and read our data, the data are structures in a way that each article or document is stored in a folder that corresponds to each language:

In [3]:
data = []
ignore_folders = ['.DS_Store']

base_dir_documents = root_dir + 'data/semeval_data/train/raw-documents'

for language_folder in os.listdir(base_dir_documents):

    if language_folder in ignore_folders:
        continue

    language_path = os.path.join(base_dir_documents, language_folder)
    if os.path.isdir(language_path):
        for root, _, files in os.walk(language_path):
            for file in files:
                if file.endswith('.txt'):
                    file_path = os.path.join(root, file)

                    article_id = file
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()

                    data.append({
                        'language': language_folder,
                        'article_id': article_id,
                        'content': content
                    })

documents_df = pd.DataFrame(data)

This is how the dataframe looks like:

In [4]:
print(documents_df.shape)
documents_df.head()

(1781, 3)


Unnamed: 0,language,article_id,content
0,RU,RU-URW-1161.txt,В ближайшие два месяца США будут стремиться к ...
1,RU,RU-URW-1175.txt,В ЕС испугались последствий популярности правы...
2,RU,RU-URW-1149.txt,Возможность признания Аллы Пугачевой иностранн...
3,RU,RU-URW-1015.txt,Азаров рассказал о смене риторики Киева по пер...
4,RU,RU-URW-1001.txt,В россиянах проснулась массовая любовь к путеш...


The dataframe contains languages from 5 different languages:

In [5]:
documents_df['language'].unique()

array(['RU', 'PT', 'BG', 'HI', 'EN'], dtype=object)

The labels are structured as follows:
* Each line contains:
  - `article_id` (the file name of the article)
  - `narratives`: one or more narrative labels (1st level taxonomy)
  - `subnarratives`: one or more sub-narrative labels (2nd level taxonomy)
  
If no specific narrative or subnarrative is assigned, "Other" is used. If only a narrative is provided without a subnarrative, the format `[Narrative]: Other` is used.

**Example:**
```
article_id narratives subnarratives 

EN_10001.txt URW: Blaming Others URW: Ukraine is the aggressor
EN_10002.txt URW: Blaming Others; URW: Praise of Russia URW: Blaming Others: Other; URW: Praising Russia’s military might
EN_10003.txt Other Other
```

In [6]:
base_dir_labels = root_dir + 'data/semeval_data/train/labels'

raw_annotation_data = []

for language_folder in os.listdir(base_dir_labels):

    if language_folder in ignore_folders:
        continue

    print('Now processing language', language_folder)

    language_path = os.path.join(base_dir_labels, language_folder)
    if os.path.isdir(language_path):
        for root, _, files in os.walk(language_path):
            label_file = 'subtask-2-annotations.txt'
            file_path = os.path.join(root, label_file)

            with open(file_path, 'r') as file:
                for line in file:
                    parts = line.strip().split('\t')
                    article_id = parts[0]
                    narrative_to_subnarratives = parts[2].split(';')
                    narratives = []
                    subnarratives = []

                    for nar_to_sub in narrative_to_subnarratives:
                      subnarrative_list = nar_to_sub.split(' ')
                      if subnarrative_list[0] == 'Other':
                        narratives.append('Other')
                        subnarratives.append('Other')
                        continue
                      nar_to_sub = ' '.join(subnarrative_list[1:])
                      nar, sub = nar_to_sub.split(':')
                      nar = f"{subnarrative_list[0]} {nar}"
                      narratives.append(nar.strip())
                      subnarratives.append(sub.strip())

                    raw_annotation_data.append({
                        'article_id': article_id,
                        'narratives': narratives,
                        'subnarratives': subnarratives
                    })

annotations_df = pd.DataFrame(raw_annotation_data)

Now processing language RU
Now processing language PT
Now processing language BG
Now processing language HI
Now processing language EN


In [7]:
from collections import defaultdict

narrative_to_subnarratives_origin = defaultdict(set)

for record in raw_annotation_data:
    narratives = record['narratives']
    subnarratives = record['subnarratives']

    for nar, sub in zip(narratives, subnarratives):
        narrative_to_subnarratives_origin[nar].add(sub)

narrative_to_subnarratives_origin = {nar: list(subs) for nar, subs in narrative_to_subnarratives_origin.items()}

In [8]:
coarse_classes = sorted(narrative_to_subnarratives_origin.keys())

In [9]:
fine_label_set = set()

for narrative, subnarratives in narrative_to_subnarratives_origin.items():
    if narrative == "Other":
        fine_label_set.add("Other")
    else:
        for sub in subnarratives:
            if sub == "Other":
                fine_label_set.add(f"{narrative}: Other")
            else:
                fine_label_set.add(f"{narrative}: {sub}")

fine_classes = sorted(fine_label_set)

In [10]:
fine_classes[:15]

['CC: Amplifying Climate Fears: Amplifying existing fears of global warming',
 'CC: Amplifying Climate Fears: Doomsday scenarios for humans',
 'CC: Amplifying Climate Fears: Earth will be uninhabitable soon',
 'CC: Amplifying Climate Fears: Other',
 'CC: Amplifying Climate Fears: Whatever we do it is already too late',
 'CC: Climate change is beneficial: CO2 is beneficial',
 'CC: Climate change is beneficial: Other',
 'CC: Climate change is beneficial: Temperature increase is beneficial',
 'CC: Controversy about green technologies: Other',
 'CC: Controversy about green technologies: Renewable energy is costly',
 'CC: Controversy about green technologies: Renewable energy is dangerous',
 'CC: Controversy about green technologies: Renewable energy is unreliable',
 'CC: Criticism of climate movement: Ad hominem attacks on key activists',
 'CC: Criticism of climate movement: Climate movement is alarmist',
 'CC: Criticism of climate movement: Climate movement is corrupt']

In [11]:
import pickle

base_save_folder_dir = './saved/'

misc_folder = os.path.join(base_save_folder_dir, 'Misc')

with open(os.path.join(misc_folder, 'fine_classes.pkl'), 'wb') as f:
    pickle.dump(fine_classes, f)

with open(os.path.join(misc_folder, 'coarse_classes.pkl'), 'wb') as f:
    pickle.dump(coarse_classes, f)

In [12]:
annotations_df.iloc[0].narratives

['URW: Discrediting Ukraine']

In [13]:
annotations_df.iloc[0].subnarratives

['Discrediting Ukrainian government and officials and policies']

In [14]:
annotations_df.head()

Unnamed: 0,article_id,narratives,subnarratives
0,RU-URW-1080.txt,[URW: Discrediting Ukraine],[Discrediting Ukrainian government and officia...
1,RU-URW-1013.txt,"[URW: Discrediting the West, Diplomacy]","[The West does not care about Ukraine, only ab..."
2,RU-URW-1145.txt,[URW: Praise of Russia],[Praise of Russian military might]
3,RU-URW-1277.txt,"[URW: Discrediting the West, Diplomacy]","[The West does not care about Ukraine, only ab..."
4,RU-URW-1048.txt,[URW: Discrediting Ukraine],[Discrediting Ukrainian military]


In [15]:
annotations_df.tail()

Unnamed: 0,article_id,narratives,subnarratives
1776,EN_CC_200022.txt,[CC: Criticism of institutions and authorities...,"[Criticism of national governments, Other, Met..."
1777,EN_CC_100028.txt,[Other],[Other]
1778,EN_CC_300010.txt,[CC: Amplifying Climate Fears],[Other]
1779,EN_UA_013257.txt,"[URW: Russia is the Victim, URW: Blaming the w...",[Russia actions in Ukraine are only self-defen...
1780,EN_UA_000104.txt,[Other],[Other]


In [16]:
annotations_df.shape

(1781, 3)

In [17]:
dataset_train = pd.merge(documents_df, annotations_df, on='article_id')
dataset_train.head()

Unnamed: 0,language,article_id,content,narratives,subnarratives
0,RU,RU-URW-1161.txt,В ближайшие два месяца США будут стремиться к ...,[URW: Blaming the war on others rather than th...,"[The West are the aggressors, Other, The West ..."
1,RU,RU-URW-1175.txt,В ЕС испугались последствий популярности правы...,"[URW: Discrediting the West, Diplomacy, URW: D...","[The West is weak, Other, The EU is divided]"
2,RU,RU-URW-1149.txt,Возможность признания Аллы Пугачевой иностранн...,[URW: Distrust towards Media],[Western media is an instrument of propaganda]
3,RU,RU-URW-1015.txt,Азаров рассказал о смене риторики Киева по пер...,"[URW: Discrediting Ukraine, URW: Discrediting ...","[Ukraine is a puppet of the West, Discrediting..."
4,RU,RU-URW-1001.txt,В россиянах проснулась массовая любовь к путеш...,[URW: Praise of Russia],[Russia is a guarantor of peace and prosperity]


In [18]:
dataset_train.shape

(1781, 5)

We do the exact same thing in order to get the dev articles

In [19]:
val_data = []
ignore_folders = ['.DS_Store']

base_dir_documents = root_dir + 'data/semeval_data/dev/raw-documents'

for language_folder in os.listdir(base_dir_documents):

    if language_folder in ignore_folders:
        continue

    print('Processing ', language_folder)
    language_path = os.path.join(base_dir_documents, language_folder)
    if os.path.isdir(language_path):
        for root, _, files in os.walk(language_path):
            for file in files:
                if file.endswith('.txt'):
                    file_path = os.path.join(root, file)

                    article_id = file
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()

                    val_data.append({
                        'language': language_folder,
                        'article_id': article_id,
                        'content': content
                    })

val_documents_df = pd.DataFrame(val_data)

Processing  RU
Processing  PT
Processing  BG
Processing  HI
Processing  EN


In [20]:
base_dir_labels = root_dir + 'data/semeval_data/dev/labels'

val_raw_annotation_data = []

for language_folder in os.listdir(base_dir_labels):

    if language_folder in ignore_folders:
        continue

    print('Processing language', language_folder)

    language_path = os.path.join(base_dir_labels, language_folder)
    if os.path.isdir(language_path):
        for root, _, files in os.walk(language_path):
            label_file = 'subtask-2-annotations.txt'
            file_path = os.path.join(root, label_file)

            with open(file_path, 'r') as file:
                for line in file:
                    parts = line.strip().split('\t')
                    article_id = parts[0]
                    narrative_to_subnarratives = parts[2].split(';')
                    narratives = []
                    subnarratives = []

                    for nar_to_sub in narrative_to_subnarratives:
                      subnarrative_list = nar_to_sub.split(' ')
                      if subnarrative_list[0] == 'Other':
                        narratives.append('Other')
                        subnarratives.append('Other')
                        continue
                      nar_to_sub = ' '.join(subnarrative_list[1:])
                      nar, sub = nar_to_sub.split(':')
                      nar = f"{subnarrative_list[0]} {nar}"
                      narratives.append(nar.strip())
                      subnarratives.append(sub.strip())

                    val_raw_annotation_data.append({
                        'article_id': article_id,
                        'narratives': narratives,
                        'subnarratives': subnarratives
                    })

val_annotations_df = pd.DataFrame(val_raw_annotation_data)

Processing language RU
Processing language PT
Processing language BG
Processing language HI
Processing language EN


In [21]:
dataset_val = pd.merge(val_documents_df, val_annotations_df, on='article_id')
dataset_val.head()

Unnamed: 0,language,article_id,content,narratives,subnarratives
0,RU,RU-URW-1014.txt,Алаудинов: российские силы растянули и размыли...,[URW: Praise of Russia],[Praise of Russian military might]
1,RU,RU-URW-1174.txt,Других сценариев нет. Никаких переговоров на У...,"[URW: Speculating war outcomes, URW: Discredit...","[Ukrainian army is collapsing, Discrediting Uk..."
2,RU,RU-URW-1166.txt,Попытка Запада изолировать Путина провалилась\...,"[URW: Praise of Russia, URW: Distrust towards ...","[Praise of Russian President Vladimir Putin, W..."
3,RU,RU-URW-1170.txt,Часть территории Украины войдет в состав Польш...,"[URW: Discrediting Ukraine, URW: Speculating w...",[Discrediting Ukrainian government and officia...
4,RU,RU-URW-1004.txt,Зеленскому не очень понравилась идея о временн...,"[URW: Discrediting Ukraine, URW: Discrediting ...",[Discrediting Ukrainian government and officia...


In [22]:
dataset_val.shape

(178, 5)

This is how an English article looks like:
* It is insightful to know that each article is consequently tab separated, indicatiing the start of a new paragraph.

In [23]:
row = 5
english_article = dataset_train[dataset_train['language'] == 'EN'].iloc[row].content
english_article

'Trump Lawyer Demands Accountability From Intel Chiefs Who Backed Hunter Biden \n\n An attorney for former President Donald Trump wants the 51 former intelligence chiefs held responsible for backing Hunter Biden in the unfolding story of the laptop abandoned in a Delaware repair shop.\n\nLawyer Tim Parlatore\'s goal is to uncover alleged communications between the 51 former senior intel leaders and the Biden 2020 campaign.\n\nPolitico had reported that an Oct. 19, 2020, letter, signed by the former intelligence officials, outlined their assessment that a New York Post disclosure of emails allegedly belonging to Hunter Biden "has all the classic earmarks of a Russia information operation."\n\nThose signing the letter included former CIA Directors Leon Panetta, Mike Hayden and John Brennan, along with former Director of National Intelligence James Clapper.\n\nThe letter offered no evidence, but raised suspicions by the former intel officials.\n\nThe Post had previously reported that duri

In [24]:
dataset_train.shape

(1781, 5)

In [25]:
dataset_train['narratives']

0       [URW: Blaming the war on others rather than th...
1       [URW: Discrediting the West, Diplomacy, URW: D...
2                           [URW: Distrust towards Media]
3       [URW: Discrediting Ukraine, URW: Discrediting ...
4                                 [URW: Praise of Russia]
                              ...                        
1776                  [URW: Amplifying war-related fears]
1777    [CC: Criticism of climate movement, CC: Downpl...
1778    [CC: Criticism of institutions and authorities...
1779                      [URW: Speculating war outcomes]
1780                       [CC: Amplifying Climate Fears]
Name: narratives, Length: 1781, dtype: object

In [26]:
unique_narratives = dataset_train['narratives'].explode().unique()
unique_narratives

array(['URW: Blaming the war on others rather than the invader',
       'URW: Discrediting the West, Diplomacy',
       'URW: Hidden plots by secret schemes of powerful groups',
       'URW: Discrediting Ukraine', 'URW: Praise of Russia',
       'URW: Distrust towards Media', 'URW: Russia is the Victim',
       'URW: Negative Consequences for the West',
       'URW: Speculating war outcomes',
       'URW: Amplifying war-related fears', 'Other',
       'URW: Overpraising the West', 'CC: Downplaying climate change',
       'CC: Criticism of institutions and authorities',
       'CC: Questioning the measurements and science',
       'CC: Climate change is beneficial',
       'CC: Criticism of climate policies',
       'CC: Criticism of climate movement',
       'CC: Amplifying Climate Fears',
       'CC: Controversy about green technologies',
       'CC: Hidden plots by secret schemes of powerful groups',
       'CC: Green policies are geopolitical instruments'], dtype=object)

The frequency of narratives in the dataset: 

In [27]:
dataset_train.iloc[0].narratives

['URW: Blaming the war on others rather than the invader',
 'URW: Discrediting the West, Diplomacy',
 'URW: Discrediting the West, Diplomacy',
 'URW: Hidden plots by secret schemes of powerful groups',
 'URW: Discrediting Ukraine',
 'URW: Discrediting Ukraine',
 'URW: Praise of Russia',
 'URW: Discrediting the West, Diplomacy']

In [28]:
dataset_train.iloc[0].subnarratives

['The West are the aggressors',
 'Other',
 'The West is weak',
 'Other',
 'Ukraine is a puppet of the West',
 'Ukraine is associated with nazism',
 'Russia is a guarantor of peace and prosperity',
 'The West does not care about Ukraine, only about its interests']

In [29]:
dataset_train.columns

Index(['language', 'article_id', 'content', 'narratives', 'subnarratives'], dtype='object')

In [30]:
print(len(dataset_train['narratives'].explode().value_counts()))
dataset_train['narratives'].explode().value_counts()

22


narratives
URW: Discrediting Ukraine                                 654
URW: Discrediting the West, Diplomacy                     502
URW: Praise of Russia                                     464
CC: Amplifying Climate Fears                              357
Other                                                     328
URW: Amplifying war-related fears                         304
URW: Russia is the Victim                                 240
CC: Criticism of institutions and authorities             216
URW: Blaming the war on others rather than the invader    203
URW: Speculating war outcomes                             134
CC: Criticism of climate policies                         127
URW: Negative Consequences for the West                   105
CC: Criticism of climate movement                          84
CC: Downplaying climate change                             68
URW: Distrust towards Media                                61
CC: Hidden plots by secret schemes of powerful groups      

In [31]:
unique_subnarratives = dataset_train['subnarratives'].explode().unique()
unique_subnarratives

array(['The West are the aggressors', 'Other', 'The West is weak',
       'Ukraine is a puppet of the West',
       'Ukraine is associated with nazism',
       'Russia is a guarantor of peace and prosperity',
       'The West does not care about Ukraine, only about its interests',
       'The EU is divided',
       'Western media is an instrument of propaganda',
       'Discrediting Ukrainian government and officials and policies',
       'The West is overreacting', 'UA is anti-RU extremists',
       'Discrediting Ukrainian nation and society',
       'Discrediting Ukrainian military',
       'Ukrainian media cannot be trusted',
       'Praise of Russian military might', 'The West is russophobic',
       'Ukrainian army is collapsing',
       'Russia has international support from a number of countries and people',
       'Praise of Russian President Vladimir Putin',
       'By continuing the war we risk WWIII', 'Ukraine is the aggressor',
       'Diplomacy does/will not work',
       

In [32]:
len(unique_subnarratives)

74

The frequency of subnarratives in the dataset:

In [33]:
pd.set_option('display.max_rows', 100)

dataset_train['subnarratives'].explode().value_counts()

subnarratives
Other                                                                     1195
Discrediting Ukrainian government and officials and policies               179
Amplifying existing fears of global warming                                178
Praise of Russian military might                                           172
Discrediting Ukrainian military                                            126
The West are the aggressors                                                117
Russia is a guarantor of peace and prosperity                              115
Ukraine is a puppet of the West                                            115
There is a real possibility that nuclear weapons will be employed           98
The West does not care about Ukraine, only about its interests              93
Criticism of national governments                                           85
Russia has international support from a number of countries and people      84
Ukraine is the aggressor              

### 1.2 Encoding classification labels

We will know transofrm'narratives' and 'subnarratives' columns
into binary format using MultiLabelBinarizer. 
* Each unique label is represented by a binary vector, enabling the model to handle multiple labels per instance for both narratives and subnarratives.

In [34]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb_narratives = MultiLabelBinarizer()
mlb_subnarratives = MultiLabelBinarizer()

In [35]:
narratives_binary = mlb_narratives.fit_transform(dataset_train['narratives'])
subnarratives_binary = mlb_subnarratives.fit_transform(dataset_train['subnarratives'])

dataset_train['narratives_encoded'] = narratives_binary.tolist()
dataset_train['subnarratives_encoded'] = subnarratives_binary.tolist()

In [36]:
dataset_train.head()

Unnamed: 0,language,article_id,content,narratives,subnarratives,narratives_encoded,subnarratives_encoded
0,RU,RU-URW-1161.txt,В ближайшие два месяца США будут стремиться к ...,[URW: Blaming the war on others rather than th...,"[The West are the aggressors, Other, The West ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,RU,RU-URW-1175.txt,В ЕС испугались последствий популярности правы...,"[URW: Discrediting the West, Diplomacy, URW: D...","[The West is weak, Other, The EU is divided]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,RU,RU-URW-1149.txt,Возможность признания Аллы Пугачевой иностранн...,[URW: Distrust towards Media],[Western media is an instrument of propaganda],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,RU,RU-URW-1015.txt,Азаров рассказал о смене риторики Киева по пер...,"[URW: Discrediting Ukraine, URW: Discrediting ...","[Ukraine is a puppet of the West, Discrediting...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,RU,RU-URW-1001.txt,В россиянах проснулась массовая любовь к путеш...,[URW: Praise of Russia],[Russia is a guarantor of peace and prosperity],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [37]:
narratives_binary_val = mlb_narratives.transform(dataset_val['narratives'])
subnarratives_binary_val = mlb_subnarratives.transform(dataset_val['subnarratives'])

dataset_val['narratives_encoded'] = narratives_binary_val.tolist()
dataset_val['subnarratives_encoded'] = subnarratives_binary_val.tolist()

In [38]:
import pickle

os.makedirs(base_save_folder_dir, exist_ok=True)
datasets_folder = os.path.join(base_save_folder_dir, 'Dataset')
with open(os.path.join(datasets_folder, 'dataset_train.pkl'), 'wb') as f:
    pickle.dump(dataset_train, f)

In [39]:
with open(os.path.join(datasets_folder, 'dataset_val.pkl'), 'wb') as f:
    pickle.dump(dataset_val, f)

#### 1.2.1 Remapping our subnarrative indices

We know that our articls have many narratives, and each one maps to several subnarratives, creating a hierarchy.  
The problem is, our `subnarratives_encoded` currently looks like a flat list of zeros:

```
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
```

But we need it to reflect the hierarchy properly:

So, we break it down into a list of lists—each inner list represents the true labels for a specific hierarchy:

```
[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0] , [0, 0, 0, ...
 ^ hierarchy 0       ^ hierarchy 1          ^ hierarchy 2 ...
```

This will help us significantly later when we need to know for a specific article, the true subnarrative labels for a specific hierarchy.

We’re using the label encoders to get the indices of narratives and subnarratives, which we’ll use later.  
* For each narrative in `narrative_to_subnarratives`, we find the index of the narrative and its corresponding subnarratives using the encoders.

In [40]:
narrative_to_sub_map = {}
narrative_classes = list(mlb_narratives.classes_)
subnarrative_classes = list(mlb_subnarratives.classes_)

for narrative, subnarratives in narrative_to_subnarratives_origin.items():
    narrative_idx = narrative_classes.index(narrative)
    subnarrative_indices = [subnarrative_classes.index(sub) for sub in subnarratives]
    narrative_to_sub_map[narrative_idx] = subnarrative_indices

print(narrative_to_sub_map)

{13: [20, 22, 33, 66, 65, 39, 50, 21, 64], 14: [19, 53, 71, 33, 56, 58, 60], 19: [34, 41, 42, 33, 46, 35], 20: [33, 59, 63, 40], 15: [33, 69, 72], 11: [43, 33, 62, 31, 3], 12: [33, 67, 54], 18: [33, 55, 57, 32], 21: [33, 44, 45, 68], 16: [33], 10: [33], 17: [33, 47, 61], 0: [1, 33, 24, 23, 73], 5: [33, 14, 17, 15, 16], 3: [33, 8, 0, 9], 6: [51, 33, 27, 49, 7, 4, 70, 29, 28], 4: [33, 11, 12, 10], 9: [18, 26, 33, 30, 48], 8: [33, 6, 2], 1: [33, 5, 52], 2: [33, 36, 38, 37], 7: [33, 25, 13]}


In [41]:
narrative_order = sorted(narrative_to_sub_map.keys())
narrative_order

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [42]:
with open(os.path.join(misc_folder, 'narrative_order.pkl'), 'wb') as f:
    pickle.dump(narrative_order, f)

In [43]:
hierarchy_new_column_name = "narrative_hierarchy"

Now, we remap the `subnarratives_encoded` list to reflect the correct hierarchy for each article.  
* For each narrative, we grab its corresponding subnarrative indices from `narrative_to_sub_map` and assign the sublabels to the appropriate hierarchy column.  

This will give us a new set of columns where each one contains the true subnarrative labels for that narrative hierarchy.

In [44]:
def remap_subnarratives(row, narrative_to_sub_map):
    """Takes in a row and encodes the current subnarrative list to the associated hierarchy based on the narr-subnar map"""
    for narr_idx, sub_indices in narrative_to_sub_map.items():
        sub_labels = [row['subnarratives_encoded'][sub_idx] for sub_idx in sub_indices]
        col_name = f"{hierarchy_new_column_name}_{narr_idx}"
        row[col_name] = sub_labels
    return row

dataset_train_cpy = dataset_train.apply(remap_subnarratives, axis=1, args=(narrative_to_sub_map,)).copy()

We do the same for validation dataset:

In [45]:
dataset_val_cpy = dataset_val.apply(remap_subnarratives, axis=1, args=(narrative_to_sub_map,)).copy()

In [46]:
dataset_val_cpy.head()

Unnamed: 0,language,article_id,content,narratives,subnarratives,narratives_encoded,subnarratives_encoded,narrative_hierarchy_13,narrative_hierarchy_14,narrative_hierarchy_19,...,narrative_hierarchy_0,narrative_hierarchy_5,narrative_hierarchy_3,narrative_hierarchy_6,narrative_hierarchy_4,narrative_hierarchy_9,narrative_hierarchy_8,narrative_hierarchy_1,narrative_hierarchy_2,narrative_hierarchy_7
0,RU,RU-URW-1014.txt,Алаудинов: российские силы растянули и размыли...,[URW: Praise of Russia],[Praise of Russian military might],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 1]",...,"[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0]"
1,RU,RU-URW-1174.txt,Других сценариев нет. Никаких переговоров на У...,"[URW: Speculating war outcomes, URW: Discredit...","[Ukrainian army is collapsing, Discrediting Uk...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 1, 1, 0, 0]","[0, 0, 0, 1, 0, 0]",...,"[0, 1, 0, 0, 0]","[1, 0, 0, 0, 0]","[1, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0]","[0, 0, 1, 0, 0]","[1, 0, 0]","[1, 0, 0]","[1, 0, 0, 0]","[1, 0, 0]"
2,RU,RU-URW-1166.txt,Попытка Запада изолировать Путина провалилась\...,"[URW: Praise of Russia, URW: Distrust towards ...","[Praise of Russian President Vladimir Putin, W...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0]","[1, 1, 0, 0, 0, 0]",...,"[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0]"
3,RU,RU-URW-1170.txt,Часть территории Украины войдет в состав Польш...,"[URW: Discrediting Ukraine, URW: Speculating w...",[Discrediting Ukrainian government and officia...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 1, 1, 0, 0]","[0, 0, 0, 1, 0, 0]",...,"[0, 1, 0, 0, 0]","[1, 0, 0, 0, 0]","[1, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0]","[0, 0, 1, 0, 0]","[1, 0, 0]","[1, 0, 0]","[1, 0, 0, 0]","[1, 0, 0]"
4,RU,RU-URW-1004.txt,Зеленскому не очень понравилась идея о временн...,"[URW: Discrediting Ukraine, URW: Discrediting ...",[Discrediting Ukrainian government and officia...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]",...,"[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0]"


A sample result looks like this:

In [47]:
for narr_idx, sub_indices in narrative_to_sub_map.items():
    dataset_hierarchy_col_name = f"{hierarchy_new_column_name}_{narr_idx}"
    res = dataset_train_cpy[dataset_hierarchy_col_name]
    print(f"Sample of {dataset_hierarchy_col_name}:")
    print(res.head()) 
    print("\n")

Sample of narrative_hierarchy_13:
0    [0, 0, 1, 1, 1, 0, 0, 0, 0]
1    [0, 0, 1, 0, 0, 0, 0, 0, 0]
2    [0, 0, 0, 0, 0, 0, 0, 0, 0]
3    [1, 0, 0, 0, 1, 0, 0, 0, 0]
4    [0, 0, 0, 0, 0, 0, 0, 0, 0]
Name: narrative_hierarchy_13, dtype: object


Sample of narrative_hierarchy_14:
0    [0, 0, 0, 1, 1, 0, 1]
1    [0, 1, 0, 1, 0, 0, 1]
2    [0, 0, 0, 0, 0, 0, 0]
3    [0, 0, 0, 0, 0, 0, 0]
4    [0, 0, 0, 0, 0, 0, 0]
Name: narrative_hierarchy_14, dtype: object


Sample of narrative_hierarchy_19:
0    [0, 0, 1, 1, 0, 0]
1    [0, 0, 0, 1, 0, 0]
2    [0, 0, 0, 0, 0, 0]
3    [0, 0, 0, 0, 0, 0]
4    [0, 0, 1, 0, 0, 0]
Name: narrative_hierarchy_19, dtype: object


Sample of narrative_hierarchy_20:
0    [1, 0, 0, 0]
1    [1, 0, 0, 0]
2    [0, 0, 0, 0]
3    [0, 0, 0, 0]
4    [0, 0, 0, 0]
Name: narrative_hierarchy_20, dtype: object


Sample of narrative_hierarchy_15:
0    [1, 0, 0]
1    [1, 0, 0]
2    [0, 0, 1]
3    [0, 0, 0]
4    [0, 0, 0]
Name: narrative_hierarchy_15, dtype: object


Sample of narra

In [48]:
narrative_order = sorted(narrative_to_sub_map.keys())
narrative_order

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

Now we want to make sure that the true subnarratives for hierarchy 0 are in position 0 of the aggregated list, hierarchy 1 in position 1, and so on.  
This ensures the subnarratives are ordered correctly in the final, aggregated list.

In [49]:
def aggregate_subnarratives(row, narrative_order, narrative_to_sub_map):
    """Takes in a row, and aggregates all hierarchy columns to 1 list.
    The encoded list will be a list of lists, starting from the first hierarchy"""
    aggregated = []
    for narr_idx in narrative_order:
        column_name = f"narrative_hierarchy_{narr_idx}"
        sub_labels = row[column_name]
        aggregated.append(sub_labels)
    return aggregated

dataset_train['aggregated_subnarratives'] = dataset_train_cpy.apply(
    aggregate_subnarratives,
    axis=1,
    args=(narrative_order, narrative_to_sub_map)
)

dataset_val['aggregated_subnarratives'] = dataset_val_cpy.apply(
    aggregate_subnarratives,
    axis=1,
    args=(narrative_order, narrative_to_sub_map)
)

In [50]:
dataset_train.head()

Unnamed: 0,language,article_id,content,narratives,subnarratives,narratives_encoded,subnarratives_encoded,aggregated_subnarratives
0,RU,RU-URW-1161.txt,В ближайшие два месяца США будут стремиться к ...,[URW: Blaming the war on others rather than th...,"[The West are the aggressors, Other, The West ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 1, 0, 0, 0], [1, 0, 0], [1, 0, 0, 0], [1,..."
1,RU,RU-URW-1175.txt,В ЕС испугались последствий популярности правы...,"[URW: Discrediting the West, Diplomacy, URW: D...","[The West is weak, Other, The EU is divided]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 1, 0, 0, 0], [1, 0, 0], [1, 0, 0, 0], [1,..."
2,RU,RU-URW-1149.txt,Возможность признания Аллы Пугачевой иностранн...,[URW: Distrust towards Media],[Western media is an instrument of propaganda],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0], [0, 0, 0], [0, 0, 0, 0], [0,..."
3,RU,RU-URW-1015.txt,Азаров рассказал о смене риторики Киева по пер...,"[URW: Discrediting Ukraine, URW: Discrediting ...","[Ukraine is a puppet of the West, Discrediting...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0], [0, 0, 0], [0, 0, 0, 0], [0,..."
4,RU,RU-URW-1001.txt,В россиянах проснулась массовая любовь к путеш...,[URW: Praise of Russia],[Russia is a guarantor of peace and prosperity],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0], [0, 0, 0], [0, 0, 0, 0], [0,..."


In [51]:
dataset_val.head()

Unnamed: 0,language,article_id,content,narratives,subnarratives,narratives_encoded,subnarratives_encoded,aggregated_subnarratives
0,RU,RU-URW-1014.txt,Алаудинов: российские силы растянули и размыли...,[URW: Praise of Russia],[Praise of Russian military might],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0], [0, 0, 0], [0, 0, 0, 0], [0,..."
1,RU,RU-URW-1174.txt,Других сценариев нет. Никаких переговоров на У...,"[URW: Speculating war outcomes, URW: Discredit...","[Ukrainian army is collapsing, Discrediting Uk...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 1, 0, 0, 0], [1, 0, 0], [1, 0, 0, 0], [1,..."
2,RU,RU-URW-1166.txt,Попытка Запада изолировать Путина провалилась\...,"[URW: Praise of Russia, URW: Distrust towards ...","[Praise of Russian President Vladimir Putin, W...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0], [0, 0, 0], [0, 0, 0, 0], [0,..."
3,RU,RU-URW-1170.txt,Часть территории Украины войдет в состав Польш...,"[URW: Discrediting Ukraine, URW: Speculating w...",[Discrediting Ukrainian government and officia...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 1, 0, 0, 0], [1, 0, 0], [1, 0, 0, 0], [1,..."
4,RU,RU-URW-1004.txt,Зеленскому не очень понравилась идея о временн...,"[URW: Discrediting Ukraine, URW: Discrediting ...",[Discrediting Ukrainian government and officia...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0], [0, 0, 0], [0, 0, 0, 0], [0,..."


### 1.3 Cleaning articles

We will use spaCy to load pre-trained language models for different languages, which will help us clean and preprocess article text.
* For each language-model that spacy supports, we will load it, otherwise we will fallback to `xx_ent_wiki_sm`

In [52]:
language_model_map = {
    "BG": "xx_ent_wiki_sm",
    "EN": "en_core_web_sm",
    "HI": "xx_ent_wiki_sm",
    "PT": "pt_core_news_sm",
    "RU": "ru_core_news_sm",
}

!python3 -m spacy download xx_ent_wiki_sm
!python3 -m spacy download pt_core_news_sm
!python3 -m spacy download ru_core_news_sm
!python3 -m spacy download en_core_web_sm

Collecting xx-ent-wiki-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.8.0/xx_ent_wiki_sm-3.8.0-py3-none-any.whl (11.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('xx_ent_wiki_sm')
Collecting pt-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.8.0/pt_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')
Collecting ru-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.8

We will also use the emoji library to remove certain emojis that appeared in the articles, and that's because they don't add meaningful context

In [53]:
!pip3 -q install emoji

In [54]:
import spacy
import emoji

nlp_models = {lang: spacy.load(model) for lang, model in language_model_map.items()}

The goal of this cleaning is to prepare the article text for analysis by removing irrelevant or noisy data, like URLs, emails, social media mentions, emojis.

* It also normalizes the text by converting non-entity words to lowercase and keeps important entities (like people, organizations, and locations) in their original case. This is more task-focused since I think they may add some context to our classification task.
* Notice also, that we are splitting our text in paragraphs, this is done just because article's are quite long, and this preparation will later help the embedding data preparation.

In [55]:
import re

class ArticleCleaner:
    def __init__(self, nlp_models):
        self.nlp_models = nlp_models

    def _clean_paragraph(self, paragraph, nlp):
        """Cleans individual paragraphs by removing links, emails, and normalizing tokens."""
        # Remove URLs, emails, and mentions
        paragraph = re.sub(
            r'http\S+|www\S+|https\S+|[a-zA-Z0-9.-]+\.com|[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+|@[A-Za-z0-9_]+',
            '',
            paragraph
        )

        doc = nlp(paragraph)
        cleaned_tokens = []
        important_entity_types = ["PERSON", "ORG", "GPE"]

        for token in doc:
            if token.is_space or emoji.is_emoji(token.text):
                continue

            if token.ent_type_ in important_entity_types:
                cleaned_tokens.append(token.text + token.whitespace_)
            else:
                cleaned_tokens.append(token.text.lower() + token.whitespace_)

        return "".join(cleaned_tokens).strip()

    def _preprocess_article_text(self, article_text):
        """Preprocess the article text by splitting into header, body, and footer."""
        parts = re.split(r'\n{2,}', article_text)

        if len(parts) > 2:
            header = parts[0].strip()
            footer = parts[-1].strip()
            body = parts[1:-1]
        else:
            header = parts[0].strip() if len(parts) > 0 else ""
            footer = parts[1].strip() if len(parts) > 1 else ""
            body = []

        return header, body, footer

    def clean_article_with_paragraphs(self, article_text, language_code):
        """Main method to clean the article by processing the header, body, and footer."""
        nlp = self.nlp_models.get(language_code, self.nlp_models["EN"])

        header, body, footer = self._preprocess_article_text(article_text)

        cleaned_header = f"<PARA>{self._clean_paragraph(header, nlp)}</PARA>" if header else ""
        cleaned_footer = f"<PARA>{self._clean_paragraph(footer, nlp)}</PARA>" if footer else ""
        cleaned_body = " ".join([self._clean_paragraph(paragraph, nlp) for paragraph in body])

        combined_text = "\n\n".join(filter(None, [cleaned_header, cleaned_body, cleaned_footer]))
        return combined_text.strip()

In [56]:
article_cleaner = ArticleCleaner(nlp_models)

In [57]:
dataset_train["content"] = dataset_train.apply(
    lambda row: article_cleaner.clean_article_with_paragraphs(row["content"], row["language"]),
    axis=1
)

In [58]:
dataset_val["content"] = dataset_val.apply(
    lambda row: article_cleaner.clean_article_with_paragraphs(row["content"], row["language"]),
    axis=1
)

This is how the new, modified article looks like:

In [59]:
row = 7
english_article = dataset_train[dataset_train['language'] == 'EN'].iloc[row].content
english_article



In [60]:
def split_into_sections(content):
    parts = re.split(r'<PARA>|</PARA>', content)
    parts = [p.strip() for p in parts if p.strip()]

    if len(parts) == 1:
        return parts[0], "", ""
    elif len(parts) == 2:
        return parts[0], parts[1], ""
    else:
        header = parts[0]
        footer = parts[-1]
        body = " ".join(parts[1:-1])
        return header, body, footer

We do a sanity check to see if our paragraph split works:

In [61]:
header, body, footer = split_into_sections(english_article)
print("Header: ", header)
print("\n\n")
print("Body: ", body)
print("\n\n")
print("Footer: ", footer)

Header:  UN chief Warns of global economic crisis at world economic forum in Davos






Footer:  other tech firms, such as Amazon, Meta, Alphabet, Salesforce, and Twitter, have announced similar moves in recent weeks. Microsoft, based in Redmond, Washington, had 221,000 full-time employees as of june 30, 2022, according to government filings.


In [62]:
dataset_train.head()

Unnamed: 0,language,article_id,content,narratives,subnarratives,narratives_encoded,subnarratives_encoded,aggregated_subnarratives
0,RU,RU-URW-1161.txt,<PARA>в ближайшие два месяца сша будут стремит...,[URW: Blaming the war on others rather than th...,"[The West are the aggressors, Other, The West ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 1, 0, 0, 0], [1, 0, 0], [1, 0, 0, 0], [1,..."
1,RU,RU-URW-1175.txt,<PARA>в ес испугались последствий популярности...,"[URW: Discrediting the West, Diplomacy, URW: D...","[The West is weak, Other, The EU is divided]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 1, 0, 0, 0], [1, 0, 0], [1, 0, 0, 0], [1,..."
2,RU,RU-URW-1149.txt,<PARA>возможность признания аллы пугачевой ино...,[URW: Distrust towards Media],[Western media is an instrument of propaganda],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0], [0, 0, 0], [0, 0, 0, 0], [0,..."
3,RU,RU-URW-1015.txt,<PARA>азаров рассказал о смене риторики киева ...,"[URW: Discrediting Ukraine, URW: Discrediting ...","[Ukraine is a puppet of the West, Discrediting...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0], [0, 0, 0], [0, 0, 0, 0], [0,..."
4,RU,RU-URW-1001.txt,<PARA>в россиянах проснулась массовая любовь к...,[URW: Praise of Russia],[Russia is a guarantor of peace and prosperity],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0], [0, 0, 0], [0, 0, 0, 0], [0,..."


In [63]:
label_encoder_folder = os.path.join(base_save_folder_dir, 'LabelEncoders')

with open(os.path.join(datasets_folder, 'dataset_train_cleaned.pkl'), 'wb') as f:
    pickle.dump(dataset_train, f)

with open(os.path.join(datasets_folder, 'dataset_val_cleaned.pkl'), 'wb') as f:
    pickle.dump(dataset_val, f)

with open(os.path.join(label_encoder_folder, 'mlb_narratives.pkl'), 'wb') as f:
    pickle.dump(mlb_narratives, f)

with open(os.path.join(label_encoder_folder, 'mlb_subnarratives.pkl'), 'wb') as f:
    pickle.dump(mlb_subnarratives, f)

with open(os.path.join(misc_folder, 'narrative_to_subnarratives.pkl'), 'wb') as f:
    pickle.dump(narrative_to_subnarratives_origin, f)

with open(os.path.join(misc_folder, 'narrative_to_subnarratives_map.pkl'), 'wb') as f:
    pickle.dump(narrative_to_sub_map, f)