In [65]:
import pandas as pd
import json
import re

In [66]:
RELEASE_DATA = "oct16release"
LANGUAGES = ["BG", "EN", "HI", "PT"]

In [67]:
all_data = []

for language in LANGUAGES:
    file_path = f"../data/json/{RELEASE_DATA}_{language}_data.jsonl"
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            # Parse each line as JSON
            data = json.loads(line)
            # Combine narratives and subnarratives, replacing spaces with underscores
            combined_labels = [
                f"{narrative.replace(' ', '_')}__{subnarrative.replace(' ', '_')}"
                for narrative, subnarrative in zip(data["labels"]["narrative"], data["labels"]["subnarrative"])
            ]
            # Add the combined labels as a new column
            data["label"] = combined_labels
            
            match = re.search(r'(BG|EN|HI|PT)', data["article_id"])
            data["language"] = match.group(0) if match else "Unknown"
            # Add the data to the list
            all_data.append(data)


df = pd.DataFrame(all_data)


In [68]:
df

Unnamed: 0,article_id,content,domain,labels,label,language
0,BG_670.txt,Опитът на колективния Запад да „обезкърви Руси...,URW,{'narrative': ['Blaming the war on others rath...,[Blaming_the_war_on_others_rather_than_the_inv...,BG
1,A9_BG_5871.txt,Зверство! Руснаците започнаха да режат глави н...,,"{'narrative': ['Other'], 'subnarrative': ['Oth...",[Other__Other],BG
2,BG_3245.txt,Подкрепата за Киев от страна на Запада вече не...,URW,"{'narrative': ['Discrediting the West, Diploma...","[Discrediting_the_West,_Diplomacy__The_West_is...",BG
3,A9_BG_5190.txt,"Дмитрий Медведев: НПО-та, спонсорирани от Соро...",URW,"{'narrative': ['Discrediting the West, Diploma...","[Discrediting_the_West,_Diplomacy__Other, Disc...",BG
4,A9_BG_8210.txt,Украйна – след войната\n\nДори след края на во...,URW,"{'narrative': ['Speculating war outcomes'], 's...",[Speculating_war_outcomes__Other],BG
...,...,...,...,...,...,...
721,PT_159.txt,A transição energética\n\nMultiplicam-se os fe...,CC,{'narrative': ['Criticism of institutions and ...,[Criticism_of_institutions_and_authorities__Cr...,PT
722,PT_91.txt,\n\nEspanha detém três pessoas por ataques cib...,URW,"{'narrative': ['Russia is the Victim'], 'subna...",[Russia_is_the_Victim__The_West_is_russophobic],PT
723,PT_74.txt,Rússia assume controlo de mais uma povoação no...,URW,"{'narrative': ['Praise of Russia'], 'subnarrat...",[Praise_of_Russia__Praise_of_Russian_military_...,PT
724,PT_57.txt,Airbus diz que frota global pode atingir os 48...,,"{'narrative': ['Other'], 'subnarrative': ['Oth...",[Other__Other],PT


In [69]:
df['label'].head()

0    [Blaming_the_war_on_others_rather_than_the_inv...
1                                       [Other__Other]
2    [Discrediting_the_West,_Diplomacy__The_West_is...
3    [Discrediting_the_West,_Diplomacy__Other, Disc...
4                    [Speculating_war_outcomes__Other]
Name: label, dtype: object

In [70]:
df['labels'][0]

{'narrative': ['Blaming the war on others rather than the invader',
  'Discrediting the West, Diplomacy',
  'Discrediting the West, Diplomacy',
  'Amplifying war-related fears'],
 'subnarrative': ['The West are the aggressors',
  'Other',
  'The West does not care about Ukraine, only about its interests',
  'Other']}

In [71]:
df['label'][0]

['Blaming_the_war_on_others_rather_than_the_invader__The_West_are_the_aggressors',
 'Discrediting_the_West,_Diplomacy__Other',
 'Discrediting_the_West,_Diplomacy__The_West_does_not_care_about_Ukraine,_only_about_its_interests',
 'Amplifying_war-related_fears__Other']

In [72]:
df['language'].value_counts()

BG    211
EN    200
PT    200
HI    115
Name: language, dtype: int64

In [73]:
output_file_path = "../data/merged_dataframe_with_language.csv"
df.to_csv(output_file_path, index=False)