In [None]:
# Take a list of article names and copy corresponding enriched files

import os
import shutil
import unicodedata

# Paths
article_list_path = "../article_list.txt"
source_dir = "../enriched"
target_dir = "../enriched_sample"

# Load article names
with open(article_list_path, "r", encoding="utf-8") as f:
    articles = [line.strip() for line in f if line.strip()]

# Convert to normalized filename forms
normalized_articles = {
    unicodedata.normalize("NFC", name.replace(" ", "_") + "_enriched.jsonl"): name
    for name in articles
}
#print(normalized_articles)
# List all files in source_dir and normalize
for filename in os.listdir(source_dir):
    normalized_filename = unicodedata.normalize("NFC", filename).replace(" ", "_")
    #print(normalized_filename)
    if normalized_filename in normalized_articles:
        src_path = os.path.join(source_dir, filename)
        dst_path = os.path.join(target_dir, filename)
        shutil.copyfile(src_path, dst_path)
        print(f"Copied: {filename}")
    else:
        print(f"File not found: {filename}")

File not found: List of Armenian churches in Azerbaijan_enriched.jsonl
File not found: 1727_in_Armenia_enriched.jsonl
File not found: 1918_in_Armenia_enriched.jsonl
File not found: 451_in_Armenia_enriched.jsonl
File not found: 1991_Nagorno-Karabakh_independence_referendum_enriched.jsonl
File not found: Acilisene_enriched.jsonl
File not found: A_Shameful_Act_enriched.jsonl
File not found: ARF_History_Museum_enriched.jsonl
File not found: Aghavnavank_Monastery_enriched.jsonl
File not found: Administrative_divisions_of_Armenia_enriched.jsonl
File not found: Administrative_divisions_of_the_Armenian_Soviet_Socialist_Republic_enriched.jsonl
File not found: Aghoghlan_Gate_enriched.jsonl
File not found: Akhalkalaki_uezd_enriched.jsonl
File not found: Abbas_Mirza_Mosque,_Yerevan_enriched.jsonl
File not found: 2022_Armenian_protests_enriched.jsonl
File not found: Agulis_(historical_village)_enriched.jsonl
File not found: Alik_(daily)_enriched.jsonl
File not found: Akhaltsikhe_uezd_enriched.jsonl

In [1]:
# Subsampling 10% of lines from each file in the target directory

import os
import json
import random

input_dir = "../enriched_sample"
output_dir = "../enriched_sample_subset"
sample_fraction = 0.10  # 10%

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

for file_name in os.listdir(input_dir):
    if not file_name.endswith(".jsonl"):
        continue

    input_path = os.path.join(input_dir, file_name)
    with open(input_path, 'r', encoding='utf-8') as infile:
        lines = [json.loads(line) for line in infile if line.strip()]

    if not lines:
        continue

    sample_size = max(1, int(len(lines) * sample_fraction))
    sampled_lines = random.sample(lines, sample_size)

    base_name = file_name.replace(".jsonl", "_subsampled.jsonl")
    output_path = os.path.join(output_dir, base_name)
    
    with open(output_path, 'w', encoding='utf-8') as outfile:
        for line in sampled_lines:
            outfile.write(json.dumps(line, ensure_ascii=False) + "\n")

    print(f"Sampled {sample_size} lines from {file_name} → {base_name}")


Sampled 865 lines from Armenia_enriched.jsonl → Armenia_enriched_subsampled.jsonl
Sampled 547 lines from Yerevan_enriched.jsonl → Yerevan_enriched_subsampled.jsonl
Sampled 542 lines from Nagorno-Karabakh_enriched.jsonl → Nagorno-Karabakh_enriched_subsampled.jsonl
Sampled 513 lines from Adana_enriched.jsonl → Adana_enriched_subsampled.jsonl
Sampled 472 lines from Armenians_enriched.jsonl → Armenians_enriched_subsampled.jsonl
Sampled 358 lines from Armenian_language_enriched.jsonl → Armenian_language_enriched_subsampled.jsonl
Sampled 338 lines from Mount_Ararat_enriched.jsonl → Mount_Ararat_enriched_subsampled.jsonl
Sampled 324 lines from Armenian_genocide_recognition_enriched.jsonl → Armenian_genocide_recognition_enriched_subsampled.jsonl
Sampled 313 lines from Shusha_enriched.jsonl → Shusha_enriched_subsampled.jsonl
Sampled 302 lines from Dolma_enriched.jsonl → Dolma_enriched_subsampled.jsonl
Sampled 272 lines from Urartu_enriched.jsonl → Urartu_enriched_subsampled.jsonl
Sampled 238 li