In [35]:
import os
import json
import pandas as pd
import shutil
import re

from itertools import chain
from tqdm import tqdm
from pathlib import Path
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

True

In [36]:
path = Path(os.environ["LOCAL_DOCUMENT_DIR"])
slug_mapping = pd.read_csv("../data/slug.csv")
target_path = Path("../data/documents")

links = [
    "national-climate-change-act-2021_aeec",
    "renewable-energy-sources-act-eeg-latest-version-eeg-2022_1b40",
    "nigeria-s-climate-change-act_5ef7",
    "the-climate-act_dae7",
    "national-climate-change-action-plan-2018-2022-nccap_a381",
    "inflation-reduction-act_b723",
    "the-european-green-deal_60e2",
    "uganda-s-updated-ndc_20d3",
    "philippine-development-plan-2017-2022_86ee",
    "directive-eu-2022-2464-of-the-european-parliament-and-of-the-council-of-14-december-2022-amending-regulation-eu-no-537-2014-directive-2004-109-ec-directive-2006-43-ec-and-directive-2013-34-eu-as-regards-corporate-sustainability-reporting-corporate-sustainability-reporting-directive_3a34",
    "climate-change-act-2008-4baa",
]

In [37]:
def get_document_names_from_links(document_dir: Path, links: list[str]) -> list[str]:
    family_import_ids = slug_mapping[slug_mapping['name'].isin(links)]["family_import_id"].tolist()
    
    file_paths = []
    for file in tqdm(document_dir.glob("*.json")):
        with open(file, "r") as file:
            f = json.load(file)
            if f["document_metadata"]["family_import_id"] in family_import_ids\
              or f["document_metadata"]["slug"] in links\
              or f["document_slug"] in links:
                file_paths.append(file.name)
    
    return file_paths

In [38]:
doc_names = get_document_names_from_links(path, links)

8910it [00:21, 409.53it/s]


In [39]:
def filter_translations(doc_names: list[str]) -> list[str]:
    filtered_doc_names = []
    all_names = [Path(n).stem for n in doc_names]

    for name in doc_names:
        if f"{Path(name).stem}_translated_en" in all_names:
            continue
        else:
            filtered_doc_names.append(name)
    
    return filtered_doc_names

In [40]:
print(len(doc_names))

doc_names = filter_translations(doc_names)

print(len(doc_names))

35
24


In [47]:
def _num_text_blocks_from_path(path: Path) -> int:
    with open(path, "r") as file:
        f = json.load(file)
    
    if "pdf_data" in f.keys() and f["pdf_data"] is not None:
        if "text_blocks" in f["pdf_data"].keys():
            return len(f["pdf_data"]["text_blocks"])
    elif "html_data" in f.keys() and f["html_data"] is not None:
        if "text_blocks" in f["html_data"].keys():
            return len(f["html_data"]["text_blocks"])

    return 0


def filter_within_families(doc_names: list[str]) -> list[str]:
    pattern = re.compile(r".*(CCLW.(?:legislative|executive).\d{1,5})")
    families = set(chain(*[re.findall(pattern, n) for n in doc_names]))

    filtered_doc_names = []
    for family in families:
        family_docs = [n for n in doc_names if family in n]

        docs_to_text_block_count = {n: _num_text_blocks_from_path(n) for n in family_docs}

        max_text_blocks = max(docs_to_text_block_count.values())

        if max_text_blocks > 0:
            filtered_doc_names.append(max(docs_to_text_block_count, key=docs_to_text_block_count.get))
            print(f"{max_text_blocks} max text blocks found for {family}")
        else:
            print(f"No text blocks found for {family}")
    
    return filtered_doc_names

In [48]:
doc_names_filtered = filter_within_families(doc_names)

773 max text blocks found for CCLW.legislative.11041
1843 max text blocks found for CCLW.executive.9647
No text blocks found for CCLW.legislative.10180
1703 max text blocks found for CCLW.legislative.10699
2064 max text blocks found for CCLW.legislative.2072
73 max text blocks found for CCLW.legislative.9363
1713 max text blocks found for CCLW.legislative.1755
906 max text blocks found for CCLW.executive.8737
47 max text blocks found for CCLW.legislative.10390
123 max text blocks found for CCLW.executive.9369


In [49]:
doc_names_filtered

['/Users/matyasjuhasz/git/data/CCLW.legislative.11041.6337.json',
 '/Users/matyasjuhasz/git/data/CCLW.executive.9647.4059.json',
 '/Users/matyasjuhasz/git/data/CCLW.legislative.10699.5931.json',
 '/Users/matyasjuhasz/git/data/CCLW.legislative.2072.4367_translated_en.json',
 '/Users/matyasjuhasz/git/data/CCLW.legislative.9363.rtl_152_translated_en.json',
 '/Users/matyasjuhasz/git/data/CCLW.legislative.1755.rtl_71.json',
 '/Users/matyasjuhasz/git/data/CCLW.executive.8737.1424.json',
 '/Users/matyasjuhasz/git/data/CCLW.legislative.10390.0.json',
 '/Users/matyasjuhasz/git/data/CCLW.executive.9369.3236.json']

In [44]:
for doc in doc_names_filtered:
    shutil.copyfile(doc, target_path / Path(doc).name)