## Sampling documents for the UNECE sprint

This notebook samples documents equally by world bank region and whether they're translated, and produces a list of IDs.

Note that there are only 86 docs provided by the UNECE that are PDFs. As this is ~10% of the 800 docs we want to sample in total, no sampling code is needed for those.

In [82]:
from pathlib import Path
import json

from cpr_data_access.parser_models import BackendDocument
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [69]:
# Config 

np.random.seed(42) # used by pandas

CPR_DOCUMENTS_PATH = Path("/Users/kalyan/documents/cpr/data/s3-buckets-prod/opensearch_input_05_20_2024")

assert CPR_DOCUMENTS_PATH.exists()

In [7]:
def load_metadata_only(doc_path: Path) -> BackendDocument:
    """Helper function to load the metadata only from a CPR document"""
    _metadata = json.loads(doc_path.read_text())["document_metadata"]
    
    return BackendDocument.model_validate(_metadata)

In [8]:
doc_metadata = [load_metadata_only(doc) for doc in tqdm(CPR_DOCUMENTS_PATH.glob("*.json"))]

0it [00:00, ?it/s]

In [27]:
backend_geographies_df = pd.read_csv("geography-iso-3166-backend.csv")
iso_region_map = {row["Iso"]: row["World Bank Region"] for _, row in backend_geographies_df.iterrows()}

In [28]:
doc_metadata_df = pd.DataFrame([doc.model_dump() for doc in doc_metadata])
doc_metadata_df["translated"] = doc_metadata_df["languages"].apply(lambda i: len(i) == 1 and i[0] == "English")
doc_metadata_df["world bank region"] = doc_metadata_df["geography"].map(iso_region_map)
doc_metadata_df.head()

Unnamed: 0,name,description,import_id,slug,family_import_id,family_slug,publication_ts,date,source_url,download_url,type,source,category,geography,languages,metadata,translated,world bank region
0,Decree 69/2021 approving the National Strategy...,This decree approves the National Strategy for...,CCLW.executive.10992.6270,decree-69-2021-approving-the-national-strategy...,CCLW.family.10992.0,decree-69-2021-approving-the-national-strategy...,2021-04-29 00:00:00+00:00,29/04/2021,https://minhacienda-gob.com/wp-content/uploads...,,Decree,CCLW,Executive,GNQ,[Spanish],"{'topic': ['Adaptation', 'Mitigation'], 'hazar...",False,Sub-Saharan Africa
1,Decree of 29-4-2015 creating the Office of Ene...,The Office of Energy and Climate Change (OECC)...,CCLW.executive.9783.4434,decree-of-29-4-2015-creating-the-office-of-ene...,CCLW.family.9783.0,decree-of-29-4-2015-creating-the-office-of-ene...,2015-04-29 00:00:00+00:00,29/04/2015,https://www.bopa.ad/bopa/027038/Pagines/ga2703...,,Decree,CCLW,Executive,AND,[Catalan],"{'topic': ['Adaptation', 'Mitigation'], 'hazar...",False,Europe & Central Asia
2,Sierra Leone Vision 2025,Has two references to ‘climate change' as one ...,CCLW.executive.4747.1918,sierra-leone-vision-2025_db39,CCLW.family.4747.0,sierra-leone-vision-2025_68fe,2003-08-01 00:00:00+00:00,01/08/2003,https://climate-laws.org/rails/active_storage/...,,Vision,CCLW,Executive,SLE,[English],"{'topic': ['Adaptation', 'Loss And Damage'], '...",True,Sub-Saharan Africa
3,Climate Action Plan 2024,"<p>On December 20th, 2023, Ireland approved th...",CCLW.document.i00000201.n0000,climate-action-plan-2024_c4ac,CCLW.family.i00000200.n0000,climate-action-plan-2024_f046,2023-12-20 00:00:00+00:00,20/12/2023,https://www.gov.ie/pdf/?file=https://assets.go...,,Plan,CCLW,Executive,IRL,[English],"{'topic': ['Mitigation'], 'hazard': [], 'secto...",True,Europe & Central Asia
4,Greening Finance: A Roadmap to Sustainable Inv...,This document details Sustainability Disclosur...,CCLW.executive.10239.4855,greening-finance-a-roadmap-to-sustainable-inve...,CCLW.family.10239.0,greening-finance-a-roadmap-to-sustainable-inve...,2021-10-19 00:00:00+00:00,19/10/2021,https://assets.publishing.service.gov.uk/gover...,,Roadmap,CCLW,Executive,GBR,[English],"{'topic': ['Mitigation'], 'hazard': [], 'secto...",True,Europe & Central Asia


In [36]:
doc_metadata_df.groupby(["world bank region", "translated"]).size().unstack().fillna(0).astype(int).T

world bank region,East Asia & Pacific,Europe & Central Asia,International,Latin America & Caribbean,Middle East & North Africa,North America,South Asia,Sub-Saharan Africa
translated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,337,1424,0,774,194,41,28,407
True,779,961,43,183,155,174,217,478


In [73]:
# Sample all International docs

sampled_ids = doc_metadata_df[doc_metadata_df["world bank region"] == "International"]["import_id"].tolist()


In [74]:
# Number of remaining docs to sample = 800 - (43 international) - (86 unece) = 671
# Round to 672 so 96 docs per region

docs_per_region = 96

# sample a balanced number of translated and untranslated docs for each region

for region, region_df in doc_metadata_df.groupby("world bank region"):
    if region == "International":
        continue
    
    if region_df["translated"].value_counts().min() < docs_per_region//2:        
        minor_class = region_df["translated"].value_counts().index[-1]
        sampled_ids.extend(region_df[region_df["translated"] == minor_class]["import_id"].tolist())
        
        major_class = region_df["translated"].value_counts().index[0]
        n_samples_major_class = docs_per_region - region_df["translated"].value_counts()[minor_class]
        sampled_ids.extend(region_df[region_df["translated"] == major_class].sample(n_samples_major_class)["import_id"].tolist())
    
    else:        
        translated_docs = region_df[region_df["translated"]].sample(docs_per_region//2)
        untranslated_docs = region_df[~region_df["translated"]].sample(docs_per_region//2)
        
        sampled_ids.extend(translated_docs["import_id"].tolist())
        sampled_ids.extend(untranslated_docs["import_id"].tolist())

In [76]:
sampled_df = doc_metadata_df[doc_metadata_df["import_id"].isin(sampled_ids)]

print(f"{len(sampled_df)} docs sampled (plus 86 from UNECE) = {len(sampled_df) + 86} docs in total.")

sampled_df.groupby(["world bank region", "translated"]).size().unstack().fillna(0).astype(int).T

715 docs sampled (plus 86 from UNECE) = 801 docs in total.


world bank region,East Asia & Pacific,Europe & Central Asia,International,Latin America & Caribbean,Middle East & North Africa,North America,South Asia,Sub-Saharan Africa
translated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,48,48,0,48,48,41,28,48
True,48,48,43,48,48,55,68,48


In [81]:
# TODO: this line didn't work, so I manually uploaded this file to the location below
# sampled_df.to_csv("s3://project_rag/data/unece_sampled_cpr_docs.csv", index=False)

sampled_df.to_csv("../../data/unece_sampled_cpr_docs.csv", index=False)

In [91]:
# copy files to new directory for s3 upload

NEW_DIR = Path("../../data/documents_unece/cpr")

for _id in tqdm(sampled_ids):
    doc_filename = f"{_id}.json"
    doc_path = CPR_DOCUMENTS_PATH / f"{_id}.json"

    new_doc_path = NEW_DIR / doc_filename
    
    new_doc_path.write_text(doc_path.read_text())
    
print(f"Done! {len(sampled_ids)} documents copied to {NEW_DIR}")

  0%|          | 0/715 [00:00<?, ?it/s]

Done! 715 documents copied to ../../data/documents_unece/cpr
