# Explorer glue code

Code to:
1. inspect the pdf parser outputs
2. join the outputs of explorer with the scraper from the metadata.

In [60]:
from pathlib import Path

from cpr_data_access.models import Dataset, BaseDocument
import pandas as pd

pd.set_option("display.max_colwidth", 200)

In [6]:
DOCUMENT_PARSER_OUTPUT_DIR = "/Users/kalyan/Documents/CPR/data/GST/parser_output/"

dataset = Dataset(document_model=BaseDocument).load_from_local(
    DOCUMENT_PARSER_OUTPUT_DIR
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 837/837 [00:39<00:00, 21.16it/s]


## 1. inspecting pdf parser outputs

In [9]:
df = dataset.metadata_df
df["num_pages"] = df["page_metadata"].apply(len)
df.head()

Unnamed: 0,document_id,document_name,document_source_url,document_content_type,document_md5_sum,languages,translated,has_valid_text,page_metadata,geography,num_pages
0,CCLW.gst.623.623,Thailand20TNC,,application/pdf,,,False,True,[],,0
1,CCLW.gst.722.722,mrt,,application/pdf,,[en],False,True,"[{'page_number': 0, 'dimensions': (596.0, 842....",,19
2,CCLW.gst.817.817,tasr2_ARM,,application/pdf,,[en],False,True,"[{'page_number': 0, 'dimensions': (612.0, 792....",,22
3,CCLW.gst.520.520,NIS_Report_2021_EN_211211-web,,application/pdf,,[en],False,True,"[{'page_number': 0, 'dimensions': (842.0, 596....",,276
4,CCLW.gst.421.421,Fourth20National20Communication20of20Albania20...,,application/pdf,,,False,True,[],,0


In [13]:
for col in ["languages", "translated"]:
    print(col)
    print(df[col].value_counts(dropna=False))

languages
[en]    626
None    134
[fr]     36
[es]     29
[ru]      9
[ar]      2
[]        1
Name: languages, dtype: int64
translated
False    837
Name: translated, dtype: int64


In [15]:
print(f"empty docs: {(df['num_pages'] == 0).sum()}")

empty docs: 134


## 2. Joining explorer outputs to scraper metadata

We create a pdf_name column from the scraper output and join it with the document_name column of the explorer output.

In [17]:
EXPLORER_OUTPUT_PATH = "/Users/kalyan/Documents/CPR/explorer/data/test/explorer_output_20230301-180857.xlsx"
SCRAPER_METADATA_PATH = (
    "/Users/kalyan/Documents/CPR/unfccc-global-stocktake-documents/unfccc_files.csv"
)

In [21]:
explorer_output = pd.read_excel(EXPLORER_OUTPUT_PATH)
scraper_metadata = pd.read_csv(SCRAPER_METADATA_PATH)

In [54]:
scraper_metadata["pdf_name"] = (
    scraper_metadata["pdf_link"]
    .apply(lambda i: i.split("/")[-1][:-4])
    .apply(lambda i: i.replace("%", ""))
)

if scraper_metadata["pdf_name"].nunique() == len(scraper_metadata):
    print("PDF names are unique")
else:
    print("PDF names aren't unique. Nonunique names:")
    duplicated_pdf_names = scraper_metadata["pdf_name"].value_counts()
    duplicated_pdf_names = duplicated_pdf_names[duplicated_pdf_names.values > 1]
    print(duplicated_pdf_names.index.tolist())

PDF names are unique


In [56]:
scraper_metadata[scraper_metadata["pdf_name"].str.startswith("TNC20-20MNE_0")]

Unnamed: 0,md5sum,document_name,theme,type,author,author_type,date,language,link,pdf_link,validation_status,data_error_type,party,translation,version,status,source,topics,pdf_name
753,0921d172aae7a6481bfefe46020f79b6,Montenegro. National Communication (NC). NC 3.,"Policies and measures, National communications (NC), Mitigation",National communications (NC),Montenegro,,2020-10-12T12:00:00Z,English,/documents/254489,https://unfccc.int/sites/default/files/resource/TNC%20-%20MNE_0.pdf,not validated,,Montenegro,,,,UNFCCC Information Portal,"Policies and measures,National communications NC,Mitigation",TNC20-20MNE_0


In [59]:
explorer_output_with_metadata = pd.merge(
    left=explorer_output,
    right=scraper_metadata,
    left_on="document_name",
    right_on="pdf_name",
    how="left",
    validate="m:1",
)

if explorer_output_with_metadata["pdf_link"].isnull().sum():
    print(
        "All documents in explorer output matched with metadata from the scraper output CSV"
    )

In [63]:
output_path = Path(EXPLORER_OUTPUT_PATH).with_stem(
    Path(EXPLORER_OUTPUT_PATH).stem + "_with_metadata"
)

explorer_output_with_metadata.to_excel(output_path, index=False)
print(f"Saved to {output_path}")

Saved to /Users/kalyan/Documents/CPR/explorer/data/test/explorer_output_20230301-180857_with_metadata.xlsx
