# Page Data Preparation
This notebook extracts the `bounding boxes` of all elements located in the `pages` sampled by the `pages_sampling.ipynb` notebook. This information is found in the `pages files` stored in a designated S3 bucket. The extracted data is organized into a structured DataFrame for streamlined analysis and further processing.

In [2]:
import pandas as pd

### Load Sampled Pages

In [3]:
test_sample = pd.read_csv("/scratch/students/danae/data/data_preparation/samples/pages/test.csv")
train_small_sample = pd.read_csv("/scratch/students/danae/data/data_preparation/samples/pages/train_small.csv")
train_large_sample = pd.read_csv("/scratch/students/danae/data/data_preparation/samples/pages/train_large.csv")

In [4]:
pages_df = pd.concat([test_sample, train_small_sample, train_large_sample], ignore_index=True).drop_duplicates(subset="page_id")
pages_df["issue_id"] = pages_df["page_id"].apply(lambda x: x.split("-p")[0])
pages_df

Unnamed: 0,journal,page_id,issue_id
0,jdpl,jdpl-1814-05-21-a-p0001,jdpl-1814-05-21-a
1,jdpl,jdpl-1814-05-21-a-p0002,jdpl-1814-05-21-a
2,jdpl,jdpl-1815-07-05-a-p0001,jdpl-1815-07-05-a
3,jdpl,jdpl-1815-07-05-a-p0002,jdpl-1815-07-05-a
4,jdpl,jdpl-1819-01-08-a-p0001,jdpl-1819-01-08-a
...,...,...,...
204995,oeuvre,oeuvre-1934-07-12-a-p0002,oeuvre-1934-07-12-a
204996,indeplux,indeplux-1905-11-03-a-p0001,indeplux-1905-11-03-a
204997,jdpl,jdpl-1930-12-02-a-p0006,jdpl-1930-12-02-a
204998,oeuvre,oeuvre-1923-05-09-a-p0006,oeuvre-1923-05-09-a


In [5]:
pages = pages_df["page_id"].to_list()

### Get the Bounding Boxes for all Sampled Pages

In [6]:
import sys
import os
import random

import numpy as np
import pandas as pd
# from dask.bag import random
from dask.dataframe import from_pandas
from dask.distributed import Client, progress, LocalCluster

# add the path to the parent directory of `src` to the system path
sys.path.insert(0, os.path.abspath('../..'))
# import custom utility functions from the `src` package
from src.utils.data_preparation import *

Perhaps you already have a cluster running?
Hosting the HTTP server on port 44273 instead


In [7]:
# helper functions to read/select the relevant info from the page files
def get_bbxs(page):

    rows = []

    for region in page["r"]:

        for paragraph in region["p"]:
            lines_bbxs = []
            lines_tokens = []

            for line in paragraph["l"]:
                lines_bbxs.append(line["c"])
                tokens = []

                for token in line["t"]:
                    tokens.append(token["tx"])

                lines_tokens.append(tokens)

            row = {
                "bbx_region": region["c"],
                "bbx_paragraph": paragraph["c"],
                "lines_bbxs": lines_bbxs,
                "lines_tokens": lines_tokens,
                "pOf": region.get(
                    "pOf", np.nan
                ),  # pOf doesn't always appear as a key in dict
            }
            rows.append(row)

    return pd.DataFrame(rows)


def get_page_info(page):

    keys = ["id", "cdt", "iiif_img_base_uri"]
    nan = math.nan  # pre-define NaN for missing values

    items = [page.get(key, nan) for key in keys]
    bbx_df = get_bbxs(page)
    df = pd.DataFrame([items] * len(bbx_df), columns=keys)
    df = pd.concat([df, bbx_df], axis=1).reset_index(drop=True)

    return df

In [8]:
if __name__ == '__main__':
    memory_per_worker_gb = 15
    cluster = LocalCluster(n_workers=5, threads_per_worker=1, memory_limit=f"{memory_per_worker_gb}GB")
    client = cluster.get_client()

    journals = list(set(pages_df["journal"]))
    # loaded_journals = ["actionfem", "avenirgdl", "courriergdl", "deletz1893", "excelsior", "lepji", "lunion", "marieclaire", "oeuvre", "oerennes"]
    # journals = [j for j in journals if j not in loaded_journals]

    bucket_name = "12-canonical-final"
    storage_options = IMPRESSO_STORAGEOPT

    i = 0
    nb_journals = len(journals)
    
    for journal in journals:
        i += 1
        print(f"Processing journal: {journal}")
        print(f"{i}/{nb_journals}")
        
        # list all the issues for which the canonical data is available
        bucket_file_names = list_journal_page_files(journal, bucket_name=bucket_name)
        
        # get all the issue ids
        issues = pages_df[pages_df["journal"] == journal]["issue_id"].drop_duplicates().to_list()
        
        file_names = [
            fn for fn in bucket_file_names if "-".join(fn.split("/")[-1].split("-")[:-1]) in issues
        ]

        # load the canonical data from s3 bucket
        print("Loading the canonical data from bucket...")
        bags = []
        
        for fn in file_names:

            bag = db.read_text(
                "s3://" + bucket_name + "/" + fn, storage_options=storage_options
            ).map(json.loads)

            bags.append(bag)

        bags = db.concat(bags)
        
        # apply the function lazily across all pages in parallel
        pages_info = bags.map(get_page_info)
        
        print("Computing...")
        with ProgressBar():
            # convert the result to a single Pandas df using Dask df for parallelism
            result = pages_info.compute()

        result_df = pd.concat(
            result, ignore_index=True
        )  # can convert to Dask df instead of a Pandas df
        
        print("Writing file...")
        # save this dataframe for EDA
        result_df = result_df.rename(columns={"id": "page_id", "cdt": "page_cdt"})
        result_df.to_parquet(f"/scratch/students/danae/data/data_preparation/samples/pages_bbox/{journal}_issues_v2.parquet.gzip", compression="gzip")
        # only keep sampled pages
        result_df = result_df[result_df["page_id"].isin(pages)]
        result_df.to_parquet(f"/scratch/students/danae/data/data_preparation/samples/pages_bbox/{journal}_pages_v2.parquet.gzip", compression="gzip")
        

Perhaps you already have a cluster running?
Hosting the HTTP server on port 33779 instead


Processing journal: diekwochen
1/36
Listing Page Files of 'diekwochen' journal
Loading the canonical data from bucket...
Computing...
Writing file...
Processing journal: lunion
2/36
Listing Page Files of 'lunion' journal
Loading the canonical data from bucket...
Computing...
Writing file...
Processing journal: avenirgdl
3/36
Listing Page Files of 'avenirgdl' journal
Loading the canonical data from bucket...
Computing...
Writing file...
Processing journal: lafronde
4/36
Listing Page Files of 'lafronde' journal
Loading the canonical data from bucket...
Computing...
Writing file...
Processing journal: legaulois
5/36
Listing Page Files of 'legaulois' journal
Loading the canonical data from bucket...
Computing...
Writing file...
Processing journal: oecaen
6/36
Listing Page Files of 'oecaen' journal
Loading the canonical data from bucket...
Computing...
Writing file...
Processing journal: gazgrdlux
7/36
Listing Page Files of 'gazgrdlux' journal
Loading the canonical data from bucket...
Compu