# Get Page Dimension from Images

In [1]:
import os
import cv2
import pandas as pd

import concurrent.futures
from concurrent.futures import ThreadPoolExecutor, as_completed

from tqdm import tqdm

In [2]:
test_sample = pd.read_csv("/scratch/students/danae/data/data_preparation/samples/pages/test.csv")
train_small_sample = pd.read_csv("/scratch/students/danae/data/data_preparation/samples/pages/train_small.csv")
train_large_sample = pd.read_csv("/scratch/students/danae/data/data_preparation/samples/pages/train_large.csv")

In [3]:
# pages_df = pd.concat([test_sample, train_small_sample, train_large_sample], ignore_index=True).drop_duplicates(subset="page_id")
pages_df = pd.concat([train_large_sample], ignore_index=True).drop_duplicates(subset="page_id")
pages_df["issue_id"] = pages_df["page_id"].apply(lambda x: x.split("-p")[0])
pages_df

Unnamed: 0,journal,page_id,issue_id
0,jdpl,jdpl-1814-05-22-a-p0001,jdpl-1814-05-22-a
1,jdpl,jdpl-1814-06-05-a-p0001,jdpl-1814-06-05-a
2,jdpl,jdpl-1814-07-26-a-p0001,jdpl-1814-07-26-a
3,jdpl,jdpl-1814-06-08-a-p0001,jdpl-1814-06-08-a
4,jdpl,jdpl-1814-07-29-a-p0001,jdpl-1814-07-29-a
...,...,...,...
149995,oeuvre,oeuvre-1934-07-12-a-p0002,oeuvre-1934-07-12-a
149996,indeplux,indeplux-1905-11-03-a-p0001,indeplux-1905-11-03-a
149997,jdpl,jdpl-1930-12-02-a-p0006,jdpl-1930-12-02-a
149998,oeuvre,oeuvre-1923-05-09-a-p0006,oeuvre-1923-05-09-a


In [4]:
pages_df["file_name"] = pages_df["page_id"].apply(lambda x: x+".jpg")
pages_df["width"] = 7000 # Placeholder !!!
pages_df["height"] = 8000

pages_df

Unnamed: 0,journal,page_id,issue_id,file_name,width,height
0,jdpl,jdpl-1814-05-22-a-p0001,jdpl-1814-05-22-a,jdpl-1814-05-22-a-p0001.jpg,7000,8000
1,jdpl,jdpl-1814-06-05-a-p0001,jdpl-1814-06-05-a,jdpl-1814-06-05-a-p0001.jpg,7000,8000
2,jdpl,jdpl-1814-07-26-a-p0001,jdpl-1814-07-26-a,jdpl-1814-07-26-a-p0001.jpg,7000,8000
3,jdpl,jdpl-1814-06-08-a-p0001,jdpl-1814-06-08-a,jdpl-1814-06-08-a-p0001.jpg,7000,8000
4,jdpl,jdpl-1814-07-29-a-p0001,jdpl-1814-07-29-a,jdpl-1814-07-29-a-p0001.jpg,7000,8000
...,...,...,...,...,...,...
149995,oeuvre,oeuvre-1934-07-12-a-p0002,oeuvre-1934-07-12-a,oeuvre-1934-07-12-a-p0002.jpg,7000,8000
149996,indeplux,indeplux-1905-11-03-a-p0001,indeplux-1905-11-03-a,indeplux-1905-11-03-a-p0001.jpg,7000,8000
149997,jdpl,jdpl-1930-12-02-a-p0006,jdpl-1930-12-02-a,jdpl-1930-12-02-a-p0006.jpg,7000,8000
149998,oeuvre,oeuvre-1923-05-09-a-p0006,oeuvre-1923-05-09-a,oeuvre-1923-05-09-a-p0006.jpg,7000,8000


In [5]:
def get_img_dim(img_dir, image_fn):
    image_path = os.path.join(img_dir, image_fn)
    image = cv2.imread(image_path)
    if image is None:
        raise FileNotFoundError(f"Failed to load image: {image_path}")
    image_height, image_width, _ = image.shape
    
    return image_fn, image_height, image_width

In [6]:
img_dir = "/scratch/students/danae/data/images"

# concurrent execution
results = []
error_log = []

MAX_THREADS = 15 # number of concurrent requests
TIMEOUT = 5

def get_img_dim_safe(img_dir, file_name):
    """
    A wrapper for get_img_dim to handle warnings and errors.
    """
    try:
        return get_img_dim(img_dir, file_name)
    except Exception as e:
        raise RuntimeError(f"Error reading {file_name}: {e}")

with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
    # submit all file names to the executor
    future_to_url = {
        executor.submit(get_img_dim_safe, img_dir, fn): fn
        for fn in pages_df["file_name"]
    }
    
    for future in tqdm(concurrent.futures.as_completed(future_to_url), total=len(future_to_url), desc="Fetching image dimensions"):
        file_name = future_to_url[future]
        try:
            # collect results as they complete, within the timeout
            result = future.result(timeout=TIMEOUT)
            results.append(result)
        except concurrent.futures.TimeoutError:
            # log timeout error
            error_log.append({"file_name": file_name, "error": "Timeout"})
        except Exception as e:
            # log other errors
            error_log.append({"file_name": file_name, "error": str(e)})


page_dimensions = pd.DataFrame(results, columns=["file_name", "img_height", "image_width"])
error_log_df = pd.DataFrame(error_log)

Fetching image dimensions:   0%|          | 248/150000 [00:05<1:16:32, 32.61it/s]Premature end of JPEG file
Fetching image dimensions:   3%|▎         | 4946/150000 [02:45<2:45:21, 14.62it/s][ WARN:0@177.059] global loadsave.cpp:268 findDecoder imread_('/scratch/students/danae/data/images/jdpl-1852-01-01-b-p0004.jpg'): can't open/read file: check file path/integrity
Fetching image dimensions:   4%|▍         | 6276/150000 [03:26<2:31:45, 15.78it/s]Premature end of JPEG file
Fetching image dimensions:   5%|▌         | 7641/150000 [04:10<1:46:35, 22.26it/s][ WARN:1@261.504] global loadsave.cpp:268 findDecoder imread_('/scratch/students/danae/data/images/luxwort-1860-11-01-a-p0002.jpg'): can't open/read file: check file path/integrity
Fetching image dimensions:   5%|▌         | 7868/150000 [04:20<1:24:43, 27.96it/s]Premature end of JPEG file
Fetching image dimensions:   6%|▋         | 9638/150000 [05:27<2:14:36, 17.38it/s][ WARN:2@338.634] global loadsave.cpp:268 findDecoder imread_('/scrat

In [7]:
page_dimensions.sample(5)

Unnamed: 0,file_name,img_height,image_width
90399,luxland-1973-02-09-a-p0003.jpg,6425,4252
22145,lematin-1886-10-25-a-p0001.jpg,6956,5162
35551,lepetitparisien-1903-05-17-a-p0005.jpg,6613,4863
65098,jdpl-1935-12-16-a-p0005.jpg,7200,5118
125717,jdpl-1826-01-30-a-p0001.jpg,4022,2532


In [8]:
error_log_df.sample(5)

Unnamed: 0,file_name,error
530,waeschfra-1873-06-28-a-p0004.jpg,Error reading waeschfra-1873-06-28-a-p0004.jpg...
687,legaulois-1880-01-31-b-p0004.jpg,Error reading legaulois-1880-01-31-b-p0004.jpg...
120,waeschfra-1869-03-27-a-p0001.jpg,Error reading waeschfra-1869-03-27-a-p0001.jpg...
466,waeschfra-1873-08-10-a-p0001.jpg,Error reading waeschfra-1873-08-10-a-p0001.jpg...
708,jdpl-1900-06-22-a-p0004.jpg,Error reading jdpl-1900-06-22-a-p0004.jpg: Fai...


In [9]:
# page_dimensions.to_csv("/scratch/students/danae/data/page_dimensions/page_dims_train_large.csv", index=False)
# error_log_df.to_csv("/scratch/students/danae/data/page_dimensions/page_dims_errors_train_large.csv", index=False)

In [10]:
len(page_dimensions), len(error_log_df), len(page_dimensions)+len(error_log_df)

(149219, 781, 150000)