# Page Downloading
This notebook is used to download the pages to the server through requests using the pages' IIF links.

In [21]:
import os
import time
import shutil
import requests
import pandas as pd

from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [22]:
import dask.dataframe as dd
import pandas as pd
import json
import re

import numpy as np
from tqdm import tqdm
from tqdm.auto import tqdm  # for notebooks

# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

import seaborn as sns
import matplotlib.pyplot as plt

from rapidfuzz.distance import Levenshtein

In [23]:
test_sample = pd.read_csv("/scratch/students/danae/data/data_preparation/samples/pages/test.csv")
train_small_sample = pd.read_csv("/scratch/students/danae/data/data_preparation/samples/pages/train_small.csv")
train_large_sample = pd.read_csv("/scratch/students/danae/data/data_preparation/samples/pages/train_large.csv")

In [24]:
#pages_df = pd.concat([test_sample, train_small_sample, train_large_sample], ignore_index=True).drop_duplicates(subset="page_id")
pages_df = pd.concat([test_sample, train_small_sample], ignore_index=True).drop_duplicates(subset="page_id")
pages_df["issue_id"] = pages_df["page_id"].apply(lambda x: x.split("-p")[0])
pages_df

Unnamed: 0,journal,page_id,issue_id
0,jdpl,jdpl-1814-05-21-a-p0001,jdpl-1814-05-21-a
1,jdpl,jdpl-1814-05-21-a-p0002,jdpl-1814-05-21-a
2,jdpl,jdpl-1815-07-05-a-p0001,jdpl-1815-07-05-a
3,jdpl,jdpl-1815-07-05-a-p0002,jdpl-1815-07-05-a
4,jdpl,jdpl-1819-01-08-a-p0001,jdpl-1819-01-08-a
...,...,...,...
54995,lepetitparisien,lepetitparisien-1928-11-24-a-p0009,lepetitparisien-1928-11-24-a
54996,jdpl,jdpl-1836-08-20-a-p0004,jdpl-1836-08-20-a
54997,lematin,lematin-1907-02-06-a-p0004,lematin-1907-02-06-a
54998,oecaen,oecaen-1924-04-01-a-p0010,oecaen-1924-04-01-a


In [25]:
# get journals
journals = pages_df["journal"].sort_values().drop_duplicates().to_list()

In [26]:
# get image uri for every page of a particular journal collection
uris_df = pd.DataFrame(columns=["page_id", "iiif_img_base_uri"])

for journal in tqdm(journals, desc="Looping through Journals"):
    
    df = pd.read_parquet(f"/scratch/students/danae/data/data_preparation/samples/pages_bbox/{journal}_pages_v2.parquet.gzip", columns=["page_id", "iiif_img_base_uri"]).drop_duplicates()
    uris_df = pd.concat([uris_df, df], ignore_index=True)
    
uris_df

Looping through Journals:   0%|          | 0/36 [00:00<?, ?it/s]

Unnamed: 0,page_id,iiif_img_base_uri
0,actionfem-1927-10-15-a-p0001,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
1,actionfem-1927-10-15-a-p0002,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
2,actionfem-1927-10-15-a-p0003,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
3,actionfem-1927-10-15-a-p0004,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
4,actionfem-1927-10-15-a-p0005,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
...,...,...
155348,waeschfra-1884-06-14-a-p0004,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
155349,waeschfra-1884-06-28-a-p0001,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
155350,waeschfra-1884-06-28-a-p0002,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
155351,waeschfra-1884-06-28-a-p0003,https://iiif.eluxemburgensia.lu/image/iiif/2/a...


In [27]:
uris_df = uris_df[uris_df["page_id"].isin(pages_df["page_id"])]
uris_df

Unnamed: 0,page_id,iiif_img_base_uri
8,actionfem-1927-10-15-a-p0009,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
9,actionfem-1927-10-15-a-p0010,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
10,actionfem-1927-10-15-a-p0011,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
11,actionfem-1927-10-15-a-p0012,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
12,actionfem-1927-10-15-a-p0013,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
...,...,...
155328,waeschfra-1884-05-10-a-p0004,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
155329,waeschfra-1884-05-17-a-p0001,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
155330,waeschfra-1884-05-17-a-p0002,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
155331,waeschfra-1884-05-17-a-p0003,https://iiif.eluxemburgensia.lu/image/iiif/2/a...


### Request Image from IIIF Server

In [28]:
# output dirs
out_dir = "/scratch/students/danae/data"
out_dir_images = os.path.join(out_dir, "images")

os.makedirs(out_dir_images, exist_ok=True)

### Get sampled pages

In [29]:
# add '/full/full/0/default.jpg' to url to get full page
uris_df["iiif_img_base_uri"] = uris_df["iiif_img_base_uri"].apply(lambda x: x + "/full/full/0/default.jpg")
uris_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uris_df["iiif_img_base_uri"] = uris_df["iiif_img_base_uri"].apply(lambda x: x + "/full/full/0/default.jpg")


Unnamed: 0,page_id,iiif_img_base_uri
8,actionfem-1927-10-15-a-p0009,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
9,actionfem-1927-10-15-a-p0010,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
10,actionfem-1927-10-15-a-p0011,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
11,actionfem-1927-10-15-a-p0012,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
12,actionfem-1927-10-15-a-p0013,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
...,...,...
155328,waeschfra-1884-05-10-a-p0004,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
155329,waeschfra-1884-05-17-a-p0001,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
155330,waeschfra-1884-05-17-a-p0002,https://iiif.eluxemburgensia.lu/image/iiif/2/a...
155331,waeschfra-1884-05-17-a-p0003,https://iiif.eluxemburgensia.lu/image/iiif/2/a...


In [30]:
import time
import requests
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [31]:
def download_iiif_image(iiif_url, f_out, session):
    try:
        with session.get(iiif_url, stream=True) as r:
            if r.status_code == 200:
                r.raw.decode_content = True
                with open(f_out, 'wb') as f:
                    shutil.copyfileobj(r.raw, f)
                return f"Downloaded: {iiif_url}"
            else:
                return f"Failed: {iiif_url} with status {r.status_code}"
    except Exception as e:
        return f"Error: {iiif_url} -> {str(e)}"

In [34]:
import time

In [37]:
def download_iiif_image(iiif_url, f_out):
    
    time.sleep(0.5)
    r = requests.get(url, stream=True)
    
    if r.status_code == 200:
        r.raw.decode_content = True
        with open(f_out,'wb') as f:
            shutil.copyfileobj(r.raw, f)
    else:
        print(f"Could not get {iiif_url}")
        
def create_session():
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session

def download_iiif_image(iiif_url, dest_path, session):
    try:
        with session.get(iiif_url, stream=True) as r:
            if r.status_code == 200:
                r.raw.decode_content = True
                with open(dest_path, 'wb') as f:
                    shutil.copyfileobj(r.raw, f)
                # print(f"Downloaded: {iiif_url}")
            else:
                print(f"Failed: {iiif_url} with status {r.status_code}")
    except Exception as e:
        print(f"Error: {iiif_url} -> {str(e)}")
        time.sleep(60*10)
        

def download_images_sequential(image_urls, page_ids, out_dir_images, delay=0.5):
    session = create_session()
    for iiif_url, page_id in tqdm(zip(image_urls, page_ids), total=len(image_urls), desc="Downloading Images"):
        dest_path = os.path.join(out_dir_images, page_id + ".jpg")
        if not os.path.exists(dest_path):
            print(f"Downloading {iiif_url} -> {dest_path}")
            start = time.time()
            download_iiif_image(iiif_url, dest_path, session)
            end = time.time()
            if (end - start) < 10:
                time.sleep(10)
            else:
                time.sleep(delay)
        else:
            print(f"Skipping {dest_path}, already exists.")

In [38]:
# output dirs
out_dir = "/scratch/students/danae/data"
out_dir_images = os.path.join(out_dir, "images")

os.makedirs(out_dir_images, exist_ok=True)

# BNF only
df = uris_df[uris_df["iiif_img_base_uri"].str.startswith("https://gallica.bnf")]
# download images
# download_images_parallel
# download_images_sequential
download_images_sequential(image_urls=df["iiif_img_base_uri"], page_ids=df["page_id"], out_dir_images=out_dir_images, delay=1)

Downloading Images:   0%|          | 0/13861 [00:00<?, ?it/s]

Skipping /scratch/students/danae/data/images/excelsior-1910-11-21-a-p0011.jpg, already exists.
Skipping /scratch/students/danae/data/images/excelsior-1910-11-26-a-p0012.jpg, already exists.
Skipping /scratch/students/danae/data/images/excelsior-1910-11-29-a-p0012.jpg, already exists.
Skipping /scratch/students/danae/data/images/excelsior-1910-12-01-a-p0007.jpg, already exists.
Skipping /scratch/students/danae/data/images/excelsior-1910-12-07-a-p0001.jpg, already exists.
Skipping /scratch/students/danae/data/images/excelsior-1910-12-07-a-p0002.jpg, already exists.
Skipping /scratch/students/danae/data/images/excelsior-1910-12-07-a-p0003.jpg, already exists.
Skipping /scratch/students/danae/data/images/excelsior-1910-12-07-a-p0004.jpg, already exists.
Skipping /scratch/students/danae/data/images/excelsior-1910-12-07-a-p0005.jpg, already exists.
Skipping /scratch/students/danae/data/images/excelsior-1910-12-07-a-p0006.jpg, already exists.
Skipping /scratch/students/danae/data/images/excel

KeyboardInterrupt: 