## Processing ALTO XML to JSONL

This notebook does the following:

- downloads ATLO XML versions of [Digitised printed books (18th-19th century)](https://www.bl.uk/collection-guides/digitised-printed-books)
- decompresses these files
- processes the text and some metadata from the XML files
- saves a file for each book in `JSONL` format. 

### Import required packages 


In [None]:
import re
import zipfile
import shutil
from pathlib import Path
import json
import subprocess
import tarfile
import os.path
from tqdm.auto import tqdm
import xml.etree.ElementTree as ET
from statistics import mean, stdev
from zipfile import BadZipfile

Make some directories for storing our downloads 

In [None]:
path = Path("data/")
path.mkdir(exist_ok=True, parents=True)
path_in = path / "in"
path_in.mkdir(exist_ok=True)

Create a dictionary for the URLs

In [None]:
_URL = "https://data.bl.uk/digbks/"

In [None]:
_URLS = {
    "unkown": _URL + "unknown.zip",
    "1510-1699": _URL + "1510_1699.zip",
    "1700-1799": _URL + "1700_1799.zip",
    "1800-1809": _URL + "1800_1809.zip",
    "1810-1819": _URL + "1810_1819.zip",
    "1820-1829": _URL + "1820_1829.zip",
    "1830-1839": _URL + "1830_1839.zip",
    "1840-1849": _URL + "1840_1849.zip",
    "1850-1859": _URL + "1850_1859.zip",
    "1860-1869": _URL + "1860_1869.zip",
    "1870-1879": _URL + "1870_1879.zip",
    "1880-1889": _URL + "1880_1889.zip",
    "1890-1899": _URL + "1890_1899.zip",
}

## Some processing helpers

### Getting dates from the alto XML

In [None]:
date_text = "[1652.]"

In [None]:
date_text.split(".")

['[1652', ']']

In [None]:
pattern = re.compile(r"\d{4}")

In [None]:
match = re.search(pattern, date_text)

In [None]:
re.findall(pattern, date_text)

['1652']

In [None]:
pattern = re.compile(r"\d{4}")


def get_four_digit(date_text):
    matches = re.findall(pattern, date_text)
    if not matches:
        return ""
    if len(matches) == 1:
        return matches[0]
    else:
        dates = (
            date for date in matches if date and (int(date) < 1900 and int(date) > 1500)
        )
        return str(round(mean(int(n) if n else 0 for n in dates)))

In [None]:
test_dates_4 = ["1652", "[1929]", "1830-1840"]
for date in test_dates_4:
    assert len(get_four_digit(date)) == 4
    print(get_four_digit(date))

1652
1929
1835


In [None]:
weird_date = " [1792]"

In [None]:
get_four_digit(weird_date)

'1792'

In [None]:
def strip_non_numeric(date_text):
    return "".join(filter(str.isdigit, date_text))

### Grabbing metadata we want 

In [None]:
def get_meta(meta_xml):
    tree = ET.parse(meta_xml)
    root = tree.getroot()
    dates = root.findall(".//{http://www.loc.gov/mods/v3}dateIssued")
    # sometime we have multiple dates
    # use a crude filter to try and get the correct one
    if not dates:
        date = 0
    elif len(dates) == 1:
        date_text = dates[0].text
        date = get_four_digit(date_text)
        date = int(date)
    else:
        candidate_dates = (date.text for date in dates)
        candidate_dates = (get_four_digit(date) for date in candidate_dates)
        if not candidate_dates:
            date = 0
        if candidate_dates:
            date = list(candidate_dates)[0]
            try:
                date = int(date)
            except ValueError:
                date = 0
    title = root.findall(".//{http://www.loc.gov/mods/v3}title")
    title = title[0].text if title else ""
    place = root.findall(".//{http://www.loc.gov/mods/v3}placeTerm")
    place = place[0].text if place else ""
    record_id = root.findall(".//{http://www.loc.gov/mods/v3}recordIdentifier")
    record_id = record_id[0].text if record_id else ""
    return {"date": date, "title": title, "place": place, "record_id": record_id}

### Getting the text and related information we want from the XML

In [None]:
def get_text_from_xml(xml):
    tree = ET.parse(xml)
    root = tree.getroot()
    strings = root.findall(".//String")
    text = [string.get("CONTENT") for string in strings]
    if text:
        wc = mean(float(string.get("WC")) for string in strings)
        wc = round(wc, ndigits=3)
        if len(text) > 2:
            std = stdev(float(string.get("WC")) for string in strings)
            std = round(std, ndigits=3)
        else:
            std = 0.0
        return text, wc, std
    return None, 0.0, 0.0

### Parsing volumes
Create folder for storing our new output jsonl files 

In [None]:
out_json = Path("data/json")
out_json.mkdir(exist_ok=True)

In [None]:
def parse_volume(volume_dir, out_dir=out_json):
    meta = get_meta(list(volume_dir.glob("*metadata.xml"))[0])
    alto_dir = list(volume_dir.glob("ALTO"))[0]
    volume = []
    for i, xml in enumerate(sorted(Path(alto_dir).glob("*.xml"))):
        data = {}
        text, ocr, std = get_text_from_xml(xml)
        if not text:
            text = ""
            is_empty = True
        else:
            is_empty = False
        data["pg"] = i + 1
        data["text"] = " ".join(text)
        data["mean_wc_ocr"] = ocr
        data["std_wc_ocr"] = std
        data["empty_pg"] = is_empty
        volume.append({**meta, **data})
    _id = volume[0]["record_id"]
    date = meta["date"]
    if date == 0:
        date = "UNKOWN"
    try:
        if int(date) > 1950:
            print(date, _id)
    except ValueError:
        pass

    with open(f"{out_dir}/{date}_{_id}.jsonl", "w") as f:
        for item in volume:
            f.write(json.dumps(item) + "\n")

## Extract a volumne

In [None]:
def extract_volume(zipped, out_final):
    save_dir = Path(f"{out_final}/{zipped.parts[-2].split('.')[0]}/{zipped.stem}")
    save_dir.mkdir(parents=True)
    try:
        with zipfile.ZipFile(zipped, "r") as zip_ref:
            zip_ref.extractall(save_dir)
            parse_volume(save_dir)
            shutil.rmtree(save_dir)
    except BadZipfile as e:
        # weird_zips.append(zipped)
        print("\U0001F92E", zipped)
        return zipped

In [None]:
out_path = Path("data/tmp")
_extract_volume = partial(extract_volume, out_final=out_path)

In [None]:
_URLS = dict(reversed(_URLS.items()))

In [None]:
for name, url in tqdm(_URLS.items()):
    tqdm.write(name)
    result = subprocess.run(["aria2c", url, "-d", path_in])
    file = list(Path("data/in").glob("*.zip"))[0]
    out_path = Path("data/tmp")
    out_path.mkdir(exist_ok=True)
    result = subprocess.run(["7z", "x", file, f"-o{out_path}", "-y"])
    low_level_zip = list(out_path.rglob("*.zip"))
    for z in tqdm(low_level_zip, leave=False, desc=(str(file))):
        extract_volume(z, out_path)
    # thread_map(_extract_volume, low_level_zip)
    shutil.rmtree(out_path)
    file.unlink()

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

1890-1899


HBox(children=(FloatProgress(value=0.0, description='data/in/1890_1899.zip', max=14847.0, style=ProgressStyle(…

1880-1889


HBox(children=(FloatProgress(value=0.0, description='data/in/1880_1889.zip', max=10856.0, style=ProgressStyle(…

1870-1879


HBox(children=(FloatProgress(value=0.0, description='data/in/1870_1879.zip', max=8630.0, style=ProgressStyle(d…

1860-1869


HBox(children=(FloatProgress(value=0.0, description='data/in/1860_1869.zip', max=7498.0, style=ProgressStyle(d…

1850-1859


HBox(children=(FloatProgress(value=0.0, description='data/in/1850_1859.zip', max=5818.0, style=ProgressStyle(d…

1840-1849


HBox(children=(FloatProgress(value=0.0, description='data/in/1840_1849.zip', max=4070.0, style=ProgressStyle(d…

1830-1839


HBox(children=(FloatProgress(value=0.0, description='data/in/1830_1839.zip', max=2639.0, style=ProgressStyle(d…

1820-1829


HBox(children=(FloatProgress(value=0.0, description='data/in/1820_1829.zip', max=2739.0, style=ProgressStyle(d…

1810-1819


HBox(children=(FloatProgress(value=0.0, description='data/in/1810_1819.zip', max=2338.0, style=ProgressStyle(d…

1800-1809


HBox(children=(FloatProgress(value=0.0, description='data/in/1800_1809.zip', max=1502.0, style=ProgressStyle(d…

1700-1799


HBox(children=(FloatProgress(value=0.0, description='data/in/1700_1799.zip', max=2070.0, style=ProgressStyle(d…

1510-1699


HBox(children=(FloatProgress(value=0.0, description='data/in/1510_1699.zip', max=693.0, style=ProgressStyle(de…

unkown


HBox(children=(FloatProgress(value=0.0, description='data/in/unknown.zip', max=284.0, style=ProgressStyle(desc…




Check how many files we have

In [None]:
!ls -l {out_json} | wc -l

   48353


### Sorting files ready for upload to repository

In [None]:
date_files = {
    "1510 - 1699": [],
    "1700 - 1799": [],
    "1800 - 1809": [],
    "1810 - 1819": [],
    "1820 - 1829": [],
    "1830 - 1839": [],
    "1840 - 1849": [],
    "1850 - 1859": [],
    "1860 - 1869": [],
    "1870 - 1879": [],
    "1880 - 1889": [],
    "1890 - 1899": [],
}

In [None]:
def get_decade(x):
    return x.stem.split("_")[0][:3]

In [None]:
def get_year(x):
    return x.stem.split("_")[0]

In [None]:
files = [f for f in Path("data/json").glob("*.jsonl")]

In [None]:
set([get_year(f) for f in files]);

In [None]:
from collections import defaultdict

dates_dict = defaultdict(list)

Organise files into dictionary depending on year group  

In [None]:
for file in tqdm(files):
    year = get_year(file)
    try:
        year = int(year)
        for key in date_files.keys():
            start = key.split("-")[0]
            end = key.split("-")[1]
            if int(start) <= year <= int(end):
                dates_dict[key].append(file)
    except ValueError:
        dates_dict["unk"].append(file)

HBox(children=(FloatProgress(value=0.0, max=48352.0), HTML(value='')))




In [None]:
dates_dict;

make folders for year ranges and copy files into new folders 

In [None]:
for key, value in tqdm(dates_dict.items()):
    out_path = Path(key.replace(" ", "").replace("-", "_"))
    out_path.mkdir(exist_ok=True)
    files = value
    [shutil.copy(f, Path(out_path / f.name)) for f in files]

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




In [None]:
dates_dict.keys()

dict_keys(['1850 - 1859', '1880 - 1889', '1800 - 1809', '1810 - 1819', '1890 - 1899', '1830 - 1839', '1870 - 1879', 'unk', '1700 - 1799', '1860 - 1869', '1840 - 1849', '1820 - 1829', '1510 - 1699'])

We now compress each file with `gzip`

In [None]:
import gzip

In [None]:
for key, value in tqdm(dates_dict.items()):
    directory = Path(key.replace(" ", "").replace("-", "_"))
    for file in Path(directory).glob("*.jsonl"):
        gz_file = file.parent / (file.name + ".gz")
        with open(file, "rb") as f_in:
            with gzip.open(gz_file, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




Small sense check

In [None]:
from toolz.itertoolz import count

In [None]:
for key, value in tqdm(dates_dict.items()):
    directory = Path(key.replace(" ", "").replace("-", "_"))
    json_count = count(Path(directory).rglob("*.jsonl"))
    json_count_gzip = count(Path(directory).rglob("*.jsonl.gz"))
    assert json_count == json_count_gzip

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




We now create a new folder for our final folders with only the compressed files

In [None]:
final = Path("data/final")
final.mkdir(exist_ok=True)

and move the `gzip` compressed json files to their relevant folder under the `final` directory

In [None]:
for key, value in tqdm(dates_dict.items()):
    directory = Path(key.replace(" ", "").replace("-", "_"))
    compressed_dir = Path(final / directory)
    compressed_dir.mkdir(exist_ok=True)
    for file in Path(directory).glob("*.jsonl.gz"):
        shutil.copy(file, compressed_dir / file.name)

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




We finally compress each folder, this won't save too much extra room but makes it easier to share each directory 

In [None]:
%%bash
cd data/final/
for dir in */
do
  base=$(basename "$dir")
  gtar -czf "${base}.tar.gz" "$dir"
done

fin