## Processing ALTO XML to JSONL

This notebook does the following:

- downloads ATLO XML versions of [Digitised printed books (18th-19th century)](https://www.bl.uk/collection-guides/digitised-printed-books)
- decompresses these files
- processes the text and some metadata from the XML files
- saves a file for each book in `JSONL` format. 

### Import required packages 


In [1]:
import re
import zipfile
import shutil
from pathlib import Path
import json
import subprocess
import tarfile
import os.path
from tqdm.auto import tqdm
import xml.etree.ElementTree as ET
from statistics import mean, stdev
from zipfile import BadZipfile

Make some directories for storing our downloads 

In [2]:
path = Path("data/")
path.mkdir(exist_ok=True, parents=True)
path_in = path / "in"
path_in.mkdir(exist_ok=True)

Create a dictionary for the URLs

In [3]:
_URL = "https://data.bl.uk/digbks/"

In [4]:
_URLS = {
    "unkown": _URL + "unknown.zip",
    "1510-1699": _URL + "1510_1699.zip",
    "1700-1799": _URL + "1700_1799.zip",
    "1800-1809": _URL + "1800_1809.zip",
    "1810-1819": _URL + "1810_1819.zip",
    "1820-1829": _URL + "1820_1829.zip",
    "1830-1839": _URL + "1830_1839.zip",
    "1840-1849": _URL + "1840_1849.zip",
    "1850-1859": _URL + "1850_1859.zip",
    "1860-1869": _URL + "1860_1869.zip",
    "1870-1879": _URL + "1870_1879.zip",
    "1880-1889": _URL + "1880_1889.zip",
    "1890-1899": _URL + "1890_1899.zip",
}

## Some processing helpers

### Getting dates from the alto XML

In [5]:
date_text = "[1652.]"

In [6]:
date_text.split(".")

['[1652', ']']

In [7]:
pattern = re.compile(r"\d{4}")

In [8]:
match = re.search(pattern, date_text)

In [9]:
re.findall(pattern, date_text)

['1652']

In [10]:
pattern = re.compile(r"\d{4}")


def get_four_digit(date_text):
    matches = re.findall(pattern, date_text)
    if not matches:
        return ""
    if len(matches) == 1:
        return matches[0]
    else:
        dates = (
            date for date in matches if date and (int(date) < 1900 and int(date) > 1500)
        )
        return str(round(mean(int(n) if n else 0 for n in dates)))

In [11]:
test_dates_4 = ["1652", "[1929]", "1830-1840"]
for date in test_dates_4:
    assert len(get_four_digit(date)) == 4
    print(get_four_digit(date))

1652
1929
1835


In [12]:
weird_date = " [1792]"

In [13]:
get_four_digit(weird_date)

'1792'

In [14]:
def strip_non_numeric(date_text):
    return "".join(filter(str.isdigit, date_text))

### Grabbing metadata we want 

In [15]:
def get_meta(meta_xml):
    tree = ET.parse(meta_xml)
    root = tree.getroot()
    dates = root.findall(".//{http://www.loc.gov/mods/v3}dateIssued")
    # sometime we have multiple dates
    # use a crude filter to try and get the correct one
    if not dates:
        date = None
    elif len(dates) == 1:
        date_text = dates[0].text
        date = get_four_digit(date_text)
        date = int(date)
    else:
        candidate_dates = (date.text for date in dates)
        candidate_dates = (get_four_digit(date) for date in candidate_dates)
        if not candidate_dates:
            date = None
        if candidate_dates:
            date = list(candidate_dates)[0]
            try:
                date = int(date)
            except ValueError:
                date = None
    title = root.findall(".//{http://www.loc.gov/mods/v3}title")
    title = title[0].text if title else None
    place = root.findall(".//{http://www.loc.gov/mods/v3}placeTerm")
    place = place[0].text if place else None
    record_id = root.findall(".//{http://www.loc.gov/mods/v3}recordIdentifier")
    record_id = record_id[0].text if record_id else None
    return {"date": date, "title": title, "place": place, "record_id": record_id}

### Getting the text and related information we want from the XML

In [16]:
def get_text_from_xml(xml):
    tree = ET.parse(xml)
    root = tree.getroot()
    strings = root.findall(".//String")
    text = [string.get("CONTENT") for string in strings]
    if text:
        wc = mean(float(string.get("WC")) for string in strings)
        wc = round(wc, ndigits=3)
        if len(text) > 2:
            std = stdev(float(string.get("WC")) for string in strings)
            std = round(std, ndigits=3)
        else:
            std = None
        return text, wc, std
    return None, None, None

### Parsing volumes
Create folder for storing our new output jsonl files 

In [17]:
out_json = Path("data/json")
out_json.mkdir(exist_ok=True)

In [18]:
def parse_volume(volume_dir, out_dir=out_json):
    meta = get_meta(list(volume_dir.glob("*metadata.xml"))[0])
    alto_dir = list(volume_dir.glob("ALTO"))[0]
    volume = []
    for i, xml in enumerate(sorted(Path(alto_dir).glob("*.xml"))):
        data = {}
        text, ocr, std = get_text_from_xml(xml)
        if not text:
            is_empty = True
        else:
            is_empty = False
        data["pg"] = i + 1
        if text:
            data["text"] = " ".join(text)
        if not text:
            data["text"] = None
        data["mean_wc_ocr"] = ocr
        data["std_wc_ocr"] = std
        data["empty_pg"] = is_empty
        volume.append({**meta, **data})
    _id = volume[0]["record_id"]
    date = meta["date"]
    if date == None:
        date = "UNKOWN"
    try:
        if int(date) > 1950:
            print(date, _id)
    except ValueError:
        pass
    with open(f"{out_dir}/{date}_{_id}.jsonl", "w") as f:
        for item in volume:
            f.write(json.dumps(item) + "\n")

## Extract a volumne

In [19]:
def extract_volume(zipped, out_final):
    save_dir = Path(f"{out_final}/{zipped.parts[-2].split('.')[0]}/{zipped.stem}")
    save_dir.mkdir(parents=True)
    try:
        with zipfile.ZipFile(zipped, "r") as zip_ref:
            zip_ref.extractall(save_dir)
            parse_volume(save_dir)
            shutil.rmtree(save_dir)
    except BadZipfile as e:
        # weird_zips.append(zipped)
        print("\U0001F92E", zipped)
        return zipped

In [20]:
out_path = Path("data/tmp")

In [21]:
_URLS = dict(reversed(_URLS.items()))
_URLS

{'1890-1899': 'https://data.bl.uk/digbks/1890_1899.zip',
 '1880-1889': 'https://data.bl.uk/digbks/1880_1889.zip',
 '1870-1879': 'https://data.bl.uk/digbks/1870_1879.zip',
 '1860-1869': 'https://data.bl.uk/digbks/1860_1869.zip',
 '1850-1859': 'https://data.bl.uk/digbks/1850_1859.zip',
 '1840-1849': 'https://data.bl.uk/digbks/1840_1849.zip',
 '1830-1839': 'https://data.bl.uk/digbks/1830_1839.zip',
 '1820-1829': 'https://data.bl.uk/digbks/1820_1829.zip',
 '1810-1819': 'https://data.bl.uk/digbks/1810_1819.zip',
 '1800-1809': 'https://data.bl.uk/digbks/1800_1809.zip',
 '1700-1799': 'https://data.bl.uk/digbks/1700_1799.zip',
 '1510-1699': 'https://data.bl.uk/digbks/1510_1699.zip',
 'unkown': 'https://data.bl.uk/digbks/unknown.zip'}

In [22]:
for name, url in tqdm(_URLS.items()):
    tqdm.write(name)
    result = subprocess.run(["aria2c", url, "-d", path_in, "-q"])
    file = list(Path("data/in").glob("*.zip"))[0]
    out_path = Path("data/tmp")
    out_path.mkdir(exist_ok=True)
    result = subprocess.run(["7z", "x", file, f"-o{out_path}", "-y"])
    low_level_zip = list(out_path.rglob("*.zip"))
    for z in tqdm(low_level_zip, leave=False, desc=(str(file))):
            extract_volume(z, out_path)
    shutil.rmtree(out_path)
    file.unlink()

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

1890-1899

7-Zip [64] 17.04 : Copyright (c) 1999-2021 Igor Pavlov : 2017-08-28
p7zip Version 17.04 (locale=utf8,Utf16=on,HugeFiles=on,64 bits,8 CPUs x64)

Scanning the drive for archives:
1 file, 47019711337 bytes (44 GiB)

Extracting archive: data/in/1890_1899.zip
--
Path = data/in/1890_1899.zip
Type = zip
Physical Size = 47019711337
64-bit = +
Characteristics = Zip64

Everything is Ok

Folders: 1
Files: 14847
Size:       47016510803
Compressed: 47019711337


HBox(children=(FloatProgress(value=0.0, description='data/in/1890_1899.zip', max=14847.0, style=ProgressStyle(…

1880-1889

7-Zip [64] 17.04 : Copyright (c) 1999-2021 Igor Pavlov : 2017-08-28
p7zip Version 17.04 (locale=utf8,Utf16=on,HugeFiles=on,64 bits,8 CPUs x64)

Scanning the drive for archives:
1 file, 40886363006 bytes (39 GiB)

Extracting archive: data/in/1880_1889.zip
--
Path = data/in/1880_1889.zip
Type = zip
Physical Size = 40886363006
64-bit = +
Characteristics = Zip64

Everything is Ok

Folders: 1
Files: 10856
Size:       40884024598
Compressed: 40886363006


HBox(children=(FloatProgress(value=0.0, description='data/in/1880_1889.zip', max=10856.0, style=ProgressStyle(…

1870-1879

7-Zip [64] 17.04 : Copyright (c) 1999-2021 Igor Pavlov : 2017-08-28
p7zip Version 17.04 (locale=utf8,Utf16=on,HugeFiles=on,64 bits,8 CPUs x64)

Scanning the drive for archives:
1 file, 32205584255 bytes (30 GiB)

Extracting archive: data/in/1870_1879.zip
--
Path = data/in/1870_1879.zip
Type = zip
Physical Size = 32205584255
64-bit = +
Characteristics = Zip64

Everything is Ok

Folders: 1
Files: 8630
Size:       32203727593
Compressed: 32205584255


HBox(children=(FloatProgress(value=0.0, description='data/in/1870_1879.zip', max=8630.0, style=ProgressStyle(d…

1860-1869

7-Zip [64] 17.04 : Copyright (c) 1999-2021 Igor Pavlov : 2017-08-28
p7zip Version 17.04 (locale=utf8,Utf16=on,HugeFiles=on,64 bits,8 CPUs x64)

Scanning the drive for archives:
1 file, 31055400974 bytes (29 GiB)

Extracting archive: data/in/1860_1869.zip
--
Path = data/in/1860_1869.zip
Type = zip
Physical Size = 31055400974
64-bit = +
Characteristics = Zip64

Everything is Ok

Folders: 1
Files: 7498
Size:       31053786202
Compressed: 31055400974


HBox(children=(FloatProgress(value=0.0, description='data/in/1860_1869.zip', max=7498.0, style=ProgressStyle(d…

1850-1859

7-Zip [64] 17.04 : Copyright (c) 1999-2021 Igor Pavlov : 2017-08-28
p7zip Version 17.04 (locale=utf8,Utf16=on,HugeFiles=on,64 bits,8 CPUs x64)

Scanning the drive for archives:
1 file, 28375238329 bytes (27 GiB)

Extracting archive: data/in/1850_1859.zip
--
Path = data/in/1850_1859.zip
Type = zip
Physical Size = 28375238329
64-bit = +
Characteristics = Zip64

Everything is Ok

Folders: 1
Files: 5818
Size:       28373986125
Compressed: 28375238329


HBox(children=(FloatProgress(value=0.0, description='data/in/1850_1859.zip', max=5818.0, style=ProgressStyle(d…

1840-1849

7-Zip [64] 17.04 : Copyright (c) 1999-2021 Igor Pavlov : 2017-08-28
p7zip Version 17.04 (locale=utf8,Utf16=on,HugeFiles=on,64 bits,8 CPUs x64)

Scanning the drive for archives:
1 file, 20478731424 bytes (20 GiB)

Extracting archive: data/in/1840_1849.zip
--
Path = data/in/1840_1849.zip
Type = zip
Physical Size = 20478731424
64-bit = +
Characteristics = Zip64

Everything is Ok

Folders: 1
Files: 4070
Size:       20477859236
Compressed: 20478731424


HBox(children=(FloatProgress(value=0.0, description='data/in/1840_1849.zip', max=4070.0, style=ProgressStyle(d…

1830-1839

7-Zip [64] 17.04 : Copyright (c) 1999-2021 Igor Pavlov : 2017-08-28
p7zip Version 17.04 (locale=utf8,Utf16=on,HugeFiles=on,64 bits,8 CPUs x64)

Scanning the drive for archives:
1 file, 10962264514 bytes (11 GiB)

Extracting archive: data/in/1830_1839.zip
--
Path = data/in/1830_1839.zip
Type = zip
Physical Size = 10962264514
64-bit = +
Characteristics = Zip64

Everything is Ok

Folders: 1
Files: 2639
Size:       10961704172
Compressed: 10962264514


HBox(children=(FloatProgress(value=0.0, description='data/in/1830_1839.zip', max=2639.0, style=ProgressStyle(d…

1820-1829

7-Zip [64] 17.04 : Copyright (c) 1999-2021 Igor Pavlov : 2017-08-28
p7zip Version 17.04 (locale=utf8,Utf16=on,HugeFiles=on,64 bits,8 CPUs x64)

Scanning the drive for archives:
1 file, 10667434360 bytes (10174 MiB)

Extracting archive: data/in/1820_1829.zip
--
Path = data/in/1820_1829.zip
Type = zip
Physical Size = 10667434360
64-bit = +
Characteristics = Zip64

Everything is Ok

Folders: 1
Files: 2739
Size:       10666854328
Compressed: 10667434360


HBox(children=(FloatProgress(value=0.0, description='data/in/1820_1829.zip', max=2739.0, style=ProgressStyle(d…

1810-1819

7-Zip [64] 17.04 : Copyright (c) 1999-2021 Igor Pavlov : 2017-08-28
p7zip Version 17.04 (locale=utf8,Utf16=on,HugeFiles=on,64 bits,8 CPUs x64)

Scanning the drive for archives:
1 file, 6701393352 bytes (6391 MiB)

Extracting archive: data/in/1810_1819.zip
--
Path = data/in/1810_1819.zip
Type = zip
Physical Size = 6701393352
64-bit = +
Characteristics = Zip64

Everything is Ok

Folders: 1
Files: 2338
Size:       6700904894
Compressed: 6701393352


HBox(children=(FloatProgress(value=0.0, description='data/in/1810_1819.zip', max=2338.0, style=ProgressStyle(d…

1800-1809

7-Zip [64] 17.04 : Copyright (c) 1999-2021 Igor Pavlov : 2017-08-28
p7zip Version 17.04 (locale=utf8,Utf16=on,HugeFiles=on,64 bits,8 CPUs x64)

Scanning the drive for archives:
1 file, 5153194342 bytes (4915 MiB)

Extracting archive: data/in/1800_1809.zip
--
Path = data/in/1800_1809.zip
Type = zip
Physical Size = 5153194342
64-bit = +
Characteristics = Zip64

Everything is Ok

Folders: 1
Files: 1502
Size:       5152884050
Compressed: 5153194342


HBox(children=(FloatProgress(value=0.0, description='data/in/1800_1809.zip', max=1502.0, style=ProgressStyle(d…

1700-1799

7-Zip [64] 17.04 : Copyright (c) 1999-2021 Igor Pavlov : 2017-08-28
p7zip Version 17.04 (locale=utf8,Utf16=on,HugeFiles=on,64 bits,8 CPUs x64)

Scanning the drive for archives:
1 file, 2458460131 bytes (2345 MiB)

Extracting archive: data/in/1700_1799.zip
--
Path = data/in/1700_1799.zip
Type = zip
Physical Size = 2458460131

Everything is Ok

Folders: 1
Files: 2070
Size:       2458039729
Compressed: 2458460131


HBox(children=(FloatProgress(value=0.0, description='data/in/1700_1799.zip', max=2070.0, style=ProgressStyle(d…

1510-1699

7-Zip [64] 17.04 : Copyright (c) 1999-2021 Igor Pavlov : 2017-08-28
p7zip Version 17.04 (locale=utf8,Utf16=on,HugeFiles=on,64 bits,8 CPUs x64)

Scanning the drive for archives:
1 file, 623534017 bytes (595 MiB)

Extracting archive: data/in/1510_1699.zip
--
Path = data/in/1510_1699.zip
Type = zip
Physical Size = 623534017

Everything is Ok

Folders: 1
Files: 693
Size:       623393557
Compressed: 623534017


HBox(children=(FloatProgress(value=0.0, description='data/in/1510_1699.zip', max=693.0, style=ProgressStyle(de…

unkown

7-Zip [64] 17.04 : Copyright (c) 1999-2021 Igor Pavlov : 2017-08-28
p7zip Version 17.04 (locale=utf8,Utf16=on,HugeFiles=on,64 bits,8 CPUs x64)

Scanning the drive for archives:
1 file, 1503674253 bytes (1435 MiB)

Extracting archive: data/in/unknown.zip
--
Path = data/in/unknown.zip
Type = zip
Physical Size = 1503674253

Everything is Ok

Folders: 1
Files: 284
Size:       1503617107
Compressed: 1503674253


HBox(children=(FloatProgress(value=0.0, description='data/in/unknown.zip', max=284.0, style=ProgressStyle(desc…




Check how many files we have

In [23]:
!ls -l {out_json} | wc -l

   48353
