# Updating metadata

Metadata isn't static. The metadata we have got via the ALTO XML has sometimes been superseded by new metadata for an item. There is also some additional metadata which we want to include in our jsonl files. We'll use an exported metadata csv to update existing metadata and add some additiional fields that weren't there already. 

In [289]:
import json
import shutil
import numpy as np
from pathlib import Path
from toolz import itertoolz
from itertools import zip_longest
from statistics import mean
from tqdm.auto import tqdm
from statistics import mean
import pandas as pd

In [230]:
df_public = pd.read_excel(
    "MS digitised books 2021-01-09.xlsx", dtype={"BL record ID": "string"}
)

In [231]:
df_public.head(1)

Unnamed: 0,BL record ID,Type of resource,BNB number,ISBN,Name,Dates associated with name,Type of name,Role,All names,Title,...,Publisher,Date of publication,Edition,Physical description,Dewey classification,BL shelfmark,Topics,Genre,Languages,Notes
0,14602826,Monograph,,,"Yearsley, Ann",1753-1806,person,,"More, Hannah, 1745-1833 [person] ; Yearsley, A...",Poems on several occasions [With a prefatory l...,...,,1786,Fourth edition MANUSCRIPT note,,,Digital Store 11644.d.32,,,English,


In [232]:
len(df_public["BL record ID"].str.len() == 9) == len(df_public)

True

In [233]:
df_marc = pd.read_excel(
    "MS digitised books 2021-01-09.xlsx", sheet_name="Not for release", skiprows=[0]
)

In [234]:
df_marc["_id"] = df_marc["Digitised Record Match"].str.split(" ").str[0]

In [235]:
df_matcher = df_marc[["_id", "Control Number"]]

In [236]:
df = pd.concat([df_public, df_matcher], axis=1)

In [None]:
df[df["BL record ID"] != df["Control Number"]]

## Try and match to a BL metadata record

In [238]:
def try_get_meta(file):
    with open(file, "rb") as f:
        data = json.loads(f.readline())
        record_id = data["record_id"]
        m = df[df["_id"].str.contains(record_id)]
        if len(m) > 0:
            return m
        return None

In [239]:
df[df["_id"].str.contains("002816907")]

Unnamed: 0,BL record ID,Type of resource,BNB number,ISBN,Name,Dates associated with name,Type of name,Role,All names,Title,...,Edition,Physical description,Dewey classification,BL shelfmark,Topics,Genre,Languages,Notes,_id,Control Number
21193,14823461,Monograph,,,,,,,,The Penny Library of Famous Books,...,,177 no (8°),,Digital Store 012601.l.24,,,English,,2816907,14823461


## Processing 

In [240]:
json_files = [f for f in Path("data/json").rglob("*.jsonl")]

These lists define the fiels which we are going to replace i.e. which are already present but will be updated, and new fields to add i.e. fields which we don't have in our current json files. 

In [241]:
field_to_replace = [
    "Title",
    "Date of publication",
    "Place of publication",
]
fields_to_add = [
    "Name",
    "All names",
    "Languages",
    "Physical description",
    "Country of publication",
    "Publisher",
]

## Metadata fields to replace

## Processing dates

In [243]:
def process_date(date):
    if date is np.nan:
        return {"date": None, "raw_date": None}
    elif len(date) == 4:
        return {"date": date, "raw_date": date}
    elif len(date) == 5:
        return {"date": date.strip("-"), "raw_date": date}
    elif len(date) == 9:
        dates = date.split("-")
        dates = [int(date) for date in dates]
        mean_date = round(mean(dates))
        return {"date": mean_date, "raw_date": date}

In [244]:
process_date(np.nan)

{'date': None, 'raw_date': None}

In [245]:
process_date("1860")

{'date': '1860', 'raw_date': '1860'}

In [246]:
process_date("1860-")

{'date': '1860', 'raw_date': '1860-'}

In [247]:
raw_date = "1850-1860"
date_dict = process_date(raw_date)
assert date_dict["date"] == 1855
assert date_dict["raw_date"] == raw_date

In [248]:
process_date(raw_date)

{'date': 1855, 'raw_date': '1850-1860'}

## Processing languages

In [249]:
default_language_dict = {
    "Language_1": None,
    "Language_2": None,
    "Language_3": None,
    "Language_4": None,
    "multi_language": False,
}

In [250]:
list(default_language_dict.keys())

['Language_1', 'Language_2', 'Language_3', 'Language_4', 'multi_language']

In [253]:
language_keys = list(default_language_dict.keys())
language_keys

['Language_1', 'Language_2', 'Language_3', 'Language_4', 'multi_language']

In [254]:
def process_languages(languages):
    multi_language = False
    if languages is not np.nan:
        languages = languages.split(";")
        languages = [language.strip(" ") for language in languages]
        if len(languages) > 1:
            multi_language = True
        language_dict = {
            key: lang for (key, lang) in zip_longest(language_keys, languages)
        }
        return {**language_dict, **{"multi_language": multi_language}}
    else:
        language_dict = {key: None for key in language_keys}
        return {**language_dict, **{"multi_language": multi_language}}

### Process country of publication

In [255]:
{
    "Country of publication 1": "England",
    "All Countries of publication": ["England", "Ireland", "Norther Ireland"],
}

{'Country of publication 1': 'England',
 'All Countries of publication': ['England', 'Ireland', 'Norther Ireland']}

In [256]:
def process_country(country):
    if country is not np.nan:
        countries = country.split(";")
        countries_dict = {
            "Country of publication 1": countries[0],
            "All Countries of publication": country,
        }
        return countries_dict
    else:
        return {"Country of publication 1": None, "All Countries of publication": None}

In [257]:
process_country(np.nan)

{'Country of publication 1': None, 'All Countries of publication': None}

In [258]:
process_country("England; Wales")

{'Country of publication 1': 'England',
 'All Countries of publication': 'England; Wales'}

In [259]:
from toolz.dicttoolz import valfilter, update_in

In [260]:
def replace_nan_with_none(dictionary):
    keys = list(valfilter(lambda x: x is np.nan, dictionary).keys())
    for key in keys:
        dictionary[key] = None
    return dictionary

### Put keys in the order we want

In [261]:
def sort_dictionary_to_key_order(dictionary, key_order):
    return {k: dictionary[k] for k in key_order}

In [262]:
empty_language_dict = {
    "Language_1": None,
    "Language_2": None,
    "Language_3": None,
    "Language_4": None,
    "multi_language": False,
}

In [263]:
empty_country_dict = {
    "Country of publication 1": None,
    "All Countries of publication": None,
}

In [264]:
new_meta_to_add_if_empty = {k: None for k in fields_to_add}
new_meta_if_empty = {**new_meta_to_add_if_empty, **empty_language_dict}
new_meta_if_empty = {**new_meta_if_empty, **empty_country_dict}
new_meta_if_empty

{'Name': None,
 'All names': None,
 'Languages': None,
 'Physical description': None,
 'Country of publication': None,
 'Publisher': None,
 'Language_1': None,
 'Language_2': None,
 'Language_3': None,
 'Language_4': None,
 'multi_language': False,
 'Country of publication 1': None,
 'All Countries of publication': None}

In [265]:
new_meta_if_empty.update({"raw_date": None})

In [266]:
def try_get_meta(file):
    with open(file, "rb") as f:
        data = json.loads(f.readline())
        record_id = data["record_id"]
        m = df[df["_id"].str.contains(record_id)]
        if len(m) > 0:
            return m
        return None

In [267]:
def process_updated_metadata(metadata: pd.DataFrame):
    metadata_to_update = metadata[field_to_replace].to_dict("records")[0]
    metadata_to_update["title"] = metadata_to_update.pop("Title")
    metadata_to_update["date"] = metadata_to_update.pop("Date of publication")
    metadata_to_update["place"] = metadata_to_update.pop("Place of publication")
    process_date(metadata_to_update["date"])
    return {**metadata_to_update, **process_date(metadata_to_update["date"])}

In [268]:
def process_additional_metadata(metadata: pd.DataFrame):
    additional_metadata = metadata[fields_to_add].to_dict("records")[0]
    languages = process_languages(additional_metadata["Languages"])
    additional_metadata.pop("Languages")
    additional_metadata = {**additional_metadata, **languages}
    countries = process_country(additional_metadata["Country of publication"])
    additional_metadata.pop("Country of publication")
    additional_metadata = {**additional_metadata, **countries}
    additional_metadata = replace_nan_with_none(additional_metadata)
    return additional_metadata

In [269]:
def update_metadata(data, metadata):
    for k, v in metadata.items():
        data[k] = v
    return data

In [270]:
key_order = [
    "record_id",
    "date",
    "raw_date",
    "title",
    "place",
    "empty_pg",
    "text",
    "pg",
    "mean_wc_ocr",
    "std_wc_ocr",
    "Name",
    "All names",
    "Publisher",
    "Country of publication 1",
    "All Countries of publication",
    "place",
    "Physical description",
    "Language_1",
    "Language_2",
    "Language_3",
    "Language_4",
    "multi_language",
]

In [271]:
def update_json(json_file: Path, out_dir: Path):
    new_data = []
    metadata = try_get_meta(json_file)
    if metadata is not None:
        metadata_to_update = process_updated_metadata(metadata)
        additional_metadata = process_additional_metadata(metadata)
        with open(json_file, "r") as f:
            data = [json.loads(line) for line in f.readlines()]
        for line_dict in data:
            line_dict.update(metadata_to_update)
            line_dict.update(additional_metadata)
            line_dict = sort_dictionary_to_key_order(line_dict, key_order)
            new_data.append(line_dict)
        date = metadata_to_update["date"]
        if not date:
            date = "UNKOWN"
        _id = file.stem.split("_")[-1]
        fname = f"{out_dir}/{date}_{_id}.jsonl"
        with open(fname, "w") as f:
            for item in new_data:
                f.write(json.dumps(item) + "\n")
    if metadata is None:
        with open(json_file, "r") as f:
            data = [json.loads(line) for line in f.readlines()]
            new_data = []
            for line_dict in data:
                line_dict.update(new_meta_if_empty)
                line_dict = sort_dictionary_to_key_order(line_dict, key_order)
                new_data.append(line_dict)

        with open(f"{out_dir}/{json_file.name}", "w") as f:
            for item in new_data:
                f.write(json.dumps(item) + "\n")

In [272]:
Path("update_metadata").mkdir(exist_ok=True)

## Process files

In [None]:
for file in tqdm(json_files):
    update_json(file, "update_metadata")

check number of new json files

In [276]:
!ls -l update_metadata | wc -l

   48353


### Sorting files ready for upload to repository

In [277]:
date_files = {
    "1510 - 1699": [],
    "1700 - 1799": [],
    "1800 - 1809": [],
    "1810 - 1819": [],
    "1820 - 1829": [],
    "1830 - 1839": [],
    "1840 - 1849": [],
    "1850 - 1859": [],
    "1860 - 1869": [],
    "1870 - 1879": [],
    "1880 - 1889": [],
    "1890 - 1899": [],
}

In [278]:
def get_decade(x):
    return x.stem.split("_")[0][:3]

In [279]:
def get_year(x):
    return x.stem.split("_")[0]

In [295]:
files = [f for f in Path("update_metadata").glob("*.jsonl")]

In [296]:
set([get_year(f) for f in files]);

In [298]:
from collections import defaultdict

dates_dict = defaultdict(list)

Organise files into dictionary depending on year group  

In [299]:
for file in tqdm(files):
    year = get_year(file)
    try:
        year = int(year)
        for key in date_files.keys():
            start = key.split("-")[0]
            end = key.split("-")[1]
            if int(start) <= year <= int(end):
                dates_dict[key].append(file)
    except ValueError:
        dates_dict["unk"].append(file)

HBox(children=(FloatProgress(value=0.0, max=48352.0), HTML(value='')))




In [300]:
dates_dict;

make folder for year ranges and copy files into these new folders 

In [301]:
for key, value in tqdm(dates_dict.items()):
    out_path = Path(key.replace(" ", "").replace("-", "_"))
    out_path.mkdir(exist_ok=True)
    files = value
    [shutil.copy(f, Path(out_path / f.name)) for f in files]

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




In [302]:
dates_dict.keys()

dict_keys(['1850 - 1859', '1880 - 1889', '1800 - 1809', '1810 - 1819', '1890 - 1899', '1830 - 1839', '1870 - 1879', '1700 - 1799', '1860 - 1869', '1840 - 1849', '1820 - 1829', '1510 - 1699', 'unk'])

We now compress each file with `gzip`

In [303]:
import gzip

In [304]:
for key, value in tqdm(dates_dict.items()):
    directory = Path(key.replace(" ", "").replace("-", "_"))
    for file in Path(directory).glob("*.jsonl"):
        gz_file = file.parent / (file.name + ".gz")
        with open(file, "rb") as f_in:
            with gzip.open(gz_file, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




Small sense check

In [305]:
from toolz.itertoolz import count

In [306]:
for key, value in tqdm(dates_dict.items()):
    directory = Path(key.replace(" ", "").replace("-", "_"))
    json_count = count(Path(directory).rglob("*.jsonl"))
    json_count_gzip = count(Path(directory).rglob("*.jsonl.gz"))
    assert json_count == json_count_gzip

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




We now create a new folder for our final folders with only the compressed files

In [307]:
final = Path("data/final")
final.mkdir(exist_ok=True)

and move the `gzip` compressed json files to their relevant folder under the `final` directory

In [308]:
for key, value in tqdm(dates_dict.items()):
    directory = Path(key.replace(" ", "").replace("-", "_"))
    compressed_dir = Path(final / directory)
    compressed_dir.mkdir(exist_ok=True)
    for file in Path(directory).glob("*.jsonl.gz"):
        shutil.copy(file, compressed_dir / file.name)

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




We finally compress each folder, this won't save too much extra room but makes it easier to share each directory 

In [309]:
%%bash
cd data/final/
for dir in */
do
  base=$(basename "$dir")
  gtar -czf "${base}.tar.gz" "$dir"
done

fin