# Internationalisation bibliographic data cleaning

## Setup

Mount Google Drive and use python Python packages previously installed into Google Drive.

In [1]:
# Mount your Google drive
from google.colab import drive
drive.mount('/content/gdrive')
# Change the path to the directory where the module is located
import sys
sys.path.append('/content/gdrive/My Drive/Colab Notebooks')
# Then import required packages

Mounted at /content/gdrive


In [2]:
from pymarc import MARCReader, Subfield
# from pathlib import Path
import html
import el_internationalisation as eli
# import json
import regex as re

## Configuration

__NORMALISE_DEFAULT__: UNicode normalisation to apply to bibliographic data. Possible normalisation forms include NFC, NFD and NFM21. NFM21 is a variation of NFD based on the 

__THAI_LAO_ROM__:

__CYRILLIC_ROM__: A flag indicating whether Cyrillic romanisations should be normalised, i.e. if `True` any half marks are converted to a double spanning diacritic.

__required_fields__: a list of MARC21 fields that should be processed.

Configuration settings can be adjusted before running the cell.

In [3]:
NORMALISE_DEFAULT = "NFM21"  # NFM21, NFC or NFD
THAI_LAO_ROM = 2011   # 2011, 1997 or None
CYRILLIC_ROM = True   # True or False
required_fields = ["100","110","111", "130","240","245","246","250","260","264", "300", "490","500","504","505","508","510","511","520","521","530","586","600","610","650","651","656","700","710","711", "730","740","800","830","852","880"]


## Clean MARC21 record

In [4]:
from google.colab import files
uploaded = files.upload()

marc_file = list(uploaded.items())[0][1]

file_name, file_ext = list(uploaded.items())[0][0].split(".")
output_file = f"{file_name}_cleaned.{file_ext}"

Saving garay_m8.mrc to garay_m8.mrc


Process the records:

In [5]:
marc_records = []
reader = MARCReader(marc_file, to_unicode=True)
for record in reader:
    try:
        record_lang = record['041']['a']
    except KeyError:
        record_lang = record['008'].value()[35:38]
    record_fields = record.get_fields(*required_fields)
    for field in record_fields:
        for i in range(len(field.subfields)):
            field.subfields[i] = Subfield(field.subfields[i].code, eli.clean_marc_subfield(field.subfields[i].value, record_lang, NORMALISE_DEFAULT ))
    marc_records.append(record)

Write cleaned records to a file on Google Colab:

In [6]:
with open(output_file, 'wb') as o:
    for record in marc_records:
        o.write(record.as_marc())

Download file to your computer:

In [7]:

files.download(output_file) 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>