In [1]:
!pip install nbformat lxml



In [2]:
"""
    Utilities
"""

import datetime
from io import BytesIO
import json
import logging
from lxml import etree
import re
import requests
import os
from zipfile import ZipFile
"""
    Globals
"""

 # Poland
set_id = '9200357'

data_dir = f'{os.path.expanduser("~")}/data'
metadata_dir = f'{os.path.expanduser("~")}/work/temp/metadata/{set_id}'
nsmap = {"cmd": "http://www.clarin.eu/cmd/1",
         "cmdp_text": "http://www.clarin.eu/cmd/1/profiles/clarin.eu:cr1:p_1633000337997"}
output_file = f'{os.path.expanduser("~")}/output'


with open(f'{data_dir}/{set_id}/id_file_map.json', 'r') as id_filename_map_file:
    id_filename_map = json.load(id_filename_map_file)

YYYY_MM_DD = re.compile(r"(?P<year>[0-9]{4})-(?P<month>[0-9]{1,2})-(?P<day>[0-9]{1,2})")
MAX_NER_TASK_SIZE = 2000000

"""
    Metadata printing
"""
def print_xml(tree, declaration: bool = False):
    print(etree.tostring(tree, encoding='UTF-8', xml_declaration=declaration, pretty_print=True).decode())

"""
    Data acess
"""
def get_resource_file(identifier):
    """
        Resolves Europeana subresource identifier to it's local location. 
        
        :param str identifier: Europeana subresource identifier
        :return str: Path to local location of the resource
    """
    if identifier in id_filename_map:
        filename = id_filename_map[identifier]
        return f'{data_dir}/{set_id}/{filename}'

def get_date_from_metadata(metadata_tree):
    dates = metadata_tree.xpath("///cmdp_text:TemporalCoverage/cmdp_text:Start/cmdp_text:date/text()", namespaces=nsmap)
    dates = [YYYY_MM_DD.match(date) for date in dates]
    dates = [datetime.date(int(date.group("year")), int(date.group("month")), int(date.group("day"))) for date in dates]
    return dates
    
def get_description_from_metadata(metadata_tree):
    descriptions = metadata_tree.xpath('//cmdp_text:TextResource/cmdp_text:Description/cmdp_text:description/text()', namespaces=nsmap)
    if len(descriptions) > 0:
        return descriptions[0]
    
def get_resource_ids_from_metadata(metadata_tree):
    ids = metadata_tree.xpath('//cmdp_text:SubresourceDescription/cmdp_text:IdentificationInfo/cmdp_text:identifier/text()', namespaces=nsmap)
    # The result can be any number of identifiers. We do want to filter the values a bit: only the numeric identifiers are useful 
    # to us so we use the special syntax below to make a new list by picking only the matching values from the query results list
    return [id for id in ids if id.isnumeric()]

def get_title_from_metadata(metadata_tree):
    # Get all the values from the xpath
    titles = metadata_tree.xpath('//cmdp_text:TextResource/cmdp_text:TitleInfo/cmdp_text:title/text()', namespaces=nsmap)
    # Check if there is an actual value
    if len(titles) > 0:
        # Return the first (assuming only) value
        return titles[0]

def unpack_metadata(set_id, target_dir):
    # Construct the address of the .zip file with the metadata for one set
    md_zip_url = f'https://europeana-oai.clarin.eu/metadata/fulltext-aggregation/{set_id}.zip'
    
    # Retrieve the .zip file
    print(f'Retrieving {md_zip_url}')
    resp = requests.get(md_zip_url)
    zipfile = ZipFile(BytesIO(resp.content))
    
    # Uncompress the .zip into the target directory
    print(f'Extracting content in {target_dir}')
    zipfile.extractall(path=target_dir)
    print('Done')

"""
    Zip/Unzip
"""
def zip_file(input_path, output_path=""):
    """
        Zips input file/directory
        
        :param str input_path: path to file to be zipped, if file is directory, entire dir gets zipped
        :param str output_path: path to location, where to save the archive. If empty, zip archive uses same location and name as input
        :returns str: path to archive
    """
    if not output_path:
        output_path = f"{os.path.dirname(input_path)}/{os.path.basename(input_path).split('.')[0]}.zip"
    with ZipFile(output_path, 'r') as zip_handle:
        logger.info(f'Zipping {input_path} to {output_path}')
        if os.path.isdir(input_path):
            _zip_dir(input_path, zip_handle)
        else:
            zip_handle.write(input_path, os.path.basename(input_path))
            # _zip_chunker(input_path)
        
    return output_path
        
def unzip_file(input_path, output_path=""):
    """
        Unzips input .zip file
        
        :param str input_path: path to file to be unzipped
        :param str output_path: path to location, where to unpack the archive. If empty, archive is extracted at its location.
    """
    if not output_path:
        output_path = f"{os.path.dirname(input_path)}/{os.path.basename(input_path).split('.')[0]}"
    with ZipFile(input_path, 'r') as zip_ref:
        logger.info(f'Unzipping {input_path} to {output_path}')
        extracted_files_paths = zip_ref.namelist()
        zip_ref.extractall(output_path)
        
    return [os.path.join(output_path, extracted_file_path) for extracted_file_path in extracted_files_paths]

def _zip_dir(input_dir_path, zip_handle):
    for dirname, subdirs, files in os.walk(input_dir_path):
        for filename in files:          
            logger.info(f'Zipping {dirname}/{filename}')
            # _zip_chunker(os.path.join(dirname, filename))
            zip_handle.write(os.path.join(dirname, filename), 
                             os.path.relpath(os.path.join(dirname, filename), 
                                             os.path.join(input_dir_path, '..')))
            
"""
    Safety
"""
def _check_task_size(resources):
    size = sum([os.path.getsize(resource) for resource in resources])
    if size > MAX_NER_TASK_SIZE:
        raise TaskTooBigError
        
class TaskTooBigError(Exception):
    """
        Exception raised for tasks with too big payload.
    """

    def __init__(self, size, max_size):
        self.message = f"Task's payload is too big, it has {size} and maximum is {max_size}"
        super().__init__(self.message)
   
"""
    Logging
"""
logger = logging.getLogger(__name__)


## Named Entity Recognition with Liner2

In this section we will present how to use Europeana bibliographic resources with Named Entity Recognition (NER) tool for Polish using [lpmn_client](https://wiki.clarin-pl.eu/en/nlpws/lpmn_client) and [Liner2](https://github.com/CLARIN-PL/Liner2). Due to the server side limitation, we ensure 2M limit on size on the task with function defined below. 


In [3]:
"""
    Install lpmn client and import it
"""
!pip install -i https://pypi.clarin-pl.eu lpmn_client

from lpmn_client import download_file, upload_file
from lpmn_client import Task

Looking in indexes: https://pypi.clarin-pl.eu




### Spellchecking with lpmn client

OCR output tends to contains spelling mistakes that adds noise to the data. Below we show how to use lpmn clinet to specify task pipeline in order to obtain spell-checked textual data. 

In [4]:
"""
    Function for tasking lpmn client with Liner2 NER pipeline with task size control 
"""

def lpmn_client_task(resources, task, names=[]):
    """
        Wrap over CLARIN-PL lpmn client with control of the task size in order to avoid jamming the task queue on the server side
        
        :param list resources: list of paths to the resources to be processed
        :param str task: string defining pipeline, e.g. "speller2" or ""
        :param list names: optional list of names for output files, has to be same length as resources
        :returns list: list of paths to the output zip files
    """
    
    # Size check
    _check_task_size(resources)
    # Upload reasources to task queue
    job_ids = [upload_file(resource_file) for resource_file in resources]
    # Specify pipeline 
    t = Task(task)
    # Run uploaded tasks with pipeline
    output_file_ids = [t.run(job_id, verbose=True) for job_id in job_ids]
    if names:
        output = [download_file(output_file_id, output_file, f"{filename}.zip") 
                         for output_file_id, filename in zip(output_file_ids, names)]
    else:
        output = [download_file(output_file_id, output_file, f"{os.path.basename(resource)}.zip") 
                         for output_file_id, resource in zip(output_file_ids, resources)]
    return output

In [16]:
"""
    Let's select resources. We will use Izraelita newspapers from 1910-1913 to investigate most frequent Named Entities over the years on January. 
"""

# Prepare metadata of the collection
unpack_metadata(set_id, metadata_dir)
izraelita_metadata_files = [f"{metadata_dir}/Izraelita_1910.xml",
                            f"{metadata_dir}/Izraelita_1911.xml",
                            f"{metadata_dir}/Izraelita_1912.xml",
                            f"{metadata_dir}/Izraelita_1913.xml",
                           ]

# Use getters to obtain date and id from metadata
izraelita_metadata_trees = [etree.parse(izraelita_metadata_file) for izraelita_metadata_file in izraelita_metadata_files]
ids = [get_resource_ids_from_metadata(izraelita_metadata_tree) for izraelita_metadata_tree in izraelita_metadata_trees]
dates = [get_date_from_metadata(izraelita_metadata_tree) for izraelita_metadata_tree in izraelita_metadata_trees]

# Get all issues from January
ids_dates_january = []
for _ids, _dates in zip (ids, dates):
    year = []
    for _id, date in zip(_ids, _dates):
        if date.month==1:
            year.append((_id, date))
    ids_dates_january.append(year)

# print(ids_dates_january)
# # Get dates of first issues per year
# first_issues_date_per_year = {min([date for _, date in year]) for year in ids_dates_january}
# print(first_issues_date_per_year)
# # Filter ids of first issues per year 
# id_date_first_issue_january = [(_id, date) for year in ids_dates_january for (_id, date) in year if date in first_issues_date_per_year]
print(ids_dates_january)
# Map id to reasource
resources_dates = [(get_resource_file(_id), date) for year in ids_dates_january for _id, date in year ]

Retrieving https://europeana-oai.clarin.eu/metadata/fulltext-aggregation/9200357.zip
Extracting content in /home/jovyan/work/temp/metadata/9200357
Done
[[('3000095241372', datetime.date(1910, 1, 1))], [('3000095240849', datetime.date(1911, 1, 20)), ('3000095241292', datetime.date(1911, 1, 6)), ('3000095240806', datetime.date(1911, 1, 13)), ('3000095242040', datetime.date(1911, 1, 1)), ('3000095240930', datetime.date(1911, 1, 27))], [('3000095241756', datetime.date(1912, 1, 26)), ('3000095241501', datetime.date(1912, 1, 5)), ('3000095241903', datetime.date(1912, 1, 1)), ('3000095242070', datetime.date(1912, 1, 12)), ('3000095240837', datetime.date(1912, 1, 19))], [('3000095241288', datetime.date(1913, 1, 3)), ('3000095241413', datetime.date(1913, 1, 17)), ('3000095241992', datetime.date(1913, 1, 24)), ('3000095241127', datetime.date(1913, 1, 31)), ('3000095241955', datetime.date(1913, 1, 10))]]


In [18]:
"""
    Let's spell check selected resources
"""

# Safety check of the size of all resources meant for the task
_check_task_size([resource for resource, _ in resources_dates])
for resource, date in resources_dates:
    lpmn_client_task([resource], "speller2", [f"{date}_speller"])



  0%|          | 0/100 [00:00<?, ?it/s][A
  0%|          | 0.0/100 [00:00<?, ?it/s][A
  0%|          | 0.0/100 [00:01<?, ?it/s][A
  0%|          | 0.0/100 [00:02<?, ?it/s][A
  0%|          | 0.0/100 [00:02<?, ?it/s][A
  0%|          | 0.0/100 [00:03<?, ?it/s][A
  0%|          | 0.0/100 [00:04<?, ?it/s][A
  0%|          | 0.0/100 [00:05<?, ?it/s][A
  0%|          | 0.0/100 [00:05<?, ?it/s][A
  0%|          | 0.0/100 [00:06<?, ?it/s][A
  0%|          | 0.0/100 [00:07<?, ?it/s][A
  0%|          | 0.0/100 [00:07<?, ?it/s][A
  0%|          | 0.0/100 [00:08<?, ?it/s][A
  0%|          | 0.0/100 [00:09<?, ?it/s][A
  0%|          | 0.0/100 [00:09<?, ?it/s][A
  0%|          | 0.0/100 [00:10<?, ?it/s][A
  0%|          | 0.0/100 [00:11<?, ?it/s][A
  0%|          | 0.0/100 [00:12<?, ?it/s][A
  0%|          | 0.0/100 [00:12<?, ?it/s][A
  0%|          | 0.0/100 [00:13<?, ?it/s][A
  0%|          | 0.0/100 [00:14<?, ?it/s][A
  0%|          | 0.0/100 [00:14<?, ?it/s][A
  0%|      

In [21]:
speller2_output_file_paths = [unzip_file(f"{output_file}/{date}_speller.zip")[0] for _, date in resources_dates]
print(speller2_output_file_paths)

['/home/jovyan/output/1910-01-01_speller/home%jovyan%data%9200357%BibliographicResource_3000095241372.txt', '/home/jovyan/output/1911-01-20_speller/home%jovyan%data%9200357%BibliographicResource_3000095240849.txt', '/home/jovyan/output/1911-01-06_speller/home%jovyan%data%9200357%BibliographicResource_3000095241292.txt', '/home/jovyan/output/1911-01-13_speller/home%jovyan%data%9200357%BibliographicResource_3000095240806.txt', '/home/jovyan/output/1911-01-01_speller/home%jovyan%data%9200357%BibliographicResource_3000095242040.txt', '/home/jovyan/output/1911-01-27_speller/home%jovyan%data%9200357%BibliographicResource_3000095240930.txt', '/home/jovyan/output/1912-01-26_speller/home%jovyan%data%9200357%BibliographicResource_3000095241756.txt', '/home/jovyan/output/1912-01-05_speller/home%jovyan%data%9200357%BibliographicResource_3000095241501.txt', '/home/jovyan/output/1912-01-01_speller/home%jovyan%data%9200357%BibliographicResource_3000095241903.txt', '/home/jovyan/output/1912-01-12_spel

NameError: name 'output_path' is not defined

In [22]:
# Safety check of the size of all resources meant for the task
_check_task_size(speller2_output_file_paths)

for resource, (_, date) in zip(speller2_output_file_paths, resources_dates):
    lpmn_client_task([resource],'any2txt|wcrft2|liner2({"model":"top9"})', [f"{date}_liner"])
    



  0%|          | 0/100 [00:00<?, ?it/s][A
 40%|████      | 40.0/100 [00:00<00:01, 55.45it/s][A
 60%|██████    | 60.0/100 [00:01<00:01, 39.31it/s][A
100%|██████████| 100.0/100 [00:02<00:00, 46.57it/s][A

  0%|          | 0/100 [00:00<?, ?it/s][A
 20%|██        | 20.0/100 [00:00<00:02, 26.78it/s][A
 40%|████      | 40.0/100 [00:01<00:02, 27.57it/s][A
 60%|██████    | 60.0/100 [00:05<00:04,  8.44it/s][A
100%|██████████| 100.0/100 [00:06<00:00, 15.39it/s][A

  0%|          | 0/100 [00:00<?, ?it/s][A
 20%|██        | 20.0/100 [00:00<00:02, 27.28it/s][A
 40%|████      | 40.0/100 [00:01<00:02, 27.60it/s][A
100%|██████████| 100.0/100 [00:05<00:00, 17.13it/s][A

  0%|          | 0/100 [00:00<?, ?it/s][A
 20%|██        | 20.0/100 [00:00<00:02, 27.81it/s][A
 40%|████      | 40.0/100 [00:02<00:03, 17.68it/s][A
100%|██████████| 100.0/100 [00:06<00:00, 15.56it/s][A

  0%|          | 0/100 [00:00<?, ?it/s][A
 40%|████      | 40.0/100 [00:00<00:01, 56.40it/s][A
100%|██████████| 100

In [28]:
liner2_output_file_paths = [unzip_file(f"{output_file}/{date}_liner.zip")[0] for _, date in resources_dates]
print(liner2_output_file_paths)

['/home/jovyan/output/1910-01-01_liner/home%jovyan%output%1910-01-01_speller%home%jovyan%data%9200357%BibliographicResource_3000095241372.txt', '/home/jovyan/output/1911-01-20_liner/home%jovyan%output%1911-01-20_speller%home%jovyan%data%9200357%BibliographicResource_3000095240849.txt', '/home/jovyan/output/1911-01-06_liner/home%jovyan%output%1911-01-06_speller%home%jovyan%data%9200357%BibliographicResource_3000095241292.txt', '/home/jovyan/output/1911-01-13_liner/home%jovyan%output%1911-01-13_speller%home%jovyan%data%9200357%BibliographicResource_3000095240806.txt', '/home/jovyan/output/1911-01-01_liner/home%jovyan%output%1911-01-01_speller%home%jovyan%data%9200357%BibliographicResource_3000095242040.txt', '/home/jovyan/output/1911-01-27_liner/home%jovyan%output%1911-01-27_speller%home%jovyan%data%9200357%BibliographicResource_3000095240930.txt', '/home/jovyan/output/1912-01-26_liner/home%jovyan%output%1912-01-26_speller%home%jovyan%data%9200357%BibliographicResource_3000095241756.txt'

In [50]:
"""
    Function for extracting annotations from Liner2 xml output
"""


def liner2_xml_to_annotation(xml_tree):
    """
        Converts xml doc into list of annotations and tokens
        
        :param str path_to_xml: path to .xml Liner2 output file
        :returns list: list of tuples (annotation_type, [tokens])
    """

    sentences = xml_tree.xpath("//sentence")
    annotated_tokens = [sentence.xpath("./tok[./ann!=0]") for sentence in sentences]
    # Prune empty lists
    annotated_tokens = filter(lambda x: True if x else False, annotated_tokens)
    annotated_tokens = [_chain_annotations(sentence) for sentence in annotated_tokens]
    return annotated_tokens
        
def _chain_annotations(sentence: list):
    annotation_heads = [token.xpath("./ann[@head]") for token in sentence]
    for token in annotation_heads:
        for annotation_head in token:
            annotation_channel = annotation_head.xpath("./text()")[0]
            annotation_type = annotation_head.get("chan")
            annotation_tokens = [token.xpath("./orth/text()")[0] for token in sentence if token.xpath(f"./ann[text()={annotation_channel}]")]
    return annotation_type, annotation_tokens

In [61]:
"""
    Parse NER output and investigate 10 most common named entities by years
"""

from collections import Counter

named_entities_counts = {}
for _, date in resources_dates:
    for root, _, files in os.walk(f"{output_file}/{date}_liner"):
        for filename in files:
            path_to_annotated_output = os.path.join(root, filename)
            xml_tree = etree.parse(path_to_annotated_output)
            # Get stats
            token_nb = sum([1 for _ in xml_tree.xpath("//tok")])
            annotation_list = liner2_xml_to_annotation(xml_tree)
            annotation_counts = Counter(f"{annotation_type}|{' '.join(annotation_tokens)}" for annotation_type, annotation_tokens in annotation_list)
            if date.year in named_entities_counts.keys():
                named_entities_counts[date.year].append((token_nb, annotation_counts))
            else:
                named_entities_counts[date.year] = [(token_nb, annotation_counts)]
{year: (sum([token_nb for token_nb, _ in named_entities_counts[year]]), 
        sum([counter for _, counter in named_entities_counts[year]], Counter()).most_common(10)
       )
 for year in named_entities_counts.keys()}

{1910: (4708,
  [('nam_liv|Leona Lichtenbaum', 5),
   ('nam_liv|Henryka Lichtenbaum', 4),
   ('nam_liv|Józefa Wassercug', 3),
   ('nam_liv|L', 3),
   ('nam_loc|Br', 3),
   ('nam_liv|J . D', 3),
   ('nam_liv|Lambda', 3),
   ('nam_liv|Bolesława Bostońskiego', 2),
   ('nam_oth|N . M', 2),
   ('nam_liv|Henryka Nussbauma', 2)]),
 1911: (80848,
  [('nam_loc|Warszawie', 21),
   ('nam_loc|Polsce', 19),
   ('nam_liv|Sułkowski', 17),
   ('nam_loc|Galicji', 13),
   ('nam_liv|Abram', 13),
   ('nam_loc|Rosji', 12),
   ('nam_liv|Lambda', 10),
   ('nam_loc|Austrii', 9),
   ('nam_loc|Warszawa', 8),
   ('nam_pro|Unser Leben', 7)]),
 1912: (66276,
  [('nam_liv|Boże', 14),
   ('nam_liv|Henryk', 12),
   ('nam_loc|Warszawie', 11),
   ('nam_loc|Warszawy', 11),
   ('nam_loc|Galicji', 10),
   ('nam_loc|Cesarstwa', 7),
   ('nam_loc|Warszawa', 7),
   ('nam_loc|Polsce', 7),
   ('nam_loc|Polski', 6),
   ('nam_loc|Izraela', 6)]),
 1913: (66370,
  [('nam_liv|Joel', 22),
   ('nam_loc|Rumunii', 18),
   ('nam_liv|Joel