In [1]:
!pip install nbformat lxml

from lxml import etree



In [2]:
# Globals
from common import _check_task_size, data_dir, metadata_dir, set_id, output_file, unpack_metadata, unzip_file, zip_file
# Getters
from common import get_liner_output_files_ch3, get_date_from_metadata, get_resource_ids_from_metadata, get_resource_file, get_spellchecked_resources_ch3

## Named Entity Recognition with Liner2

In this section we will present how to use Europeana bibliographic resources with Named Entity Recognition (NER) tool for Polish using [lpmn_client](https://wiki.clarin-pl.eu/en/nlpws/lpmn_client) and [Liner2](https://github.com/CLARIN-PL/Liner2). Due to the server side limitation, we ensure 2M limit on size on the task with function defined below. 


In [3]:
"""
    Install lpmn client and import it
"""
!pip install -i https://pypi.clarin-pl.eu lpmn_client

from lpmn_client import download_file, upload_file
from lpmn_client import Task

Looking in indexes: https://pypi.clarin-pl.eu




### Spellchecking with lpmn client

OCR output tends to contains spelling mistakes that adds noise to the data. Below we show how to use lpmn clinet to specify task pipeline in order to obtain spell-checked textual data. 

In [4]:
"""
    Function for tasking lpmn client with Liner2 NER pipeline with task size control 
"""

def lpmn_client_task(resources, task, names=[]):
    """
        Wrap over CLARIN-PL lpmn client with control of the task size in order to avoid jamming the task queue on the server side
        
        :param list resources: list of paths to the resources to be processed
        :param str task: string defining pipeline, e.g. "speller2" or ""
        :param list names: optional list of names for output files, has to be same length as resources
        :returns list: list of paths to the output zip files
    """
    
    # Size check
    _check_task_size(resources)
    # Upload reasources to task queue
    job_ids = [upload_file(resource_file) for resource_file in resources]
    # Specify pipeline 
    t = Task(task)
    # Run uploaded tasks with pipeline
    output_file_ids = [t.run(job_id, verbose=True) for job_id in job_ids]
    if names:
        output = [download_file(output_file_id, output_file, f"{filename}.zip") 
                         for output_file_id, filename in zip(output_file_ids, names)]
    else:
        output = [download_file(output_file_id, output_file, f"{os.path.basename(resource)}.zip") 
                         for output_file_id, resource in zip(output_file_ids, resources)]
    return output

In [5]:
"""
    Let's select resources. We will use Izraelita newspapers from 1910-1913 to investigate most frequent Named Entities over the years on January. 
"""

# Prepare metadata of the collection
unpack_metadata(set_id, metadata_dir)
izraelita_metadata_files = [f"{metadata_dir}/Izraelita_1910.xml",
                            f"{metadata_dir}/Izraelita_1911.xml",
                            f"{metadata_dir}/Izraelita_1912.xml",
                            f"{metadata_dir}/Izraelita_1913.xml",
                           ]

# Use getters to obtain date and id from metadata
izraelita_metadata_trees = [etree.parse(izraelita_metadata_file) for izraelita_metadata_file in izraelita_metadata_files]
ids = [get_resource_ids_from_metadata(izraelita_metadata_tree) for izraelita_metadata_tree in izraelita_metadata_trees]
dates = [get_date_from_metadata(izraelita_metadata_tree) for izraelita_metadata_tree in izraelita_metadata_trees]

# Get all issues from January
ids_dates_january = []
for _ids, _dates in zip (ids, dates):
    year = []
    for _id, date in zip(_ids, _dates):
        if date.month==1:
            year.append((_id, date))
    ids_dates_january.append(year)

# Map id to reasource
resources_dates = [(get_resource_file(_id), date) for year in ids_dates_january for _id, date in year ]
print(resources_dates)

Retrieving https://europeana-oai.clarin.eu/metadata/fulltext-aggregation/9200357.zip
Extracting content in /home/jovyan/temp/metadata/9200357
Done
[('/home/jovyan/data/9200357/BibliographicResource_3000095241372.txt', datetime.date(1910, 1, 1)), ('/home/jovyan/data/9200357/BibliographicResource_3000095240849.txt', datetime.date(1911, 1, 20)), ('/home/jovyan/data/9200357/BibliographicResource_3000095241292.txt', datetime.date(1911, 1, 6)), ('/home/jovyan/data/9200357/BibliographicResource_3000095240806.txt', datetime.date(1911, 1, 13)), ('/home/jovyan/data/9200357/BibliographicResource_3000095242040.txt', datetime.date(1911, 1, 1)), ('/home/jovyan/data/9200357/BibliographicResource_3000095240930.txt', datetime.date(1911, 1, 27)), ('/home/jovyan/data/9200357/BibliographicResource_3000095241756.txt', datetime.date(1912, 1, 26)), ('/home/jovyan/data/9200357/BibliographicResource_3000095241501.txt', datetime.date(1912, 1, 5)), ('/home/jovyan/data/9200357/BibliographicResource_3000095241903.

In [6]:
"""
    Let's spell check selected resources
"""

# Safety check of the size of all resources meant for the task
_check_task_size([resource for resource, _ in resources_dates])

# For offline run uncomment block below and comment last line
# for resource, date in resources_dates:
#     lpmn_client_task([resource], "speller2", [f"{date}_speller"])
# speller2_output_file_paths = [unzip_file(f"{output_file}/{date}_speller.zip")[0] for _, date in resources_dates]

speller2_output_file_paths = get_spellchecked_resources_ch3()
speller2_output_file_paths

['/home/jovyan/data/preprocessed_data/Chapter3/1912-01-05_speller/home%jovyan%data%9200357%BibliographicResource_3000095241501.txt',
 '/home/jovyan/data/preprocessed_data/Chapter3/1912-01-26_speller/home%jovyan%data%9200357%BibliographicResource_3000095241756.txt',
 '/home/jovyan/data/preprocessed_data/Chapter3/1911-01-06_speller/home%jovyan%data%9200357%BibliographicResource_3000095241292.txt',
 '/home/jovyan/data/preprocessed_data/Chapter3/1911-01-20_speller/home%jovyan%data%9200357%BibliographicResource_3000095240849.txt',
 '/home/jovyan/data/preprocessed_data/Chapter3/1911-01-27_speller/home%jovyan%data%9200357%BibliographicResource_3000095240930.txt',
 '/home/jovyan/data/preprocessed_data/Chapter3/1913-01-24_speller/home%jovyan%data%9200357%BibliographicResource_3000095241992.txt',
 '/home/jovyan/data/preprocessed_data/Chapter3/1912-01-19_speller/home%jovyan%data%9200357%BibliographicResource_3000095240837.txt',
 '/home/jovyan/data/preprocessed_data/Chapter3/1912-01-12_speller/hom

In [7]:
# Safety check of the size of all resources meant for the task
_check_task_size(speller2_output_file_paths)

for resource, (_, date) in zip(speller2_output_file_paths, resources_dates):
    lpmn_client_task([resource],'any2txt|wcrft2|liner2({"model":"top9"})', [f"{date}_liner"])
    


100%|██████████| 100.0/100 [00:04<00:00, 22.92it/s]
100%|██████████| 100.0/100 [00:05<00:00, 17.08it/s]
100%|██████████| 100.0/100 [00:05<00:00, 19.04it/s]
100%|██████████| 100.0/100 [00:06<00:00, 15.85it/s]
100%|██████████| 100.0/100 [00:05<00:00, 17.37it/s]
100%|██████████| 100.0/100 [00:04<00:00, 22.95it/s]
100%|██████████| 100.0/100 [00:04<00:00, 22.09it/s]
100%|██████████| 100.0/100 [00:04<00:00, 22.44it/s]
100%|██████████| 100.0/100 [00:05<00:00, 17.12it/s]
100%|██████████| 100.0/100 [00:02<00:00, 45.84it/s]
100%|██████████| 100.0/100 [00:02<00:00, 43.45it/s]
100%|██████████| 100.0/100 [00:05<00:00, 19.95it/s]
100%|██████████| 100.0/100 [00:04<00:00, 22.74it/s]
100%|██████████| 100.0/100 [00:03<00:00, 27.51it/s]
100%|██████████| 100.0/100 [00:04<00:00, 23.11it/s]
100%|██████████| 100.0/100 [00:02<00:00, 46.24it/s]


In [8]:
liner2_output_file_paths = [unzip_file(f"{output_file}/{date}_liner.zip")[0] for _, date in resources_dates]
liner2_output_file_paths

['/home/jovyan/output/1910-01-01_liner/home%jovyan%data%preprocessed_data%Chapter3%1912-01-05_speller%home%jovyan%data%9200357%BibliographicResource_3000095241501.txt',
 '/home/jovyan/output/1911-01-20_liner/home%jovyan%data%preprocessed_data%Chapter3%1912-01-26_speller%home%jovyan%data%9200357%BibliographicResource_3000095241756.txt',
 '/home/jovyan/output/1911-01-06_liner/home%jovyan%data%preprocessed_data%Chapter3%1911-01-06_speller%home%jovyan%data%9200357%BibliographicResource_3000095241292.txt',
 '/home/jovyan/output/1911-01-13_liner/home%jovyan%data%preprocessed_data%Chapter3%1911-01-20_speller%home%jovyan%data%9200357%BibliographicResource_3000095240849.txt',
 '/home/jovyan/output/1911-01-01_liner/home%jovyan%data%preprocessed_data%Chapter3%1911-01-27_speller%home%jovyan%data%9200357%BibliographicResource_3000095240930.txt',
 '/home/jovyan/output/1911-01-27_liner/home%jovyan%data%preprocessed_data%Chapter3%1913-01-24_speller%home%jovyan%data%9200357%BibliographicResource_300009

In [9]:
"""
    Function for extracting annotations from Liner2 xml output
"""


def liner2_xml_to_annotation(xml_tree):
    """
        Converts xml doc into list of annotations and tokens
        
        :param str path_to_xml: path to .xml Liner2 output file
        :returns list: list of tuples (annotation_type, [tokens])
    """

    sentences = xml_tree.xpath("//sentence")
    annotated_tokens = [sentence.xpath("./tok[./ann!=0]") for sentence in sentences]
    # Prune empty lists
    annotated_tokens = filter(lambda x: True if x else False, annotated_tokens)
    annotated_tokens = [_chain_annotations(sentence) for sentence in annotated_tokens]
    return annotated_tokens
        
def _chain_annotations(sentence: list):
    annotation_heads = [token.xpath("./ann[@head]") for token in sentence]
    for token in annotation_heads:
        for annotation_head in token:
            annotation_channel = annotation_head.xpath("./text()")[0]
            annotation_type = annotation_head.get("chan")
            annotation_tokens = [token.xpath("./lex/base/text()")[0] for token in sentence if token.xpath(f"./ann[text()={annotation_channel}]")]
    return annotation_type, annotation_tokens

In [20]:
"""
    Parse NER output and investigate 10 most common named entities by years
"""

from collections import Counter
import os

named_entities_counts = {}
for _, date in resources_dates:
    for root, _, files in os.walk(f"{output_file}/{date}_liner"):
        for filename in files:
            path_to_annotated_output = os.path.join(root, filename)
            xml_tree = etree.parse(path_to_annotated_output)
            # Get stats
            token_nb = sum([1 for _ in xml_tree.xpath("//tok")])
            annotation_list = liner2_xml_to_annotation(xml_tree)
            annotation_counts = Counter(f"{annotation_type}|{' '.join(annotation_tokens)}" for annotation_type, annotation_tokens in annotation_list)
            if date.year in named_entities_counts.keys():
                named_entities_counts[date.year].append((token_nb, annotation_counts))
            else:
                named_entities_counts[date.year] = [(token_nb, annotation_counts)]
most_common_overall = {year: (sum([token_nb for token_nb, _ in named_entities_counts[year]]), 
                              sum([counter for _, counter in named_entities_counts[year]], Counter()).most_common(10)
                             ) for year in named_entities_counts.keys()}

In [21]:
most_common_overall

{1910: (13791,
  [('nam_liv|bóg', 7),
   ('nam_loc|Ameryka', 5),
   ('nam_loc|Warszawa', 5),
   ('nam_adj|polski', 4),
   ('nam_liv|Jan', 4),
   ('nam_adj|żydowski', 3),
   ('nam_liv|Nobel', 3),
   ('nam_loc|cesarstwo', 2),
   ('nam_liv|Leon Lichtenbaum', 2),
   ('nam_liv|Ania', 2)]),
 1911: (87538,
  [('nam_loc|Warszawa', 38),
   ('nam_loc|Polska', 25),
   ('nam_liv|bóg', 21),
   ('nam_loc|Izrael', 16),
   ('nam_loc|Rosja', 15),
   ('nam_liv|Abram', 13),
   ('nam_adj|żydowski', 12),
   ('nam_liv|Sułkowski', 12),
   ('nam_loc|Niemcy', 11),
   ('nam_loc|Galicja', 11)]),
 1912: (59763,
  [('nam_loc|Polska', 24),
   ('nam_loc|Warszawa', 21),
   ('nam_liv|lambda', 21),
   ('nam_loc|Galicja', 19),
   ('nam_loc|Rosja', 13),
   ('nam_loc|Ameryka', 11),
   ('nam_liv|bóg', 11),
   ('nam_liv|Sułkowski', 11),
   ('nam_org|Polak', 10),
   ('nam_liv|Henryk', 10)]),
 1913: (58675,
  [('nam_liv|Joel', 28),
   ('nam_loc|Warszawa', 28),
   ('nam_loc|Rumunia', 18),
   ('nam_adj|polski', 13),
   ('nam_li