In [1]:
!pip install -i https://pypi.clarin-pl.eu lpmn_client

Looking in indexes: https://pypi.clarin-pl.eu


In [2]:
!pip install nbformat lxml



In [119]:
"""
    Utilities
"""

import datetime
from io import BytesIO
import json
import logging
from lxml import etree
import re
import requests
import os
from zipfile import ZipFile
"""
    Globals
"""

 # Poland
set_id = '9200357'

data_dir = f'{os.path.expanduser("~")}/work/data'
metadata_dir = f'{os.path.expanduser("~")}/work/temp/metadata/{set_id}'
nsmap = {"cmd": "http://www.clarin.eu/cmd/1",
         "cmdp_text": "http://www.clarin.eu/cmd/1/profiles/clarin.eu:cr1:p_1633000337997"}
output_file = f'{os.path.expanduser("~")}/work/output'


with open(f'{data_dir}/{set_id}/id_file_map.json', 'r') as id_filename_map_file:
    id_filename_map = json.load(id_filename_map_file)

YYYY_MM_DD = re.compile(r"(?P<year>[0-9]{4})-(?P<month>[0-9]{1,2})-(?P<day>[0-9]{1,2})")
MAX_NER_TASK_SIZE = 2000000

"""
    Metadata printing
"""
def print_xml(tree, declaration: bool = False):
    print(etree.tostring(tree, encoding='UTF-8', xml_declaration=declaration, pretty_print=True).decode())

"""
    Data acess
"""
def get_resource_file(identifier):
    """
        Resolves Europeana subresource identifier to it's local location. 
        
        :param str identifier: Europeana subresource identifier
        :return str: Path to local location of the resource
    """
    if identifier in id_filename_map:
        filename = id_filename_map[identifier]
        return f'{data_dir}/{set_id}/{filename}'

def get_date_from_metadata(metadata_tree):
    dates = metadata_tree.xpath("///cmdp_text:TemporalCoverage/cmdp_text:Start/cmdp_text:date/text()", namespaces=nsmap)
    dates = [YYYY_MM_DD.match(date) for date in dates]
    dates = [datetime.date(int(date.group("year")), int(date.group("month")), int(date.group("day"))) for date in dates]
    return dates
    
def get_description_from_metadata(metadata_tree):
    descriptions = metadata_tree.xpath('//cmdp_text:TextResource/cmdp_text:Description/cmdp_text:description/text()', namespaces=nsmap)
    if len(descriptions) > 0:
        return descriptions[0]
    
def get_resource_ids_from_metadata(metadata_tree):
    ids = metadata_tree.xpath('//cmdp_text:SubresourceDescription/cmdp_text:IdentificationInfo/cmdp_text:identifier/text()', namespaces=nsmap)
    # The result can be any number of identifiers. We do want to filter the values a bit: only the numeric identifiers are useful 
    # to us so we use the special syntax below to make a new list by picking only the matching values from the query results list
    return [id for id in ids if id.isnumeric()]

def get_title_from_metadata(metadata_tree):
    # Get all the values from the xpath
    titles = metadata_tree.xpath('//cmdp_text:TextResource/cmdp_text:TitleInfo/cmdp_text:title/text()', namespaces=nsmap)
    # Check if there is an actual value
    if len(titles) > 0:
        # Return the first (assuming only) value
        return titles[0]

def unpack_metadata(set_id, target_dir):
    # Construct the address of the .zip file with the metadata for one set
    md_zip_url = f'https://europeana-oai.clarin.eu/metadata/fulltext-aggregation/{set_id}.zip'
    
    # Retrieve the .zip file
    print(f'Retrieving {md_zip_url}')
    resp = requests.get(md_zip_url)
    zipfile = ZipFile(BytesIO(resp.content))
    
    # Uncompress the .zip into the target directory
    print(f'Extracting content in {target_dir}')
    zipfile.extractall(path=target_dir)
    print('Done')

"""
    Zip/Unzip
"""
def zip_file(input_path, output_path=""):
    """
        Zips input file/directory
        
        :param str input_path: path to file to be zipped, if file is directory, entire dir gets zipped
        :param str output_path: path to location, where to save the archive. If empty, zip archive uses same location and name as input
        :returns str: path to archive
    """
    if not output_path:
        output_path = f"{os.path.dirname(input_path)}/{os.path.basename(input_path).split('.')[0]}.zip"
    with ZipFile(output_path, 'r') as zip_handle:
        logger.info(f'Zipping {input_path} to {output_path}')
        if os.path.isdir(input_path):
            _zip_dir(input_path, zip_handle)
        else:
            zip_handle.write(input_path, os.path.basename(input_path))
            # _zip_chunker(input_path)
        
    return output_path
        
def unzip_file(input_path, output_path=""):
    """
        Unzips input .zip file
        
        :param str input_path: path to file to be unzipped
        :param str output_path: path to location, where to unpack the archive. If empty, archive is extracted at its location.
    """
    if not output_path:
        output_path = f"{os.path.dirname(input_path)}/{os.path.basename(input_path).split('.')[0]}"
        print("$$")
        print(output_path)
        print(input_path)
    with ZipFile(input_path, 'r') as zip_ref:
        logger.info(f'Unzipping {input_path} to {output_path}')
        zip_ref.extractall()
        extracted_files_paths = zip_ref.namelist()
        
    return [os.path.join(output_path, extracted_file_path) for extracted_file_path in extracted_files_paths]

def _zip_dir(input_dir_path, zip_handle):
    for dirname, subdirs, files in os.walk(input_dir_path):
        for filename in files:          
            logger.info(f'Zipping {dirname}/{filename}')
            # _zip_chunker(os.path.join(dirname, filename))
            zip_handle.write(os.path.join(dirname, filename), 
                             os.path.relpath(os.path.join(dirname, filename), 
                                             os.path.join(input_dir_path, '..')))
            
"""
    Safety
"""
def _check_task_size(resources):
    size = sum([os.path.getsize(resource) for resource in resources])
    if size > MAX_NER_TASK_SIZE:
        raise TaskTooBigError
        
class TaskTooBigError(Exception):
    """
        Exception raised for tasks with too big payload.
    """

    def __init__(self, size, max_size):
        self.message = f"Tasks payload is too big, it has {size} and maximum is {max_size}"
        super().__init__(self.message)
   
"""
    Logging
"""
logger = logging.getLogger(__name__)


## Named Entity Recognition with Liner2

In this section we will present how to use Europeana bibliographic resources with Named Entity Recognition (NER) tool for Polish using [lpmn_client](https://wiki.clarin-pl.eu/en/nlpws/lpmn_client) and [Liner2](https://github.com/CLARIN-PL/Liner2). Due to the server side limitation, we ensure 2M limit on size on the task with function defined below. 


In [120]:
"""
    Function for tasking lpmn client with Liner2 NER pipeline with task size control 
"""

from lpmn_client import download_file, upload_file
from lpmn_client import Task


def liner2_NER(resources, names=[]):
    """
        Wrap over CLARIN-PL lpmn client with control of the task size in order to avoid jamming the task queue on the server side
        
        :param list resources: list of paths to the resources to be processed
        :returns list: list of paths to the output zip files
    """
    
    # Size check
    _check_task_size(resources)
    # Upload reasources to task queue
    job_ids = [upload_file(resource_file) for resource_file in resources]
    # Specify pipeline 
    t = Task("any2txt|wcrft2|liner2")
    # Run uploaded tasks with pipeline
    output_file_ids = [t.run(job_id, verbose=True) for job_id in job_ids]
    if names:
        liner2_output = [download_file(output_file_id, output_file, filename) for output_file_id, filename in zip(output_file_ids, names)]
    else:
        liner2_output = [download_file(output_file_id, output_file) for output_file_id in output_file_ids]
    return liner2_output

In [121]:
"""
    Let's select resources. We will use Izraelita newspapers from 1910-1913 to investigate most frequent Named Entities over the years on January. 
"""

# Prepare metadata of the collection
unpack_metadata(set_id, metadata_dir)
izraelita_metadata_files = [f"{metadata_dir}/Izraelita_1910.xml",
                            f"{metadata_dir}/Izraelita_1911.xml",
                            f"{metadata_dir}/Izraelita_1912.xml",
                            f"{metadata_dir}/Izraelita_1913.xml",
                           ]

# Use getters to obtain date and id from metadata
izraelita_metadata_trees = [etree.parse(izraelita_metadata_file) for izraelita_metadata_file in izraelita_metadata_files]
ids = [get_resource_ids_from_metadata(izraelita_metadata_tree) for izraelita_metadata_tree in izraelita_metadata_trees]
dates = [get_date_from_metadata(izraelita_metadata_tree) for izraelita_metadata_tree in izraelita_metadata_trees]

# Get all issues from January
ids_dates_january = []
for _ids, _dates in zip (ids, dates):
    year = []
    for _id, date in zip(_ids, _dates):
        if date.month==1:
            year.append((_id, date))
    ids_dates_january.append(year)

# Get dates of first issues per year
first_issues_date_per_year = {min([date for _, date in year]) for year in ids_dates_january}

# Filter ids of first issues per year 
id_date_first_issue_january = [(_id, date) for year in ids_dates_january for _ids, date in year if date in first_issues_date_per_year]

# Map id to reasource
resources_dates = [(get_resource_file(_id), date) for _id, date in id_date_first_issue_january]

Retrieving https://europeana-oai.clarin.eu/metadata/fulltext-aggregation/9200357.zip
Extracting content in /home/jovyan/work/temp/metadata/9200357
Done


In [122]:
"""
    Let's use selected resources
"""

for resource, date in resources_dates:
    liner2_NER([resource], [date])


100%|██████████| 100.0/100 [00:03<00:00, 27.79it/s]
100%|██████████| 100.0/100 [00:03<00:00, 28.03it/s]
100%|██████████| 100.0/100 [00:03<00:00, 27.91it/s]
100%|██████████| 100.0/100 [00:02<00:00, 34.96it/s]


In [123]:
for _, date in resources_dates:
    print(f"{output_file}/{date}")
liner2_output_file_paths = [unzip_file(f"{output_file}/{date}") for _, date in resources_dates]
# We flatten list [[file_path]] to [file_path]

print(liner2_output_file_paths)

with open(liner2_output_file_paths[0], 'r') as liner2_output_file:
    print(liner2_output_file)

/home/jovyan/work/output/1910-01-01
/home/jovyan/work/output/1911-01-01
/home/jovyan/work/output/1912-01-01
/home/jovyan/work/output/1913-01-03
$$
/home/jovyan/work/output/1910-01-01
/home/jovyan/work/output/1910-01-01
$$
/home/jovyan/work/output/1911-01-01
/home/jovyan/work/output/1911-01-01
$$
/home/jovyan/work/output/1912-01-01
/home/jovyan/work/output/1912-01-01
$$
/home/jovyan/work/output/1913-01-03
/home/jovyan/work/output/1913-01-03
[['/home/jovyan/work/output/1910-01-01/home%jovyan%work%data%9200357%BibliographicResource_3000095241512.txt'], ['/home/jovyan/work/output/1911-01-01/home%jovyan%work%data%9200357%BibliographicResource_3000095241512.txt'], ['/home/jovyan/work/output/1912-01-01/home%jovyan%work%data%9200357%BibliographicResource_3000095241512.txt'], ['/home/jovyan/work/output/1913-01-03/home%jovyan%work%data%9200357%BibliographicResource_3000095241512.txt']]


TypeError: expected str, bytes or os.PathLike object, not list