In [1]:
from bs4 import BeautifulSoup
import time
import joblib
import os
import sys
import json
import re
import stanza

stanza.download('de')
nlp = stanza.Pipeline('de')

HLEX_SOUP_PICKLE = 'hlex_soup.pickle'
occupation_list = []


def load_transformed_hlex_to_soup():
    hlex_xml_path = '../data/Heiligenlex-1858.xml'
    with open(hlex_xml_path, 'r', encoding='utf-8') as hlex:
        soup = BeautifulSoup(hlex, features="xml")
        return soup


def pickle_it(object_to_pickle, path: str):
    print("Attempting to pickle...")
    with open(path, 'wb') as target_file:
        joblib.dump(value=object_to_pickle, filename=target_file)

def timing_wrapper(func, param):
    start = time.time()
    value = None
    if param:
        value = func(param)
    else:
        print("no param found, running function without params")
        value = func()
    end = time.time()
    print("Finished after ", end - start)
    return value


def extract_occupation(paragraph_list):
    occupation = None
    # only look at the first two paragraphs for now

    print("Full paragraph list")
    print(paragraph_list)

    print("First paragraph")
    print(paragraph_list[0])
    first_paragraph_text = paragraph_list[0].text

    print(paragraph_list[1])

    # for item in occupation_list:
    #     if item.lower() in paragraph_text.lower():
    #         occupation = item
    #         continue
    doc = nlp(first_paragraph_text)
    print(doc)
    doc.sentences[0].print_dependencies()

    second_paragraph_text = paragraph_list[1].text
    print("Second paragraph")
    print(second_paragraph_text)
    doc2 = nlp(second_paragraph_text)
    doc2.sentences[0].print_dependencies()
    for sentence in doc2.sentences:
        sentence.print_dependencies()
    print(doc2.entities)

    third_paragraph_text = paragraph_list[2].text
    print("Third paragraph")
    print(third_paragraph_text)
    # doc3 = nlp(third_paragraph_text)
    # doc3.sentences[0].print_dependencies()

    fourth_paragraph_text = paragraph_list[3].text
    print("Fourth paragraph")
    print(fourth_paragraph_text)

    sys.exit()
    return occupation


# the term of the entry contains the name of the saint and their title, usually one variant of: S., B., V. (Sanctus, Beati or Veritit
def parse_term(term):
    raw_term = term.text
    print("Raw:")
    print(raw_term)

    saint_name = None
    canonization_status = None
    hlex_number = None
    footnote = None


    import re

    saint_pattern = r"((\w|\s)+\w)\,?\(?"
    canonization_pattern = r"[A-Z]+\."
    number_pattern = r"\(.*\)"
    footnote_pattern = r"\[.*\]"

    saint_match = re.search(saint_pattern, raw_term)
    if saint_match:
        saint_name = saint_match.group(1)
    else:
        print("No match found for ", raw_term)
        sys.exit()

    canonization_match = re.search(canonization_pattern, raw_term)
    if canonization_match:
        canonization_status = canonization_match.group()

    num_match = re.search(number_pattern, raw_term)
    if num_match:
        hlex_number = num_match.group()

    footnote_match = re.search(footnote_pattern, raw_term)
    if footnote_match:
        footnote = footnote_match.group()

    print("----------")
    print(saint_name)
    if canonization_status:
        print(canonization_status)
    if hlex_number:
        print(hlex_number)
    if footnote:
        print(footnote)
        if hlex_number:
            hlex_number = hlex_number + " " + footnote
        else:
            hlex_number = footnote
    print("\n")
    return saint_name, canonization_status, hlex_number


# The paragraph contains free form text, but often starts with the feast day if it is available,
# May also contain occupation of saint
def parse_paragraph(paragraph_list):
    paragraph = paragraph_list[0]
    feast_day_pattern = r"\(.?[0-9][0-9]?.*?\)"
    raw_paragraph = paragraph.text

    feast_day = None

    feast_day_match = re.search(feast_day_pattern, raw_paragraph)
    if feast_day_match:
        feast_day = feast_day_match.group()

    occupation = extract_occupation(paragraph_list)
    return feast_day, occupation

def parse_entry(entry):
    #namespace is found on linux, not in windows, maybe a module version error?
    #term_list = entry.find_all('tei:term')
    term_list = entry.find_all('term')
    entry_id = entry.get('xml:id')

    #TODO: This is more of a sanity check to verify an assumption about the data, would be nicer to move this to tests
    if len(entry.find_all('sense'))>1:
        print("Error: More than one sense found in entry!")
        sys.exit()

    #print(term_list)
    entry_dict = {}
    #paragraph_list = entry.find_all('tei:p')
    paragraph_list = entry.find_all('p')
    #Assuming only one term per entry, give warning when finding other
    print("Looking at entry: ", entry_id)
    print(entry)
    if len(term_list) > 1:
        print(f"Error, found more than one term in entry {entry_id}!")
        sys.exit()
    else:
        print(term_list)
        term = term_list[0]
        saint_name, canonization_status, hlex_number = parse_term(term)
        entry_dict['SaintName'] = saint_name
        entry_dict['CanonizationStatus'] = canonization_status
        entry_dict['NumberInHlex'] = hlex_number
        entry_dict['OriginalText'] = entry.text

        #TODO looking only at first paragraph for now, will have to look at more later
        if paragraph_list:

            feast_day, occupation = parse_paragraph(paragraph_list)
            entry_dict['FeastDay'] = feast_day
            entry_dict['Ocupation'] = occupation
        else:
            entry_dict['FeastDay'] = None
            entry_dict['Occupation'] = None

        return entry_id, entry_dict

def write_dict_to_json(data: dict):

    json_data = json.dumps(data)
    with open('tmp/parsed_heiligenlexikon.json', 'w') as json_file:
        json_file.write(json_data)

def parse_soup(soup):
    entries = soup.find_all('entry')
    data = {}
    for e in entries[:]:
        entry_id, entry_dict = parse_entry(e)

        if entry_id in data.keys():
            print("ERROR: Duplicate entry id found!", entry_id)
            sys.exit()

        data[entry_id] = entry_dict

    write_dict_to_json(data)


def load_occupation_list():
    with open(r"occupation_list.txt", "r") as occupation_file:
        tmp_occupation_list = occupation_file.readlines()
        for item in tmp_occupation_list:
            if item.startswith('#'):
                continue
            occupation_list.append(item.strip())

        print(occupation_list)

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 85.8MB/s]
2023-08-10 03:57:39 INFO: Downloading default packages for language: de (German) ...
2023-08-10 03:57:41 INFO: File exists: C:\Users\chenz\stanza_resources\de\default.zip
2023-08-10 03:57:45 INFO: Finished downloading models and saved to C:\Users\chenz\stanza_resources.
2023-08-10 03:57:45 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 39.0MB/s]
2023-08-10 03:57:46 INFO: Loading these models for language: de (German):
| Processor | Package      |
----------------------------
| tokenize  | gsd          |
| mwt       | gsd          |
| pos       | gsd      

In [None]:

hlex_soup = None
load_occupation_list()
#sys.setrecursionlimit(sys.getrecursionlimit()*50)
#print("Attempting with recursionlimit:", sys.getrecursionlimit())
#TODO: Add a check to see if pickle file is corrupt
if os.path.isfile('tmp/'+HLEX_SOUP_PICKLE):
    print("Pickle found, loading...")
    with open('tmp/' + HLEX_SOUP_PICKLE, 'rb') as pickle_file:
        hlex_soup = timing_wrapper(joblib.load, pickle_file)
        #print("Hlex_soup is: ")
        #print(hlex_soup)
else:
    print("No pickle found, loading from XML...")
    hlex_soup = timing_wrapper(load_transformed_hlex_to_soup, None)
    print("Size of Hlex Object: ",sys.getsizeof(hlex_soup))
    pickle_it(hlex_soup, "tmp/"+HLEX_SOUP_PICKLE)
print("Loaded", hlex_soup.title.text)
parse_soup(hlex_soup)

In [13]:
dir(doc)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_count_words',
 '_ents',
 '_lang',
 '_num_tokens',
 '_num_words',
 '_process_sentences',
 '_sentences',
 '_text',
 'add_property',
 'build_ents',
 'entities',
 'ents',
 'from_serialized',
 'get',
 'get_mwt_expansions',
 'iter_tokens',
 'iter_words',
 'lang',
 'num_tokens',
 'num_words',
 'reindex_sentences',
 'sentence_comments',
 'sentences',
 'set',
 'set_mwt_expansions',
 'text',
 'to_dict',
 'to_serialized']

In [44]:
def extract_gender(input_name: str):
    gender_pattern = re.compile(r"Gender=(\w+)")
    doc = nlp(input_name)
    extracted_gender = None
    print(doc)
    feats = doc.get("feats")
    print(feats)
    feats_str = feats[0]
    if "Gender" in feats_str:
        print("found gender")
        gender_match = re.search(gender_pattern, feats_str)
        if gender_match:
            extracted_gender = gender_match.group(1)
    return extracted_gender

In [52]:
extract_gender("Albericus")

[
  [
    {
      "id": 1,
      "text": "Albericus",
      "lemma": "Albericus",
      "upos": "PROPN",
      "xpos": "NE",
      "feats": "Case=Nom|Gender=Masc|Number=Sing",
      "head": 0,
      "deprel": "root",
      "start_char": 0,
      "end_char": 9,
      "ner": "S-PER",
      "multi_ner": [
        "S-PER"
      ]
    }
  ]
]
['Case=Nom|Gender=Masc|Number=Sing']
found gender


'Masc'

In [63]:
doc = nlp("Anso")
print(doc)
if doc.get("feats"):
    print("yay")

[
  [
    {
      "id": 1,
      "text": "Anso",
      "lemma": "anso",
      "upos": "ADV",
      "xpos": "ADV",
      "head": 0,
      "deprel": "root",
      "start_char": 0,
      "end_char": 4,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    }
  ]
]
yay


In [None]:
stanza.download('it')
nlp = stanza.Pipeline('de')