# Parse HLS persons notices

The overall goal is to "clean" notices by removing html, links, abreviations etc from the notices. 

In [1]:
import pandas as pd
import re
import geovpylib.database as db
from gmpykit.ipython import infos
import gmpykit as kit


eta = kit.Eta()
db.connect_yellow('switzerland_and_beyond', execute=True)

[DB] Connecting to YELLOW database "switzerland_and_beyond" ... Connected!


## Fetch the data

In [2]:
persons = db.query('select * from hls.person')
infos(persons)

Shape:  (24966, 5) - extract:


Unnamed: 0,id,url,name,notice_html,notice
0,31,https://hls-dhs-dss.ch/fr/articles/023373,"Burckhardt, Emanuel","<span class=""hls-dnais"" locale=""fr"">25.11.1744...",Association fondée en 1933 par la congrégation...
1,47,https://hls-dhs-dss.ch/fr/articles/023622,"Dubois, René","<span class=""hls-dnais"" locale=""fr"">17.8.1905<...",Association fondée en 1933 par la congrégation...
2,72,https://hls-dhs-dss.ch/fr/articles/023462,"Cuénoud, Bernard","<span class=""hls-dnais"" locale=""fr"">2.3.1899</...",Association fondée en 1933 par la congrégation...
3,122,https://hls-dhs-dss.ch/fr/articles/023693,"Frey, Richard","<span class=""hls-dnais"" locale=""fr"">2.9.1894</...",Association fondée en 1933 par la congrégation...
4,151,https://hls-dhs-dss.ch/fr/articles/026041,"Gléresse, Joseph de","<span class=""hls-dnais"" locale=""fr"">6.12.1661<...",Association fondée en 1933 par la congrégation...


## Update the table

In [3]:
if "notice" not in persons.columns:
    db.connect_yellow('switzerland_and_beyond', execute=True)
    db.execute('alter table hls.person add notice varchar;')
    db.connect_yellow('switzerland_and_beyond', execute=False)

## Clean the HTML

In [4]:
def get_name(name):
    try: comma_index = name.index(', ')
    except: return name

    first = name[0:comma_index]
    second = name[comma_index + 1:]
    
    return second.strip() + ' ' + first.strip()

def is_date_format(string):
    splits = string.split('.')
    try:
        day = int(splits[0])
        month = int(splits[1])
        year = int(splits[2])
        return True
    except:
        return False


def handle_baptisme(notice):
    begin_words = '<span class="hls-bapt" locale="fr">'
    end_words = '</span>'
    try:
        begin = notice.index(begin_words)
        end = notice.index(end_words, begin) + len(end_words)
        origin = notice[begin:end]
        result = origin.replace(begin_words, '').replace(end_words, '')
        if is_date_format(result):
            result = 'baptisé le ' + result
        else:
            result = 'baptisé ' + result
        return notice.replace(origin, result)
    except: return notice


def handle_birthdate(notice):
    begin_words = '<span class="hls-dnais" locale="fr">'
    end_words = '</span>'
    try:
        begin = notice.index(begin_words)
        end = notice.index(end_words, begin) + len(end_words)
        origin = notice[begin:end]
        result = origin.replace(begin_words, '').replace(end_words, '')
        if is_date_format(result):
            result = 'naît le ' + result
        else:
            result = 'naît ' + result
        return notice.replace(origin, result)
    except: return notice


def handle_deathdate(notice):
    begin_words = '<span class="hls-ddec" locale="fr">'
    end_words = '</span>'
    try:
        begin = notice.index(begin_words)
        end = notice.index(end_words, begin) + len(end_words)
        origin = notice[begin:end]
        result = origin.replace(begin_words, '').replace(end_words, '')
        if is_date_format(result):
            result = 'meurt le ' + result
        else:
            result = 'meurt ' + result
        return notice.replace(origin, result)
    except: return notice


def handle_sepulture(notice):
    begin_words = '<span class="hls-sepu" locale="fr">'
    end_words = '</span>'
    try:
        begin = notice.index(begin_words)
        end = notice.index(end_words, begin) + len(end_words)
        origin = notice[begin:end]
        result = origin.replace(begin_words, '').replace(end_words, '')
        if is_date_format(result):
            result = 'enterré le' + result
        else:
            result = 'enterré ' + result
        return notice.replace(origin, result)
    except: return notice


def handle_cit(notice):
    begin_words = '<span class="hls-cit" locale="fr">'
    end_words = '</span>'
    try:
        begin = notice.index(begin_words)
        end = notice.index(end_words, begin) + len(end_words)
        origin = notice[begin:end]
        result = origin.replace(begin_words, '').replace(end_words, '')
        return notice.replace(origin, result)
    except: return notice

def handle_cit2(notice):
    begin_words = '<span class="hls-cit2" locale="fr">'
    end_words = '</span>'
    try:
        begin = notice.index(begin_words)
        end = notice.index(end_words, begin) + len(end_words)
        origin = notice[begin:end]
        result = origin.replace(begin_words, '').replace(end_words, '')
        return notice.replace(origin, result)
    except: return notice


def handle_mariage(notice):
    # Zero.1
    to_find = '. <span class="hls-alli" locale="fr"></span>avant'
    notice = notice.replace(to_find, '. Marié(e) avant')
    # Zero.2
    to_find = '. <span class="hls-alli" locale="fr"></span>après'
    notice = notice.replace(to_find, '. Marié(e) après')

    # First case without date, begining of sentence
    to_find = '. <span class="hls-alli" locale="fr"></span>'
    notice = notice.replace(to_find, '. Marié(e) à ')
    
    # First case without date, begining of sentence
    to_find = '. <span class="hls-alli" locale="fr"></span>'
    notice = notice.replace(to_find, '. Marié(e) à ')

    # Second case without date, middle of sentence
    to_find = '<span class="hls-alli" locale="fr"></span>'
    notice = notice.replace(to_find, 'marié(e) à')

    # Third case with date
    begin_words = '<span class="hls-alli" locale="fr">'
    end_words = '</span>'
    try:
        begin = notice.index(begin_words)
        end = notice.index(end_words, begin) + len(end_words)
        origin = notice[begin:end]
        result = origin.replace(begin_words, '').replace(end_words, '')
        result = 'Marié(e) (' + result + ') à'
        notice = notice.replace(origin, result)
    except: pass

    return notice


def handle_links(notice):
    begin_words = '<span class="wikilink">'
    end_words = '</span>'

    while 'href' in notice:
        try:
            begin = notice.index(begin_words)
            end = notice.index(end_words, begin) + len(end_words)
            href_begin = notice.index('"', notice.index('href')) + 1
            href_end = notice.index('"', href_begin)
            origin = notice[begin:end]

            link_content_begin = origin.index('>', origin.index('<a ')) + 1
            link_content_end = origin.index('</a>')
            link_content = origin[link_content_begin:link_content_end].strip()
            link_content = link_content.replace('-&gt;', '')
            if not link_content == '': 
                link_content += ' ('
                suffix = ')'
            else: suffix = ''
            notice = notice.replace(origin, link_content + "https://hls-dhs-dss.ch" + notice[href_begin:href_end] + suffix)
        except: 
            pass
    
    return notice


def format_begin_end(notice):
    return (notice[0:1].upper() + notice[1:]).strip()


def expand_name(notice, name):
    unique_caps_with_dot = re.findall(r'\s([A-Z])\.', notice)
    for caps in unique_caps_with_dot:
        notice = notice.replace(f' {caps}. ', ' ' + name + ' ')
        notice = notice.replace(f' {caps}.,', ' ' + name + ' ')
    return notice


def expand_shortcuts(notice):
    notice = notice.replace('cath.-chr.', 'catholique chrétien')
    notice = notice.replace('cath.', 'catholique')
    notice = notice.replace('prot.', 'protestant')
    notice = notice.replace('comm.', 'commune de')
    notice = notice.replace('auj.', 'aujourd\'hui')
    return notice

def handle_power_letters(notice):
    notice = notice.replace('<sup><span locale="fr">e</span></sup>', 'e')
    return notice

def replace_chars(notice):
    notice = notice.replace('&nbsp;', ' ')
    notice = notice.replace('&ZeroWidthSpace;', ' ')
    notice = notice.replace('\u200b', '')
    return notice

def handle_stanger_words(notice):
    begin_words = '<em><span locale="fr">'
    end_words = '</span></em>'

    while '<em>' in notice:
        try:
            begin = notice.index(begin_words)
            end = notice.index(end_words, begin) + len(end_words)
            origin = notice[begin:end]
            result = origin.replace(begin_words, '').replace(end_words, '')
            notice = notice.replace(origin, result)
        except: notice = notice
    return notice

def handle_stanger_words2(notice):
    begin_words = '<sup><span locale="fr">'
    end_words = '</span></sup>'

    while '<sup>' in notice:
        try:
            begin = notice.index(begin_words)
            end = notice.index(end_words, begin) + len(end_words)
            origin = notice[begin:end]
            result = origin.replace(begin_words, '').replace(end_words, '')
            notice = notice.replace(origin, result)
        except: notice = notice
    return notice

def handle_numbers(notice):
    return notice



special_case_1 = '''<span class="hls-dnais" locale="fr">17.6.1835</span> à Neuchâtel, <span class="hls-ddec" locale="fr">14.6.1892 (et non le 4)</span> à Berne, protestant, de Couvet et Neuchâtel. Fils de François-Victor Borel, professeur de calligraphie et directeur de la Maison des orphelins, et de Louise née Fauche. Petit-fils d'<span class="wikilink"><a class="rtxt" title="Abraham Louis Fauche" target="_blank" rel="noopener noreferrer" href="/fr/articles/015888/2006-03-28/">Abraham Louis Fauche-Borel</a></span>, arrière-petit-fils de <span class="wikilink"><a target="_blank" rel="noopener noreferrer" href="/fr/articles/044214/2004-11-23/">Samuel Fauche</a></span>. <span class="hls-alli" locale="fr">1861</span> Marie Guillaume, des Verrières,<strong><span locale="fr">&nbsp;</span></strong>fille de Louis-Constant Guillaume, notaire, et de Marianne-Virginie née Fatton,<strong><span locale="fr">&nbsp;</span></strong>sœur de <span class="wikilink"><a class="rtxt" title="Louis Guillaume" target="_blank" rel="noopener noreferrer" href="/fr/articles/014395/2008-03-13/">Louis Guillaume</a></span>. Après ses humanités au gymnase de Neuchâtel, Eugène Borel étudia le droit à Munich et Heidelberg et devint avocat dans sa ville natale. Il se lança très tôt dans la vie politique sous l'étiquette radicale (<span class="wikilink"><a target="_blank" rel="noopener noreferrer" href="/fr/articles/017378/2022-01-24/">Parti radical- démocratique</a></span>, PRD): conseiller général (1857) puis municipal (1864) de la ville de Neuchâtel, député au Grand Conseil neuchâtelois (1862) et conseiller d'Etat (Militaire, 1865-1870; Justice, 1870-1872; <span class="wikilink"><a target="_blank" rel="noopener noreferrer" href="/fr/articles/010243/2015-12-18/">gouvernements cantonaux</a></span>).'''

def handle_notice(name, notice):
    name = get_name(name)
    notice = kit.remove_bin_chars(notice)
    notice = replace_chars(notice)
    notice = handle_stanger_words(notice)
    notice = handle_stanger_words2(notice)
    notice = handle_cit(notice)
    notice = handle_cit2(notice)
    notice = handle_birthdate(notice)
    notice = handle_baptisme(notice)
    notice = handle_deathdate(notice)
    notice = handle_sepulture(notice)
    notice = handle_mariage(notice)
    notice = handle_links(notice)
    notice = handle_power_letters(notice)
    notice = format_begin_end(notice)
    notice = expand_shortcuts(notice)
    notice = expand_name(notice, name)
    notice = handle_numbers(notice)

    return notice.replace('  ', ' ')

In [5]:
eta.begin(len(persons), "Cleaning notices")
for i, row in persons.iterrows():

    if row['url'] == "https://hls-dhs-dss.ch/fr/articles/004431":
        notice_html = special_case_1
    else:
        notice_html = row['notice_html']

    cleaned = handle_notice(row['name'], notice_html)
    db.execute(f"""
        update hls.person
            set notice = '{cleaned.replace("'", "''")}'
        where id = {row['id']};
    """)
    eta.iter()
eta.end()

Cleaning notices - 24966 iterations in 00h12m52s (avg of 00h00m00s/iter)               
