# Parse HLS places notices

The overall goal is to "clean" notices by removing html, links, abreviations etc from the notices. 

In [1]:
import re
import geovpylib.database as db
from gmpykit.ipython import infos
import gmpykit as kit


eta = kit.Eta()
db.connect_yellow('switzerland_and_beyond', execute=True)

[DB] Connecting to YELLOW database "switzerland_and_beyond" ... Connected!


## Fetch the data

In [2]:
places = db.query('select * from hls.place')
infos(places)

Shape:  (5351, 5) - extract:


Unnamed: 0,id,url,name,notice_html,notice
0,28,https://hls-dhs-dss.ch/fr/articles/002345,Penthalaz,"Comm. VD, distr. du Gros-de-Vaud, sur la rive ...","commune de VD, district du Gros-de-Vaud, sur l..."
1,40,https://hls-dhs-dss.ch/fr/articles/001395,Jonschwil,"Comm. SG, région de Wil, comprenant les villag...","commune de SG, région de Wil, comprenant les v..."
2,41,https://hls-dhs-dss.ch/fr/articles/001324,Eggersriet,"Comm. SG, région de Rorschach. Village-rue sit...","commune de SG, région de Rorschach. Village-ru..."
3,59,https://hls-dhs-dss.ch/fr/articles/001166,Hägendorf,"Comm. SO, distr. d'Olten, au pied sud du Jura....","commune de SO, district d'Olten, au pied sud d..."
4,62,https://hls-dhs-dss.ch/fr/articles/001345,Rüthi (SG),"Comm. SG, région du Rheintal, comprenant les v...","commune de SG, région du Rheintal, comprenant ..."


## Update the table

In [3]:
if "notice" not in places.columns:
    db.connect_yellow('switzerland_and_beyond', execute=True)
    db.execute('alter table hls.person add notice varchar;')
    db.connect_yellow('switzerland_and_beyond', execute=False)

## Clean the HTML

In [4]:
def handle_exponent(notice):
    begin_words = '<sup><span locale="fr">'
    end_words = '</span></sup>'

    while '<sup>' in notice:
        try:
            begin = notice.index(begin_words)
            end = notice.index(end_words, begin) + len(end_words)
            origin = notice[begin:end]
            result = origin.replace(begin_words, '').replace(end_words, '')
            notice = notice.replace(origin, result)
        except: notice = notice
    return notice

def handle_ampersand(notice):
    return notice.replace('&amp;', '&')


def handle_italics(notice):
    begin_words = '<em><span locale="fr">'
    end_words = '</span></em>'

    while '<em>' in notice:
        try:
            begin = notice.index(begin_words)
            end = notice.index(end_words, begin) + len(end_words)
            origin = notice[begin:end]
            result = origin.replace(begin_words, '').replace(end_words, '')
            notice = notice.replace(origin, result)
        except: notice = notice
    return notice

def replace_chars(notice):
    notice = notice.replace('&nbsp;', ' ')
    notice = notice.replace('&ZeroWidthSpace;', ' ')
    notice = notice.replace('\u200b', '')
    return notice

def expand_shortcuts(notice):
    notice = notice.replace('cath.-chr.', 'catholique chrétien')
    notice = notice.replace('cath.', 'catholique')
    notice = notice.replace('prot.', 'protestant')
    notice = notice.replace('comm.', 'commune de')
    notice = notice.replace('distr.', 'district')
    notice = notice.replace('Comm.', 'commune de')
    notice = notice.replace('auj.', 'aujourd\'hui')
    notice = notice.replace(' s.', ' siècle')
    notice = notice.replace(' hab.', ' habitants')
    notice = notice.replace(' franç.', ' français')
    return notice


def handle_links(notice):
    begin_words = '<span class="wikilink">'
    end_words = '</span>'

    while 'href' in notice:
        try:
            begin = notice.index(begin_words)
            end = notice.index(end_words, begin) + len(end_words)
            href_begin = notice.index('"', notice.index('href')) + 1
            href_end = notice.index('"', href_begin)
            origin = notice[begin:end]

            link_content_begin = origin.index('>', origin.index('<a ')) + 1
            link_content_end = origin.index('</a>')
            link_content = origin[link_content_begin:link_content_end].strip()
            link_content = link_content.replace('-&gt;', '')
            if not link_content == '': 
                link_content += ' ('
                suffix = ')'
            else: suffix = ''
            notice = notice.replace(origin, link_content + "https://hls-dhs-dss.ch" + notice[href_begin:href_end] + suffix)
        except: 
            pass
    
    return notice


def expand_name(notice, name):
    unique_caps_with_dot = re.findall(r'\s([A-Z])\.', notice)
    for caps in unique_caps_with_dot:
        notice = notice.replace(f' {caps}. ', ' ' + name + ' ')
        notice = notice.replace(f' {caps}.,', ' ' + name + ' ')
    return notice.replace(f'-{name[0:1]}.', f'-{name}')

def handle_notice(name, notice):

    notice = notice.replace('wikiinternallink', 'wikilink')
    
    notice = handle_italics(notice)
    notice = replace_chars(notice)
    notice = handle_exponent(notice)
    notice = handle_ampersand(notice)
    notice = expand_shortcuts(notice)
    notice = handle_links(notice)
    notice = expand_name(notice, name)
    

    return notice.strip()

In [5]:
eta.begin(len(places), "Cleaning notices")
for i, row in places.iterrows():
    cleaned = handle_notice(row['name'], row['notice_html'])
    db.execute(f"""
        update hls.place
            set notice = '{cleaned.replace("'", "''")}'
        where id = {row['id']};
    """)
    eta.iter()
eta.end()

Cleaning notices - 5351 iterations in 00h02m38s (avg of 00h00m00s/iter)               


___