# Parse HLS themes notices

The overall goal is to "clean" notices by removing html, links, abreviations etc from the notices. 

In [1]:
import re
import geovpylib.database as db
from gmpykit.ipython import infos
import gmpykit as kit


eta = kit.Eta()
db.connect_yellow('switzerland_and_beyond', execute=True)

[DB] Connecting to YELLOW database "switzerland_and_beyond" ... Connected!


## Fetch the data

In [2]:
themes = db.query('select * from hls.theme')
infos(themes)

Shape:  (3089, 5) - extract:


Unnamed: 0,id,url,name,notice_html,notice
0,2721,https://hls-dhs-dss.ch/fr/articles/007779,Grêle,Précipitation de grains de glace de 5 à 50&nbs...,
1,2734,https://hls-dhs-dss.ch/fr/articles/014889,"Gachnang, affaire de",Hektor von Beroldingen ayant essayé de ramener...,
2,2889,https://hls-dhs-dss.ch/fr/articles/026838,"Ringgenberg, affaire de",Lorsque Petermann von Ringgenberg devint seign...,
3,2890,https://hls-dhs-dss.ch/fr/articles/048330,Nidwaldner Volksblatt,"Ancien journal du canton de Nidwald, le <em><s...",
4,285,https://hls-dhs-dss.ch/fr/articles/017358,Mammelus,Surnom donné aux Genevois - par analogie aux m...,


## Update the table

In [3]:
if "notice" not in themes.columns:
    db.connect_yellow('switzerland_and_beyond', execute=True)
    db.execute('alter table hls.person add notice varchar;')
    db.connect_yellow('switzerland_and_beyond', execute=False)

## Clean the HTML

In [46]:
def handle_exponent(notice):
    begin_words = '<sup><span locale="fr">'
    end_words = '</span></sup>'

    while '<sup>' in notice:
        try:
            begin = notice.index(begin_words)
            end = notice.index(end_words, begin) + len(end_words)
            origin = notice[begin:end]
            result = origin.replace(begin_words, '').replace(end_words, '')
            notice = notice.replace(origin, result)
        except: notice = notice
    return notice

def handle_ampersand(notice):
    return notice.replace('&amp;', '&')


def handle_italics(notice):
    begin_words = '<em><span locale="fr">'
    end_words = '</span></em>'

    while '<em>' in notice:
        try:
            begin = notice.index(begin_words)
            end = notice.index(end_words, begin) + len(end_words)
            origin = notice[begin:end]
            result = origin.replace(begin_words, '').replace(end_words, '')
            notice = notice.replace(origin, result)
        except: notice = notice
    return notice

def replace_chars(notice):
    notice = notice.replace('&nbsp;', ' ')
    notice = notice.replace('&ZeroWidthSpace;', ' ')
    notice = notice.replace('\u200b', '')
    return notice

def expand_shortcuts(notice):
    notice = notice.replace('cath.-chr.', 'catholique chrétien')
    notice = notice.replace('cath.', 'catholique')
    notice = notice.replace('prot.', 'protestant')
    notice = notice.replace('comm.', 'commune de')
    notice = notice.replace('auj.', 'aujourd\'hui')
    notice = notice.replace(' s.', ' siècle')
    return notice


def handle_links(notice):
    begin_words = '<span class="wikilink">'
    end_words = '</span>'

    while 'href' in notice:
        try:
            begin = notice.index(begin_words)
            end = notice.index(end_words, begin) + len(end_words)
            href_begin = notice.index('"', notice.index('href')) + 1
            href_end = notice.index('"', href_begin)
            origin = notice[begin:end]

            link_content_begin = origin.index('>', origin.index('<a ')) + 1
            link_content_end = origin.index('</a>')
            link_content = origin[link_content_begin:link_content_end].strip()
            link_content = link_content.replace('-&gt;', '')
            if not link_content == '': 
                link_content += ' ('
                suffix = ')'
            else: suffix = ''
            notice = notice.replace(origin, link_content + "https://hls-dhs-dss.ch" + notice[href_begin:href_end] + suffix)
        except: 
            pass
    
    return notice

def handle_notice(name, notice):

    notice = notice.replace('wikiinternallink', 'wikilink')
    
    notice = handle_italics(notice)
    notice = replace_chars(notice)
    notice = handle_exponent(notice)
    notice = handle_ampersand(notice)
    notice = expand_shortcuts(notice)
    notice = handle_links(notice)
    

    return notice.strip()

In [49]:
eta.begin(len(themes), "Cleaning notices")
for i, row in themes.iterrows():
    cleaned = handle_notice(row['name'], row['notice_html'])
    db.execute(f"""
        update hls.theme
            set notice = '{cleaned.replace("'", "''")}'
        where id = {row['id']};
    """)
    eta.iter()
eta.end()

Cleaning notices - 3089 iterations in 00h01m29s (avg of 00h00m00s/iter)               


___