# Tutorial
This tutorial explains how normalization can be done file-based. This tutorial, and the used files, explain the normalization at the hand of the two named entities (NEs) "la Haye" and "la Haïe", which are eventually both normalized to "The Hague"

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### Settings

In [2]:
type_of_NE='places' # or "works" or "persons"
character_margin=50 # number of characters to display before and after the named entity

### Loading

In [3]:
texts_df = pd.read_pickle("data/processed/texts.p")

In [4]:
texts_df.head(3)

Unnamed: 0,nde,ndf,text,places,persons,works,filename,title,volume,issue,journal_title,author,country,language,date,topics
0,E1,,N°. xv . Le Lundi 18. de Juin 1714. pruntons l...,"{'la Haye': {'norm': '', 'num': 2}, 'Montpelli...","{'H. Scheurleer': {'norm': 'Scheurleer, Henri'...",{},mws.6497.xml,N°. XV.,1,15,Le Censeur ou Caractères des Mœurs de la Haye,Anonym [Jean Rousset de Missy / Nicolas de Gue...,Frankreich,French,1715 [1714],{Manners and Customs}
1,E2,,A la vérité on se lasse d’entendre un donneur ...,{},{},{},mws.6497.xml,N°. XV.,1,15,Le Censeur ou Caractères des Mœurs de la Haye,Anonym [Jean Rousset de Missy / Nicolas de Gue...,Frankreich,French,1715 [1714],{Manners and Customs}
2,E2,MT,Em Boileau pour entamer ce Discours-ci. Il me ...,"{'la Haïe': {'norm': '', 'num': 2}, 'Paris': {...","{'Boileau': {'norm': 'Boileau-Despréaux, Nicol...",{},mws.6497.xml,N°. XV.,1,15,Le Censeur ou Caractères des Mœurs de la Haye,Anonym [Jean Rousset de Missy / Nicolas de Gue...,Frankreich,French,1715 [1714],{Manners and Customs}


In [5]:
NE_series = pd.Series()
for NEs in texts_df[type_of_NE]:
    for NE in NEs:
        if NEs.get(NE).get('norm') == "":
            if NE not in NE_series:
                NE_series[NE] = 1
            else:
                NE_series[NE] += 1
        else:
            if NEs.get(NE).get('norm') not in NE_series:
                NE_series[NEs.get(NE).get('norm')] = 1
            else:
                NE_series[NEs.get(NE).get('norm')] += 1
print('Number of unique names entities: ' + str(len(NE_series)))
print('Number of names entities occurring more than once: ' + str(len(NE_series[NE_series>1])))
print('Number of names entities occurring more than 5 times: ' + str(len(NE_series[NE_series>5])))

Number of unique names entities: 1657
Number of names entities occurring more than once: 564
Number of names entities occurring more than 5 times: 154


### Storing all unnormalized named entities
In this file, there is an additional column with title 'Unclear'. If the named entity is unclear, add a "x" in this column

In [6]:
NE_df=pd.DataFrame(NE_series)
NE_df=NE_df.rename({0:"Counts"},axis=1)
NE_df['Unclear']=""
NE_df.to_csv('NE_names.csv')
NE_df.head(5)

Unnamed: 0,Counts,Unclear
la Haye,45,
Montpellier,4,
Groningue,6,
Amsterdam,66,
la Haïe,2,


# Interaction Necessary:
Open the file "NE_names.csv" in Excel and browse through the list of unnormalized named entities. If any of the named entities is unclear or ambigouous, then add an "x" in the corresponding column "Unclear". Save the changed Excel file as "NE_names_marked.csv".
# ______________________

In [7]:
NE_marked=pd.read_csv('NE_names_marked.csv',index_col=0)
NE_marked.head(5)

Unnamed: 0,Counts,Unclear
la Haye,45,x
Montpellier,4,
Groningue,6,
Amsterdam,66,
la Haïe,2,x


In [8]:
list_of_unclear_NEs=list(NE_marked.loc[NE_marked["Unclear"]=="x"].index)
list_of_unclear_NEs

['la Haye', 'la Haïe']

In [9]:
output_df=pd.DataFrame(columns=['old_index','NE','normalized_NE','text'])
for unclear_NE in list_of_unclear_NEs:
    for ind in range(len(texts_df)):
        if unclear_NE in texts_df[type_of_NE][ind]:
            if texts_df[type_of_NE][ind].get(unclear_NE).get('norm')=='':
                normalized_NE=''
            else:
                normalized_NE=texts_df[type_of_NE][ind].get(unclear_NE).get('norm')
                
            NE_position=texts_df['text'][ind].find(unclear_NE)
            text=texts_df['text'][ind][max(0,NE_position-character_margin):min(len(texts_df['text'][ind]),NE_position+character_margin+len(unclear_NE))]
            output_df=output_df.append({'old_index':ind, 'NE':unclear_NE,'normalized_NE':normalized_NE,'text':text},ignore_index=True)
        else:
            for NE in texts_df[type_of_NE][ind]:
                if unclear_NE == texts_df[type_of_NE][ind].get(NE).get('norm'):
                    NE_position=texts_df['text'][ind].find(NE)
                    text=texts_df['text'][ind][max(0,NE_position-character_margin):min(len(texts_df['text'][ind]),NE_position+character_margin+len(NE))]
                    output_df=output_df.append({'old_index':ind, 'NE':NE,'normalized_NE':unclear_NE,'text':text},ignore_index=True)

In [10]:
output_df.to_csv('NEs_to_normalize.csv',sep='|')

# Interaction Necessary:
Open the file "NEs_to_normalize.csv" in Excel, using only the symbol "|" as separator. This file contains every occurrence of every named entity that was marked as unclear or ambiguous, together with the text it is surrounded with. For reference, it also occurs normalizations of named entities, if available. For every occurrence, add the normalization in the column "normalized_NE". Save the changed Excel file as "NE_normalized.csv".
# ______________________

In [11]:
normalized_df=pd.read_csv('NEs_normalized.csv',sep='|',index_col=0)
normalized_df.head(3)

Unnamed: 0,old_index,NE,normalized_NE,text
0,0,la Haye,The Hague,"ement . On trouve chez H. Scheurleer , Librair..."
1,284,la Haye,The Hague,N°. xli . Le Lundi 17. de Décembre 1714. A la ...
2,328,la Haye,The Hague,N°. xxiii . Le Lundi 13. d’Août 1714. A la Hay...


### Add normalization to the dataframe

In [12]:
for ind in range(len(normalized_df)):
    texts_df[type_of_NE][normalized_df['old_index'][ind]][normalized_df['NE'][ind]]['norm']=normalized_df['normalized_NE'][ind]

In [13]:
NE_series = pd.Series()
for NEs in texts_df[type_of_NE]:
    for NE in NEs:
        if NEs.get(NE).get('norm') == "":
            if NE not in NE_series:
                NE_series[NE] = 1
            else:
                NE_series[NE] += 1
        else:
            if NEs.get(NE).get('norm') not in NE_series:
                NE_series[NEs.get(NE).get('norm')] = 1
            else:
                NE_series[NEs.get(NE).get('norm')] += 1
print('Number of unique names entities: ' + str(len(NE_series)))
print('Number of names entities occurring more than once: ' + str(len(NE_series[NE_series>1])))
print('Number of names entities occurring more than 5 times: ' + str(len(NE_series[NE_series>5])))

Number of unique names entities: 1655
Number of names entities occurring more than once: 562
Number of names entities occurring more than 5 times: 153


One can now save back the normalization to the pickle file.

In [14]:
texts_df.to_pickle("data/processed/texts_normalized.p")