In [1]:
import unicodedata

import sys

sys.path.append("../../../src")
sys.path.append("../../../scripts")
from inception_fishing import Annotation



In [2]:
from s2_prepare_articles import *
import spacy

In [3]:
spacy_tokenizer = spacy.load("fr_core_news_sm")

# Documents' text preparation
## Normalizing texts and investigating text issues

In [4]:
# Evaluating normalized text difference with original text:

polities_dtf["normalized_text"] = polities_dtf.document.apply(lambda d: unicodedata.normalize("NFKC",d.text))

polities_dtf["len_normalized_text"] = polities_dtf["normalized_text"].apply(len)
polities_dtf["len_unnormalized_text"] = polities_dtf.document.apply(lambda d: len(d.text))
polities_dtf["len_diff_normalized_text"] = polities_dtf["len_unnormalized_text"] - polities_dtf["len_normalized_text"]
polities_dtf["len_diff_normalized_text"].value_counts()

 0    4411
-4       2
-2       1
Name: len_diff_normalized_text, dtype: int64

In [5]:
# Character normalization investigation -> NFKC is the way to go :-)

# Zoug had a very big 2529 char diff using NFKD
lendif2529NFKD = polities_dtf[polities_dtf["hds_article_id"]=="007373"]

def investigate_norm_len_diff(dtf, i=0):
    norm_text = dtf["normalized_text"].iloc[i]
    unnorm_text = dtf.document.iloc[i].text

    norm_chars = set(norm_text)
    unnorm_chars = set(unnorm_text)

    unnorm_missing_chars = [ c for c in unnorm_chars if c not in norm_chars]
    norm_missing_chars = [ c for c in norm_chars if c not in unnorm_chars]

    #return (norm_text, unnorm_text, norm_chars, unnorm_chars, norm_missing_chars, unnorm_missing_chars)
    return (norm_missing_chars, unnorm_missing_chars)

investigate_norm_len_diff(lendif2529NFKD)

([], ['\xa0'])

In [6]:
lendifNFKC = polities_dtf[polities_dtf["len_diff_normalized_text"]!=0]

"""Still some problem with some minor characters... we'll come to it when we need ta"""
polities_dtf.loc[polities_dtf.polity_id.apply(lambda i: i in ["001256-c", "001321-c", "007384-ct"]),:]


[investigate_norm_len_diff(lendifNFKC, i) for i in range(lendifNFKC.shape[0])]

[([], ['…', '\xa0']), (['⁄'], ['⅔', '\xa0', '⅓']), ([], ['…', '\xa0'])]

In [7]:
def normalize_unicode_text(text):
    """unicode normalization NFKD removes accents in characters -> NFKC is the way to go :-)
    
        /!\\ /!\\ /!\\ normalizing text yields different text lengths for 3 articles (out of 4000), use with caution /!\\/!\\/!\\
        but it is needed for proper tokenization/prediction/learning"""

    return unicodedata.normalize("NFKC",text)

In [8]:
grandson_dtf = polities_dtf[polities_dtf.toponym=="Grandson"]
grandson_article = grandson_dtf.article.iloc[0]
grandson_document = grandson_dtf.document.iloc[0]
doc = spacy_tokenizer(normalize_unicode_text(grandson_document.text))

In [9]:
grandson_tokens = [token for token in doc if token.text =="Grandson"]

grandson_tokens

[Grandson,
 Grandson,
 Grandson,
 Grandson,
 Grandson,
 Grandson,
 Grandson,
 Grandson,
 Grandson,
 Grandson,
 Grandson,
 Grandson,
 Grandson,
 Grandson,
 Grandson]

In [10]:
seigneurs_tokens = [token for token in doc if token.text =="seigneurs"]
[seigneurs_tokens[0].nbor(i) for i in range(-5,5)]

[Liée, à, l', apparition, des, seigneurs, de, Grandson, et, de]

In [11]:
nb_prev = 2

grandson_tokens_3g = [[t.nbor(i) for i in range(-nb_prev,1)] for t in grandson_tokens]

grandson_tokens_3g

[[commune, de, Grandson],
 [importante, de, Grandson],
 [attesté, à, Grandson],
 [seigneurs, de, Grandson],
 [,, les, Grandson],
 [spirituel, ,, Grandson],
 [ville, de, Grandson],
 [cimetière, de, Grandson],
 [commun, de, Grandson],
 [introduites, à, Grandson],
 [occidentale, de, Grandson],
 [traversée, de, Grandson],
 [), ,, Grandson],
 [population, de, Grandson],
 [qui, relie, Grandson]]

In [12]:
"""
Strategy rule-based annotations:
- tokenize list of articles
    - use doc text
    - normalize
- frequency table of N predecessor words
    -> identify the ones that are relevant statuswords
- extract all sequences of the form statuswords-X-X-toponym
- identify the relevant sequences representing an entity
- identify which entity each sequence corresponds to
    -> each statuswords refers to a list of possible entity type
    -> entity type + statusword

"""

'\nStrategy rule-based annotations:\n- tokenize list of articles\n    - use doc text\n    - normalize\n- frequency table of N predecessor words\n    -> identify the ones that are relevant statuswords\n- extract all sequences of the form statuswords-X-X-toponym\n- identify the relevant sequences representing an entity\n- identify which entity each sequence corresponds to\n    -> each statuswords refers to a list of possible entity type\n    -> entity type + statusword\n\n'

In [13]:
polities_dtf.drop(
    ["normalized_text", "len_normalized_text", "len_unnormalized_text",
    "len_diff_normalized_text", "len_diff_normalized_text"],
    axis=1,
    inplace=True
)
if False:

    def normalize_doc_text(d):
        """ /!\\ use with caution, see above"""
        d.text = normalize_unicode_text(d.text)

    polities_dtf.document.apply(normalize_doc_text)
    ""

In [14]:
sampled_articles_ids = set(sampled_articles_ids)
#sampled_polities_dtf = polities_dtf[polities_dtf.hds_article_id.apply(lambda id: id in sampled_articles_ids)]

In [15]:
additional_columns = ["article", "document"]
articles_dtf = get_articles_dtf_from_polities_dtf(polities_dtf, additional_columns)

# Propositions to improve code quality

...

#  Polity recognition

## Identifying toponyms' tokens

In [16]:
def add_tokenized_text(dtf, tokenizer):
    """
    takes a dtf with "document" column
    + adds the following columns:
        - tokens: spacy tokenization of text column
    """
    dtf["tokens"] = dtf.document.apply(lambda d: tokenizer(normalize_unicode_text(d.text)))
    return dtf

def add_toponyms(dtf, tokenizer):
    """
    takes a dtf with a "toponym" column
    + adds the following columns:
        - tokenized_toponym: spacy tokenization texts of toponym
    """
    dtf["tokenized_toponym"] = dtf.toponym.apply(lambda t: set([tok.text for tok in tokenizer(normalize_unicode_text(t))]))

In [17]:
# take into account the fact that toponym might span multiple tokens
#articles_dtf["tokenized_toponym"] = articles_dtf.toponym.apply(lambda t: set([tok.text for tok in spacy_tokenizer(normalize_unicode_text(t))]))
add_toponyms(articles_dtf, spacy_tokenizer)
toponym_tokens = articles_dtf["tokenized_toponym"].explode()
#[t for t in utoponym_tokens if len(t)==4]
toponym_tokens_value_counts = toponym_tokens.value_counts()
toponym_tokens_value_counts[toponym_tokens_value_counts>1].shape
toponym_tokens_value_counts.shape


#with pd.option_context('display.max_rows', None, 'display.max_columns', None):
toponym_tokens_value_counts

-             180
La             45
bei            33
Les            25
Le             25
             ... 
Onex            1
Liesberg        1
Prangins        1
Weesen          1
Tarentaise      1
Name: tokenized_toponym, Length: 3768, dtype: int64

In [18]:
not_toponym_tokens = {"'",
 '-',
 '/',
 "L'",
 'La',
 'Lac',
 'Le',
 'Les',
 'Nord',
 'S',
 'See',
 'Sud',
 'Sur',
 'am',
 'an',
 'bei',
 'ch',
 "d'",
 'da',
 'dans',
 'de',
 'der',
 'des',
 'di',
 'du',
 'et',
 'im',
 'in',
 'l',
 'la',
 'le',
 'les',
 'près',
 'sur',
 'zum',
 "vers"
}
ambiguous_toponym_tokens={
 'Au',
 'Bois',
 'Col',
 'Dieu',
 'Eaux',
 'Ile',
 'Part',
 'Pays',
 'Pont',
 'Port',
 'Rue',
 "vaudois",
 "helvétique",
}

In [19]:



def normalize_toponyms(dtf, not_toponym_tokens_texts, ambiguous_toponym_tokens_texts):
    """
    takes a dtf coming from add_toponyms()
    + adds the following columns:
        - loose_normalized_tokenized_toponym: all toponym tokens that are generally toponym (exlcuding "les", "la", etc...)
        - strict_normalized_tokenized_toponym: same as loose_normalized_tokenized_toponym, excluding in addition ambiguous toponym tokens ("eaux", "ile", "bois", "col", etc...)
    + returns:
        - normalized_toponym_tokens: set of all strict_normalized_tokenized_toponym
    """
    dtf["loose_normalized_tokenized_toponym"] = [[s for s in texts if s not in not_toponym_tokens_texts] for texts in dtf["tokenized_toponym"]]
    dtf["strict_normalized_tokenized_toponym"] = [[s for s in texts if s not in ambiguous_toponym_tokens_texts] for texts in dtf["loose_normalized_tokenized_toponym"]]
    normalized_toponym_tokens = set(dtf["strict_normalized_tokenized_toponym"].explode())
    return normalized_toponym_tokens

In [20]:
#articles_dtf["loose_normalized_tokenized_toponym"] = [[s for s in texts if s not in not_toponym_tokens] for texts in articles_dtf["tokenized_toponym"]]
#articles_dtf["strict_normalized_tokenized_toponym"] = [[s for s in texts if s not in ambiguous_toponym_tokens] for texts in articles_dtf["loose_normalized_tokenized_toponym"]]

#normalized_toponym_tokens = set(articles_dtf["strict_normalized_tokenized_toponym"].explode())

normalized_toponym_tokens = normalize_toponyms(articles_dtf, not_toponym_tokens, ambiguous_toponym_tokens)

In [21]:
sampled_articles_dtf = articles_dtf[articles_dtf.hds_article_id.apply(lambda id: id in sampled_articles_ids)].copy()

#sampled_articles_dtf["tokens"] = sampled_articles_dtf.document.apply(lambda d: spacy_tokenizer(normalize_unicode_text(d.text)))
add_tokenized_text(sampled_articles_dtf, spacy_tokenizer)
sampled_articles_dtf.head()

Unnamed: 0,hds_article_id,toponym,geoidentifier,article_title,nbtags,max_level,article,document,polities_ids,nb_polities,tokenized_toponym,loose_normalized_tokenized_toponym,strict_normalized_tokenized_toponym,tokens
43,627,Meierskappel,,Meierskappel,1,10.0,"DhsArticle(fr, 000627, Meierskappel, text loaded)","Document(Meierskappel, 5 annot., text=""Meiersk...",[000627-c],1,{Meierskappel},[Meierskappel],[Meierskappel],"(Meierskappel, \n, Comm, ., LU, ,, distr, ., d..."
77,90,Rüti,ZH,"Rüti ZH, commune",1,10.0,"DhsArticle(fr, 000090, Rüti (ZH, commune), tex...","Document(Rüti ZH, commune, 8 annot., text=""Rüt...",[000090-c],1,{Rüti},[Rüti],[Rüti],"(RütiZ, H, ,, commune, \n, Comm, ., ZH, ,, dis..."
112,2946,Pleigne,,Pleigne,1,10.0,"DhsArticle(fr, 002946, Pleigne, text loaded)","Document(Pleigne, 4 annot., text=""Pleigne\nCom...",[002946-c],1,{Pleigne},[Pleigne],[Pleigne],"(Pleigne, \n, Comm, ., JU, ,, distr, ., de, De..."
234,1713,Villigen,,Villigen,1,10.0,"DhsArticle(fr, 001713, Villigen, text loaded)","Document(Villigen, 3 annot., text=""Villigen\nC...",[001713-c],1,{Villigen},[Villigen],[Villigen],"(Villigen, \n, Comm, ., AG, ,, distr, ., de, B..."
285,297,Diemerswil,,Diemerswil,1,10.0,"DhsArticle(fr, 000297, Diemerswil, text loaded)","Document(Diemerswil, 2 annot., text=""Diemerswi...",[000297-c],1,{Diemerswil},[Diemerswil],[Diemerswil],"(Diemerswil, \n, Comm, ., BE, ,, distr, ., de,..."


In [22]:
def is_token_toponym(token, dtf_row, normalized_toponym_tokens):
    """Checks that a given token is a toponym (either corresponding to any strict toponym, or a loose toponym from the particular article toponym
    """
    return (
        token.text in normalized_toponym_tokens
        or token.text in dtf_row.loose_normalized_tokenized_toponym
    )

## Extracting toponym sequences

In [23]:
def add_toponym_tokens_sequences(dtf, normalized_toponym_tokens, nb_predecessors = 10, nb_successors = 3):
    """takes nb_predecessors, nb_successors and a dtf coming from add_toponyms() and add_tokenized_text()
    + adds the following columns:
        - toponym_tokens: tokens from the text that are identified as a toponym 
        - toponyms_tokens_sequences: for each toponym_token, a sequence according from nb_pred to nb_succ

    Note: are not considered toponyms token sequences:
    - the first token of a text (cannot have an accompanying statusword)
    - a toponym token directly following another toponym token, they are very probably 2 tokens from the same toponym
    """
    # toponym_tokens: tokens in the text that denote a toponym
    dtf["toponym_tokens"] = [
        # handle the case of multiple toponym tokens: in this case only take first token each time
        [
            token for token in row.tokens
            if is_token_toponym(token, row, normalized_toponym_tokens)
            # ensure we're not taking toponym tokens that just follow another toponym 
            and (token.i==0 or not is_token_toponym(token.nbor(-1), row, normalized_toponym_tokens))
        ]
        for k, row in dtf.iterrows()
    ]


    dtf["toponyms_tokens_sequences"] = [
        [
            [t.nbor(i) for i in range(-min(nb_predecessors,t.i),min(nb_successors, len(t.doc)-t.i))]
            for t in row.toponym_tokens
        ]
        for k, row in dtf.iterrows()
    ]


In [24]:
nb_predecessors = 10
nb_successors = 3
add_toponym_tokens_sequences(sampled_articles_dtf, normalized_toponym_tokens, nb_predecessors, nb_successors)


In [25]:
# toponym_sequence_tokens: all tokens present in any toponym sequence
toponym_sequence_tokens = pd.Series([
    [
        #(print(f"nb_successors={nb_successors}, len(t.doc)={len(t.doc)}, t.i={t.i}, len(t.doc)-t.i)+1={(len(t.doc)-t.i)+1}, total={min(nb_successors, len(t.doc)-t.i)}"),
        t.nbor(i)
        for t in row.toponym_tokens
        for i in range(-min(nb_predecessors,t.i),min(nb_successors, len(t.doc)-t.i))
        if (t.i==0 or not is_token_toponym(t.nbor(-1), row, normalized_toponym_tokens))
    ]
    for k, row in sampled_articles_dtf.iterrows()
])
toponym_sequence_tokens = toponym_sequence_tokens.apply(lambda ts: [t.text for t in ts]).explode()
toponym_sequence_tokens_value_counts = toponym_sequence_tokens.value_counts().to_frame()
toponym_sequence_tokens_value_counts.columns = ['toponym_sequence_tokens']
toponym_sequence_tokens_value_counts.to_csv("toponym_sequence_tokens_value_counts.csv", sep="\t")

In [26]:
toponym_sequence_tokens_value_counts

Unnamed: 0,toponym_sequence_tokens
",",3588
de,3111
.,1517
et,1145
la,1112
...,...
annexée,1
1224,1
Création,1
explosion,1


## Identifying statuswords

In [27]:
#pd.set_option('display.max_rows', None)
toponym_sequence_tokens_value_counts[toponym_sequence_tokens_value_counts.toponym_sequence_tokens==2]

statusword_token_text = [
    # lordships
    "seigneur",
    "seigneurs",
    "seigneurie",
    "seigneuries",
    "châtellenie",
    "châtellenies",
    "comte",
    "comtes",
    "comté",
    "comtés",
    "baron",
    "barons",
    "baronnie",
    "baronnies",
    "duc",
    "ducs",
    "duché",
    "duchés",
    # cantons
    "canton",
    "cantons",
    # administrative: district/bailliage/etc
    "district",
    "districts",
    "distr",
    "dizain",
    "dizains"
    "bailliage",
    "bailliages",
    "bailli",
    "baillis",
    # commmunes
    "commune",
    "communes",
    "comm",
    "village",
    "villages",
    "municipalité",
    "municipalités"
    "communauté",
    "communautés",
    "munizipalgemeinde",
    "ortsgemeinden",
    "ville",
    "hameau",
    "hameaux",
    "paroisse",
    "paroisses",
    "bourgade",
    "bourgades",
    "localité",
    "localités",
    "cercle",
    "cercles",
    # ecclesiastical
    "chapitre",
    "prieuré",
    "prieurés",
    "abbaye",
    "abbayes",
    "abbé",
    "abbés",
    "évêque",
    "évêques",
    "évêché",
    "évêchés",
    "diocèse",
    "diocèses",
    "monastère",
    "monastères",
    "avouerie",
    "avoueries",
    "commanderie",
    "commanderies",
    # fuzzy terms
    "région",
    "régions",
    "domaine",
    "domaines",
    "vallon",
    "administration",
    "circonscriptions",

]

ambiguous_statusword_token_text = [
    "vallée",
    "château",
    "châteaux"
    "terres",
    "territoire",
    "église",
    #"dîme", # -> ?
    "juridictions",
    "juridiction",
    "forêt",
    "forêts",
    "possession",
    "possessions",
    "ferme",
    "fermes",
    "suzeraineté",
    #"vestiges",# -> ?
    #"hôpital",

]

In [28]:
toponyms_tokens_sequences = [
    seq
    for toponyms_tokens_sequences in sampled_articles_dtf.toponyms_tokens_sequences
    for seq in toponyms_tokens_sequences
]

In [29]:
def identify_statuswords_toponyms_sequences(dtf, statusword_token_text,
    columns_filter =['hds_article_id', 'toponym', 'geoidentifier', 'article_title', 'polities_ids', 'nb_polities',
        'tokenized_toponym', 'loose_normalized_tokenized_toponym',
        'strict_normalized_tokenized_toponym',
        'statusword_tokens_sequences'
    ]):
    """
    takes a dtf coming from add_toponym_tokens_sequences()
    + adds a column "statusword_tokens_sequences" to dtf containing all the toponyms_tokens_sequences also containing a statusword
    + returns a new dtf statusword_tokens_sequences_dtf with one row per sequence containing at least 1 statusword and 1 toponym
    """
    dtf["statusword_tokens_sequences"] = [
        [
            seq
            for seq in toponyms_tokens_sequences if any(token.text.lower() in statusword_token_text for token in seq)]
            for toponyms_tokens_sequences in dtf.toponyms_tokens_sequences
    ]

    statusword_tokens_sequences_dtf = dtf.explode("statusword_tokens_sequences")
    statusword_tokens_sequences_dtf = statusword_tokens_sequences_dtf[columns_filter].copy()

    statusword_tokens_sequences_dtf = statusword_tokens_sequences_dtf[~statusword_tokens_sequences_dtf.statusword_tokens_sequences.isna()]
    return statusword_tokens_sequences_dtf


In [30]:
statusword_tokens_sequences_dtf = identify_statuswords_toponyms_sequences(sampled_articles_dtf, statusword_token_text)

## Analysing sequences structure
STATUS-XX-TOPONYM

In [31]:
def analyse_statusword_tokens_sequence_single(dtf_row, token_sequence, statusword_index, toponym_index):
    """Analyses a single statusword-toponym combination

    returns a sequence whose first term is the sequence's statusword, and the last word is the sequence's toponym
    """
    sequence = token_sequence[statusword_index:(toponym_index+1)]
    sequence_structure = [
        "STATUS" if token.text.lower() in statusword_token_text else(
        "TOPONYM" if is_token_toponym(token, dtf_row, normalized_toponym_tokens)
        else token.text
        )
        for token in sequence
    ]
    statusword = token_sequence[statusword_index]
    toponym = token_sequence[toponym_index]
    return (statusword, toponym, sequence, sequence_structure)

def analyse_statusword_tokens_sequence(dtf_row, token_sequence):
    """Returns all the possible statusword-toponym combination analyses for a given token sequence
    """
    statusword_indices = [i for i,tok in enumerate(token_sequence) if tok.text.lower() in statusword_token_text]
    #toponym_indices = [i for i,tok in enumerate(token_sequence) if tok.text in normalized_toponym_tokens or tok.text in dtf_row.loose_normalized_tokenized_toponym]
    toponym_indices = [len(token_sequence)-nb_successors] # the toponym is always at the same spot in the sequence
    sequences_analyses = [
        analyse_statusword_tokens_sequence_single(dtf_row, token_sequence, i, j)
        for i in statusword_indices for j in toponym_indices if i<j
    ]
    return sequences_analyses

In [32]:
def analyse_statuswords_toponyms_sequences(dtf):
    """
    takes a dtf coming from identify_statuswords_toponyms_sequences()
    + adds "sequence_analysis" column to dtf (from analyse_statusword_tokens_sequence())
    + returns a new dtf sequences_analyses_dtf with one row per statusword+toponym combination (multiple rows possible for one toponym sequence)"""
    dtf["sequence_analysis"] = [
        analyse_statusword_tokens_sequence(row, row.statusword_tokens_sequences)
        for k, row in dtf.iterrows()
    ]
    sequences_analyses_dtf = dtf.explode("sequence_analysis")
    sequences_analyses_dtf = sequences_analyses_dtf[~sequences_analyses_dtf.sequence_analysis.isna()]
    sequences_analyses_dtf["statusword"] = sequences_analyses_dtf.sequence_analysis.apply(lambda sa: sa[0])
    sequences_analyses_dtf["sequence_toponym"] = sequences_analyses_dtf.sequence_analysis.apply(lambda sa: sa[1])
    sequences_analyses_dtf["sequence"] = sequences_analyses_dtf.sequence_analysis.apply(lambda sa: sa[2])
    sequences_analyses_dtf["sequence_structure"] = sequences_analyses_dtf.sequence_analysis.apply(lambda sa: sa[3])
    sequences_analyses_dtf["sequence_structure_str"] = sequences_analyses_dtf["sequence_structure"].apply(lambda ss: "-".join(ss))
    return sequences_analyses_dtf

In [33]:
sequences_analyses_dtf = analyse_statuswords_toponyms_sequences(statusword_tokens_sequences_dtf)
sequence_structures = sequences_analyses_dtf["sequence_structure_str"].value_counts()


In [34]:
sequence_structures
sequence_structures.to_frame().to_csv(s2_sequence_structures_counts_csv, sep="\t")
sequence_structures[sequence_structures>3]

STATUS-de-TOPONYM                        495
STATUS-d'-TOPONYM                         70
STATUS-.-de-TOPONYM                       39
STATUS-du-TOPONYM                         25
STATUS-.-TOPONYM                          15
STATUS-de-TOPONYM---TOPONYM               13
STATUS-de-la-TOPONYM                      12
STATUS-de-TOPONYM-,-TOPONYM               11
STATUS-du-STATUS-de-TOPONYM                9
STATUS-des-TOPONYM                         8
STATUS-.-du-TOPONYM                        7
STATUS-.-VD-,-STATUS-.-du-TOPONYM          6
STATUS-.-d'-TOPONYM                        6
STATUS-TOPONYM                             6
STATUS-,-TOPONYM                           5
STATUS-(-TOPONYM                           5
STATUS-de-TOPONYM-(-TOPONYM                5
STATUS-TI-,-STATUS-de-TOPONYM              5
STATUS-.-BE-,-STATUS-.-de-TOPONYM          5
STATUS-de-TOPONYM-à-TOPONYM                5
STATUS-)-de-TOPONYM                        4
STATUS-cathédral-de-TOPONYM                4
STATUS-.-A

In [35]:
sequence_structure = "STATUS-\n-Dizain-du-TOPONYM"

sequence_structures_human_columns = ['toponym', 'article_title', 'polities_ids', "statusword", "sequence", "sequence_structure"]

sequences_analyses_dtf.loc[sequences_analyses_dtf["sequence_structure_str"]==sequence_structure,sequence_structures_human_columns]

Unnamed: 0,toponym,article_title,polities_ids,statusword,sequence,sequence_structure


## Isolating valid statusword-toponym sequences structures

In [36]:
valid_sequence_structures = pd.read_csv(s2_sequence_structures_validation_csv, sep="\t")
valid_sequence_structures = set(valid_sequence_structures[valid_sequence_structures.validity=="yes"].structure)
valid_sequence_structures

{'STATUS---rue-de-TOPONYM',
 'STATUS-.-TOPONYM',
 "STATUS-.-d'-TOPONYM",
 'STATUS-.-de-TOPONYM',
 'STATUS-.-de-TOPONYM---TOPONYM',
 'STATUS-.-de-la-TOPONYM',
 'STATUS-.-du-TOPONYM',
 'STATUS-TOPONYM',
 'STATUS-bernois-de-TOPONYM',
 'STATUS-bénédictine-de-TOPONYM',
 'STATUS-cathédral-de-TOPONYM',
 'STATUS-clunisien-de-TOPONYM',
 'STATUS-commun-de-TOPONYM',
 "STATUS-d'-TOPONYM",
 'STATUS-de-La-TOPONYM',
 'STATUS-de-TOPONYM',
 'STATUS-de-TOPONYM-,-TOPONYM-,-TOPONYM-,-TOPONYM',
 'STATUS-de-TOPONYM-,-TOPONYM-,-TOPONYM-et-TOPONYM',
 'STATUS-de-TOPONYM-et-TOPONYM',
 'STATUS-de-TOPONYM-et-de-TOPONYM',
 "STATUS-de-l'-TOPONYM",
 'STATUS-de-la-TOPONYM',
 'STATUS-de-la-TOPONYM---TOPONYM',
 'STATUS-des-TOPONYM',
 'STATUS-du-Lac-de-TOPONYM',
 'STATUS-du-TOPONYM',
 "STATUS-libre-d'-TOPONYM",
 'STATUS-savoyarde-de-TOPONYM',
 'STATUS-sur-le-TOPONYM',
 'STATUS-électoral-de-TOPONYM'}

In [37]:
sequence_structures.shape

(832,)

In [38]:
def validate_statuswords_toponyms_sequences(dtf, valid_sequence_structures):
    """
    takes valid_sequence_structures set of strings and a dtf coming from explode_statuswords_toponyms_sequences()
    + returns a new dtf valid_sequences_dtf containing the valid statuswords_toponyms_sequences
    """
    valid_sequences_dtf = dtf[dtf.sequence_structure_str.apply(lambda struct: struct in valid_sequence_structures)].copy()
    return valid_sequences_dtf

In [39]:
valid_sequences_dtf = validate_statuswords_toponyms_sequences(sequences_analyses_dtf, valid_sequence_structures)
valid_sequences_dtf.shape

(727, 16)

In [40]:
with open(s2_statusword_to_typology_json) as f:
    statusword_keys_dict = json.load(f)

statusword_to_typology_dict = {
    statusword : t[1] 
    for t in statusword_keys_dict
    for statusword in t[0]
}

statusword_to_hdstag_dict = {
    statusword : t[2] 
    for t in statusword_keys_dict
    for statusword in t[0]
}

## Getting polities_dtf toponyms' tokens

In [41]:
polities_dtf[polities_dtf.typology=="baillage"].tail()

Unnamed: 0,polity_id,hds_tag,canonic_title,typology,toponym,geoidentifier,hds_article_id,article_title,nbtags,level,max_level,article,document
3885,008434-b,"Entités politiques / Bailliage, châtellenie",baillage de Aarberg,baillage,Aarberg,,8434,"Aarberg seigneurie, district",3,20.0,20.0,"DhsArticle(fr, 008434, Aarberg (seigneurie, di...","Document(Aarberg seigneurie, district, 2 annot..."
3890,007528-b,"Entités politiques / Bailliage, châtellenie",baillage de Büron,baillage,Büron,,7528,Büron seigneurie,2,20.0,20.0,"DhsArticle(fr, 007528, Büron (seigneurie), tex...","Document(Büron seigneurie, 2 annot., text=""Bür..."
3901,008295-b,"Entités politiques / Bailliage, châtellenie",baillage de Lenzbourg,baillage,Lenzbourg,,8295,"Lenzbourg comté, district",2,20.0,20.0,"DhsArticle(fr, 008295, Lenzbourg (comté, distr...","Document(Lenzbourg comté, district, 11 annot.,..."
3903,008296-b,"Entités politiques / Bailliage, châtellenie",baillage de Baden,baillage,Baden,,8296,"Baden comté, district",3,20.0,20.0,"DhsArticle(fr, 008296, Baden (comté, district)...","Document(Baden comté, district, 8 annot., text..."
3909,007619-b,"Entités politiques / Bailliage, châtellenie",baillage de Val-de-Travers,baillage,Val-de-Travers,,7619,Val-de-Travers vallée,2,20.0,20.0,"DhsArticle(fr, 007619, Val-de-Travers (vallée)...","Document(Val-de-Travers vallée, 11 annot., tex..."


In [42]:
polities_dtf["tokenized_toponym"] = polities_dtf.toponym.apply(lambda t: spacy_tokenizer(t))
polities_dtf["tokenized_toponym_texts"] = polities_dtf.tokenized_toponym.apply(lambda tokens: [t.text for t in tokens])

polities_dtf["tokenized_toponym"].apply(len).value_counts()
polities_dtf[polities_dtf["tokenized_toponym"].apply(len)>1]

Unnamed: 0,polity_id,hds_tag,canonic_title,typology,toponym,geoidentifier,hds_article_id,article_title,nbtags,level,max_level,article,document,tokenized_toponym,tokenized_toponym_texts
12,000342-c,Entités politiques / Commune,Niederried bei Interlaken,,Niederried bei Interlaken,,000342,Niederried bei Interlaken,1,10.0,10.0,"DhsArticle(fr, 000342, Niederried bei Interlak...","Document(Niederried bei Interlaken, 2 annot., ...","(Niederried, bei, Interlaken)","[Niederried, bei, Interlaken]"
22,001279-c,Entités politiques / Commune,Neuhausen am Rheinfall,,Neuhausen am Rheinfall,,001279,Neuhausen am Rheinfall,1,10.0,10.0,"DhsArticle(fr, 001279, Neuhausen am Rheinfall,...","Document(Neuhausen am Rheinfall, 8 annot., tex...","(Neuhausen, am, Rheinfall)","[Neuhausen, am, Rheinfall]"
33,002882-c,Entités politiques / Commune,Les Verrières,,Les Verrières,,002882,Les Verrières,1,10.0,10.0,"DhsArticle(fr, 002882, Verrières, Les, text lo...","Document(Les Verrières, 3 annot., text=""Les Ve...","(Les, Verrières)","[Les, Verrières]"
62,049390-c,Entités politiques / Commune,commune de La Tène,commune,La Tène,,049390,La Tène commune,1,10.0,10.0,"DhsArticle(fr, 049390, Tène, La (commune), tex...","Document(La Tène commune, 4 annot., text=""La T...","(La, Tène)","[La, Tène]"
64,000343-c,Entités politiques / Commune,Oberried am Brienzersee,,Oberried am Brienzersee,,000343,Oberried am Brienzersee,1,10.0,10.0,"DhsArticle(fr, 000343, Oberried am Brienzersee...","Document(Oberried am Brienzersee, 3 annot., te...","(Oberried, am, Brienzersee)","[Oberried, am, Brienzersee]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4385,046530-et,Entités politiques / Etat historique disparu,République démocratique allemande (RDA),,République démocratique allemande,RDA,046530,République démocratique allemande (RDA),1,40.0,40.0,"DhsArticle(fr, 046530, RDA, text loaded)",Document(République démocratique allemande (RD...,"(République, démocratique, allemande)","[République, démocratique, allemande]"
4390,008806-h,Entités ecclésiastiques / Hospice,Col du Simplon,,Col du Simplon,,008806,Col du Simplon,1,15.0,15.0,"DhsArticle(fr, 008806, Simplon, col du, text l...","Document(Col du Simplon, 16 annot., text=""Col ...","(Col, du, Simplon)","[Col, du, Simplon]"
4392,007466-h,Entités ecclésiastiques / Hospice,Col du Saint-Gothard,,Col du Saint-Gothard,,007466,Col du Saint-Gothard,1,15.0,15.0,"DhsArticle(fr, 007466, Gothard, col du, text l...","Document(Col du Saint-Gothard, 34 annot., text...","(Col, du, Saint-Gothard)","[Col, du, Saint-Gothard]"
4393,008815-h,Entités ecclésiastiques / Hospice,Col du Septimer,,Col du Septimer,,008815,Col du Septimer,1,15.0,15.0,"DhsArticle(fr, 008815, Septimer, col du, text ...","Document(Col du Septimer, 3 annot., text=""Col ...","(Col, du, Septimer)","[Col, du, Septimer]"


# Polity linking

## Linking valid statuswords sequences to their polity 

In [43]:
def link_entity_by_typology(dtf_row, polities_dtf):
    possible_typologies = statusword_to_typology_dict.get(dtf_row.statusword.text.lower())

    if possible_typologies is None:
        print("WARNING: statusword without corresponding typology: |"+dtf_row.statusword.text.lower()+"|")
        return []

    possible_polities = [
        polities_dtf.loc[(polities_dtf.typology==typology) & polities_dtf.toponym.apply(lambda t: dtf_row.sequence_toponym.text == t)]
        for typology in possible_typologies
    ]
    possible_polities = [dtf for dtf in possible_polities if dtf.shape[0]>0]
    return possible_polities

def count_nb_matching_tokens(sequence_dtf_row, tokenized_toponym_texts):
    sequence_dtf_row_tokens_texts = [t.text for t in sequence_dtf_row.statusword_tokens_sequences]
    nb_matching_tokens = sum([
        word in sequence_dtf_row_tokens_texts[-(nb_successors+1):]
        for word in tokenized_toponym_texts
    ])
    return nb_matching_tokens

def link_entity_by_hdstag(dtf_row, polities_dtf, statusword_to_hdstag_dict):
    """
        # find possible polities: take polities that have matching hds_tag AND an exact match between the searched toponym and th sequence's identified toponym

    replacement proposition:
    - tokenize polities_dtf canonic_title
    - computing toponym matching score

    toponym matching score:
    - nb_matching_tokens= nb of polities_dtf.toponym_tokens present in sequence_tokens
    - all_tokens_matched: whether all tokens of the polities_dtf.toponym_tokens are in the sequence_tokens 
    - hds_tag_score: score inversely proportional to the rank an hds_tag has in the ordering (rank 0 -> highest score)

    ranking algorithm:
    -> order according to following order:
        1) all_tokens_matched*nb_matching_tokens
        2) hds_tag_score
        3) nb_matched_tokens
    -> score = 100* all_tokens_matched*nb_matching_tokens +
                10 * hds_tag_score + 
                nb_matched_tokens
    """
    possible_hdstags = statusword_to_hdstag_dict.get(dtf_row.statusword.text.lower())

    if possible_hdstags is None:
        print("WARNING: statusword without corresponding hdstag: |"+dtf_row.statusword.text.lower()+"|")
        return []

    possible_polities = [(
            i,
            polities_dtf.loc[(polities_dtf.hds_tag==hds_tag) &
            polities_dtf.tokenized_toponym_texts.apply(lambda tokens:
                any([dtf_row.sequence_toponym.text == t for t in tokens])
            )].copy()
        )for i,hds_tag in enumerate(possible_hdstags)
    ]
    for i,dtf in possible_polities:
        dtf["possibility_hds_tag_rank"] = i 
    possible_polities_dtf = pd.concat([dtf for i,dtf in possible_polities])
    possible_polities_dtf["nb_matching_tokens"] = possible_polities_dtf.tokenized_toponym_texts.apply(lambda ttt: count_nb_matching_tokens(dtf_row, ttt))
    possible_polities_dtf["possible_polity_score"] = \
        100* (possible_polities_dtf.tokenized_toponym_texts.apply(len)==possible_polities_dtf["nb_matching_tokens"]) * possible_polities_dtf["nb_matching_tokens"] + \
        10* (possible_polities_dtf["possibility_hds_tag_rank"].max() - possible_polities_dtf["possibility_hds_tag_rank"])+ \
        possible_polities_dtf["nb_matching_tokens"]
    possible_polities_dtf = possible_polities_dtf.sort_values(by ='possible_polity_score', ascending = False)

    return possible_polities_dtf

def link_statuswords_toponyms_sequences(dtf, polities_dtf, statusword_to_hdstag_dict):
    """
    takes a dtf (valid_sequences_dtf) coming from validate_statuswords_toponyms_sequences()
    + adds columns
        - possible_polities
        - possible_polities_min_rank
        - linked_polity_id, linked_hds_tag, linked_toponym
    """
    dtf["possible_polities"] = [
        link_entity_by_hdstag(row, polities_dtf, statusword_to_hdstag_dict)
        for i, row in dtf.iterrows()
    ]

    #valid_sequences_dtf["possible_polities_ranks"] = valid_sequences_dtf["possible_polities"].apply(lambda pp: [t[0] for t in pp])
    dtf["possible_polities_min_rank"] = dtf["possible_polities"].apply(lambda pp_dtf: pp_dtf.possibility_hds_tag_rank.min() if pp_dtf.shape[0]>0 else None)

    dtf["linked_polity_id"] = dtf["possible_polities"].apply(lambda pp: pp.iloc[0]["polity_id"] if pp.shape[0]>0 else None)
    dtf["linked_hds_tag"] = dtf["possible_polities"].apply(lambda pp: pp.iloc[0]["hds_tag"]if pp.shape[0]>0 else None)
    dtf["linked_toponym"] = dtf["possible_polities"].apply(lambda pp: pp.iloc[0]["toponym"]if pp.shape[0]>0 else None)


In [44]:
link_statuswords_toponyms_sequences(valid_sequences_dtf, polities_dtf, statusword_to_hdstag_dict)


In [45]:
if False:
    valid_sequences_dtf["possible_polities_by_typology"] = [
        link_entity_by_typology(row, polities_dtf)
        for i, row in valid_sequences_dtf.iterrows()
    ]

In [46]:
valid_sequences_dtf["possible_polities"].apply(lambda pp_dtf: pp_dtf.shape[0]).value_counts()
valid_sequences_dtf["possible_polities_min_rank"].value_counts()

0.0    535
1.0     56
2.0     12
4.0      2
3.0      2
5.0      2
Name: possible_polities_min_rank, dtype: int64

## Exploring linking results

In [47]:
linked_sequences_human_columns = ["hds_article_id", "statusword", "sequence_toponym", "sequence", "linked_polity_id", "linked_hds_tag", "linked_toponym"]

valid_sequences_dtf.loc[:,linked_sequences_human_columns]

Unnamed: 0,hds_article_id,statusword,sequence_toponym,sequence,linked_polity_id,linked_hds_tag,linked_toponym
43,000627,distr,Lucerne,"[distr, ., de, Lucerne]",011152-d,Entités politiques / District,Lucerne
43,000627,chapitre,Beromünster,"[chapitre, de, Beromünster]",012007-cclg,Entités ecclésiastiques / Chapitre collégial,Beromünster
43,000627,ville,Zoug,"[ville, de, Zoug]",007373-ct,Entités politiques / Canton,Zoug
43,000627,paroisse,Cham,"[paroisse, de, Cham]",000788-c,Entités politiques / Commune,Cham
43,000627,évêque,Constance,"[évêque, de, Constance]",008561-ev,"Entités ecclésiastiques / Evêché, diocèse",Constance
...,...,...,...,...,...,...,...
4335,007373,ville,Zoug,"[ville, de, Zoug]",007373-ct,Entités politiques / Canton,Zoug
4335,007373,ville,Zoug,"[ville, de, Zoug]",007373-ct,Entités politiques / Canton,Zoug
4335,007373,ville,Zoug,"[ville, de, Zoug]",007373-ct,Entités politiques / Canton,Zoug
4335,007373,ville,Zoug,"[ville, de, Zoug]",007373-ct,Entités politiques / Canton,Zoug


In [48]:
linked_sequences_dtf = valid_sequences_dtf.loc[valid_sequences_dtf["possible_polities"].apply(lambda pp: pp is not None)].copy()
linked_sequences_dtf.loc[:,linked_sequences_human_columns]

Unnamed: 0,hds_article_id,statusword,sequence_toponym,sequence,linked_polity_id,linked_hds_tag,linked_toponym
43,000627,distr,Lucerne,"[distr, ., de, Lucerne]",011152-d,Entités politiques / District,Lucerne
43,000627,chapitre,Beromünster,"[chapitre, de, Beromünster]",012007-cclg,Entités ecclésiastiques / Chapitre collégial,Beromünster
43,000627,ville,Zoug,"[ville, de, Zoug]",007373-ct,Entités politiques / Canton,Zoug
43,000627,paroisse,Cham,"[paroisse, de, Cham]",000788-c,Entités politiques / Commune,Cham
43,000627,évêque,Constance,"[évêque, de, Constance]",008561-ev,"Entités ecclésiastiques / Evêché, diocèse",Constance
...,...,...,...,...,...,...,...
4335,007373,ville,Zoug,"[ville, de, Zoug]",007373-ct,Entités politiques / Canton,Zoug
4335,007373,ville,Zoug,"[ville, de, Zoug]",007373-ct,Entités politiques / Canton,Zoug
4335,007373,ville,Zoug,"[ville, de, Zoug]",007373-ct,Entités politiques / Canton,Zoug
4335,007373,ville,Zoug,"[ville, de, Zoug]",007373-ct,Entités politiques / Canton,Zoug


In [49]:
unlinked_sequences_human_columns = ["hds_article_id", "statusword", "sequence_toponym", "sequence"]

unlinked_sequences_dtf = valid_sequences_dtf.loc[valid_sequences_dtf["possible_polities"].apply(lambda pp: pp is None)].copy()
unlinked_sequences_dtf.loc[:,unlinked_sequences_human_columns]

Unnamed: 0,hds_article_id,statusword,sequence_toponym,sequence


In [50]:
polities_dtf[polities_dtf.typology.apply(lambda t: t is None)].hds_tag.value_counts()

Entités politiques / Commune                                        1973
Entités politiques / Ancienne commune                               1068
Entités politiques / Seigneurie                                      186
Entités politiques / Ville, commune, localité (étranger)             167
Entités ecclésiastiques / Abbaye, couvent, monastère, prieuré        132
Entités politiques / Bailliage, châtellenie                           91
Entités politiques / District                                         45
Entités politiques / Etat historique disparu                          29
Entités politiques / Comté, landgraviat                               29
Entités politiques / Ancien district                                  20
Entités ecclésiastiques / Chapitre collégial                          14
Entités politiques / Canton, Département, République (1790-1813)      14
Entités politiques / Canton                                           13
Entités ecclésiastiques / Commanderie              

## Annotating linked polities in documents

In [51]:
def add_annotation_to_document_from_valid_sequences(document, valid_sequences_dtf_rows):
    new_annotations = [
        Annotation(
            row.sequence[0].idx,
            row.sequence[-1].idx+len(row.sequence[-1]),
            extra_fields={
                "type": "polity_id_LOC",
                "polity_id": row.linked_polity_id
            }
        )
        for i, row in valid_sequences_dtf_rows.iterrows()
    ]
    document.annotations = document.annotations + new_annotations

In [52]:
for i, row in sampled_articles_dtf.iterrows():
    add_annotation_to_document_from_valid_sequences(row.document, valid_sequences_dtf[valid_sequences_dtf.hds_article_id==row.hds_article_id])

In [53]:
# COMPLETING ANNOTATIONS OF MULTI-TOKEN TOPONYMS
sampled_articles_dtf.iloc[32,:].hds_article_id

valid_sequences_dtf[valid_sequences_dtf.hds_article_id=="001245"]
dtf_row = valid_sequences_dtf[valid_sequences_dtf.hds_article_id=="001245"].iloc[1,:]

valid_sequences_dtf[valid_sequences_dtf.hds_article_id=="001245"].iloc[0,:].possible_polities

#hds_tag = valid_sequences_dtf.loc[valid_sequences_dtf.hds_article_id=="001245",["hds_tag"]]

Unnamed: 0,polity_id,hds_tag,canonic_title,typology,toponym,geoidentifier,hds_article_id,article_title,nbtags,level,max_level,article,document,tokenized_toponym,tokenized_toponym_texts,possibility_hds_tag_rank,nb_matching_tokens,possible_polity_score
4327,007477-ct,Entités politiques / Canton,Bâle-Campagne,,Bâle-Campagne,,7477,Bâle-Campagne,1,30.0,30.0,"DhsArticle(fr, 007477, Bâle-Campagne, text loa...","Document(Bâle-Campagne, 83 annot., text=""Bâle-...","(Bâle, -, Campagne)","[Bâle, -, Campagne]",0,3,313
4312,007387-ct,Entités politiques / Canton,canton de Bâle,canton,Bâle,,7387,Bâle canton,2,30.0,30.0,"DhsArticle(fr, 007387, Bâle (canton), text loa...","Document(Bâle canton, 121 annot., text=""Bâleca...",(Bâle),[Bâle],0,1,111
4313,007387-ct,"Entités politiques / Canton, Département, Répu...",canton de Bâle,canton,Bâle,,7387,Bâle canton,2,30.0,30.0,"DhsArticle(fr, 007387, Bâle (canton), text loa...","Document(Bâle canton, 121 annot., text=""Bâleca...",(Bâle),[Bâle],1,1,101
1780,007478-ct,Entités politiques / Canton,Bâle-Ville,,Bâle-Ville,,7478,Bâle-Ville,2,30.0,30.0,"DhsArticle(fr, 007478, Bâle-Ville, text loaded)","Document(Bâle-Ville, 222 annot., text=""Bâle-Vi...","(Bâle, -, Ville)","[Bâle, -, Ville]",0,2,12


In [54]:
test_values = [
    valid_sequences_dtf.shape[0],      statusword_tokens_sequences_dtf.shape[0]
]
truth_sequence = [
    valid_sequences_dtf.shape[0]==727, statusword_tokens_sequences_dtf.shape[0]==1456
]

print(all(truth_sequence))
print(test_values)
print(truth_sequence)

True
[727, 1456]
[True, True]


In [55]:
valid_sequences_dtf

Unnamed: 0,hds_article_id,toponym,geoidentifier,article_title,polities_ids,nb_polities,tokenized_toponym,loose_normalized_tokenized_toponym,strict_normalized_tokenized_toponym,statusword_tokens_sequences,...,statusword,sequence_toponym,sequence,sequence_structure,sequence_structure_str,possible_polities,possible_polities_min_rank,linked_polity_id,linked_hds_tag,linked_toponym
43,000627,Meierskappel,,Meierskappel,[000627-c],1,{Meierskappel},[Meierskappel],[Meierskappel],"[Meierskappel, \n, Comm, ., LU, ,, distr, ., d...",...,distr,Lucerne,"[distr, ., de, Lucerne]","[STATUS, ., de, TOPONYM]",STATUS-.-de-TOPONYM,polity_id hds_tag ...,0.0,011152-d,Entités politiques / District,Lucerne
43,000627,Meierskappel,,Meierskappel,[000627-c],1,{Meierskappel},[Meierskappel],[Meierskappel],"[qui, les, acheta, en, 1447, ), ., Le, chapitr...",...,chapitre,Beromünster,"[chapitre, de, Beromünster]","[STATUS, de, TOPONYM]",STATUS-de-TOPONYM,polity_id ...,0.0,012007-cclg,Entités ecclésiastiques / Chapitre collégial,Beromünster
43,000627,Meierskappel,,Meierskappel,[000627-c],1,{Meierskappel},[Meierskappel],[Meierskappel],"[attribuèrent, Meierskappel, au, bailliage, de...",...,ville,Zoug,"[ville, de, Zoug]","[STATUS, de, TOPONYM]",STATUS-de-TOPONYM,polity_id hds_tag ...,0.0,007373-ct,Entités politiques / Canton,Zoug
43,000627,Meierskappel,,Meierskappel,[000627-c],1,{Meierskappel},[Meierskappel],[Meierskappel],"[\n, La, chapelle, de, Meierskappel, dépendait...",...,paroisse,Cham,"[paroisse, de, Cham]","[STATUS, de, TOPONYM]",STATUS-de-TOPONYM,polity_id hds_tag ca...,0.0,000788-c,Entités politiques / Commune,Cham
43,000627,Meierskappel,,Meierskappel,[000627-c],1,{Meierskappel},[Meierskappel],[Meierskappel],"[droits, de, collation, passèrent, du, Fraumün...",...,évêque,Constance,"[évêque, de, Constance]","[STATUS, de, TOPONYM]",STATUS-de-TOPONYM,polity_id ...,0.0,008561-ev,"Entités ecclésiastiques / Evêché, diocèse",Constance
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4335,007373,Zoug,,Zoug canton,[007373-ct],1,{Zoug},[Zoug],[Zoug],"[au, secondaire, et, supérieure, au, gymnase, ...",...,ville,Zoug,"[ville, de, Zoug]","[STATUS, de, TOPONYM]",STATUS-de-TOPONYM,polity_id hds_tag ...,0.0,007373-ct,Entités politiques / Canton,Zoug
4335,007373,Zoug,,Zoug canton,[007373-ct],1,{Zoug},[Zoug],[Zoug],"[enseignement, par, branches, ., La, fondation...",...,ville,Zoug,"[ville, de, Zoug]","[STATUS, de, TOPONYM]",STATUS-de-TOPONYM,polity_id hds_tag ...,0.0,007373-ct,Entités politiques / Canton,Zoug
4335,007373,Zoug,,Zoug canton,[007373-ct],1,{Zoug},[Zoug],[Zoug],"[des, sociétés, de, gymnastique, (, d', abord,...",...,ville,Zoug,"[ville, de, Zoug]","[STATUS, de, TOPONYM]",STATUS-de-TOPONYM,polity_id hds_tag ...,0.0,007373-ct,Entités politiques / Canton,Zoug
4335,007373,Zoug,,Zoug canton,[007373-ct],1,{Zoug},[Zoug],[Zoug],"[onze, communes, ., \n, Longtemps, ,, seule, l...",...,ville,Zoug,"[ville, de, Zoug]","[STATUS, de, TOPONYM]",STATUS-de-TOPONYM,polity_id hds_tag ...,0.0,007373-ct,Entités politiques / Canton,Zoug
