In [None]:
import unicodedata

import sys
import re
from os import path

sys.path.append("../../../src")
sys.path.append("../../../scripts")
from inception_fishing import Annotation

from tqdm import tqdm



In [None]:
from s2_prepare_articles import *
import spacy

In [None]:
spacy_tokenizer = spacy.load("fr_core_news_sm")

# Documents' text preparation
## Normalizing texts and investigating text issues

In [None]:
# Evaluating normalized text difference with original text:

polities_dtf["normalized_text"] = polities_dtf.document.apply(lambda d: unicodedata.normalize("NFKC",d.text))

polities_dtf["len_normalized_text"] = polities_dtf["normalized_text"].apply(len)
polities_dtf["len_unnormalized_text"] = polities_dtf.document.apply(lambda d: len(d.text))
polities_dtf["len_diff_normalized_text"] = polities_dtf["len_unnormalized_text"] - polities_dtf["len_normalized_text"]
polities_dtf["len_diff_normalized_text"].value_counts()

In [None]:
# Character normalization investigation -> NFKC is the way to go :-)

# Zoug had a very big 2529 char diff using NFKD
lendif2529NFKD = polities_dtf[polities_dtf["hds_article_id"]=="007373"]

def investigate_norm_len_diff(dtf, i=0):
    norm_text = dtf["normalized_text"].iloc[i]
    unnorm_text = dtf.document.iloc[i].text

    norm_chars = set(norm_text)
    unnorm_chars = set(unnorm_text)

    unnorm_missing_chars = [ c for c in unnorm_chars if c not in norm_chars]
    norm_missing_chars = [ c for c in norm_chars if c not in unnorm_chars]

    #return (norm_text, unnorm_text, norm_chars, unnorm_chars, norm_missing_chars, unnorm_missing_chars)
    return (norm_missing_chars, unnorm_missing_chars)

investigate_norm_len_diff(lendif2529NFKD)

In [None]:
lendifNFKC = polities_dtf[polities_dtf["len_diff_normalized_text"]!=0]

"""Still some problem with some minor characters... we'll come to it when we need ta"""
polities_dtf.loc[polities_dtf.polity_id.apply(lambda i: i in ["001256-c", "001321-c", "007384-ct"]),:]


[investigate_norm_len_diff(lendifNFKC, i) for i in range(lendifNFKC.shape[0])]

In [None]:
def normalize_unicode_text(text):
    """unicode normalization NFKD removes accents in characters -> NFKC is the way to go :-)
    
        /!\\ /!\\ /!\\ normalizing text yields different text lengths for 3 articles (out of 4000), use with caution /!\\/!\\/!\\
        but it is needed for proper tokenization/prediction/learning"""

    return unicodedata.normalize("NFKC",text)

In [None]:
grandson_dtf = polities_dtf[polities_dtf.toponym=="Grandson"]
grandson_article = grandson_dtf.article.iloc[0]
grandson_document = grandson_dtf.document.iloc[0]
doc = spacy_tokenizer(normalize_unicode_text(grandson_document.text))

In [None]:
grandson_tokens = [token for token in doc if token.text =="Grandson"]

grandson_tokens

In [None]:
seigneurs_tokens = [token for token in doc if token.text =="seigneurs"]
[seigneurs_tokens[0].nbor(i) for i in range(-5,5)]

In [None]:
nb_prev = 2

grandson_tokens_3g = [[t.nbor(i) for i in range(-nb_prev,1)] for t in grandson_tokens]

grandson_tokens_3g

In [None]:
"""
Strategy rule-based annotations:
- tokenize list of articles
    - use doc text
    - normalize
- frequency table of N predecessor words
    -> identify the ones that are relevant statuswords
- extract all sequences of the form statuswords-X-X-toponym
- identify the relevant sequences representing an entity
- identify which entity each sequence corresponds to
    -> each statuswords refers to a list of possible entity type
    -> entity type + statusword

"""

In [None]:
polities_dtf.drop(
    ["normalized_text", "len_normalized_text", "len_unnormalized_text",
    "len_diff_normalized_text", "len_diff_normalized_text"],
    axis=1,
    inplace=True
)
if False:

    def normalize_doc_text(d):
        """ /!\\ use with caution, see above"""
        d.text = normalize_unicode_text(d.text)

    polities_dtf.document.apply(normalize_doc_text)
    ""

In [None]:
sampled_articles_ids = set(sampled_articles_ids)
#sampled_polities_dtf = polities_dtf[polities_dtf.hds_article_id.apply(lambda id: id in sampled_articles_ids)]

In [None]:
polities_dtf.geoidentifier.unique()

#  Polity recognition

## Identifying toponyms' tokens

In [None]:
def add_tokenized_text(dtf, tokenizer):
    """
    takes a dtf with "document" column
    + adds the following columns:
        - tokens: spacy tokenization of text column
    """
    dtf["tokens"] = dtf.document.apply(lambda d: tokenizer(normalize_unicode_text(d.text)))
    return dtf

def add_toponyms(dtf, tokenizer):
    """
    takes a dtf with a "toponym" column
    + adds the following columns:
        - tokenized_toponym: spacy tokenization texts of toponym
    """
    dtf["tokenized_toponym"] = [tokenizer(t) for t in tqdm(dtf.toponym, total=dtf.shape[0], desc="Tokenizing toponyms")]


In [None]:
# take into account the fact that toponym might span multiple tokens
#articles_dtf["tokenized_toponym"] = articles_dtf.toponym.apply(lambda t: set([tok.text for tok in spacy_tokenizer(normalize_unicode_text(t))]))
add_toponyms(polities_dtf, spacy_tokenizer)
toponym_tokens = polities_dtf["tokenized_toponym"].explode().apply(lambda t: t.text)
#[t for t in utoponym_tokens if len(t)==4]
toponym_tokens_value_counts = toponym_tokens.value_counts()
toponym_tokens_value_counts[toponym_tokens_value_counts>1].shape
toponym_tokens_value_counts.shape


#with pd.option_context('display.max_rows', None, 'display.max_columns', None):
toponym_tokens_value_counts

In [None]:
not_toponym_tokens = {"'",
 '-',
 '/',
 "L'",
 'La',
 'Lac',
 'Le',
 'Les',
 'Nord',
 'S',
 'See',
 'Sud',
 'Sur',
 'am',
 'an',
 'bei',
 'ch',
 "d'",
 'da',
 'dans',
 'de',
 'der',
 'des',
 'di',
 'du',
 'et',
 'im',
 'in',
 'l',
 'la',
 'le',
 'les',
 'près',
 'sur',
 'zum',
 "vers"
}
ambiguous_toponym_tokens={
 'Au',
 'Bois',
 'Col',
 'Dieu',
 'Eaux',
 'Ile',
 'Part',
 'Pays',
 'Pont',
 'Port',
 'Rue',
 "vaudois",
 "helvétique",
}

In [None]:
def normalize_toponyms(dtf, not_toponym_tokens_texts, ambiguous_toponym_tokens_texts):
    """
    normalize_toponyms(): takes a dtf coming from add_toponyms()
    + adds the following columns:
        - loose_normalized_tokenized_toponym: all toponym tokens that are generally toponym (exlcuding "les", "la", etc...)
        - strict_normalized_tokenized_toponym: same as loose_normalized_tokenized_toponym, excluding in addition ambiguous toponym tokens ("eaux", "ile", "bois", "col", etc...)
        - trimmed_normalized_tokenized_toponym: removing leading tokens that are not_toponym_tokens (mainly to avoid confusion with regard to toponyms such as "Les Verrières")
    + returns:
        - normalized_toponym_tokens: set of all strict_normalized_tokenized_toponym
    """
    dtf["trimmed_normalized_tokenized_toponym"] = [
        [s for s in tokens[:1] if s.text not in not_toponym_tokens_texts]+
        [t for t in tokens[1:]]
        for tokens in dtf["tokenized_toponym"]
    ]   
    dtf["loose_normalized_tokenized_toponym"] = [[s.text for s in tokens if s.text not in not_toponym_tokens_texts] for tokens in dtf["tokenized_toponym"]]
    dtf["strict_normalized_tokenized_toponym"] = [[st for st in texts if st not in ambiguous_toponym_tokens_texts] for texts in dtf["loose_normalized_tokenized_toponym"]]
    normalized_toponym_tokens = set(dtf["strict_normalized_tokenized_toponym"].explode())
    trimmed_normalized_tokenized_toponyms_texts = [t for t in dtf["trimmed_normalized_tokenized_toponym"].apply(lambda tokens: "".join([t.text+t.whitespace_ for t in tokens])) if len(t)>0]
    return normalized_toponym_tokens, trimmed_normalized_tokenized_toponyms_texts

In [None]:
#articles_dtf["loose_normalized_tokenized_toponym"] = [[s for s in texts if s not in not_toponym_tokens] for texts in articles_dtf["tokenized_toponym"]]
#articles_dtf["strict_normalized_tokenized_toponym"] = [[s for s in texts if s not in ambiguous_toponym_tokens] for texts in articles_dtf["loose_normalized_tokenized_toponym"]]

#normalized_toponym_tokens = set(articles_dtf["strict_normalized_tokenized_toponym"].explode())

normalized_toponym_tokens, trimmed_normalized_tokenized_toponyms_texts = normalize_toponyms(polities_dtf, not_toponym_tokens, ambiguous_toponym_tokens)

polities_dtf.tokenized_toponym

In [None]:
additional_columns = [
    "article", "document", "tokenized_toponym",
    "trimmed_normalized_tokenized_toponym", "loose_normalized_tokenized_toponym", "strict_normalized_tokenized_toponym"]
articles_dtf = get_articles_dtf_from_polities_dtf(polities_dtf, additional_columns)
polities_dtf["tokenized_toponym_texts"] = [[t.text for t in tokens] for tokens in polities_dtf.tokenized_toponym]


In [None]:
sampled_articles_dtf = articles_dtf[articles_dtf.hds_article_id.apply(lambda id: id in sampled_articles_ids)].copy()

#sampled_articles_dtf["tokens"] = sampled_articles_dtf.document.apply(lambda d: spacy_tokenizer(normalize_unicode_text(d.text)))
add_tokenized_text(sampled_articles_dtf, spacy_tokenizer)
sampled_articles_dtf.head()

In [None]:
def is_token_toponym(token, dtf_row, normalized_toponym_tokens):
    """Checks that a given token is a toponym (either corresponding to any strict toponym, or a loose toponym from the particular article toponym
    """
    return (
        token.text in normalized_toponym_tokens
        or token.text in dtf_row.loose_normalized_tokenized_toponym
    )

In [None]:
# toponyms_pattern regex test
if False:
    trimmed_normalized_tokenized_toponyms_texts

    toponyms_pattern = re.compile("("+(r")\W|\W(".join(trimmed_normalized_tokenized_toponyms_texts))+")")
    text = " "+sampled_articles_dtf.document.iloc[0].text+" "

    match_list = [m for m in toponyms_pattern.finditer(text)] #, re.IGNORECASE)]
    (text, match_list)

In [None]:
# might simple string operation be faster than regex? who knows...
def find_indices(string, substring):
    """returns all the start+end boundaries of the occurences of the substring inside the string"""
    indices = [(index,len(substring)) for index in range(len(string)) if string.startswith(substring, index)]
    return indices

a_string = "the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the lazy dog"
# Find all indices of 'the'
indices = [index for index in range(len(a_string)) if a_string.startswith('the', index)]
print(indices)


In [None]:
def add_text_toponyms_spans(dtf, trimmed_normalized_tokenized_toponyms_texts):
    """add_text_toponyms_spans():  takes a dtf coming from normalize_toponyms()
    + adds the following columns: 
        - toponym_tokens_spans: list of spacy Spans, each Span containing a toponym, ensures no overlaps

    algorithm:
    The pattern requires whitespace before&after toponym,
    hence we add whitespace at beginning and end of text to ensure detection at those places
    and correct match indices afterwards
    
    example:
    real text:
    "MEIERKAPPEL, BLABLA"	    -> real bounds 0, 11
    with added space:
    " MEIERKAPPEL, BLABLA" -> displaced bounds 1, 12
    detected:
    " MEIERKAPPEL,"         -> detected bounds 0, 13

    real_start = m.start()+1-1, +1 because \W in pattern, -1 because " " added in front of text
    real_end = m.end()-2 because \W at start and end of pattern
    
    """

    toponyms_pattern = re.compile("("+(r")\W|\W(".join(trimmed_normalized_tokenized_toponyms_texts))+")")

    # multi-tokens toponyms
    dtf["toponym_tokens_spans"]=[
        [
            row.tokens.char_span(m.start(), m.end()-2, alignment_mode="contract")
            for m in toponyms_pattern.finditer(" "+row.document.text+" ") #, re.IGNORECASE):
        ]
        for i, row in tqdm(dtf.iterrows(), total = dtf.shape[0], desc ="Adding token spans")
    ]
    dtf["toponym_tokens_spans"] = dtf["toponym_tokens_spans"].apply(lambda spans: [s for s in spans if s is not None])
    dtf["toponym_tokens_indices"] = dtf["toponym_tokens_spans"].apply(lambda spans: set([t.i for span in spans for t in span]))

    # single token toponyms that are in the row.loose_normalized_tokenized_toponym (think "Au", "le Pont", "See", etc...)
    dtf["toponym_tokens_spans"] = [
        row.toponym_tokens_spans+
        [
            row.tokens[token.i:(token.i+1)] for token in row.tokens
            if 
                token.i not in row.toponym_tokens_indices and # ensure we don't have twice the same toponyms
                token.text in row.loose_normalized_tokenized_toponym
        ]
            for i, row in dtf.iterrows()
    ]
    del dtf["toponym_tokens_indices"]
        


In [None]:
# as add_text_toponyms_spans() takes 10min to run on 100 articles,
# here is a simple cache system.

def serialize_spacy_span(spacy_span):
    return (spacy_span.start, spacy_span.end)
def unserialize_spacy_span(spacy_doc, serialized_span):
    return spacy.tokens.span.Span(spacy_doc, serialized_span[0], serialized_span[1])
def serialize_spacy_spans_series(spans_series):
    """serialize a series of spans list: i.e. a dtf column containing, for each row, a list of spans"""
    return spans_series.apply(lambda spans: [serialize_spacy_span(s) for s in spans])
def unserialize_spacy_spans_column(dtf, doc_column, serialized_spans_column):
    return [
        [unserialize_spacy_span(row[doc_column], s) for s in row[serialized_spans_column]]
        for i, row in dtf.iterrows()
    ]
    #return spacy_doc.char_span(serialized_span[0], serialized_span[1])



def save_toponym_tokens_spans(dtf, picklepath):
    pickle_dtf = dtf.loc[:,["hds_article_id", "toponym_tokens_spans"]].copy()
    pickle_dtf["toponym_tokens_spans"] = serialize_spacy_spans_series(pickle_dtf["toponym_tokens_spans"])
    pickle_dtf.to_pickle(picklepath)

def restore_toponym_tokens_spans(dtf, picklepath):
    """/!\ doesn't change dtf inplace, must assign result"""
    pickle_dtf = pd.read_pickle(picklepath)
    dtf = dtf.merge(pickle_dtf, on="hds_article_id")
    dtf["toponym_tokens_spans"] = unserialize_spacy_spans_column(dtf, "tokens", "toponym_tokens_spans")
    return dtf

def restore_or_compute_and_save_toponym_spans(dtf, picklepath, trimmed_normalized_tokenized_toponyms_texts):
    """/!\ doesn't change dtf inplace, must assign result"""
    if "toponym_tokens_spans" in dtf.columns:
        print('"toponym_tokens_spans" column already present in dataframe, no need to restore or compute.')
        return dtf
    if path.exists(picklepath):
        print("picklepath",picklepath,"exists, restoring...")
        return restore_toponym_tokens_spans(dtf, picklepath)
    else:
        print("picklepath",picklepath," not existing, computing toponym_tokens_spans...")
        add_text_toponyms_spans(dtf, trimmed_normalized_tokenized_toponyms_texts)
        save_toponym_tokens_spans(dtf, picklepath)
        return dtf


In [None]:
#add_text_toponyms_spans(sampled_articles_dtf, trimmed_normalized_tokenized_toponyms_texts)

sampled_articles_dtf = restore_or_compute_and_save_toponym_spans(sampled_articles_dtf, s2_toponyms_spans_dtf_pickle, trimmed_normalized_tokenized_toponyms_texts)
sampled_articles_dtf.head()

In [None]:
def to_toponyms_dtf(dtf):
    return dtf.explode("toponym_tokens_spans")

toponyms_dtf = to_toponyms_dtf(sampled_articles_dtf)
toponyms_dtf.head()

## Extracting toponym sequences

In [None]:
def add_toponym_tokens_sequence(dtf, nb_predecessors = 10, nb_successors = 3):
    """add_toponym_tokens_sequence(): takes nb_predecessors, nb_successors and a dtf coming from add_text_toponyms_spans() and add_tokenized_text()
    + adds the following columns:
        - toponym_tokens_sequence: for each toponym_token_span, a sequence according from nb_pred to nb_succ (indexed on first token from each span)"""

    dtf["toponym_tokens_sequence"] = dtf.toponym_tokens_spans.apply(lambda span: [
        span[0].nbor(i)
        for i in range(
            -min(nb_predecessors,span[0].i),
            min(nb_successors, len(span[0].doc)-span[0].i)
        )
    ])


In [None]:
nb_predecessors = 10
nb_successors = 3
add_toponym_tokens_sequence(toponyms_dtf, nb_predecessors, nb_successors)


In [None]:
# toponym_sequence_tokens: all tokens present in any toponym sequence
toponym_sequence_tokens = toponyms_dtf.toponym_tokens_sequence.apply(lambda span:
    [t.text for t in span]
).explode()

toponym_sequence_tokens_value_counts = toponym_sequence_tokens.value_counts().to_frame()
toponym_sequence_tokens_value_counts.columns = ['toponym_sequence_tokens']
toponym_sequence_tokens_value_counts.to_csv("toponym_sequence_tokens_value_counts.csv", sep="\t")

In [None]:
toponym_sequence_tokens_value_counts

## Statusword-toponym combination
### Identifying statuswords

In [None]:
#pd.set_option('display.max_rows', None)
toponym_sequence_tokens_value_counts[toponym_sequence_tokens_value_counts.toponym_sequence_tokens==2]
s2_statuswords_json = path.join(s2_polities_to_extract_folder, "statuswords.json")

with open(s2_statuswords_json) as f:
    statusword_token_text = json.load(f)


ambiguous_statusword_token_text = statusword_token_text["ambiguous_statuswords"]
statusword_token_text = statusword_token_text["statuswords"]


In [None]:
def identify_statusword_toponym_sequences(dtf, statusword_token_text):
    """
    takes a dtf coming from add_toponym_tokens_sequence()
    + adds a column "toponym_tokens_sequence" to dtf containing all the toponym_tokens_sequence also containing a statusword
    + returns a new dtf statusword_tokens_sequences_dtf with one row per sequence containing at least 1 statusword and 1 toponym
    """
    dtf["is_statusword_toponym_sequence"] = [
            any(token.text.lower() in statusword_token_text for token in seq)
            for seq in dtf.toponym_tokens_sequence
    ]

In [None]:
identify_statusword_toponym_sequences(toponyms_dtf, statusword_token_text)
statusword_tokens_sequences_columns_filter =['hds_article_id', 'toponym', 'geoidentifier', 'article_title', 'polities_ids', 'nb_polities',
        'tokenized_toponym', "trimmed_normalized_tokenized_toponym", 'loose_normalized_tokenized_toponym',
        'strict_normalized_tokenized_toponym', "toponym_tokens_spans", 'toponym_tokens_sequence'
    ]
statusword_tokens_sequences_dtf = toponyms_dtf.loc[toponyms_dtf.is_statusword_toponym_sequence,statusword_tokens_sequences_columns_filter].copy()
only_toponyms_sequences_dtf = toponyms_dtf.loc[~toponyms_dtf.is_statusword_toponym_sequence,statusword_tokens_sequences_columns_filter].copy()

(
    toponyms_dtf.shape,
    statusword_tokens_sequences_dtf.shape[0]+only_toponyms_sequences_dtf.shape[0],
    statusword_tokens_sequences_dtf.shape, only_toponyms_sequences_dtf.shape
)
print(toponyms_dtf.toponym_tokens_sequence.iloc[0])
print(statusword_tokens_sequences_dtf.toponym_tokens_sequence.iloc[0][0].i)
print(only_toponyms_sequences_dtf.toponym_tokens_sequence.iloc[0][0].i)
toponyms_dtf.head()

### Analysing sequences structure
STATUS-XX-TOPONYM

In [None]:
def analyse_statusword_tokens_sequence_single(dtf_row, token_sequence, statusword_index, toponym_index):
    """Analyses a single statusword-toponym combination

    returns a sequence whose first term is the sequence's statusword, and the last word is the sequence's toponym
    """
    sequence = token_sequence[statusword_index:(toponym_index+1)]
    sequence_structure = [
        "STATUS" if token.text.lower() in statusword_token_text else(
        "TOPONYM" if is_token_toponym(token, dtf_row, normalized_toponym_tokens)
        else token.text
        )
        for token in sequence
    ]
    statusword = token_sequence[statusword_index]
    toponym = token_sequence[toponym_index]
    return (statusword, toponym, sequence, sequence_structure)

def analyse_statusword_tokens_sequence(dtf_row, token_sequence):
    """Returns all the possible statusword-toponym combination analyses for a given token sequence
    """
    statusword_indices = [i for i,tok in enumerate(token_sequence) if tok.text.lower() in statusword_token_text]
    #toponym_indices = [i for i,tok in enumerate(token_sequence) if tok.text in normalized_toponym_tokens or tok.text in dtf_row.loose_normalized_tokenized_toponym]
    toponym_indices = [i for i, t in enumerate(token_sequence) if t.i == dtf_row.toponym_tokens_spans[0].i]
    #toponym_indices = [len(token_sequence)-nb_successors] # the toponym is always at the same spot in the sequence
    sequences_analyses = [
        analyse_statusword_tokens_sequence_single(dtf_row, token_sequence, i, j)
        for i in statusword_indices for j in toponym_indices if i<j
    ]
    return sequences_analyses

In [None]:
def analyse_statuswords_toponyms_sequences(dtf):
    """
    takes a dtf coming from identify_statuswords_toponyms_sequences()
    + adds "sequence_analysis" column to dtf (from analyse_statusword_tokens_sequence())
    + returns a new dtf sequences_analyses_dtf with one row per statusword+toponym combination (multiple rows possible for one toponym sequence)"""
    dtf["sequence_analysis"] = [
        analyse_statusword_tokens_sequence(row, row.toponym_tokens_sequence)
        for k, row in dtf.iterrows()
    ]
    sequences_analyses_dtf = dtf.explode("sequence_analysis")
    sequences_analyses_dtf = sequences_analyses_dtf[~sequences_analyses_dtf.sequence_analysis.isna()]
    sequences_analyses_dtf["statusword"] = sequences_analyses_dtf.sequence_analysis.apply(lambda sa: sa[0])
    sequences_analyses_dtf["sequence_toponym"] = sequences_analyses_dtf.sequence_analysis.apply(lambda sa: sa[1])
    sequences_analyses_dtf["sequence"] = sequences_analyses_dtf.sequence_analysis.apply(lambda sa: sa[2])
    sequences_analyses_dtf["sequence_structure"] = sequences_analyses_dtf.sequence_analysis.apply(lambda sa: sa[3])
    sequences_analyses_dtf["sequence_structure_str"] = sequences_analyses_dtf["sequence_structure"].apply(lambda ss: "-".join(ss))
    return sequences_analyses_dtf

In [None]:
sequences_analyses_dtf = analyse_statuswords_toponyms_sequences(statusword_tokens_sequences_dtf)
sequence_structures = sequences_analyses_dtf["sequence_structure_str"].value_counts()


In [None]:
sequence_structures
sequence_structures.to_frame().to_csv(s2_sequence_structures_counts_csv, sep="\t")
sequence_structures[sequence_structures>3]

In [None]:
sequence_structure = "STATUS-\n-Dizain-du-TOPONYM"

sequence_structures_human_columns = ['toponym', 'article_title', 'polities_ids', "statusword", "sequence", "sequence_structure"]

sequences_analyses_dtf.loc[sequences_analyses_dtf["sequence_structure_str"]==sequence_structure,sequence_structures_human_columns]

### Isolating valid statusword-toponym sequences structures

In [None]:
valid_sequence_structures = pd.read_csv(s2_sequence_structures_validation_csv, sep="\t")
valid_sequence_structures = set(valid_sequence_structures[valid_sequence_structures.validity=="yes"].structure)
valid_sequence_structures

In [None]:
sequence_structures.shape

In [None]:
def validate_statuswords_toponyms_sequences(dtf, valid_sequence_structures):
    """
    takes valid_sequence_structures set of strings and a dtf coming from explode_statuswords_toponyms_sequences()
    + returns a new dtf valid_sequences_dtf containing the valid statuswords_toponyms_sequences
    """
    valid_sequences_dtf = dtf[dtf.sequence_structure_str.apply(lambda struct: struct in valid_sequence_structures)].copy()
    return valid_sequences_dtf

In [None]:
valid_sequences_dtf = validate_statuswords_toponyms_sequences(sequences_analyses_dtf, valid_sequence_structures)
valid_sequences_dtf.shape

In [None]:
with open(s2_statusword_to_typology_json) as f:
    statusword_keys_dict = json.load(f)

statusword_to_typology_dict = {
    statusword : t[1] 
    for t in statusword_keys_dict
    for statusword in t[0]
}

statusword_to_hdstag_dict = {
    statusword : t[2] 
    for t in statusword_keys_dict
    for statusword in t[0]
}

## Getting polities_dtf toponyms' tokens

In [None]:
polities_dtf[polities_dtf.typology=="baillage"].tail()

In [None]:
#polities_dtf["tokenized_toponym"] = [spacy_tokenizer(t) for t in tqdm(polities_dtf.toponym, total=polities_dtf.shape[0], desc="Tokenizing polities' toponyms")]

polities_dtf["tokenized_toponym"].apply(len).value_counts()
polities_dtf[polities_dtf["tokenized_toponym"].apply(len)>1]

# Polity linking

In [None]:
with open(s2_statusword_to_typology_json) as f:
    statusword_keys_dict = json.load(f)

statusword_to_typology_dict = {
    statusword : t[1] 
    for t in statusword_keys_dict
    for statusword in t[0]
}

statusword_to_hdstag_dict = {
    statusword : t[2] 
    for t in statusword_keys_dict
    for statusword in t[0]
}


## Linking single toponyms to their polity 

In [None]:
default_hdstag_priorization = statusword_keys_dict[0][2]
default_hdstag_priorization

In [None]:

def create_toponyms_exact_match_dict(polities_dtf, default_hdstag_priorization):
    default_hdstag_priorization_dict = {t:i for i,t in enumerate(default_hdstag_priorization)}
    toponyms_exact_match_dict = {}
    for i, row in polities_dtf.iterrows():
        toponym_key = "".join([t.text+t.whitespace_ for t in row.trimmed_normalized_tokenized_toponym])
        toponym_possible_polities = toponyms_exact_match_dict.get(toponym_key)
        if toponym_possible_polities is None:
            toponyms_exact_match_dict[toponym_key] = [None]*len(default_hdstag_priorization_dict)
            toponym_possible_polities = toponyms_exact_match_dict[toponym_key]
        hdstag_priorization = default_hdstag_priorization_dict.get(row.hds_tag)
        if hdstag_priorization is not None:
            # if it has a priorization: put it in its proper priorization position
            toponym_possible_polities[hdstag_priorization] = row.polity_id
        else: 
            # else append it at the end
            toponym_possible_polities.append(row.polity_id)

    for k,v in toponyms_exact_match_dict.items():
        # remove None entries from the priorization:
        toponyms_exact_match_dict[k] = [pid for pid in v if pid is not None]

    #pd.Series([len(v) for v in toponyms_exact_match_dict.values()]).value_counts()
    return toponyms_exact_match_dict

In [None]:
toponyms_exact_match_dict = create_toponyms_exact_match_dict(polities_dtf, default_hdstag_priorization)
toponyms_exact_match_dict

In [92]:
def link_toponym_by_exact_match(toponym, toponyms_exact_match_dict):
    """Links a single toponym to its possible polities by toponym exact match
    """
    possible_polities= toponyms_exact_match_dict.get(toponym)
    return possible_polities if possible_polities is not None else []

def link_single_toponyms(dtf, polities_dtf, toponyms_exact_match_dict):
    """
    link_single_toponyms(): takes a dtf coming from identify_statusword_toponym_sequences()
    + links single toponyms to the polity ids found in the result from create_toponyms_exact_match_dict()
    + adds columns
        - possible_polities
        - linked_polity_id, linked_hds_tag, linked_toponym
    """
    dtf["possible_polities"] = dtf.toponym_tokens_spans.apply(lambda span:
        link_toponym_by_exact_match(span.text, toponyms_exact_match_dict)
    )
    dtf["linked_polity_id"] = [(pp[0] if len(pp)>0 else None) for pp in dtf["possible_polities"]]
    dtf["linked_hds_tag"] = [(polities_dtf.hds_tag[polities_dtf.polity_id==lpi].iloc[0] if lpi is not None else None) for lpi in dtf["linked_polity_id"]]
    dtf["linked_toponym"] = [(polities_dtf.toponym[polities_dtf.polity_id==lpi].iloc[0] if lpi is not None else None) for lpi in dtf["linked_polity_id"]]



In [94]:
#only_toponyms_sequences_dtf.toponym_tokens_spans[only_toponyms_sequences_dtf.toponym_tokens_spans.apply(len)>1].head()#iloc[1].text
link_single_toponyms(only_toponyms_sequences_dtf, polities_dtf, toponyms_exact_match_dict)

In [95]:
only_toponyms_sequences_dtf.head()

Unnamed: 0,hds_article_id,toponym,geoidentifier,article_title,polities_ids,nb_polities,tokenized_toponym,trimmed_normalized_tokenized_toponym,loose_normalized_tokenized_toponym,strict_normalized_tokenized_toponym,toponym_tokens_spans,toponym_tokens_sequence,possible_polities,linked_polity_id,linked_hds_tag,linked_toponym
0,627,Meierskappel,,Meierskappel,[000627-c],1,(Meierskappel),[Meierskappel],[Meierskappel],[Meierskappel],(Zoug),"[s', étendant, du, flanc, sud-est, du, Rooterb...","[000797-c, 007373-ct]",000797-c,Entités politiques / Commune,Zoug
0,627,Meierskappel,,Meierskappel,[000627-c],1,(Meierskappel),[Meierskappel],[Meierskappel],[Meierskappel],(Meierskappel),"[selon, copie, du, XIVe, s., ), ;, le, nom, de...",[000627-c],000627-c,Entités politiques / Commune,Meierskappel
0,627,Meierskappel,,Meierskappel,[000627-c],1,(Meierskappel),[Meierskappel],[Meierskappel],[Meierskappel],(Zoug),"[basse, justice, et, droits, de, pêche, dans, ...","[000797-c, 007373-ct]",000797-c,Entités politiques / Commune,Zoug
0,627,Meierskappel,,Meierskappel,[000627-c],1,(Meierskappel),[Meierskappel],[Meierskappel],[Meierskappel],(Habsbourg),"[Böschenrot, (, attestée, en, 1173, et, 1346/1...",[007503-b],007503-b,"Entités politiques / Bailliage, châtellenie",Habsbourg
0,627,Meierskappel,,Meierskappel,[000627-c],1,(Meierskappel),[Meierskappel],[Meierskappel],[Meierskappel],(Lucerne),"[au, Fraumünster, la, presqu', île, du, Chieme...","[000624-c, 007382-ct, 007382-ct, 011152-b, 011...",000624-c,Entités politiques / Commune,Lucerne


In [96]:
only_toponyms_sequences_dtf.possible_polities.apply(len).value_counts()

1    1038
2     339
4     209
3     174
5      55
0      13
Name: possible_polities, dtype: int64

In [None]:
only_toponyms_sequences_dtf[only_toponyms_sequences_dtf.possible_polities.apply(len)>4].head()


## Linking valid statuswords sequences to their polity 

In [None]:
def link_entity_by_typology(dtf_row, polities_dtf):
    possible_typologies = statusword_to_typology_dict.get(dtf_row.statusword.text.lower())

    if possible_typologies is None:
        print("WARNING: statusword without corresponding typology: |"+dtf_row.statusword.text.lower()+"|")
        return []

    possible_polities = [
        polities_dtf.loc[(polities_dtf.typology==typology) & polities_dtf.toponym.apply(lambda t: dtf_row.sequence_toponym.text == t)]
        for typology in possible_typologies
    ]
    possible_polities = [dtf for dtf in possible_polities if dtf.shape[0]>0]
    return possible_polities

def count_nb_matching_tokens(sequence_dtf_row, tokenized_toponym_texts):
    sequence_dtf_row_tokens_texts = [t.text for t in sequence_dtf_row.toponym_tokens_sequence]
    nb_matching_tokens = sum([
        word in sequence_dtf_row_tokens_texts[-(nb_successors+1):]
        for word in tokenized_toponym_texts
    ])
    return nb_matching_tokens

def link_entity_by_hdstag(dtf_row, polities_dtf, statusword_to_hdstag_dict):
    """
        # find possible polities: take polities that have matching hds_tag AND an exact match between the searched toponym and th sequence's identified toponym

    replacement proposition:
    - tokenize polities_dtf canonic_title
    - computing toponym matching score

    toponym matching score:
    - nb_matching_tokens= nb of polities_dtf.toponym_tokens present in sequence_tokens
    - all_tokens_matched: whether all tokens of the polities_dtf.toponym_tokens are in the sequence_tokens 
    - hds_tag_score: score inversely proportional to the rank an hds_tag has in the ordering (rank 0 -> highest score)

    ranking algorithm:
    -> order according to following order:
        1) all_tokens_matched*nb_matching_tokens
        2) hds_tag_score
        3) nb_matched_tokens
    -> score = 100* all_tokens_matched*nb_matching_tokens +
                10 * hds_tag_score + 
                nb_matched_tokens
    """
    possible_hdstags = statusword_to_hdstag_dict.get(dtf_row.statusword.text.lower())

    if possible_hdstags is None:
        print("WARNING: statusword without corresponding hdstag: |"+dtf_row.statusword.text.lower()+"|")
        return []

    possible_polities = [(
            i,
            polities_dtf.loc[(polities_dtf.hds_tag==hds_tag) &
            polities_dtf.tokenized_toponym_texts.apply(lambda tokens:
                any([dtf_row.sequence_toponym.text == t for t in tokens])
            )].copy()
        )for i,hds_tag in enumerate(possible_hdstags)
    ]
    for i,dtf in possible_polities:
        dtf["possibility_hds_tag_rank"] = i 
    possible_polities_dtf = pd.concat([dtf for i,dtf in possible_polities])
    possible_polities_dtf["nb_matching_tokens"] = possible_polities_dtf.tokenized_toponym_texts.apply(lambda ttt: count_nb_matching_tokens(dtf_row, ttt))
    possible_polities_dtf["possible_polity_score"] = \
        100* (possible_polities_dtf.tokenized_toponym_texts.apply(len)==possible_polities_dtf["nb_matching_tokens"]) * possible_polities_dtf["nb_matching_tokens"] + \
        10* (possible_polities_dtf["possibility_hds_tag_rank"].max() - possible_polities_dtf["possibility_hds_tag_rank"])+ \
        possible_polities_dtf["nb_matching_tokens"]
    possible_polities_dtf = possible_polities_dtf.sort_values(by ='possible_polity_score', ascending = False)

    return possible_polities_dtf

def link_statuswords_toponyms_sequences(dtf, polities_dtf, statusword_to_hdstag_dict):
    """
    takes a dtf (valid_sequences_dtf) coming from validate_statuswords_toponyms_sequences()
    + adds columns
        - possible_polities
        - possible_polities_min_rank
        - linked_polity_id, linked_hds_tag, linked_toponym
    """
    dtf["possible_polities"] = [
        link_entity_by_hdstag(row, polities_dtf, statusword_to_hdstag_dict)
        for i, row in tqdm(dtf.iterrows(), total=dtf.shape[0], desc="Linking entities by HDS tag")
    ]

    #valid_sequences_dtf["possible_polities_ranks"] = valid_sequences_dtf["possible_polities"].apply(lambda pp: [t[0] for t in pp])
    dtf["possible_polities_min_rank"] = dtf["possible_polities"].apply(lambda pp_dtf: pp_dtf.possibility_hds_tag_rank.min() if pp_dtf.shape[0]>0 else None)

    dtf["linked_polity_id"] = dtf["possible_polities"].apply(lambda pp: pp.iloc[0]["polity_id"] if pp.shape[0]>0 else None)
    dtf["linked_hds_tag"] = dtf["possible_polities"].apply(lambda pp: pp.iloc[0]["hds_tag"]if pp.shape[0]>0 else None)
    dtf["linked_toponym"] = dtf["possible_polities"].apply(lambda pp: pp.iloc[0]["toponym"]if pp.shape[0]>0 else None)


In [None]:
link_statuswords_toponyms_sequences(valid_sequences_dtf, polities_dtf, statusword_to_hdstag_dict)


In [None]:
valid_sequences_dtf.possible_polities.iloc[0]

In [None]:
if False:
    valid_sequences_dtf["possible_polities_by_typology"] = [
        link_entity_by_typology(row, polities_dtf)
        for i, row in valid_sequences_dtf.iterrows()
    ]

In [None]:
valid_sequences_dtf["possible_polities"].apply(lambda pp_dtf: pp_dtf.shape[0]).value_counts()
valid_sequences_dtf["possible_polities_min_rank"].value_counts()

## Exploring linking results

In [None]:
linked_sequences_human_columns = ["hds_article_id", "statusword", "sequence_toponym", "sequence", "linked_polity_id", "linked_hds_tag", "linked_toponym"]

valid_sequences_dtf.loc[:,linked_sequences_human_columns]

In [None]:
linked_sequences_dtf = valid_sequences_dtf.loc[valid_sequences_dtf["possible_polities"].apply(lambda pp: pp.shape[0]>0)].copy()
linked_sequences_dtf.loc[:,linked_sequences_human_columns]

In [None]:
unlinked_sequences_human_columns = ["hds_article_id", "statusword", "sequence_toponym", "sequence"]

unlinked_sequences_dtf = valid_sequences_dtf.loc[valid_sequences_dtf["possible_polities"].apply(lambda pp: pp.shape[0]==0)].copy()

unlinked_sequences_dtf.loc[:,unlinked_sequences_human_columns]


In [None]:
unlinked_sequences_dtf["sequence_text"] = ["".join([t.text_with_ws for t in s]) for s in unlinked_sequences_dtf.sequence]
unlinked_sequences_dtf.sequence_text.value_counts().head(20)


In [None]:
polities_dtf[polities_dtf.typology.apply(lambda t: t is None)].hds_tag.value_counts()

## Annotating linked polities in documents

In [None]:
def add_annotation_to_document_from_valid_sequences(document, valid_sequences_dtf_rows):
    new_annotations = [
        Annotation(
            row.sequence[0].idx,
            row.sequence[-1].idx+len(row.sequence[-1]),
            extra_fields={
                "type": "polity_id_LOC",
                "polity_id": row.linked_polity_id
            }
        )
        for i, row in valid_sequences_dtf_rows.iterrows()
    ]
    document.annotations = document.annotations + new_annotations

In [None]:
for i, row in sampled_articles_dtf.iterrows():
    add_annotation_to_document_from_valid_sequences(row.document, valid_sequences_dtf[valid_sequences_dtf.hds_article_id==row.hds_article_id])

In [None]:
# COMPLETING ANNOTATIONS OF MULTI-TOKEN TOPONYMS
sampled_articles_dtf.iloc[32,:].hds_article_id

valid_sequences_dtf[valid_sequences_dtf.hds_article_id=="001245"]
dtf_row = valid_sequences_dtf[valid_sequences_dtf.hds_article_id=="001245"].iloc[1,:]

valid_sequences_dtf[valid_sequences_dtf.hds_article_id=="001245"].iloc[0,:].possible_polities

#hds_tag = valid_sequences_dtf.loc[valid_sequences_dtf.hds_article_id=="001245",["hds_tag"]]

In [None]:
test_values = [
    valid_sequences_dtf.shape[0],      statusword_tokens_sequences_dtf.shape[0]
]
truth_sequence = [
    valid_sequences_dtf.shape[0]==727, statusword_tokens_sequences_dtf.shape[0]==1456
]

print(all(truth_sequence))
print(test_values)
print(truth_sequence)

In [None]:
valid_sequences_dtf

In [None]:
polities_dtf.tokenized_toponym_texts.apply(len).value_counts()

In [None]:
polities_dtf.loc[:,["polity_id", "tokenized_toponym_texts"]]

In [None]:
test = linked_sequences_dtf.merge(polities_dtf.loc[:,["polity_id", "tokenized_toponym_texts"]], left_on="linked_polity_id", right_on="polity_id", how="left")
test.tokenized_toponym_texts.apply(len).value_counts()
