In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
import unicodedata
import gc

import spacy
import sqlite3
from sentence_transformers import CrossEncoder

import networkx as nx
import matplotlib.pyplot as plt
from itertools import combinations

tqdm.pandas()

### Load Alias Table and Models

In [3]:
alias_table = pd.read_pickle("../../knowledge_base/alias_table.pkl")
nlp = spacy.load("../../output/model-best")
cross_encoder_marco = CrossEncoder("../../bert_output/ms-marco-MiniLM/checkpoint-279")
cross_encoder_bertje = CrossEncoder("../../bert_output/bert-base-dutch/checkpoint-279")

nlp.add_pipe('sentencizer')
alias_table.head()

Unnamed: 0,qid,surface_form,normal_form,label,description
0,Q23304137,karnemelksegat,Karnemelksegat,Karnemelksegat,meer in Noord-Holland
1,Q23304137,karnemelksgat,Karnemelksgat,Karnemelksegat,meer in Noord-Holland
2,Q23304529,zoel,Zoel,Zoel,rivier in Nederland
3,Q23306241,kalkwerk,Kalkwerk,Kalkwerk,"voormalig buurtschap in Groningen, Nederland"
4,Q59244527,roomgracht,Roomgracht,Roomgracht,"voormalige gracht in Leiden, Nederland"


### Index Alias Table

In [5]:
db = sqlite3.connect(':memory:')
cur = db.cursor()

# Create surface form table
cur.execute('create virtual table kb using fts5(qid UNINDEXED, surface_form, normal_form UNINDEXED, label UNINDEXED, description UNINDEXED, tokenize="porter unicode61");')

# populate form table table
cur.executemany(
    'insert into kb (qid, surface_form, normal_form, label, description) values (?,?,?,?,?);',
    alias_table[['qid', 'surface_form', 'normal_form','label', 'description']].to_records(index=False))
db.commit()

### Load Woogle data
It is recommended to only load one dataframe to save RAM.


In [7]:
df1 = pd.read_csv("../../woo_data/2b_clean.csv")
df2 = pd.read_csv("../../woo_data/2c_clean.csv")
df3 = pd.read_csv("../../woo_data/2e-b_clean.csv")
df4 = pd.read_csv("../../woo_data/2i_clean.csv")

### Sample (optional) and group documents

In [9]:
def sample_documents(df, n_samples, seed=42):
    sampled_doc_ids = df['foi_documentId'].drop_duplicates().sample(n=n_samples, random_state=seed)
    sampled_pages = df[df['foi_documentId'].isin(sampled_doc_ids)].copy()
    return sampled_pages

n = 10
df = sample_documents(df1, n)

df = df.groupby("foi_documentId")['foi_bodyTextOCR']\
    .apply(lambda pages: " ".join(pages.dropna().astype(str)))\
    .reset_index()

df.head()

Unnamed: 0,foi_documentId,foi_bodyTextOCR
0,nl.oorg10002.2b.1997.29-2333-2333.doc.1,Voorzitter: Weisglas Tegenwoordig zijn 107 led...
1,nl.oorg10002.2b.1998.21-1356-1379.doc.1,daarin wordt gevraagd om een zo spoedig mogeli...
2,nl.oorg10002.2b.2015.61-12.doc.1,12 Stemmingen overige moties Rapport Onderzoek...
3,nl.oorg10002.2b.2019.102-17.doc.1,17 Stemmingen moties Ontwerpbesluit maatregele...
4,nl.oorg10002.2b.2023.18-6.doc.1,6 Afrikastrategie Voorzitter: Kamminga Afrikas...


### Helper Functions

In [11]:
def normalize_unicode(text):
    text = unicodedata.normalize('NFKD', text)
    text = ''.join([c for c in text if not unicodedata.combining(c)])
    return text

def clean_text(text):
    text = normalize_unicode(str(text))
    text = re.sub(r'-\s+', '', text)
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    tokens = text.split()
    return ' '.join(token.strip() for token in tokens).lower()

def prepare_candidates(candidates):
    fts_scores = []
    labels = []
    descriptions = []

    for i, (qid, rank, label, surface_from, desc) in enumerate(candidates):
        fts_scores.append(rank)
        labels.append(label)
        descriptions.append(f"{label} - {desc}")

    return fts_scores, labels, descriptions

def normalize_scores(scores, min_val, max_val):
    return (scores - min_val) / (max_val - min_val)

def compute_final_scores(encoder_scores, fts_scores, alpha):
    fts_min, fts_max = 4.2948, 25.2106
    encoder_min, encoder_max = -11.0917, 6.2494

    fts_scores_norm = normalize_scores(np.abs(fts_scores), fts_min, fts_max)
    encoder_scores_norm = normalize_scores(encoder_scores, encoder_min, encoder_max)

    return alpha * encoder_scores_norm + (1 - alpha) * fts_scores_norm

### Named Entity Recognition and Entity Linking

In [13]:
def named_entity_recognition(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append({
            'mention': ent.text,
            'context': ent.sent.text,
            'start': ent.start
        })
    return entities

def candidate_generation(mention, limit=15, window=10):
    terms = mention.split()
    if len(terms) >= 2:
        quoted_terms = ' '.join(f'"{t}"' for t in terms)
        query = f"NEAR({quoted_terms}, {window})"
    else:
        query = f"surface_form:{mention}"

    qids = []
    res = cur.execute(f"""
        SELECT qid, MIN(rank) as best_rank, label, normal_form, description
        FROM kb
        WHERE surface_form MATCH ?
        GROUP BY qid
        ORDER BY best_rank
        LIMIT ?
        """, (query, limit)).fetchall()

    for candidate in res:
        qids.append(candidate[0])

    return qids, res

def ranking(qids, candidates, context, alpha):
    if len(qids) == 0:
        return None

    fts_scores, labels, descriptions = prepare_candidates(candidates)

    pairs = [(context, cand) for cand in descriptions]
    encoder_scores = np.array(cross_encoder_marco.predict(pairs)) # Select re-ranking model
    fts_scores = abs(np.array(fts_scores))

    final_scores = compute_final_scores(encoder_scores, fts_scores, alpha)

    best_idx = np.argmax(final_scores)
    return qids[best_idx], labels[best_idx], final_scores[best_idx]

def entity_linking(mention, context, limit=10, alpha=0.55, threshold=0.55): # Select hyperparameters
    entity_clean = clean_text(mention)
    qids, candidates = candidate_generation(entity_clean, limit)
    result = ranking(qids, candidates, context, alpha)

    if result is None:
        return 'NILL'

    qid, label, score = result
    if score > threshold:
        return qid #Change to 'label' to return page titles 
    else:
        return 'NILL'

def return_entities(text):
    if len(text) > 200000: # Documents with high number of tokens can overload the RAM
        return []


    entities = named_entity_recognition(text)
    entities_norm = []

    for entity in entities:
        mention = entity['mention']
        context = entity['context']
        start = entity['start']
        linked = entity_linking(mention, context)
        if linked != 'NILL':
            entities_norm.append({
                'entity': linked,
                'start': start
            })


    return entities_norm

In [14]:
batch_size = 5
results = []

total_batches = len(df) // batch_size + 1

for batch_idx in tqdm(range(total_batches), desc="Processing Batches"):
    start = batch_idx * batch_size
    end = min(start + batch_size, len(df))
    
    batch = df.iloc[start:end].copy()
    batch_results = []
    
    for text in batch['foi_bodyTextOCR']:
        try:
            ents = return_entities(text)
        except Exception as e:
            # print(f"Error processing text: {e}")
            ents = []
        batch_results.append(ents)
    
    batch['entities'] = batch_results
    results.append(batch)
    
    del batch, batch_results
    gc.collect()

df_results = pd.concat(results, ignore_index=True)
df_results.head()

# Optional: write to pkl
# df_results.to_pickle("results.pkl") 

Processing Batches: 100%|███████████████████████████████████████████████████████████████| 2/2 [04:54<00:00, 147.24s/it]


Unnamed: 0,foi_documentId,foi_bodyTextOCR,entities
0,nl.oorg10002.2b.1997.29-2333-2333.doc.1,Voorzitter: Weisglas Tegenwoordig zijn 107 led...,"[{'entity': 'Q2368020', 'start': 15}, {'entity..."
1,nl.oorg10002.2b.1998.21-1356-1379.doc.1,daarin wordt gevraagd om een zo spoedig mogeli...,"[{'entity': 'Q667680', 'start': 24}, {'entity'..."
2,nl.oorg10002.2b.2015.61-12.doc.1,12 Stemmingen overige moties Rapport Onderzoek...,"[{'entity': 'Q752', 'start': 366}, {'entity': ..."
3,nl.oorg10002.2b.2019.102-17.doc.1,17 Stemmingen moties Ontwerpbesluit maatregele...,"[{'entity': 'Q275441', 'start': 162}, {'entity..."
4,nl.oorg10002.2b.2023.18-6.doc.1,6 Afrikastrategie Voorzitter: Kamminga Afrikas...,"[{'entity': 'Q22001627', 'start': 252}, {'enti..."


### Co-occurrence Network

In [16]:
def make_network_documents(df):
    G = nx.Graph()

    for _, row in df.iterrows():
        entity_list = row['entities']
        
        if not entity_list or len(entity_list) < 1:
            continue
    
        for ent in entity_list:
            if ent['entity'] is not None:
                G.add_node(ent['entity'])
                
        for ent1, ent2 in combinations(entity_list, 2):
            label1 = ent1['entity']
            label2 = ent2['entity']
            
            if label1 == label2:
                continue 
            
            if G.has_edge(label1, label2):
                G[label1][label2]["weight"] += 1
            else:
                G.add_edge(label1, label2, weight=1)

    return G

def make_network_proximity(df, k=50):
    G = nx.Graph()
    
    for _, row in df.iterrows():
        entity_list = row['entities']
        
        if not entity_list or len(entity_list) < 1:
            continue
    
        for ent in entity_list:
            G.add_node(ent['entity'])
    
        for ent1, ent2 in combinations(entity_list, 2):
            if ent1['entity'] == ent2['entity']:
                continue 
    
            distance = abs(ent1['start'] - ent2['start'])
            if distance <= k:
                if G.has_edge(ent1['entity'], ent2['entity']):
                    G[ent1['entity']][ent2['entity']]['weight'] += 1
                else:
                    G.add_edge(ent1['entity'], ent2['entity'], weight=1)

    return G

In [17]:
# Optional: read pkl
# df_results = pd.read_pickle("results.pkl")

graph = make_network_proximity(df_results)

print("Nodes in graph 1:", graph.number_of_nodes())
print("Edges in graph 1:", graph.number_of_edges())

Nodes in graph 1: 146
Edges in graph 1: 856


In [18]:
def filter_edges(graph):
    G = nx.Graph()
    filtered_edges = [(u, v, d) for u, v, d in graph.edges(data=True) if d.get('weight', 0) > 1]
    G.add_edges_from(filtered_edges)
    return G

graph = filter_edges(graph)

print("Nodes in graph 1:", graph.number_of_nodes())
print("Edges in graph 1:", graph.number_of_edges())

Nodes in graph 1: 41
Edges in graph 1: 76


In [19]:
# Optional: k-core decomposition
# graph = nx.k_core(graph, k=3)

In [20]:
# Optional: write to node and edge dataframes
nodes = pd.DataFrame({'Id': list(graph.nodes)})

edges_proximity = nx.to_pandas_edgelist(graph)

edges_proximity = edges_proximity.rename(columns={'source': 'Source', 'target': 'Target', 'width': 'Weight'})

# nodes.to_csv('nodes.csv', index=False)
# edges_proximity.to_csv('edges_proximity.csv', index=False)