## Load ICIJ dataset


In [1]:
import spacy
from spacy.tokens import DocBin
from spacy import displacy
import srsly

from spacy_lancedb_linker.kb import AnnKnowledgeBase
from spacy_lancedb_linker.linker import AnnLinker  # noqa
from spacy_lancedb_linker.types import Alias, Entity
from src.scraper import SPACY_MODEL

In [2]:
scrape_nlp: spacy.Language = spacy.load(SPACY_MODEL)

In [3]:
doc_bin = DocBin().from_disk(path="data/dataset.spacy")
len(doc_bin)

2

In [4]:
docs = list(doc_bin.get_docs(scrape_nlp.vocab))

## Load example Wikidata KB (manual input)

In [5]:
entities = [Entity(**entity) for entity in srsly.read_jsonl("data/icij-example/entities.jsonl")]

In [6]:
aliases = [Alias(**alias) for alias in srsly.read_jsonl("data/icij-example/aliases.jsonl")] + [
    Alias(alias=entity.name, entities=[entity.entity_id], probabilities=[1]) for entity in entities
]

In [7]:
uri = "data/sample-lancedb"
ann_kb = AnnKnowledgeBase(uri=uri)
ann_kb.add_entities(entities)
ann_kb.add_aliases(aliases)



In [8]:
ann_linker = scrape_nlp.add_pipe("ann_linker", last=True)
ann_linker.set_kb(ann_kb)

In [9]:
scrape_nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'ann_linker']

In [10]:
doc = scrape_nlp(docs[0])

In [11]:
displacy.render(doc, style="ent")

## Load custom KB built from Senzing results

In [12]:
import pytextrank

/home/donbr/erkg-tutorials/venv/lib/python3.12/site-packages


In [13]:
entities = [Entity(**entity) for entity in srsly.read_jsonl("data/senzing/entities.jsonl")]
len(entities)

29

In [14]:
aliases = [Alias(**alias) for alias in srsly.read_jsonl("data/senzing/aliases.jsonl")] + [
    Alias(alias=entity.name, entities=[entity.entity_id], probabilities=[1]) for entity in entities
]
len(aliases)

2502

In [15]:
uri = "data/sample-lancedb"
ann_kb = AnnKnowledgeBase(uri=uri)
ann_kb.add_entities(entities)
ann_kb.add_aliases(aliases)

In [16]:
scrape_nlp: spacy.Language = spacy.load(SPACY_MODEL)
ann_linker = scrape_nlp.add_pipe("ann_linker", last=True)
ann_linker.set_kb(ann_kb)
scrape_nlp.add_pipe("textrank")
scrape_nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'ann_linker',
 'textrank']

In [17]:
doc = scrape_nlp(docs[1])

In [18]:
displacy.render(doc, style="ent")

In [19]:
records = []
for phrase in doc._.phrases[:30]:
    record = (phrase.text, phrase.rank, phrase.count, set((ent.text, ent.kb_id_) for chunk in phrase.chunks for ent in chunk.ents))
    records.append((record[0], record[1], record[2], [{"text": e[0], "kb_id": e[1]} for e in record[-1]]))

In [20]:
import pandas as pd

In [21]:
raw_entities = pd.DataFrame.from_records(records, columns=["phrase", "rank", "count", "entities"]).explode("entities")
df = pd.concat(  # type: ignore
    [
        raw_entities.drop(columns="entities"),
        pd.json_normalize(raw_entities.entities).set_index(raw_entities.index),  # type: ignore
    ],
    axis=1,
)

In [22]:
df

Unnamed: 0,phrase,rank,count,text,kb_id
0,Azerbaijan President Ilham Aliyev,0.082795,2,Ilham Aliyev,1342265.0
0,Azerbaijan President Ilham Aliyev,0.082795,2,Azerbaijan,246799.0
1,President Aliyev,0.071555,1,Aliyev,1342265.0
2,AtaHolding Azerbaijan,0.069281,2,AtaHolding Azerbaijan,246799.0
3,Mossack Fonseca,0.069242,24,Mossack Fonseca,388148.0
4,Azerbaijan,0.065796,25,Azerbaijan,918573.0
4,Azerbaijan,0.065796,25,Azerbaijan,442619.0
4,Azerbaijan,0.065796,25,Azerbaijan,281073.0
4,Azerbaijan,0.065796,25,Azerbaijan,1551574.0
4,Azerbaijan,0.065796,25,Azerbaijan,246799.0


In [23]:
entities_to_review = df.loc[lambda d: (d.text.notnull()) & (d.kb_id == '')]
entities_to_review

Unnamed: 0,phrase,rank,count,text,kb_id
5,Azerbaijan Richard D. Kauzlarich,0.060485,1,Richard D. Kauzlarich,
13,Azeri investigative journalist Khadija Ismayilova,0.049191,1,Khadija Ismayilova,
14,S. President Barack Obama,0.048902,2,Barack Obama,
17,Investigative reporter Khadija Ismayilova,0.045958,2,Khadija Ismayilova,
19,Londex Resources S.A.,0.045497,2,Londex Resources S.A.,
21,Londex Resources,0.044973,2,Londex Resources,
25,Khadija Ismayilova,0.042795,3,Khadija Ismayilova,


In [24]:
for_review: list[pd.DataFrame] = []
for doc in scrape_nlp.pipe(docs):
    records = []
    for phrase in doc._.phrases[:30]:
        records.append(
            (
                phrase.text,
                phrase.rank,
                phrase.count,
                [
                    {"text": text, "kb_id": kb_id}
                    for text, kb_id in set(
                        (ent.text, ent.kb_id_) for chunk in phrase.chunks for ent in chunk.ents
                    )
                ],
            )
        )
    raw_entities = pd.DataFrame.from_records(
        records, columns=["phrase", "rank", "count", "entities"]
    ).explode("entities")
    df = pd.concat(  # type: ignore
        [
            raw_entities.drop(columns="entities"),
            pd.json_normalize(raw_entities.entities).set_index(raw_entities.index),  # type: ignore
        ],
        axis=1,
    )
    entities_to_review = df.loc[lambda d: (d.text.notnull()) & (d.kb_id == "")]
    for_review.append(entities_to_review)

In [25]:
for_review

[                                        phrase      rank  count  \
 0     Former Czech prime minister Andrej Babis  0.085349      1   
 3                  former Czech prime minister  0.071176      1   
 5                                    Czech law  0.061685      1   
 6                                         ICIJ  0.060946     11   
 7                                        Czech  0.058808      8   
 8   Former Czech leader’s secret French estate  0.057126      1   
 9        Hungarian Prime Minister Victor Orbán  0.056900      1   
 9        Hungarian Prime Minister Victor Orbán  0.056900      1   
 12          Freedom Party leader Herbert Kickl  0.050929      1   
 12          Freedom Party leader Herbert Kickl  0.050929      1   
 15                  ICIJ’s Czech media partner  0.050260      1   
 15                  ICIJ’s Czech media partner  0.050260      1   
 17                              Pandora Papers  0.049464      2   
 19                              Chateau Bigaud 