In [1]:
import requests
import spacy
from bs4 import BeautifulSoup, SoupStrainer
from spacy.tokens import DocBin
from spacy import displacy

In [2]:
from src.scraper import IcijScraper, SPACY_MODEL

## load docBin with default NER entities

In [3]:
doc_bin = DocBin().from_disk(path="data/dataset.spacy") 

In [4]:
len(doc_bin)

2

In [5]:
scrape_nlp: spacy.Language = spacy.load(SPACY_MODEL)

In [6]:
docs = list(doc_bin.get_docs(scrape_nlp.vocab))

In [7]:
displacy.render(docs[0], style="ent")



## apply a different NER model on DocBin

In [9]:
import spacy
from gliner_spacy.pipeline import (  # noqa: F401 because we need to register the factory with spacy
    GlinerSpacy,
)

candidate_labels = [
    "persons",
    "address",
    "shell companies",
    "banks or law firms",
]  # NuZero requires labels to be lower-cased

model_name = "numind/NuZero_token"

nlp = spacy.load("en_core_web_md", disable=["ner"])
# nlp.add_pipe("span_marker", config={"model": "tomaarsen/span-marker-mbert-base-multinerd"})
nlp.add_pipe("gliner_spacy")
#     # config={
#     #     "gliner_model": model_name,
#     #     "chunk_size": 250,
#     #     "labels": candidate_labels,
#     #     "style": "ent",
#     #     "threshold": 0.3,
#     # },


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]



<gliner_spacy.pipeline.GlinerSpacy at 0x7f457be60dd0>

In [10]:
doc = nlp(docs[0])
displacy.render(doc, style="ent")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


## using a dbpedia_trie as input for the zshot entity linker

In [12]:
import spacy
import zshot
from zshot import PipelineConfig, displacy
from zshot.linker import LinkerRegen
from zshot.linker.linker_regen.utils import load_dbpedia_trie, load_wikipedia_trie
from zshot.mentions_extractor import MentionsExtractorSpacy
from zshot.utils.mappings import spans_to_dbpedia, spans_to_wikipedia

dbpedia_trie = load_dbpedia_trie()

  TPL_SCRIPT = """


dbpedia_trie.pkl:   0%|          | 0.00/274M [00:00<?, ?B/s]

In [13]:
nlp_dbpedia = spacy.load("en_core_web_md")
nlp_config = PipelineConfig(
    mentions_extractor=MentionsExtractorSpacy(), linker=LinkerRegen(trie=dbpedia_trie)
)
nlp_dbpedia.add_pipe("zshot", config=nlp_config, last=True)

<zshot.zshot.Zshot at 0x7f4341727fb0>

In [14]:
doc = nlp_dbpedia(
    "CH2O2 is a chemical compound similar to Acetamide used in International Business "
    "Machines Corporation (IBM)."
)
displacy.render(doc, style="ent")
print(list(zip(doc.ents, spans_to_dbpedia(doc._.spans))))

config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  return self._call_impl(*args, **kwargs)


dbpedia_map_id.json:   0%|          | 0.00/987M [00:00<?, ?B/s]

[(Acetamide, 'http://dbpedia.org/resource/Acetamide'), (International Business Machines Corporation, 'http://dbpedia.org/resource/IBM'), (IBM, 'http://dbpedia.org/resource/IBM')]


In [15]:
docs[0][:50]

Former Czech leader’s secret French estate, revealed in Pandora Papers, listed for sale.
The luxury Riviera property featured in ICIJ’s investigation into then-Prime Minister Andrej Babis’ secret offshore dealings, and is now part of a money laundering probe by French

In [16]:
doc = nlp_dbpedia(docs[0][:100].text)

In [17]:
displacy.render(doc, style="ent")

In [18]:
doc = nlp_dbpedia(docs[0])

: 

: 

In [32]:
e = doc.ents[0]

In [None]:
e.label_

## Understanding dbpedia_map

In [40]:
from huggingface_hub import hf_hub_download
from zshot.config import MODELS_CACHE_PATH


In [77]:
REPO_ID = "ibm/regen-disambiguation"
WIKIPEDIA_MAP = "wikipedia_map_id.json"
DBPEDIA_MAP = "dbpedia_map_id.json"

dbpedia_map = hf_hub_download(repo_id=REPO_ID,
                              repo_type='model',
                              filename=DBPEDIA_MAP,
                              cache_dir=MODELS_CACHE_PATH)

In [45]:
import json

In [46]:
    with open(dbpedia_map, "r") as f:
        dbpedia_map = json.load(f)

In [50]:
spans = doc._.spans

In [51]:
links = [dbpedia_map[s.label] for s in spans if s.label in dbpedia_map]


In [54]:
extract = {k: v for k,v in dbpedia_map.items() if k in [s.label for s in spans]}

In [None]:
extract

In [None]:
links

In [65]:
ents = list(zip(doc.ents, spans_to_dbpedia(doc._.spans)))

In [None]:
[(ent, ent.label, link) for ent,link in ents]

In [None]:
dbpedia_map

In [None]:
len(dbpedia_trie.trie_dict)

## Understanding dbpedia_trie_file

In [57]:
REPO_ID = "ibm/regen-disambiguation"
WIKIPEDIA_TRIE_FILE_NAME = "wikipedia_trie.pkl"
DBPEDIA_TRIE_FILE_NAME = "dbpedia_trie.pkl"
dbpedia_trie_file = hf_hub_download(repo_id=REPO_ID,
                                    repo_type='model',
                                    filename=DBPEDIA_TRIE_FILE_NAME,
                                    cache_dir=MODELS_CACHE_PATH)

In [None]:
dbpedia_trie_file

In [61]:
import pickle

with open(dbpedia_trie_file, "rb") as f:
    dbpedia_trie = pickle.load(f)

In [None]:
dbpedia_trie.trie_dict.keys()

In [None]:
dbpedia_trie.trie_dict[11401]

the Trie is built with this code
```python
self.trie = Trie(
    [
        self.tokenizer(e.name, return_tensors="pt")['input_ids'][0].tolist()
        for e in entities
    ]
)
```

## making microsoft ann_linker work on demo data
I can't install the package, and the package was last updated 4 years ago. I skip.

## Spacy entity linker
ref: https://github.com/explosion/projects/blob/v3/tutorials/nel_emerson/notebooks/notebook_video.ipynb

In [None]:
import spacy
nlp = spacy.load("en_core_web_md")
text = "Tennis champion Emerson was expected to win Wimbledon."
doc = nlp(text)
for ent in doc.ents:
    print(f"Named Entity '{ent.text}' with label '{ent.label_}'")

In [4]:
import csv
from pathlib import Path

def load_entities():
    entities_loc = Path.cwd().parent / "data" / "test-spacy" / "entities.csv"  # distributed alongside this notebook

    names = dict()
    descriptions = dict()
    with entities_loc.open("r", encoding="utf8") as csvfile:
        csvreader = csv.reader(csvfile, delimiter=",")
        for row in csvreader:
            qid = row[0]
            name = row[1]
            desc = row[2]
            names[qid] = name
            descriptions[qid] = desc
    return names, descriptions

In [None]:
name_dict, desc_dict = load_entities()
for QID in name_dict.keys():
    print(f"{QID}, name={name_dict[QID]}, desc={desc_dict[QID]}")

In [8]:
from spacy.kb import InMemoryLookupKB
kb = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=300)

In [None]:
# entities.jsonl
# entity_id, optional:entity_name, entity_description, corpus_frequency, optional:label
# {"id":"a6","name":"Statistics","description":"Statistics deals with all aspects of data collection, organization, analysis, interpretation, and presentation.","label":"SKILL"}

# aliases.jsonl
# name, entities, probabilities
# {"alias": "ML", "entities": ["a1", "a2"], "probabilities": [0.5, 0.5]}

In [10]:
for qid, desc in desc_dict.items():
    desc_doc = nlp(desc)
    desc_enc = desc_doc.vector
    kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342)   # 342 is an arbitrary value here

In [11]:
for qid, name in name_dict.items():
    kb.add_alias(alias=name, entities=[qid], probabilities=[1])   # 100% prior probability P(entity|alias)

In [None]:
kb.add_alias(alias="Emerson", entities=name_dict.keys(), probabilities=[0.3, 0.3, 0.3])  # sum([probs]) should be <= 1 !

In [None]:
print(f"Entities in the KB: {kb.get_entity_strings()}")
print(f"Aliases in the KB: {kb.get_alias_strings()}")

In [None]:
print(f"Candidates for 'Roy Stanley Emerson': {[c.entity_ for c in kb.get_alias_candidates('Roy Stanley Emerson')]}")
print(f"Candidates for 'Emerson': {[c.entity_ for c in kb.get_alias_candidates('Emerson')]}")
print(f"Candidates for 'Sofie': {[c.entity_ for c in kb.get_alias_candidates('Sofie')]}")

In [15]:
# change the directory and file names to whatever you like
import os
output_dir = Path.cwd().parent / "data" / "spacy_el_output"
if not os.path.exists(output_dir):
    os.mkdir(output_dir) 
kb.to_disk(output_dir / "my_kb")

In [16]:
nlp.to_disk(output_dir / "my_nlp")

In [17]:
from spacy.pipeline.entity_linker import DEFAULT_NEL_MODEL


In [None]:
def create_kb(vocab):
    kb = InMemoryLookupKB(vocab, entity_vector_length=128)
    kb.add_entity(...)
    kb.add_alias(...)
    return kb

In [18]:
from spacy.pipeline.entity_linker import DEFAULT_NEL_MODEL
config = {
   "labels_discard": [],
   "n_sents": 1,
   "incl_prior": True,
   "incl_context": True,
   "model": DEFAULT_NEL_MODEL,
   "entity_vector_length": 300,
   "get_candidates": {'@misc': 'spacy.CandidateGenerator.v1'},
   "threshold": None,
}
entity_linker = nlp.add_pipe("entity_linker", config=config)

In [20]:
from spacy.ml.models import load_kb

entity_linker.set_kb(lambda vocab: load_kb(output_dir / "my_kb"))
# entity_linker.initialize(lambda: examples, nlp=nlp, kb_loader=my_kb)

In [None]:
from spacy.pipeline import EntityLinker
entity_linker = EntityLinker(nlp.vocab, DEFAULT_NEL_MODEL, name="entity_linker", cds)

In [None]:
from spacy.pipeline import EntityLinker
from spacy.kb import Candidate

# candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)

entity_linker = EntityLinker(
    nlp.vocab,
    DEFAULT_NEL_MODEL,
    entity_vector_length=300,
    get_candidates=lambda kb, span: None,
)

In [None]:


nlp = spacy.load("en_core_web_md")
doc = nlp("Tennis champion Emerson was expected to win Wimbledon.")
entity_linker = nlp.add_pipe("entity_linker")
entity_linker.set_kb(lambda vocab: load_kb(output_dir / "my_kb"))
# This usually happens under the hood
processed = entity_linker(doc)


In [None]:
processed

In [None]:
text = "Tennis champion Emerson was expected to win Wimbledon."
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_, ent.kb_id_)

## implementing my own entity linker based on microsoft/spacy_ann_linker

In [None]:
from src.ann_linker.dag import entities, aliases, nlp, kb

In [2]:
from src.ann_linker.linker import AnnLinker

In [3]:
entities = entities()
aliases = aliases()
nlp = nlp()

In [None]:
kb = kb(entities, aliases)

In [None]:
kb.get_alias_candidates("ML")

In [None]:
candidate_entities = kb.get_entity_candidates("ML")
candidate_entities

In [None]:
[e for e in entities if e.entity_id in candidate_entities]

In [8]:
doc_embedding = kb._embed("Linear regression is one of the first statistical models used by students of ML")

In [None]:
kb.disambiguate(candidate_entities, doc_embedding)

In [None]:
kb.get_alias_candidates("learning")

In [11]:
ruler = nlp.add_pipe('entity_ruler')
patterns = [
    {"label": "SKILL", "pattern": alias}
    for alias in [a.alias for a in aliases] + ['machine learn']
]
ruler.add_patterns(patterns)

In [12]:
ann_linker = nlp.add_pipe("ann_linker", last=True)
ann_linker.set_kb(kb)

In [13]:
doc = nlp("NLP is a subset of machine learn.")

In [None]:
doc.ents

In [None]:
kb.get_candidates_batch(doc.ents)

In [None]:
for ent in doc.ents:
    print(ent.kb_id_)

In [None]:
kb.get_alias_candidates("machine learn")

In [None]:
kb.get_entity_candidates("machine learn")

In [None]:
doc_embedding = kb._embed(doc.text)
kb.disambiguate(kb.get_entity_candidates("machine learn"), doc_embedding)

In [None]:
doc.ents[0]._.alias_candidates

In [None]:
doc.ents[0]._.kb_candidates

In [None]:
doc.ents[1]._.alias_candidates

In [None]:
doc.ents[1]._.kb_candidates

In [32]:
scanner = tbl._dataset.scanner(columns=["alias.alias"])

In [34]:
unique_labels = set()
for batch in scanner.to_batches():
    unique_labels.update(batch.column("alias.alias").to_pylist())

len(unique_labels)

In [None]:
alias_records

In [14]:
import json

In [29]:
import pandas as pd

In [32]:
import numpy as np

In [29]:
class AliasRawData(TypedDict):
    alias: str
    entity: int


def load_aliases(
    icij_path: str | pathlib.Path = "data/ICIJ-entity-report-2024-06-21_12-04-57-std.json",
) -> list[AliasRawData]:
    alias_records: list[AliasRawData] = []

    with open(icij_path, "r", encoding="utf-8") as fp:
        while line := fp.readline():
            dat = json.loads(line.strip())

            # add aliases from resolved entities
            entity: dict = dat["RESOLVED_ENTITY"]
            if not entity["ENTITY_NAME"]:
                continue
            for record in entity["RECORDS"]:
                alias_records.append(
                    {"alias": entity["ENTITY_NAME"], "entity": record["INTERNAL_ID"]}
                )

            # add aliases from related entities
            related_entities: dict = dat["RELATED_ENTITIES"]
            for record in related_entities:
                # MATCH_LEVEL_CODE is either POSSIBLY_SAME or POSSIBLY_RELATED or RESOLVED or DISCLOSED
                # we choose to add an alias record if POSSIBLY_SAME
                if record["MATCH_LEVEL_CODE"] in ["POSSIBLY_SAME", "RESOLVED", "DISCLOSED"]:
                    alias_records.append(
                        {"alias": entity["ENTITY_NAME"], "entity": record["ENTITY_ID"]}
                    )
                # and discard if POSSIBLY_RELATED
                elif record["MATCH_LEVEL_CODE"] == "POSSIBLY_RELATED":
                    continue

    return alias_records

def generate_aliases(raw_aliases: list[AliasRawData]) -> pd.DataFrame:
    df = (
        pd.DataFrame.from_records(raw_aliases)
        .groupby("alias")
        .agg(counts=("entity", Counter))
        .assign(entities=lambda d: d.counts.apply(list))
        .assign(
            probabilites=lambda d: d.counts.apply(
                lambda x: [count / x.total() for k, count in x.items()]
            )
        )
        .drop(columns="counts")
        .reset_index()
    )
    return df


def write_aliases(
    aliases: pd.DataFrame, filepath: str | pathlib.Path = "data/senzing/aliases.jsonl"
):
    aliases.to_json(filepath, orient="records", lines=True)

In [30]:
raw_aliases = load_aliases()

In [31]:
aliases = generate_aliases(raw_aliases)

In [None]:
aliases.head()

In [None]:
aliases.head()

In [33]:
write_aliases(aliases)

## Cherry pick the entities 

In [13]:
%load_ext autoreload
%autoreload 2

In [14]:
import spacy
from spacy.tokens import DocBin
from spacy import displacy
import srsly

from spacy_lancedb_linker.kb import AnnKnowledgeBase
from spacy_lancedb_linker.linker import AnnLinker  # noqa
from spacy_lancedb_linker.types import Alias, Entity
from src.scraper import SPACY_MODEL

In [15]:
from src.senzing_pipeline import load_aliases, load_countries, load_entities

Pipeline is:
- take docbin of articles
- collect NER and noun chunks
- pre-filter senzing results for that
- build entity summaries and aliases for that pre-filtered set
- load the spacy model for EL and do EL

In [None]:
countries = load_countries()
raw_entities = load_entities()

In [None]:
raw_aliases = load_aliases()

In [None]:
set(a["type"] for a in raw_aliases)

In [9]:
from src.senzing_pipeline import generate_patterns

In [10]:
patterns = generate_patterns(raw_aliases)

In [17]:
nlp = spacy.load(SPACY_MODEL, exclude=["ner"])

In [9]:
# disabled = nlp.select_pipes(disable="ner")
# doc = nlp("I won't have named entities")
# disabled.restore()

In [13]:
ruler = nlp.add_pipe("entity_ruler")
with nlp.select_pipes(enable="tagger"):
    ruler.add_patterns(patterns)

In [None]:
nlp.pipe_names

In [20]:
nlp = spacy.load(SPACY_MODEL)

In [35]:
doc_bin = DocBin().from_disk(path="data/dataset.spacy")
docs = list(doc_bin.get_docs(nlp.vocab))

In [25]:
matched = set(ent.text for doc in nlp.pipe(docs) for ent in doc.ents)

- use aliases.jsonl to define initial group
- filter aliases.jsonl for entities in group
- filter entities.jsonl for entities in this group and friend of and friend of friend

In [36]:
docs = list(nlp.pipe(docs))

In [None]:
displacy.render(docs[1], style="ent")

In [None]:
[p for p in patterns if p['id'] in set(ent.ent_id_ for doc in nlp.pipe(docs) for ent in doc.ents)]

In [29]:
matched_ids = set(p['id'] for p in patterns if p['pattern'] in matched)

In [33]:
filtered_entities = {k: v for k, v in raw_entities.items() if str(k) in matched_ids}

In [37]:
filtered_aliases = [alias for alias in raw_aliases if str(alias["entity"]) in matched_ids]

In [None]:
filtered_aliases

In [None]:
raw_entities[918573]

In [1]:
from src.neo4j import extract_senzing_results

In [None]:
entities = extract_senzing_results("data/ICIJ-entity-report-2024-06-21_12-04-57-std.json")

In [41]:
with open("data/icij-example/suspicious.txt") as file:
    names = [line.rstrip() for line in file]

In [46]:
import pandas as pd

In [None]:
df = pd.DataFrame.from_records(
    [(name, list(filter(lambda ent: ent.name == name, [entity for k, entity in entities.items()]))) for name in names],
    columns=['suspicion', 'matches']
)
df

In [None]:
df.matches.explode().dropna().apply(lambda d: d.related.keys()).explode().unique()

In [53]:
rank_0 = df.matches.explode().dropna().apply(lambda d: d.entity_uid).unique()

In [59]:
rank_1 = df.matches.explode().dropna().apply(lambda d: d.related.keys()).explode().unique()

In [69]:
rank_2 = set(ent_id for seed_id in set(rank_0) | set(rank_1) for ent_id in entities[seed_id].related.keys())

In [None]:
from src.senzing_pipeline import filter_senzing

In [None]:
entity_ids = filter_senzing()

In [None]:
entity_ids

In [None]:
    raw_entities = load_entities()
    raw_aliases = load_aliases()

In [79]:
entity_ids = set(str(ent_id) for ent_id in entity_ids)

In [83]:
    filtered_entities = {k: v for k, v in raw_entities.items() if str(k) in entity_ids}
    filtered_aliases = [alias for alias in raw_aliases if str(alias["entity"]) in entity_ids]

In [None]:
    entities = generate_entities(filtered_entities, countries)
    write_entities(entities)