In [1]:
from operator import itemgetter
from pathlib import Path
from typing import Iterator

import pandas as pd
import prodigy
import rapidfuzz
import spacy
from numpy import dot
from numpy.linalg import norm
from prodigy.components.filters import filter_duplicates
from prodigy.components.loaders import TXT
from prodigy.models.ner import EntityRecognizer
from prodigy.util import set_hashes
from spacy.kb import KnowledgeBase, Candidate  # , get_candidates

from gu_model.trf_tensor_to_vec import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
kb_loc='assets/kb_full_2022_10_26'
nlp = spacy.load('gu_model/en_ner_guardian-1.0.3/en_ner_guardian/en_ner_guardian-1.0.3',
                     disable=['transformer', 'tagger', 'parser', 'lemmatizer', 'attribute_ruler'])
nlp.add_pipe('tensor2attr')
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=1)
kb.from_disk(kb_loc)
model = EntityRecognizer(nlp)



In [3]:
##

In [4]:
##

In [5]:
def get_candidates_from_fuzzy_matching(span, kb, matching_thres=60) -> Iterator[Candidate]:
    """
    Return a list of candidate entities for an alias based on fuzzy string matching.
    Each candidate defines the entity, the original alias,
    and the prior probability of that alias resolving to that entity.
    If the alias is not known in the KB, and empty list is returned.
    """
    aliases = kb.get_alias_strings()
    # matches=[]
    matches = {}
    for al in aliases:
        # fuzzy_ratio=rapidfuzz.fuzz.token_set_ratio(span.lower(),al.lower())
        fuzzy_ratio = rapidfuzz.fuzz.WRatio(span.lower(), al.lower())
        if fuzzy_ratio >= matching_thres:
            # matches.append(al)
            matches[al] = fuzzy_ratio
    candidates = []
    for match in matches:
        candidates.extend(kb.get_alias_candidates(match))
    return candidates, matches


def order_candidates_fuzzy_score(candidates, matches, candidate_limit=12):
    """
    Order candidates by descending fuzzy name matching score
    """
    # names = dict()
    candidate_d = dict()
    fuzzy_scores = dict()
    for candidate in candidates:
        qid = candidate.entity_
        name = candidate.alias_
        # names[qid] = name
        candidate_d[qid] = candidate
        fuzzy_scores[qid] = matches[name]
    entities_ordered = dict(sorted(fuzzy_scores.items(), key=itemgetter(1), reverse=True))
    entities_ordered = list(entities_ordered.keys())[:candidate_limit]
    return [candidate_d[entity] for entity in entities_ordered]

In [6]:
# Read the pre-defined CSV file into dictionaries mapping QIDs to the full names and descriptions
entity_loc = 'kb_datasets/kb_entities_full_2022_10_26.csv'
kb_entities = pd.read_csv(entity_loc, index_col=0)
kb_entities['id'] = kb_entities['id'].astype(str)
kb_entities['name'] = kb_entities['name'].astype(str)
kb_entities['desc'] = kb_entities['desc'].astype(str)
kb_entities['kb_url'] = kb_entities['kb_url'].astype(str)
kb_entities_url = kb_entities[['id', 'name', 'kb_url']]
kb_entities = kb_entities[['id', 'name', 'desc']]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
kb_entities['desc_len']=kb_entities['desc'].str.len()

In [31]:
span = 'Cruz'
candidates, matches = get_candidates_from_fuzzy_matching(span, kb)

In [32]:
matches=pd.DataFrame.from_dict(matches, orient='index').reset_index().rename(columns={'index':'name',0:'score'})

In [33]:
matches

Unnamed: 0,name,score
0,Adrian Păunescu,60.0
1,Rudolf Keller,60.0
2,Rui Namorado,60.0
3,Timothy R Schwarz,60.0
4,Rune J. Skjælaaen,60.0
...,...,...
4960,Adriana Cruz,90.0
4961,Stephan Schwarz,60.0
4962,Ruth Chapman,60.0
4963,Ruslan Abdullaev,60.0


In [34]:
matches=matches.merge(kb_entities, on=['name'], how='left')

In [35]:
score_thres=matches['score'].quantile(0.8)

In [36]:
matches=matches[matches['score']>=score_thres]

In [37]:
matches.shape

(1230, 5)

In [38]:
matches.sort_values(by='score', ascending=False)

Unnamed: 0,name,score,id,desc,desc_len
5102,Adriana Cruz,90.0,371758,Adriana Cruz is a ED Economic Development & To...,181
4488,Melvin Geovanni Alvarenga Cruz,90.0,interpol-red-2018-64668,Melvin Geovanni Alvarenga Cruz is a AGRUPACION...,234
2024,Geraldo Cruz,90.0,Q28678162,Geraldo Cruz is a Brazilian politician. This p...,315
3848,Juan Jose Moreno Cruz,90.0,interpol-red-2018-21242,Juan Jose Moreno Cruz is a HOMICIDIO AGRAVADO ...,200
2027,Heidi Cruz,90.0,Q20165784,Heidi Cruz is a American investment manager at...,248
...,...,...,...,...,...
2151,Sophie Cluzel,67.5,Q3490886,Sophie Cluzel is a French politician. This per...,329
2152,Sophie Cluzel,67.5,us-cia-france-sophie-cluzel-min-for-disabled-p...,This person is called Sophie Cluzel. This pers...,210
2153,Caron Crummey,67.5,100010,"Caron Crummey is a Lobbyist, Hinman Straub. Ca...",1402
2158,Olive Mary Scruton,67.5,Q76102774,Olive Mary Scruton is a Peerage person ID=5324...,107


In [39]:
matches=matches.sort_values(by=['score','desc_len'], ascending=False)

In [40]:
matches=matches.reset_index(drop=True)

In [41]:
matches.shape

(1230, 5)

In [42]:
matches.head()

Unnamed: 0,name,score,id,desc,desc_len
0,Lupe De La Cruz,90.0,278841,Lupe De La Cruz is a Senior Director of Gover...,2555
1,Heidi Nelson Cruz,90.0,133404,"Heidi Nelson Cruz is a Vice President, Goldman...",1986
2,Cecily Myart-Cruz,90.0,404824,Cecily Myart-Cruz is a President United Teache...,1811
3,Juan Cruz,90.0,332624,Juan Cruz is a Superintendent Franklin McKinle...,1768
4,Edward Cruz,90.0,226428,"Edward Cruz is a Founded E. E. Cruz & Company,...",1106


In [43]:
sentence="Will Johnson be any more successful at cracking housing supply issues than previous governments? Many are pinning their hopes on the man who will be a crucial aide in Johnson’s government, his new chief of staff, Sir Edward Lister."

In [44]:
def embed_text(text, nlp):
    """
    Return spaCy embedding of a text.
    """
    return nlp(text).vector

In [45]:
matches.shape

(1230, 5)

In [46]:
matches.head()

Unnamed: 0,name,score,id,desc,desc_len
0,Lupe De La Cruz,90.0,278841,Lupe De La Cruz is a Senior Director of Gover...,2555
1,Heidi Nelson Cruz,90.0,133404,"Heidi Nelson Cruz is a Vice President, Goldman...",1986
2,Cecily Myart-Cruz,90.0,404824,Cecily Myart-Cruz is a President United Teache...,1811
3,Juan Cruz,90.0,332624,Juan Cruz is a Superintendent Franklin McKinle...,1768
4,Edward Cruz,90.0,226428,"Edward Cruz is a Founded E. E. Cruz & Company,...",1106


In [47]:
matches.loc[matches['id'].str.match('Q\d'),'score']=matches.loc[matches['id'].str.match('Q\d'),'score'] * 1.05

In [48]:
max_len=2000
min_len=matches['desc_len'].min()

In [49]:
matches['normalised_desc_len']=(matches['desc_len']-min_len)/(max_len-min_len)

In [50]:
matches['normalised_desc_len']=matches['normalised_desc_len'].apply(lambda x: 1 if x > 1 else x)

In [51]:
matches.sort_values(by=['score'], ascending=False).head(10)

Unnamed: 0,name,score,id,desc,desc_len,normalised_desc_len
216,Victorino Cruz Campos,94.5,Q111519038,Victorino Cruz Campos is a Mexican politician....,205,0.092518
99,Anthony Renson Novoa Cruzado,94.5,Q84285197,Anthony Renson Novoa Cruzado is a político per...,279,0.129929
92,Santiago José Rivero Cruz,94.5,Q107551116,Santiago José Rivero Cruz is a Spanish politic...,294,0.137513
93,Armando Da Cruz Neto,94.5,Q111937228,Armando Da Cruz Neto is a military officer in ...,293,0.137007
222,Verónica Ramos Cruz,94.5,Q111518044,Verónica Ramos Cruz is a Mexican politician. T...,203,0.091507
96,Esther Mejía Cruz,94.5,Q98397314,María Esther Mejía Cruz This person has held t...,288,0.134479
97,Emilia Molina Cruz,94.5,Q16728826,Emilia Molina Cruz is a Costa Rican politician...,287,0.133974
98,Cinchona Cruz-Gonzales,94.5,Q12962885,Cinchona Cruz-Gonzales is a Philippine politic...,280,0.130435
100,Cruz María Zeta Chunga,94.5,Q108414771,Cruz María Zeta Chunga is a política peruana. ...,279,0.129929
90,Sophie Cruz,94.5,Q62887790,Sophie Cruz is a Frans politica. This person h...,302,0.141557


In [54]:
matches.sort_values(by=['score','normalised_desc_len'], ascending=False).head(50)

Unnamed: 0,name,score,id,desc,desc_len,normalised_desc_len
9,Pedro Manuel Da Cruz Roseta,94.5,Q10347591,Pedro Manuel Da Cruz Roseta is a Portuguese po...,521,0.252275
11,Eufrosina Cruz,94.5,Q5406856,Eufrosina Cruz is a The first indigenous woman...,473,0.228008
12,Marta Rivera De La Cruz,94.5,Q3394433,Marta Rivera De La Cruz is a Spanish writer. T...,472,0.227503
13,Emilio Sáez Cruz,94.5,Q58875521,Emilio Sáez Cruz is a Spanish politician. This...,449,0.215875
14,Nilsa Cruz-Perez,94.5,Q7037445,Nilsa Cruz-Perez is a American politician. Thi...,446,0.214358
16,Paula Teixeira Da Cruz,94.5,Q3075716,Paula Teixeira Da Cruz is a Portuguese politic...,441,0.21183
17,Geraldo Magela Da Cruz Quintão,94.5,Q5441710,Geraldo Magela Da Cruz Quintão is a Brazilian ...,439,0.210819
18,Manuel Cruz,94.5,Q5992712,Manuel Cruz is a Spanish politician. This pers...,439,0.210819
19,Diego Cruz Torrijos,94.5,Q55553301,Diego Cruz Torrijos is a Spanish politician. T...,439,0.210819
20,Simon Tensing De Cruz,94.5,Q7519877,Simon Tensing De Cruz is a Singaporean diploma...,430,0.206269


In [57]:
matches[matches['name'].str.contains('Ted ' + span)]

Unnamed: 0,name,score,id,desc,desc_len,normalised_desc_len
6,Ted Cruz,90.0,96583,Ted Cruz is a Republican U.S. Senator from Tex...,687,0.336198


In [55]:
def calculate_cosine_similarity(descriptions_vec, vector_ref_sentence):
    """
    Return a dictionary mapping the kb entity id to cosine similarity score
    between kb embedded descriptions and the reference vector.
    """
    similarity = {}
    for entity_id in descriptions_vec.keys():
        vector_desc = descriptions_vec[entity_id]
        score = np.nan_to_num(
            dot(vector_ref_sentence, vector_desc) /
            (norm(vector_ref_sentence) * norm(vector_desc))
            , 0)
        similarity[entity_id] = score
    return similarity

In [56]:
def get_context_similarity(text, nlp, matches, candidate_limit=100):
    """
    Select only the top candidates to surface via the Prodigy UI. Based on
    topmost cosine similarities.
    """
    vector_ref_sentence = embed_text(text, nlp)
    
    names = dict()
    descriptions_vec = dict()
    #for candidate in candidates:
    for tuple_ in matches.itertuples():
        qid = tuple_[3]
        name = tuple_[1]
        desc_enc = tuple_[6]
        #desc_enc = candidate.entity_vector
        names[qid] = name
        descriptions_vec[qid] = desc_enc

    similarity = calculate_cosine_similarity(descriptions_vec, vector_ref_sentence)
    return pd.DataFrame.from_dict(similarity,orient='index')\
    .reset_index().rename(columns={'index':'id',0:'similarity'})

In [27]:
similarity_df=get_context_similarity(sentence, nlp,  matches)



In [29]:
matches=matches.merge(similarity_df, on='id')

In [30]:
matches.head()

Unnamed: 0,name,score,id,desc,desc_len,desc_embedding,similarity
0,Jo Johnson,95.0,Q728063,Jo Johnson is a British politician MP (born 19...,672,"[0.2822974, 0.24237156, -0.12491302, 0.2132171...",0.983038
1,Bo Johnson,95.0,116082,"Bo Johnson is a Lobbyist, Corrections Corp of ...",100,"[0.22940476, 0.07866096, -0.23129657, 0.022621...",0.938399
2,Ed Johnson,95.0,315140,This person is called Ed Johnson. This per...,80,"[0.3014737, -0.043383293, -0.17395419, -0.0418...",0.928145
3,Jo Johnson,95.0,55090,Jo Johnson is a Prospective Parliamentary Cand...,66,"[0.38097247, -0.09801138, -0.19058442, 0.15013...",0.924945
4,Rj Johnson,95.0,425078,RJ Johnson is a Top political advisor to Scott...,54,"[0.3856456, -0.16109328, -0.24016757, 0.184732...",0.902954


In [62]:
matches['composit_score']= matches['score'] * matches['similarity'] * matches['normalised_desc_len']

In [63]:
matches.sort_values('composit_score', ascending=False).head(20)

Unnamed: 0,name,id,desc,desc_len,desc_embedding,similarity,score,normalised_desc_len,composit_score
55,Imogene Powers Johnson,15168,"Imogene Powers Johnson, the billionaire widow...",2083,"[0.31178147, 0.3211938, -0.09199102, 0.1636720...",0.990551,90.0,1.0,89.14959
58,Emory Johnson,164375,Emory Johnson is a Professor of Transportation...,2011,"[0.2981705, 0.23666798, -0.14443867, 0.0693922...",0.984195,90.0,1.0,88.577587
34,Sue Clark-Johnson,178887,"Sue Clark-Johnson is a Former Publisher, Arizo...",2375,"[0.30413514, 0.3030189, -0.14652178, 0.1279459...",0.98399,90.0,1.0,88.559128
22,Frederick Ross Johnson,149261,Frederick Ross Johnson is a CEO of RJR Nabisco...,2721,"[0.33529863, 0.3185673, -0.15305309, 0.0719855...",0.983683,90.0,1.0,88.531485
23,Jennifer M Johnson,186127,"Jennifer M Johnson is a President and COO, Fra...",2713,"[0.33056182, 0.24376902, -0.14681502, 0.124611...",0.983338,90.0,1.0,88.500436
7,Crawford Toy Johnson Iii,182518,Crawford Toy Johnson III is a Former Chairman ...,5272,"[0.31047255, 0.32346684, -0.1556873, 0.1069957...",0.982743,90.0,1.0,88.446829
45,Mark Johnson,175844,"Mark Johnson is a COO, Civilian Marksmanship P...",2126,"[0.32258293, 0.26170307, -0.19030425, 0.044983...",0.980089,90.0,1.0,88.207973
56,Ashland Johnson,390453,"Ashland Johnson is a Attorney, National Center...",2068,"[0.2945806, 0.22795784, -0.12959296, 0.0749545...",0.978972,90.0,1.0,88.107525
15,Linda E Johnson,227813,"Linda E Johnson is a President and CEO, Brookl...",3519,"[0.31697896, 0.27082464, -0.17599843, 0.046157...",0.978441,90.0,1.0,88.059728
21,E Christopher Johnson Jr,386811,E Christopher Johnson Jr is a CEO and Co-Found...,2941,"[0.330057, 0.21661668, -0.18811746, 0.07042361...",0.974086,90.0,1.0,87.66775


In [108]:
matches.sort_values(by='score',ascending=False)

Unnamed: 0,name,score,id,desc,desc_len,desc_embedding,similarity
0,Jo Johnson,95.000000,Q728063,Jo Johnson is a British politician MP (born 19...,672,"[0.2822974, 0.24237156, -0.12491302, 0.2132171...",0.983038
2,Ed Johnson,95.000000,315140,This person is called Ed Johnson. This per...,80,"[0.3014737, -0.043383293, -0.17395419, -0.0418...",0.928145
3,Jo Johnson,95.000000,55090,Jo Johnson is a Prospective Parliamentary Cand...,66,"[0.38097247, -0.09801138, -0.19058442, 0.15013...",0.924945
4,Rj Johnson,95.000000,425078,RJ Johnson is a Top political advisor to Scott...,54,"[0.3856456, -0.16109328, -0.24016757, 0.184732...",0.902954
5,Ej Johnson,95.000000,331445,EJ Johnson is a Son of Magic Johnson.,37,"[0.39550623, -0.13490239, -0.26468384, 0.17075...",0.886878
...,...,...,...,...,...,...,...
2182,Johannes Poulsen,69.230769,Q12320116,Johannes Poulsen is a Danish politician. This ...,305,"[0.31842715, 0.2602151, -0.0374893, 0.24306317...",0.984012
2183,Johannes Gilleberg,69.230769,Q6216704,Johannes Gilleberg is a Norwegian politician. ...,304,"[0.30388454, 0.31487128, -0.093770236, 0.23683...",0.968860
2184,Sig Hutchinson,69.230769,315082,Sig Hutchinson is a Sig Hutchinson LLC/Wake Co...,301,"[0.24709877, 0.08559995, -0.19148602, 0.053567...",0.940625
2185,John Okechukwuemeka,69.230769,Q6251174,John Okechukwuemeka is a Nigerian politician. ...,298,"[0.32186395, 0.2897311, -0.0674257, 0.27010944...",0.979908


In [111]:
matches.sort_values(by='similarity',ascending=False).head(20)

Unnamed: 0,name,score,id,desc,desc_len,desc_embedding,similarity
29,Imogene Powers Johnson,90.0,15168,"Imogene Powers Johnson, the billionaire widow...",2083,"[0.31178147, 0.3211938, -0.09199102, 0.1636720...",0.990551
1854,Johanna Quandt,69.230769,76711,Johanna Quandt is a Widow held 17% of BMW. For...,1684,"[0.28511798, 0.32589412, -0.1012195, 0.1773893...",0.989534
1884,John Sd Eisenhower,69.230769,50130,"John SD Eisenhower is a US Army General, Milit...",1178,"[0.27957487, 0.30162582, -0.10646973, 0.191410...",0.989325
86,Gary E Johnson,90.0,34262,Gary E Johnson is a Former Governor of New Mex...,1203,"[0.32614115, 0.2979551, -0.15264152, 0.1700044...",0.989246
98,Clay A Johnson,90.0,36869,Clay A Johnson is a Director of Sunlight Labs....,1105,"[0.31782562, 0.31476963, -0.08849746, 0.170402...",0.988494
1860,John Stanley Pottinger,69.230769,360836,"John Stanley Pottinger is a Attorney, novelist...",1528,"[0.2854411, 0.28591397, -0.12673561, 0.1575661...",0.98785
93,Ronald B Johnson,90.0,2503,Ronald B Johnson is a Senior Vice President of...,1147,"[0.292732, 0.3121277, -0.15763398, 0.14204612,...",0.987792
65,Woody Johnson,90.0,46087,Woody Johnson is a Great-grandson of Johnson &...,1452,"[0.28356323, 0.32903042, -0.08760893, 0.121867...",0.987749
155,Kendra R Johnson,90.0,388307,This person is called Kendra R Johnson. Kendr...,703,"[0.30293167, 0.24478087, -0.16445631, 0.141636...",0.987638
1901,John Saunders,69.230769,182352,John Saunders is a ESPN Studio Host/Play-by-Pl...,1051,"[0.31606877, 0.32837492, -0.16691738, 0.130412...",0.987537


In [121]:
kb_entities[kb_entities['name']=='Boris Johnson']

Unnamed: 0,id,name,desc,desc_len
427360,Q180589,Boris Johnson,Boris Johnson is a Prime Minister of the Unite...,824


In [120]:
matches.sort_values(by=['score','similarity'], ascending=False).to_csv('similarity.csv')

In [25]:
kb_entities[kb_entities['name'].isin(candidates)]

Unnamed: 0,id,name,desc
236143,23759,Barry Strumpf,This person is called Barry Strumpf. This ...
245829,47307,Donald L Trump,"Donald L Trump is a President & CEO, Roswell P..."
266609,79677,Fred C Trump,Fred C Trump is a East coast real estate devel...
322132,176381,Stephanie Trump,"Stephanie Trump is a Co-Founder, Williams Isla..."
347611,219209,Mary Trump,"Mary Trump is a DJT niece, author of ""Too Much..."
347618,219217,Mary Macleod Trump,This person is called Mary Macleod Trump. Mrs...
356389,242036,Edmond Trump,Edmond Trump is a Miami billionaire real estat...
366407,260572,Robert Trumpbour,Robert Trumpbour is a Associate Professor of C...
392603,335179,Jules Trump,Jules Trump is a Billionaire from Miami. Th...
418528,408804,Ann Trump Daniel,Ann Trump Daniel is a Members of the UC Davis ...
