In [1]:
import string
from pathlib import Path
import nltk
import pandas as pd
import numpy as np
import prodigy
import rapidfuzz
import spacy
from nltk.tokenize import word_tokenize
from prodigy.models.ner import EntityRecognizer
from prodigy.util import set_hashes
from spacy.kb import KnowledgeBase, Candidate
from gu_model.trf_tensor_to_vec import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
span = 'Obama'
person_of_interest='Barack Obama'
sentence="Despite the busy run of the past two weeks, and his evident admiration for players such as Woods or the NFL's Tom Brady, Trump is trailing his predecessors in sporting celebrations. He has hosted 16 events, compared with 57 under Obama and 26 under George W Bush at the same point in their presidencies, according to a count by Mark Knoller, a CBS correspondent and keeper of White House statistics.”"


In [3]:
kb_loc='assets/kb_full_2022_10_26'
nlp = spacy.load('gu_model/en_ner_guardian-1.0.3/en_ner_guardian/en_ner_guardian-1.0.3',
                     disable=['transformer', 'tagger', 'parser', 'lemmatizer', 'attribute_ruler'])
nlp.add_pipe('tensor2attr')
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=1)
kb.from_disk(kb_loc)
model = EntityRecognizer(nlp)



In [4]:
def get_candidates_from_fuzzy_matching(span, kb, single_name=False, matching_thres=60):
    """
    Return a list of candidate entities for an alias based on fuzzy string matching.
    Each candidate defines the entity, the original alias,
    and the prior probability of that alias resolving to that entity.
    If the alias is not known in the KB, and empty list is returned.
    """
    aliases = kb.get_alias_strings()
    matches = {}
    for al in aliases:
        if single_name:
            # For single name mentions, e.g. Trump, use partial_ratio
            fuzzy_ratio = rapidfuzz.fuzz.partial_ratio(span.lower(), al.lower())
        else:
            # For multi name mentions, e.g. Donald Trump, use WRatio
            fuzzy_ratio = rapidfuzz.fuzz.WRatio(span.lower(), al.lower())
        if fuzzy_ratio >= matching_thres:
            matches[al] = fuzzy_ratio
    candidates = []
    for match in matches:
        candidates.extend(kb.get_alias_candidates(match))
    return candidates, matches


def relevant_lexicon(text, stopwords):
    # Extract the set of words from a text field
    punctuation_rm = str.maketrans('', '', string.punctuation)
    text = text.translate(punctuation_rm)
    word_set = set(word_tokenize(text))
    return word_set.difference(stopwords)

In [5]:
# Read the pre-defined CSV file into dictionaries mapping QIDs to the full names and descriptions
entity_loc = 'kb_datasets/kb_entities_full_2022_10_26.csv'
kb_entities = pd.read_csv(entity_loc, index_col=0)
# Select and format relevant columns
kb_entities = kb_entities[['id', 'name', 'desc', 'kb_url']]
kb_entities['id'] = kb_entities['id'].astype(str)
kb_entities['name'] = kb_entities['name'].astype(str)
kb_entities['desc'] = kb_entities['desc'].astype(str)
kb_entities['kb_url'] = kb_entities['kb_url'].astype(str)
# Add description length column
kb_entities['desc_len'] = kb_entities['desc'].str.len()

  exec(code_obj, self.user_global_ns, self.user_ns)


In [18]:
kb_entities[kb_entities['name'].str.contains('Razan')]

Unnamed: 0,id,name,desc,kb_url,desc_len
4404,be-fod-logical-82071741372,Ramzan Lechievitch Makhauri,This person is called Ramzan Lechievitch Makha...,https://www.opensanctions.org/entities/be-fod-...,144
33287,interpol-red-2015-74054,Ramzan Yusupov,Ramzan Yusupov is a participation in illegal a...,https://www.opensanctions.org/entities/interpo...,263
33638,interpol-red-2016-45216,Ramzan Umarov,Ramzan Umarov is a participation in an illegal...,https://www.opensanctions.org/entities/interpo...,263
34036,interpol-red-2016-77067,Ramzan Murzaev,Ramzan Murzaev is a participation in an illega...,https://www.opensanctions.org/entities/interpo...,235
34108,interpol-red-2017-11362,Ramzan Usmakhadzhiev,Ramzan Usmakhadzhiev is a participation in the...,https://www.opensanctions.org/entities/interpo...,304
34122,interpol-red-2017-126409,Ramzan Naurbiev,Ramzan Naurbiev is a participation in illegal ...,https://www.opensanctions.org/entities/interpo...,250
34644,interpol-red-2017-2057,Ramzan Iznaurov,Ramzan Iznaurov is a Participation in illegal ...,https://www.opensanctions.org/entities/interpo...,269
34931,interpol-red-2017-3973,Ramzan Hatsiev,Ramzan Hatsiev is a Participation in illegal a...,https://www.opensanctions.org/entities/interpo...,249
34999,interpol-red-2017-7391,Ramzan Taramov,"Ramzan Taramov is a complicity to the persons,...",https://www.opensanctions.org/entities/interpo...,303
35052,interpol-red-2017-9900,Ramzan Magomadov,Ramzan Magomadov is a participation in the act...,https://www.opensanctions.org/entities/interpo...,278


In [6]:
span_text = span
single_name = len(span_text.split(' ')) <= 1
# Retrieve a wide list of candidates based on fuzzy string matching and similarity score
candidates, matches = get_candidates_from_fuzzy_matching(span_text, kb, single_name)

In [10]:
# Convert candidate and scores to df
matches_df = pd.DataFrame.from_dict(matches, orient='index').reset_index().rename(
    columns={'index': 'name', 0: 'score'})
# Merge kb entity description
matches_df = matches_df.merge(kb_entities, on=['name'], how='left')
# Retain candidates with scores in top decile
top_decile = 0.9
score_thres = matches_df['score'].quantile(top_decile)
matches_df = matches_df[matches_df['score'] >= score_thres]
# Sort based on score and length of description text
matches_df = matches_df.sort_values(by=['score', 'desc_len'], ascending=False).reset_index(drop=True)
# Increase score in candidates with a wikiID
matches_df.loc[matches_df['id'].str.match('Q\d'), 'score'] = matches_df.loc[matches_df['id'].str.match(
    'Q\d'), 'score'] * 1.05
# Calculate normalised min/max description length.
# Descriptions over 300 characters long are capped at 1.
max_len = 300
min_len = matches_df['desc_len'].min()
matches_df['normalised_desc_len'] = (matches_df['desc_len'] - min_len) / (max_len - min_len)
matches_df['normalised_desc_len'] = matches_df['normalised_desc_len'].apply(lambda x: 1 if x > 1 else x)

# Find common words between paragraph and candidate descriptions
stopwords = set(nltk.corpus.stopwords.words())
desc_ents_d = dict()
column_order = matches_df.reset_index().columns.values
id_loc, = np.where(column_order == 'id')
desc_loc, = np.where(column_order == 'desc')
for tuple_ in matches_df.itertuples():
    qid = tuple_[id_loc[0]]
    desc = tuple_[desc_loc[0]]
    desc_ents = relevant_lexicon(desc, stopwords)
    desc_ents_d[qid] = desc_ents

common_ent_count_d = {}
text = sentence
sentence_ents = [str(ent) for ent in nlp(text).ents]
sentence_ents = set(' '.join(sentence_ents).split())
for qid, ent in desc_ents_d.items():
    common_ent_counts = len(sentence_ents.intersection(ent))
    common_ent_count_d[qid] = common_ent_counts

common_ent_counts = pd.DataFrame.from_dict(common_ent_count_d, orient='index').reset_index().rename(
    columns={'index': 'id', 0: 'common_ent_counts'})

matches_df = matches_df.merge(common_ent_counts, on='id')

matches_df = matches_df.sort_values(by=['common_ent_counts', 'score', 'normalised_desc_len'],
                                    ascending=False).reset_index(drop=True)

#n_candidates = 10
#matches_df = matches_df.head(n_candidates)

In [13]:
matches_df.head(30)

Unnamed: 0,name,score,id,desc,kb_url,desc_len,normalised_desc_len,common_ent_counts
0,Mike Roman,66.666667,396691,Mike Roman is a Philadelphia based Trump campa...,https://littlesis.org/person/396691-Mike Roman,1166,1.0,4
1,Alma L Golden,66.666667,342250,Alma L Golden is a Trump nominee to be Assista...,https://littlesis.org/person/342250-Alma L Golden,915,1.0,4
2,Walter Beaman Jones Jr,80.0,13369,Walter Beaman Jones Jr is a US Representative ...,https://littlesis.org/person/13369-Walter Beam...,2028,1.0,3
3,Jody Thomas,66.666667,96732,Jody Thomas is a Political Manager. Jody Thoma...,https://littlesis.org/person/96732-Jody Thomas,1150,1.0,3
4,Samantha Dravis,66.666667,255426,Samantha Dravis is a Former associate administ...,https://littlesis.org/person/255426-Samantha D...,919,1.0,3
5,Jason Thomas,66.666667,51441,Jason Thomas is a VP of Research at the Privat...,https://littlesis.org/person/51441-Jason Thomas,107,0.315603,3
6,Mark Okoth Obama Ndesandjo,105.0,Q15982189,Mark Okoth Obama Ndesandjo is a half-brother o...,https://www.opensanctions.org/entities/Q15982189,189,0.606383,2
7,George Hussein Onyango Obama,105.0,Q15982326,George Hussein Onyango Obama is a Bruder von B...,https://www.opensanctions.org/entities/Q15982326,147,0.457447,2
8,Mark Okoth Obama Ndesandjo,100.0,84973,Mark Okoth Obama Ndesandjo is a Barack Obama's...,https://littlesis.org/person/84973-Mark Okoth ...,106,0.312057,2
9,Aditya Bamzai,80.0,325846,"Aditya Bamzai is a Associate Professor of Law,...",https://littlesis.org/person/325846-Aditya Bamzai,1422,1.0,2


In [14]:
matches_df[matches_df['name']=='Barack Obama']

Unnamed: 0,name,score,id,desc,kb_url,desc_len,normalised_desc_len,common_ent_counts
52,Barack Obama,100.0,13503,Barack Obama is a 44th President of the United...,https://littlesis.org/person/13503-Barack Obama,862,1.0,1


In [None]:
def order_candidates_fuzzy_score(candidates, matches, candidate_limit=12):
    """
    Order candidates by descending fuzzy name matching score
    """
    candidate_d = dict()
    fuzzy_scores = dict()
    for candidate in candidates:
        qid = candidate.entity_
        name = candidate.alias_
        candidate_d[qid] = candidate
        fuzzy_scores[qid] = matches[name]
    entities_ordered = dict(sorted(fuzzy_scores.items(), key=itemgetter(1), reverse=True))
    entities_ordered = list(entities_ordered.keys())[:candidate_limit]
    return [candidate_d[entity] for entity in entities_ordered]

In [None]:
matches['candidate']=matches['id'].map({candidate.entity_:candidate for candidate in candidates})

In [None]:
matches.head(10)

Unnamed: 0,name,score,id,desc,desc_len,normalised_desc_len,common_ent_counts
0,Donald Trump,105.0,Q22686,Donald Trump is a president of the United Stat...,476,1.0,0
1,Maryanne Trump Barry,105.0,Q736223,Maryanne Trump Barry is a American lawyer and ...,460,1.0,0
2,"Jean Barker, Baroness Trumpington",105.0,Q1297871,"Jean Barker, Baroness Trumpington is a British...",455,1.0,0
3,Charles S. Trump,105.0,Q27832616,Charles S. Trump is a American politician. Thi...,411,1.0,0
4,Lara Trump,105.0,Q35703322,Lara Trump is a American television producer a...,298,0.992806,0
5,Mary Anne Macleod Trump,105.0,Q22952511,Mary Anne Macleod Trump is a mother of Donald ...,287,0.953237,0
6,Barron Trump,105.0,Q23000814,Barron Trump is a son of Donald and Melania Tr...,274,0.906475,0
7,Mary L. Trump,105.0,Q96475328,Mary L. Trump is a American clinical psycholog...,243,0.794964,0
8,Tiffany Trump,105.0,Q12071552,Tiffany Trump is a American socialite (born 19...,237,0.773381,0
9,Donald Trump Jr.,105.0,Q3713655,Donald Trump Jr. is a American businessman (bo...,234,0.76259,0


In [None]:
matches[matches['name']==person_of_interest]

Unnamed: 0,name,score,id,desc,desc_len,normalised_desc_len,common_ent_counts
0,Donald Trump,105.0,Q22686,Donald Trump is a president of the United Stat...,476,1.0,0


In [None]:
def embed_text(text, nlp):
    """
    Return spaCy embedding of a text.
    """
    return nlp(text).vector

In [None]:
matches['desc_embedding']=matches['desc'].apply(lambda x: embed_text(x,nlp))



In [None]:
def calculate_cosine_similarity(descriptions_vec, vector_ref_sentence):
    """
    Return a dictionary mapping the kb entity id to cosine similarity score
    between kb embedded descriptions and the reference vector.
    """
    similarity = {}
    for entity_id in descriptions_vec.keys():
        vector_desc = descriptions_vec[entity_id]
        score = np.nan_to_num(
            dot(vector_ref_sentence, vector_desc) /
            (norm(vector_ref_sentence) * norm(vector_desc))
            , 0)
        similarity[entity_id] = score
    return similarity

In [None]:
def get_context_similarity(text, nlp, matches, candidate_limit=100):
    """
    Select only the top candidates to surface via the Prodigy UI. Based on
    topmost cosine similarities.
    """
    vector_ref_sentence = embed_text(text, nlp)
    
    names = dict()
    descriptions_vec = dict()
    column_order=matches.reset_index().columns.values
    id_loc,=np.where(column_order=='id')
    name_loc,=np.where(column_order=='name')
    desc_emb_loc,=np.where(column_order=='desc_embedding')
    for tuple_ in matches.itertuples():
        qid = tuple_[id_loc[0]]
        name = tuple_[name_loc[0]]
        desc_enc = tuple_[desc_emb_loc[0]]
        #desc_enc = candidate.entity_vector
        names[qid] = name
        descriptions_vec[qid] = desc_enc

    similarity = calculate_cosine_similarity(descriptions_vec, vector_ref_sentence)
    return pd.DataFrame.from_dict(similarity,orient='index')\
    .reset_index().rename(columns={'index':'id',0:'similarity'})

In [None]:
similarity_df=get_context_similarity(sentence, nlp,  matches)

In [69]:
similarity_df

Unnamed: 0,id,similarity
0,Q581095,0.955535
1,Q3195494,0.959758
2,Q6394699,0.955669
3,Q16187047,0.941213
4,Q6394636,0.933152
...,...,...
195,Q76229610,0.931908
196,Q75538116,0.949967
197,Q76101172,0.931816
198,Q76101163,0.932400


In [71]:
matches=matches.merge(similarity_df, on='id')

In [72]:
matches.head()

Unnamed: 0,name,score,id,desc,desc_len,normalised_desc_len,desc_embedding,similarity
0,Kerry Mccarthy,105.0,Q581095,Kerry Mccarthy is a British politician (born 1...,764,1.0,"[0.26950276, 0.2947241, -0.133328, 0.21762279,...",0.955535
1,Kerry-Lynne Findlay,105.0,Q3195494,Kerry-Lynne Findlay is a Canadian politician. ...,462,1.0,"[0.30053824, 0.26975125, -0.08049413, 0.258529...",0.959758
2,Kerryn Phelps,105.0,Q6394699,Kerryn Phelps is a Australian doctor and polit...,442,1.0,"[0.2647866, 0.30980346, -0.09035116, 0.2168619...",0.955669
3,Kerry Diotte,105.0,Q16187047,Kerry Diotte is a Canadian politician. This pe...,420,1.0,"[0.28495017, 0.36452746, -0.057482373, 0.23180...",0.941213
4,Kerry Roberts,105.0,Q6394636,Kerry Roberts is a Tennessee State Senator. Th...,398,1.0,"[0.27534282, 0.37202474, -0.11694094, 0.197256...",0.933152


In [75]:
matches.sort_values(by='similarity', ascending=False).head(10)

Unnamed: 0,name,score,id,desc,desc_len,normalised_desc_len,desc_embedding,similarity
47,Kerry Washington,100.0,250236,Kerry Washington is a Actress. Ms. Washington ...,771,1.0,"[0.34216702, 0.28275317, -0.2301654, 0.0876753...",0.981972
34,Kerry J Preete,100.0,60963,"Kerry J Preete is a EVP of Global Strategy, Mo...",1979,1.0,"[0.2719542, 0.2566482, -0.17158447, 0.05546593...",0.981368
41,Kerry Perry,100.0,279798,"Kerry Perry is a President and CEO, USA Gymnas...",1083,1.0,"[0.3057713, 0.27808952, -0.14171067, 0.1661069...",0.977658
35,Kerry Kennedy,100.0,38333,Kerry Kennedy is a Daughter of Robert F Kenned...,1733,1.0,"[0.28771013, 0.35336745, -0.14681476, 0.162945...",0.975714
45,Kerry Stokes,100.0,81059,Kerry Stokes is a Australian billionaire. Stro...,855,1.0,"[0.3257711, 0.25046766, -0.17361978, 0.1616998...",0.973949
36,Kerry Knott,100.0,21511,"Kerry Knott is a President, C.S. Lewis Institu...",1665,1.0,"[0.30162033, 0.30003712, -0.14818524, 0.136793...",0.973767
33,Kerry Halferty Hardy,100.0,143011,Kerry Halferty Hardy is a Principal and consul...,2529,1.0,"[0.3494941, 0.22872539, -0.18811136, 0.0616118...",0.972611
32,Kerry John Katsorhis,100.0,45631,"Kerry John Katsorhis is a Katsorhis Law Firm, ...",2896,1.0,"[0.31445464, 0.23945971, -0.12993431, 0.113871...",0.972049
39,R Kerry Clark,100.0,1324,R Kerry Clark is a Retired Chairman and Chief ...,1097,1.0,"[0.29704675, 0.30050045, -0.22320616, 0.144830...",0.970065
46,Kerry Mcconnon,100.0,330667,Kerry McConnon is a DDC Public Affairs. With D...,811,1.0,"[0.2904959, 0.3401988, -0.06810058, 0.11846647...",0.969262


In [73]:
matches[matches['name']==person_of_interest]

Unnamed: 0,name,score,id,desc,desc_len,normalised_desc_len,desc_embedding,similarity
37,John Kerry,100.0,13377,"John Kerry is a 68th US Secretary of State, Se...",1588,1.0,"[0.30509615, 0.22645071, -0.25140598, 0.139775...",0.956122


In [35]:
matches.sort_values(by=['similarity'], ascending=False).head(10)

Unnamed: 0,name,score,id,desc,desc_len,normalised_desc_len,desc_embedding,similarity
0,Carina Ståhl Herrstedt,105.0,Q4988712,Carina Ståhl Herrstedt is a Swedish politician...,589,1.0,"[0.2727859, 0.28228948, -0.10635452, 0.1914473...",0.035977
126,James Addison Halsted,100.0,175604,James Addison Halsted is a Nutrition researche...,1972,1.0,"[0.2816307, 0.27608034, -0.13001594, 0.1858776...",0.035977
128,Ted R French,100.0,3735,Ted R French is a Former Chief Financial Offic...,1739,1.0,"[0.30200538, 0.27971175, -0.18464883, 0.175322...",0.035977
129,William Bastedo,100.0,127217,"William Bastedo is a SVP, Booz Allen. As a Sen...",1728,1.0,"[0.2894372, 0.27440315, -0.16353847, 0.0848268...",0.035977
130,Ted Sarandos,100.0,128512,"Ted Sarandos is a Chief Content Officer, Netfl...",1702,1.0,"[0.29125437, 0.2391752, -0.12827414, 0.1505959...",0.035977
131,Shane Tedjarati,100.0,128384,"Shane Tedjarati is a President, Global High Gr...",1678,1.0,"[0.32086444, 0.21766777, -0.16003317, 0.140430...",0.035977
132,Ted Van Der Meid,100.0,26908,Ted Van Der Meid is counsel in our government...,1626,1.0,"[0.30236992, 0.28617683, -0.12945502, 0.149877...",0.035977
133,Shirley Mount Hufstedler,100.0,8420,Shirley Mount Hufstedler is a First US Secreta...,1622,1.0,"[0.33224115, 0.3046495, -0.14053863, 0.1369625...",0.035977
134,Ted W Love,100.0,65550,"Ted W Love is a EVP, Research & Development, O...",1573,1.0,"[0.26212552, 0.23311691, -0.19053678, 0.193209...",0.035977
135,Ted Malloch,100.0,255735,"Ted Malloch is a British Academic, Brexiteer, ...",1453,1.0,"[0.2903265, 0.28811792, -0.108777694, 0.115588...",0.035977


In [62]:
matches['composit_score']= matches['score'] * matches['similarity'] * matches['normalised_desc_len']

In [63]:
matches.sort_values('composit_score', ascending=False).head(20)

Unnamed: 0,name,id,desc,desc_len,desc_embedding,similarity,score,normalised_desc_len,composit_score
55,Imogene Powers Johnson,15168,"Imogene Powers Johnson, the billionaire widow...",2083,"[0.31178147, 0.3211938, -0.09199102, 0.1636720...",0.990551,90.0,1.0,89.14959
58,Emory Johnson,164375,Emory Johnson is a Professor of Transportation...,2011,"[0.2981705, 0.23666798, -0.14443867, 0.0693922...",0.984195,90.0,1.0,88.577587
34,Sue Clark-Johnson,178887,"Sue Clark-Johnson is a Former Publisher, Arizo...",2375,"[0.30413514, 0.3030189, -0.14652178, 0.1279459...",0.98399,90.0,1.0,88.559128
22,Frederick Ross Johnson,149261,Frederick Ross Johnson is a CEO of RJR Nabisco...,2721,"[0.33529863, 0.3185673, -0.15305309, 0.0719855...",0.983683,90.0,1.0,88.531485
23,Jennifer M Johnson,186127,"Jennifer M Johnson is a President and COO, Fra...",2713,"[0.33056182, 0.24376902, -0.14681502, 0.124611...",0.983338,90.0,1.0,88.500436
7,Crawford Toy Johnson Iii,182518,Crawford Toy Johnson III is a Former Chairman ...,5272,"[0.31047255, 0.32346684, -0.1556873, 0.1069957...",0.982743,90.0,1.0,88.446829
45,Mark Johnson,175844,"Mark Johnson is a COO, Civilian Marksmanship P...",2126,"[0.32258293, 0.26170307, -0.19030425, 0.044983...",0.980089,90.0,1.0,88.207973
56,Ashland Johnson,390453,"Ashland Johnson is a Attorney, National Center...",2068,"[0.2945806, 0.22795784, -0.12959296, 0.0749545...",0.978972,90.0,1.0,88.107525
15,Linda E Johnson,227813,"Linda E Johnson is a President and CEO, Brookl...",3519,"[0.31697896, 0.27082464, -0.17599843, 0.046157...",0.978441,90.0,1.0,88.059728
21,E Christopher Johnson Jr,386811,E Christopher Johnson Jr is a CEO and Co-Found...,2941,"[0.330057, 0.21661668, -0.18811746, 0.07042361...",0.974086,90.0,1.0,87.66775


In [108]:
matches.sort_values(by='score',ascending=False)

Unnamed: 0,name,score,id,desc,desc_len,desc_embedding,similarity
0,Jo Johnson,95.000000,Q728063,Jo Johnson is a British politician MP (born 19...,672,"[0.2822974, 0.24237156, -0.12491302, 0.2132171...",0.983038
2,Ed Johnson,95.000000,315140,This person is called Ed Johnson. This per...,80,"[0.3014737, -0.043383293, -0.17395419, -0.0418...",0.928145
3,Jo Johnson,95.000000,55090,Jo Johnson is a Prospective Parliamentary Cand...,66,"[0.38097247, -0.09801138, -0.19058442, 0.15013...",0.924945
4,Rj Johnson,95.000000,425078,RJ Johnson is a Top political advisor to Scott...,54,"[0.3856456, -0.16109328, -0.24016757, 0.184732...",0.902954
5,Ej Johnson,95.000000,331445,EJ Johnson is a Son of Magic Johnson.,37,"[0.39550623, -0.13490239, -0.26468384, 0.17075...",0.886878
...,...,...,...,...,...,...,...
2182,Johannes Poulsen,69.230769,Q12320116,Johannes Poulsen is a Danish politician. This ...,305,"[0.31842715, 0.2602151, -0.0374893, 0.24306317...",0.984012
2183,Johannes Gilleberg,69.230769,Q6216704,Johannes Gilleberg is a Norwegian politician. ...,304,"[0.30388454, 0.31487128, -0.093770236, 0.23683...",0.968860
2184,Sig Hutchinson,69.230769,315082,Sig Hutchinson is a Sig Hutchinson LLC/Wake Co...,301,"[0.24709877, 0.08559995, -0.19148602, 0.053567...",0.940625
2185,John Okechukwuemeka,69.230769,Q6251174,John Okechukwuemeka is a Nigerian politician. ...,298,"[0.32186395, 0.2897311, -0.0674257, 0.27010944...",0.979908


In [111]:
matches.sort_values(by='similarity',ascending=False).head(20)

Unnamed: 0,name,score,id,desc,desc_len,desc_embedding,similarity
29,Imogene Powers Johnson,90.0,15168,"Imogene Powers Johnson, the billionaire widow...",2083,"[0.31178147, 0.3211938, -0.09199102, 0.1636720...",0.990551
1854,Johanna Quandt,69.230769,76711,Johanna Quandt is a Widow held 17% of BMW. For...,1684,"[0.28511798, 0.32589412, -0.1012195, 0.1773893...",0.989534
1884,John Sd Eisenhower,69.230769,50130,"John SD Eisenhower is a US Army General, Milit...",1178,"[0.27957487, 0.30162582, -0.10646973, 0.191410...",0.989325
86,Gary E Johnson,90.0,34262,Gary E Johnson is a Former Governor of New Mex...,1203,"[0.32614115, 0.2979551, -0.15264152, 0.1700044...",0.989246
98,Clay A Johnson,90.0,36869,Clay A Johnson is a Director of Sunlight Labs....,1105,"[0.31782562, 0.31476963, -0.08849746, 0.170402...",0.988494
1860,John Stanley Pottinger,69.230769,360836,"John Stanley Pottinger is a Attorney, novelist...",1528,"[0.2854411, 0.28591397, -0.12673561, 0.1575661...",0.98785
93,Ronald B Johnson,90.0,2503,Ronald B Johnson is a Senior Vice President of...,1147,"[0.292732, 0.3121277, -0.15763398, 0.14204612,...",0.987792
65,Woody Johnson,90.0,46087,Woody Johnson is a Great-grandson of Johnson &...,1452,"[0.28356323, 0.32903042, -0.08760893, 0.121867...",0.987749
155,Kendra R Johnson,90.0,388307,This person is called Kendra R Johnson. Kendr...,703,"[0.30293167, 0.24478087, -0.16445631, 0.141636...",0.987638
1901,John Saunders,69.230769,182352,John Saunders is a ESPN Studio Host/Play-by-Pl...,1051,"[0.31606877, 0.32837492, -0.16691738, 0.130412...",0.987537


In [121]:
kb_entities[kb_entities['name']=='Boris Johnson']

Unnamed: 0,id,name,desc,desc_len
427360,Q180589,Boris Johnson,Boris Johnson is a Prime Minister of the Unite...,824


In [120]:
matches.sort_values(by=['score','similarity'], ascending=False).to_csv('similarity.csv')

In [25]:
kb_entities[kb_entities['name'].isin(candidates)]

Unnamed: 0,id,name,desc
236143,23759,Barry Strumpf,This person is called Barry Strumpf. This ...
245829,47307,Donald L Trump,"Donald L Trump is a President & CEO, Roswell P..."
266609,79677,Fred C Trump,Fred C Trump is a East coast real estate devel...
322132,176381,Stephanie Trump,"Stephanie Trump is a Co-Founder, Williams Isla..."
347611,219209,Mary Trump,"Mary Trump is a DJT niece, author of ""Too Much..."
347618,219217,Mary Macleod Trump,This person is called Mary Macleod Trump. Mrs...
356389,242036,Edmond Trump,Edmond Trump is a Miami billionaire real estat...
366407,260572,Robert Trumpbour,Robert Trumpbour is a Associate Professor of C...
392603,335179,Jules Trump,Jules Trump is a Billionaire from Miami. Th...
418528,408804,Ann Trump Daniel,Ann Trump Daniel is a Members of the UC Davis ...


In [3]:
##