In [None]:
import os, sys
_path = os.path.abspath(os.path.join(".", "..", ".."))
sys.path.append(_path)

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import multiprocessing as mp
import sparse_dot_topn
import re

from entity_matching.utilities.ngram_analyzer import ngrams

In [None]:
def load_from_pickle(filename):
    with open(filename, "rb") as fn:
        return pickle.load(fn)

# Model parameters

In [None]:
n_from = 2
n_to = 4

df_max = 0.5

model_suffix = f"n{n_from}_{n_to}_tf_l2_{df_max}"
model_suffix

# Loading data

In [None]:
df = pd.read_pickle('/mnt/data/mediascreen/sharedfolder/em/hrns_clean.gzip')
df.head(3)

In [None]:
df_persons = df[df['hrn_type'] == 'PERSON']
df_entities = df[df['hrn_type'] == 'ENTITY']

In [None]:
df_persons[df_persons['hrn_id'] == 'F_100000']

# 1.) Persons

## 1a) full name

In [None]:
full_name_variations = [['last_name'],
                        ['first_name', 'last_name'],
                        ['last_name', 'first_name'],
                        ['middle_name', 'last_name'],
                        ['first_name', 'middle_name', 'last_name']
                       ]

def possible_full_names(_input):
    input_dict = dict(zip(('first_name', 'middle_name', 'last_name'), _input))
    res = []
    for v in full_name_variations:
        name_var = [input_dict[i] for i in v if input_dict[i]]
        name_var = ' '.join(name_var)
        res.append(name_var)
    
    res = list(set(res))
    
    return res

In [None]:
inp = ('matt', '', 'baniar')
possible_full_names(inp)

In [None]:
%%time
with mp.Pool(mp.cpu_count()-2) as pool:
    df_persons['possible_full_names'] = pool.map(possible_full_names, zip(df_persons['first_name'], df_persons['middle_name'], df_persons['last_name']))

In [None]:
df_persons.head()

In [None]:
%%time
full_names_df = df_persons.reset_index()[['record_id', 'hrn_id', 'possible_full_names']].explode('possible_full_names').groupby(['hrn_id', 'possible_full_names'])[['record_id']].agg(min).reset_index()
full_names_df = full_names_df.groupby('possible_full_names')[['hrn_id', 'record_id']].agg({'hrn_id': list, 'record_id': list}).reset_index().rename(columns={'possible_full_names':'name', 'hrn_id': 'ids', 'record_id': 'record_ids'})
full_names_df = full_names_df[full_names_df['name'] != ''].reset_index()

In [None]:
full_names_df

In [None]:
full_names_df.to_pickle('/mnt/data/mediascreen/sharedfolder/em/models/full_names_df.pkl', compression= 'gzip')

# full_names_df = pd.read_pickle("/mnt/data/mediascreen/sharedfolder/em/models/full_names_df.pkl", compression="gzip")

In [None]:
ngrams('matt baniar')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from functools import partial

In [None]:
ngrams_func = partial(ngrams, ngram_from=n_from, ngram_to=n_to)

In [None]:
%%time

vectorizer_fullname = TfidfVectorizer(
    max_df=df_max,
    analyzer=ngrams_func,
    use_idf=False,
    norm='l2'
)

X_fullname = vectorizer_fullname.fit_transform( full_names_df['name'] )

In [None]:
pickle.dump(vectorizer_fullname, open(f'/mnt/data/mediascreen/sharedfolder/em/models/vectorizer_fullname_{model_suffix}.pkl', 'wb'), protocol=4)
pickle.dump(X_fullname, open(f'/mnt/data/mediascreen/sharedfolder/em/models/X_fullname_{model_suffix}.pkl', 'wb'), protocol=4)

# vectorizer_fullname = load_from_pickle(f"/mnt/data/mediascreen/sharedfolder/em/models/vectorizer_fullname_{model_suffix}.pkl")
# X_fullname = load_from_pickle(f"/mnt/data/mediascreen/sharedfolder/em/models/X_fullname_{model_suffix}.pkl")

In [None]:
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import rand
from sparse_dot_topn import awesome_cossim_topn

In [None]:
def sparse_dot_search(full_matrix, search_vector):
    cos_top_n = awesome_cossim_topn(full_matrix, search_vector.transpose(), 1, 0.5, use_threads=True, n_jobs=30)
    shortlist = zip(cos_top_n.data, cos_top_n.nonzero()[0])
    
    res = [sorted(shortlist, key=lambda x: -x[0])]
    
    return res

In [None]:
%%time

input_name = 'Felix Patasse'

search_features_vec = vectorizer_fullname.transform([input_name])

In [None]:
search_features_vec

In [None]:
%%time

shortlist_name = sparse_dot_search(X_fullname, search_features_vec)

len(shortlist_name[0])

In [None]:
shortlist_name[0]

In [None]:
def process_sparse_results(shortlist_, df_):
    s = shortlist_[0]
    ss_ids = [int(i[1]) for i in s]
    ss_dists = [i[0] for i in s]

    ss_df = pd.DataFrame({'id': ss_ids, 'score': ss_dists}).astype({'id': 'int32'}).set_index('id')

    res = df_[df_.index.isin(ss_ids)].copy()
    res = res.join(ss_df)

    res = res.sort_values(by='score', ascending=True)[['score', 'ids']].explode('ids').rename(columns={'ids': 'id'})

    res = res.groupby('id').agg({'score': np.max}).sort_values(by='score', ascending=False).astype(float).reset_index()

    return res

In [None]:
process_sparse_results(shortlist_name, full_names_df)

In [None]:
df_persons[df_persons['hrn_id'] == 'F_10183']

## 1b) first+last name

In [None]:
%%time
first_names_df = df_persons.reset_index()[['record_id', 'hrn_id', 'first_name']].groupby(['hrn_id', 'first_name'])[['record_id']].agg(min).reset_index()
first_names_df = first_names_df.groupby('first_name')[['hrn_id', 'record_id']].agg({'hrn_id': list, 'record_id': list}).reset_index().rename(columns={'first_name':'name', 'hrn_id': 'ids', 'record_id': 'record_ids'})
first_names_df = first_names_df[first_names_df['name'] != ''].reset_index()

In [None]:
%%time
last_names_df = df_persons.reset_index()[['record_id', 'hrn_id', 'last_name']].groupby(['hrn_id', 'last_name'])[['record_id']].agg(min).reset_index()
last_names_df = last_names_df.groupby('last_name')[['hrn_id', 'record_id']].agg({'hrn_id': list, 'record_id': list}).reset_index().rename(columns={'last_name':'name', 'hrn_id': 'ids', 'record_id': 'record_ids'})
last_names_df = last_names_df[last_names_df['name'] != ''].reset_index()

In [None]:
len(first_names_df)
len(last_names_df)

In [None]:
first_names_df.to_pickle('/mnt/data/mediascreen/sharedfolder/em/models/first_names_df.pkl', compression= 'gzip')
last_names_df.to_pickle('/mnt/data/mediascreen/sharedfolder/em/models/last_names_df.pkl', compression= 'gzip')

In [None]:
%%time

vectorizer_firstname = TfidfVectorizer(
    max_df=df_max,
    analyzer=ngrams_func,
    use_idf=False,
    norm='l2'
)

X_firstname = vectorizer_firstname.fit_transform( first_names_df['name'] )

In [None]:
%%time

vectorizer_lastname = TfidfVectorizer(
    max_df=df_max,
    analyzer=ngrams_func,
    use_idf=False,
    norm='l2'
)

X_lastname = vectorizer_lastname.fit_transform( last_names_df['name'] )

In [None]:
pickle.dump(vectorizer_firstname, open(f'/mnt/data/mediascreen/sharedfolder/em/models/vectorizer_firstname_{model_suffix}.pkl', 'wb'), protocol=4)
pickle.dump(X_firstname, open(f'/mnt/data/mediascreen/sharedfolder/em/models/X_firstname_{model_suffix}.pkl', 'wb'), protocol=4)

pickle.dump(vectorizer_lastname, open(f'/mnt/data/mediascreen/sharedfolder/em/models/vectorizer_lastname_{model_suffix}.pkl', 'wb'), protocol=4)
pickle.dump(X_lastname, open(f'/mnt/data/mediascreen/sharedfolder/em/models/X_lastname_{model_suffix}.pkl', 'wb'), protocol=4)

In [None]:
%%time

first_name = 'Ange Felix'

search_features_vec = vectorizer_firstname.transform([first_name])
shortlist_first_name = sparse_dot_search(X_firstname, search_features_vec)

len(shortlist_first_name[0])

In [None]:
%%time

lastname = 'Patasse'

search_features_vec = vectorizer_lastname.transform([lastname])
shortlist_lastname = sparse_dot_search(X_lastname, search_features_vec)

len(shortlist_lastname[0])

# Entities

In [None]:
import unicodedata as ud

latin_letters= {}

def is_latin(uchr):
    try: return latin_letters[uchr]
    except KeyError:
         return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))

def only_roman_chars(unistr):
    return all(is_latin(uchr)
           for uchr in unistr
           if uchr.isalpha())

In [None]:
%%time
with mp.Pool(mp.cpu_count()-2) as pool:
    df_entities['entity_check'] = pool.map(only_roman_chars, df_entities['entity_name'])

In [None]:
%%time
entities_df = df_entities.reset_index()[['record_id', 'hrn_id', 'entity_name']].groupby(['hrn_id', 'entity_name'])[['record_id']].agg(min).reset_index()
entities_df = entities_df.groupby('entity_name')[['hrn_id', 'record_id']].agg({'hrn_id': list, 'record_id': list}).reset_index().rename(columns={'entity_name':'name', 'hrn_id': 'ids', 'record_id': 'record_ids'})
entities_df = entities_df[entities_df['name'] != ''].reset_index()

In [None]:
entities_df

In [None]:
entities_df.to_pickle('/mnt/data/mediascreen/sharedfolder/em/models/entities_df.pkl', compression= 'gzip')

In [None]:
ngrams_func_e = partial(ngrams, ngram_from=n_from, ngram_to=n_to, clear_digits=False)

In [None]:
%%time

vectorizer_entities = TfidfVectorizer(
    max_df=df_max,
    analyzer=ngrams_func_e,
    use_idf=False,
    norm='l2'
)

X_entities = vectorizer_entities.fit_transform( entities_df['name'] )

In [None]:
pickle.dump(vectorizer_entities, open(f'/mnt/data/mediascreen/sharedfolder/em/models/vectorizer_entities_{model_suffix}.pkl', 'wb'), protocol=4)
pickle.dump(X_entities, open(f'/mnt/data/mediascreen/sharedfolder/em/models/X_entities_{model_suffix}.pkl', 'wb'), protocol=4)