In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.extmath import safe_sparse_dot
from scipy.sparse import vstack

from matchers import metrics, utils, dataset

In [3]:
np.set_printoptions(suppress=True)
tqdm.pandas()

### Load and process data

In [4]:
# Only need to run this once
# It we split the data into train/test and will persist the data on disk
# dataset.load_split_init(test_size=0.1)

In [5]:
train, test = dataset.load_process_from_disk()

input_names_train, weighted_relevant_names_train, all_candidates_train = train
input_names_test, weighted_relevant_names_test, all_candidates_test = test

all_candidates = np.concatenate((all_candidates_train, all_candidates_test))

### Model

In [6]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char_wb', min_df=10, max_df=0.5)

In [7]:
X_train = vectorizer.fit_transform(all_candidates_train)
X_test = vectorizer.transform(all_candidates_test)
X_all = vstack((X_train, X_test))

#### Similarity Function

In [8]:
def get_similars(name, k=10, demo_mode=False):
    if demo_mode:
        name = utils.add_padding(name)
    x = vectorizer.transform([name]).toarray()
    scores = safe_sparse_dot(X_all, x.T).flatten()
    sorted_scores_idx = np.argsort(scores)[::-1][:k]
    candidates = all_candidates[sorted_scores_idx]
    if demo_mode:
        candidates = [utils.remove_padding(candidate) for candidate in candidates]
    candidates_scores = scores[sorted_scores_idx]
    
    return list(zip(candidates, candidates_scores))

#### Demo

In [9]:
get_similars('schumacher', 10, True)

[('schumacker', 0.8084019391224851),
 ('schuman', 0.7498062862635078),
 ('schumann', 0.7021428274919617),
 ('schumaker', 0.6903799002710765),
 ('schacher', 0.6546075134964433),
 ('schum', 0.6372560334730984),
 ('scheuman', 0.5587998782852258),
 ('hamacher', 0.542549261402101),
 ('amacher', 0.5372510049299706),
 ('stelmacher', 0.5293322424252592)]

### Generate candidates for all test names

In [None]:
k = 100 # Number of candidates to consider
name_candidates = list(map(lambda x: get_similars(x, k=k), tqdm(input_names_test)))

 68%|██████▊   | 1659/2444 [00:04<00:01, 408.06it/s]

In [None]:
len(name_candidates)

#### Ugh - how can I specify [[[str,float64]]] without taking apart and re-assembling the array?

In [None]:
names = np.array(list(list(cell[0] for cell in row) for row in name_candidates), dtype='O')
scores = np.array(list(list(cell[1] for cell in row) for row in name_candidates), dtype='f8')
candidates = np.dstack((names, scores))

#### Ugh - how can I specify [[[str,float64]]] without taking apart and re-assembling the array?

### Evaluation

### Average precision @0.65

In [None]:
metrics.avg_precision_at_threshold(weighted_relevant_names_test, candidates, 0.65)

### Average recall @0.8

In [None]:
metrics.avg_recall_at_threshold(weighted_relevant_names_test, candidates, 0.65)

### Precision-Recall Curve

In [None]:
# minimum score threshold to test
min_threshold = 0.5
metrics.precision_recall_curve_at_threshold(weighted_relevant_names_test, candidates, min_threshold)

### Test

In [None]:
input_names_test[251]

In [None]:
weighted_relevant_names_test[251]

In [None]:
candidates[251][:10]

In [None]:
metrics.recall_at_threshold(weighted_relevant_names_test[251], candidates[251], 0.9)

In [None]:
metrics.recall_at_threshold(weighted_relevant_names_test[251], candidates[251], 0.5)