In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd 

from simsity.service import Service
from simsity.datasets import fetch_voters
from simsity.indexer import PyNNDescentIndexer

# Don't forget to pip install dirty_cat
from dirty_cat import GapEncoder

df = fetch_voters()

In [3]:
encoder = GapEncoder()

service = Service(
    indexer=PyNNDescentIndexer(metric="euclidean", n_jobs=6),
    encoder=encoder
)

# Index the datapoints
service.train_from_dataf(df)

<simsity.service.Service at 0x7ffb4844e9d0>

In [4]:
import random 

def generate_pair(service, n_consider=10):
    idx = random.randint(0, len(service.storage) - 1)
    query = service.storage[idx]
    df_out = service.query(**query, n_neighbors=n_consider, out='dataframe')
    return df_out.drop(columns='dist').sample(2)

print(generate_pair(service).to_markdown(index=False))

| name            | suburb     |   postcode |
|:----------------|:-----------|-----------:|
| heather stewart | concord    |      28027 |
| darlene smith   | lincolnton |      28092 |


In [5]:
from IPython.display import display

# Don't forget to pip install pigeon-jupyter
from pigeon import annotate

annotations = annotate(
  (generate_pair(service, n_consider=3) for x in range(200)),
  options=['similar', 'not similar'],
  display_fn=display
)

HTML(value='0 examples annotated, 201 examples left')

HBox(children=(Button(description='similar', style=ButtonStyle()), Button(description='not similar', style=But…

Output()

In [7]:
def annot_to_dataf(annotations):
    data = []
    for a in annotations:
        d1, d2 = a[0].to_dict(orient='records')
        d1 = {f'{k}_1': v for k, v in d1.items()}
        d2 = {f'{k}_2': v for k, v in d2.items()}
        data.append({**d1, **d2, 'label': a[1]})
    return pd.DataFrame(data)

def annot_to_X_y(annotations, encoder):
    data1 = []
    data2 = []
    ys = []
    for a in annotations:
        d1, d2 = a[0].to_dict(orient='records')
        data1.append(d1)
        data2.append(d2)
        ys.append(a[1])
    X1 = encoder.transform(pd.DataFrame(data1))
    X2 = encoder.transform(pd.DataFrame(data2))
    return X1, X2, ys

X1, X2, y = annot_to_X_y(annotations, encoder)

In [8]:
print(annot_to_dataf(annotations).head(5).to_markdown(index=False))

| name_1          | suburb_1      | postcode_1   | name_2          | suburb_2      | postcode_2   | label       |
|:----------------|:--------------|:-------------|:----------------|:--------------|:-------------|:------------|
| stacey gore     | maggie avlley | 287s1        | namcy cruse     | rocky mount   | 27871        | not similar |
| kenneth reed    | winston salem | 27101        | bessie smith    | winston salem | 27107        | not similar |
| angela franklin | canton        | 28716        | angla franklin  | canton        | 287|6        | similar     |
| carla macartney | charlotte     | 28227        | caria macartney | charlotte     | 28220        | similar     |
| maryann ca5h    | waxha         | 28173        | jaime cable     | canton        | 28716        | not similar |


In [9]:
X_difference = X1 - X2

In [10]:
import numpy as np
from sklearn.linear_model import LogisticRegression

pred = LogisticRegression().fit(X_difference, y).predict(X_difference)
np.mean(pred == y)

0.7623762376237624