In [None]:
%matplotlib inline
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

import os
from itertools import product
import heapq
import re

import pandas as pd
import numpy as np

from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling

from IPython import display
from matplotlib import pyplot as plt

from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, f1_score, average_precision_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from fuzzywuzzy.fuzz import ratio, token_set_ratio, token_sort_ratio, partial_ratio

In [None]:
from datasets import load_data
from deduplipy.active_learning import ActiveStringMatchLearner
from deduplipy.string_matcher import StringMatcher
from deduplipy.blocking import Blocking
from deduplipy.clustering import hierarchical_clustering
from deduplipy.deduplicator import Deduplicator

In [None]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_row', 500)
pd.set_option('display.max_colwidth', 200)

## Load data 

In [None]:
X_train, X_test, y_train, y_test = load_data(return_pairs=True)

## StringMatcher

In [None]:
myStringMatcher = StringMatcher().fit(X_train, y_train)

In [None]:
myStringMatcher.score(X_train, y_train)

In [None]:
myStringMatcher.score(X_test, y_test)

In [None]:
average_precision_score(y_train, myStringMatcher.predict_proba(X_train)[:,1])

In [None]:
average_precision_score(y_test, myStringMatcher.predict_proba(X_test)[:,1])

## Active learning

In [None]:
df = pd.DataFrame(np.hstack((X_train, y_train.reshape(-1,1))), columns=['name_1', 'name_2', 'match'])

In [None]:
myActiveLearner = ActiveStringMatchLearner(n_queries=20, col='name')

In [None]:
myActiveLearner.fit(df)

In [None]:
plt.plot(np.vstack(myActiveLearner.parameters));
plt.legend(['ratio', 'partial_ratio', 'tokenset', 'tokensort']);

## Blocking

In [None]:
X_training = myActiveLearner.learner.X_training
y_training = myActiveLearner.learner.y_training

In [None]:
myBlocking = Blocking('col').fit(X_training, y_training)

In [None]:
myBlocking.rules_selected

In [None]:
pairs_table = myBlocking.transform(X_pool[:,0])
pairs_table.head(2)

## Scoring

In [None]:
scored_pairs_table = pairs_table.copy()

In [None]:
scored_pairs_table['score'] = myActiveLearner.predict_proba(pairs_table[['col_1', 'col_2']].values)[:,1]

In [None]:
scored_pairs_table.loc[scored_pairs_table.col_1==scored_pairs_table.col_2, 'score'] = 1

In [None]:
scored_pairs_table.head(2)

## Hierarchical clustering

In [None]:
df_clusters = hierarchical_clustering(scored_pairs_table)
df_clusters.head()

## All in one

In [None]:
df_train = load_data(kind='childcare', return_pairs=False)

In [None]:
myDedupliPy = Deduplicator('name_address')

In [None]:
myDedupliPy.fit(df_train)

In [None]:
res = myDedupliPy.predict(df_train)
res.head(10)