In [None]:
!pip install recordlinkage

In [None]:
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import recordlinkage
from recordlinkage.preprocessing import clean
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(COLORS_PALETTE))
import tensorflow as tf
import time
plt.rcParams['figure.figsize'] = (8,5)
plt.rcParams['figure.dpi'] = 80

In [None]:
amzn_train = pd.read_csv('../input/Amazon_train.csv')
amzn_train = amzn_train.drop(['Unnamed: 0'], axis=1)
amzn_test = pd.read_csv('../input/Amazon_test.csv')
amzn_test = amzn_test.drop(['Unnamed: 0'], axis=1)
goog_train = pd.read_csv('../input/Google_train.csv')
goog_train = goog_train.drop(['Unnamed: 0'], axis=1)
goog_test = pd.read_csv('../input/Google_test.csv')
goog_test = goog_test.drop(['Unnamed: 0'], axis=1)
AG_train = pd.read_csv('../input/AG_perfect_matching_train.csv')
AG_train = AG_train.drop(['Unnamed: 0'], axis=1)
AG_test = pd.read_csv('../input/AG_perfect_matching_test.csv')
AG_test = AG_test.drop(['Unnamed: 0'], axis=1)

In [None]:
amzn_train.head()

In [None]:
amzn_train.info()

In [None]:
goog_train.head()

In [None]:
goog_train.info()

In [None]:
AG_train.head()

In [None]:
AG_train.info()

## Preprocessing data

In [None]:
amzn_train.description = amzn_train.description.fillna("")
goog_train.description = goog_train.description.fillna("")
goog_train.manufacturer = goog_train.manufacturer.fillna("")
goog_train['price'] = pd.to_numeric(goog_train['price'], errors='coerce')

Cleaning Text
* Tags were removed
* Remove stop words
* Removing puctuations
* Converting to lowercase
* Remove morphological affixes from words, leaving only the word stem

In [None]:
# 1. Tags were removed. For eg. “<i>Hello</i> <b>World</b>!” was converted to “Hello World!”
# 2. Repeating whitespace characters (spaces, tabs, line breaks) were removed. Tabs & line 
# breaks were converted to spaces
# 3. Stopwords were removed. Stopwords include the most commonly occurring words in a language 
# like ‘the’, ‘on’, ‘is’ etc. In this case, the default stopwords list from gensim was used
# 4. The text was transformed to lowercase
# 5. change the verb example. running -> run, banning -> ban

import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from textblob import TextBlob
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

tokenizer = RegexpTokenizer(r'\w+')
stopwords = stopwords.words("english")
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")

def get_processed_text(text=""):
    """
    Remove stopword,lemmatizing the words and remove special character to get important content
    """
    clean_text = re.sub('[^a-zA-Z0-9 \n\.]', ' ', text)
    tokens = tokenizer.tokenize(clean_text)
    tokens = [lemmatizer.lemmatize(token.lower().strip()) for token in tokens 
              if token not in stopwords and len(token) >= 2]
    tokens = [stemmer.stem(token) for token in tokens]
    text = " ".join(tokens)
    return text

amzn_train['title'] = amzn_train.title.apply(lambda text: get_processed_text(text))
amzn_train['description'] = amzn_train.description.apply(lambda text: get_processed_text(text))
amzn_train['manufacturer'] = amzn_train.manufacturer.apply(lambda text: get_processed_text(text))
amzn_train.columns = ['idAmazon', 'title', 'description', 'manufacturer', 'price']
amzn_train = amzn_train.set_index('idAmazon')

goog_train['name'] = goog_train.name.apply(lambda text: get_processed_text(text))
goog_train['description'] = goog_train.description.apply(lambda text: get_processed_text(text))
goog_train['manufacturer'] = goog_train.manufacturer.apply(lambda text: get_processed_text(text))
goog_train.columns = ['idGoogleBase', 'title', 'description', 'manufacturer', 'price']
goog_train = goog_train.set_index('idGoogleBase')

### All Pairs

In [None]:
pairs = recordlinkage.FullIndex()
pairs = pairs.index(amzn_train, goog_train)
len(pairs)

### Blocking

In [None]:
block = recordlinkage.index.SortedNeighbourhood('title', window = 251)
block = block.index(amzn_train, goog_train)
len(block)

## Blocking Dataset

### Compare string by suing jarowinkler

In [None]:
c1 = recordlinkage.Compare()

c1.string('title', 'title', method='jarowinkler', label='cmp_title')
c1.string('description', 'description', method='jarowinkler', label='cmp_description')
c1.string('manufacturer', 'manufacturer', method='jarowinkler', label='cmp_manufacturer')
c1.numeric('price', 'price', method='gauss', scale=5,
          offset=10, label='cmp_price')

feature_vectors1 = c1.compute(block, amzn_train, goog_train)

In [None]:
feature_vectors1

In [None]:
matches = AG_train.set_index(['idAmazon', 'idGoogleBase']).index
matches_index = feature_vectors1.index & matches
len(matches_index)

In [None]:
match_data = matches_index.to_frame(index=False)
data = feature_vectors1.reset_index()
data = data.merge(match_data, indicator=True, how='outer')
data['label'] = np.where(data['_merge']=='both', 1, 0)
data = data.drop(columns=['_merge'])

In [None]:
label = data['label'].value_counts()

sns.barplot(label.index, label.values, alpha=0.8)
plt.ylabel('Count', fontsize=20)
plt.xlabel('Label', fontsize=20)
plt.savefig("is_duplicate.jpg")
plt.show()

print(label)
print ("Percentage of positive cases: {:.2f} %".format((sum(data['label'] == 1))*100.0/data.shape[0]))
print ("Percentage of negative cases: {:.2f} %".format((sum(data['label'] == 0))*100.0/data.shape[0]))

In [None]:
sns.pairplot(data = data, hue = 'label', diag_kind='hist')

In [None]:
data.loc[data['label'] == 1]

In [None]:
data_label0 = data.loc[data['label'] == 0]
data_label1 = data.loc[data['label'] == 1]

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,8))

ax0, ax1, ax2, ax3 = axes.flatten()

ax0.hist(data_label0.cmp_title, bins=10, alpha=0.7, rwidth=0.85)
ax0.set_title('The similarity of title for negative records')
ax1.hist(data_label0.cmp_description, bins=10, alpha=0.7, rwidth=0.85)
ax1.set_title('The similarity of description for negative records')
ax2.hist(data_label0.cmp_manufacturer, bins=10, alpha=0.7, rwidth=0.85)
ax2.set_title('The similarity of manufacturer for negative records')
ax3.hist(data_label0.cmp_price, bins=10, alpha=0.7, rwidth=0.85)
ax3.set_title('The similarity of price for negative records')

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,8))

ax0, ax1, ax2, ax3 = axes.flatten()

ax0.hist(data_label1.cmp_title, bins=10, alpha=0.7, rwidth=0.85)
ax0.set_title('The similarity of title for positive records')
ax1.hist(data_label1.cmp_description, bins=10, alpha=0.7, rwidth=0.85)
ax1.set_title('The similarity of description for positive records')
ax2.hist(data_label1.cmp_manufacturer, bins=10, alpha=0.7, rwidth=0.85)
ax2.set_title('The similarity of manufacturer for positive records')
ax3.hist(data_label1.cmp_price, bins=10, alpha=0.7, rwidth=0.85)
ax3.set_title('The similarity of price for positive records')

## Building Model - Blocking

In [None]:
from sklearn.model_selection import train_test_split

# Create a training and test set
train, validation = train_test_split(data, test_size=0.2, random_state=23)

y_train = train.label
X_train = train.drop(['idAmazon', 'idGoogleBase', 'label'] , axis=1)
y_val = validation.label
X_val = validation.drop(['idAmazon', 'idGoogleBase', 'label'] , axis=1)

In [None]:
X_train.describe()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

### DecisionTree Classifier

In [None]:
from sklearn import tree
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import confusion_matrix

params = {"max_depth": [int(x) for x in np.linspace(50, 110, num = 5)],
          "max_features": ['auto', 'sqrt'],
          "min_samples_split": [1, 4, 8, 10],
          "min_samples_leaf": [1, 4, 8, 10],
          "criterion": ["gini", "entropy"]}
clf = GridSearchCV(tree.DecisionTreeClassifier(), params, 
                   cv=KFold(8, True), scoring="neg_root_mean_squared_error")
model2 = clf.fit(X_train, y_train)
pred_tree = model2.predict(X_val)

In [None]:
clf.best_estimator_.get_params()

In [None]:
d_tree = pd.DataFrame(
data = {'label': y_val, 'pred': pred_tree}
)

cm_tree = confusion_matrix(d_tree['label'], d_tree['pred'])
plt.figure(figsize=(8, 6))
ax= plt.subplot()
sns.heatmap(cm_tree/cm_tree.sum(axis=1)[:, np.newaxis], annot=True, ax = ax, annot_kws={"size": 20});

# labels, title and ticks

ax.set_xlabel('Predicted labels', fontsize=20);ax.set_ylabel('True labels', fontsize=20); 
ax.set_title('Normalized Confusion Matrix', fontsize=20); 
ax.xaxis.set_ticklabels(['Negative', 'Positive'], fontsize=20); 
ax.yaxis.set_ticklabels(['Negative', 'Positive'], fontsize=20);
print(cm_tree)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(d_tree['label'], d_tree['pred'])
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(d_tree['label'], d_tree['pred'])
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(d_tree['label'], d_tree['pred'])
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(d_tree['label'], d_tree['pred'])
print('F1 score: %f' % f1)

### K-Means

In [None]:
from sklearn.cluster import KMeans
model3 = KMeans(n_clusters=2)
model3.fit(X_train)
pred_kmeans = model3.predict(X_val)

In [None]:
d_kmeans = pd.DataFrame(
data = {'label': y_val, 'pred': pred_kmeans}
)

cm_kmeans = confusion_matrix(d_kmeans['label'], d_kmeans['pred'])
plt.figure(figsize=(8, 6))
ax= plt.subplot()
sns.heatmap(cm_kmeans/cm_kmeans.sum(axis=1)[:, np.newaxis], annot=True, ax = ax, annot_kws={"size": 20});

# labels, title and ticks

ax.set_xlabel('Predicted labels', fontsize=20);ax.set_ylabel('True labels', fontsize=20); 
ax.set_title('Normalized Confusion Matrix', fontsize=20); 
ax.xaxis.set_ticklabels(['Negative', 'Positive'], fontsize=20); 
ax.yaxis.set_ticklabels(['Negative', 'Positive'], fontsize=20);
print(cm_kmeans)

In [None]:
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(d_kmeans['label'], d_kmeans['pred'])
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(d_kmeans['label'], d_kmeans['pred'])
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(d_kmeans['label'], d_kmeans['pred'])
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(d_kmeans['label'], d_kmeans['pred'])
print('F1 score: %f' % f1)

### Deep Learning

In [None]:
from keras.layers import Dense, Input, Activation, Dropout, BatchNormalization
from keras.models import Model, Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
def create_model():
    # create model
    model = Sequential()
    model.add(Dense(64, input_dim=4, activation='relu'))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model
model = KerasClassifier(build_fn=create_model, verbose=0)
batch_size = [500, 1024, 2048]
epochs = [50, 100, 200]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train, y_train)

In [None]:
grid_result.best_score_, grid_result.best_params_

In [None]:
METRICS = [
    tf.keras.metrics.TruePositives(name='tp'),
    tf.keras.metrics.FalsePositives(name='fp'),
    tf.keras.metrics.TrueNegatives(name='tn'),
    tf.keras.metrics.FalseNegatives(name='fn'), 
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall'),
    tf.keras.metrics.AUC(name='auc')
]

In [None]:
from keras.layers import Dense, Input, Activation, Dropout, BatchNormalization
from keras.models import Model, Sequential

model4 = Sequential()
model4.add(Dense(64, input_dim=4, activation='relu'))
model4.add(Dropout(0.5))
model4.add(BatchNormalization())
model4.add(Dense(32, activation='relu'))
model4.add(Dropout(0.5))
model4.add(BatchNormalization())
model4.add(Dense(1, activation='sigmoid'))
model4.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

model4.summary()

In [None]:
# https://www.tensorflow.org/tutorials/structured_data/imbalanced_data
neg, pos = np.bincount(y_train)
total = neg +  pos
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0

class_weight = {0: weight_for_0, 1: weight_for_1}
class_weight

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    verbose=1,
    patience=10,
    mode='min',
    restore_best_weights=True)

batch_size = 1024
epochs = 50
start = time.time()
hist1 = model4.fit(X_train, y_train, 
                   validation_data=(X_val, y_val),  
                   batch_size=batch_size, epochs=epochs, callbacks = [early_stopping], class_weight=class_weight)
end = time.time()
print ("Elapsed time: {:.2f} seconds".format(end - start))

In [None]:
def plot_pression_recall(history):
    precision = history.history['precision']
    val_precision = history.history['val_precision']
    recall = history.history['recall']
    val_recall = history.history['val_recall']
    x = range(1, len(precision) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, precision, 'b', label='Training precision')
    plt.plot(x, val_precision, 'r', label='Validation precision')
    plt.title('Training and validation precision')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, recall, 'b', label='Training recall')
    plt.plot(x, val_recall, 'r', label='Validation recall')
    plt.title('Training and validation recall')
    plt.legend()
    #plt.savefig("validation.jpg")
plot_pression_recall(hist1)

In [None]:
def plot_acc_loss(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    #plt.savefig("validation.jpg")
plot_acc_loss(hist1)

In [None]:
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve

probs_deep = model4.predict(X_val)

average_precision = average_precision_score(y_val, probs_deep.ravel())

print('Average precision-recall score: {0:0.4f}'.format(
      average_precision))

precision, recall, _ = precision_recall_curve(y_val, probs_deep.ravel())
plt.plot(recall, precision, label='deep-learning (AP=0.34)')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.title('2-class Precision-Recall curve: '
                  'AP={0:0.4f}'.format(average_precision))
plt.show()

In [None]:
from sklearn.metrics import roc_curve

def plot_roc(labels, predictions):
    fp, tp, _ = roc_curve(labels, predictions)
    plt.figure(figsize=(8, 6))

    plt.plot(fp, tp, label='ROC', linewidth=3)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.plot(
        [0, 1], [0, 1], 
        linestyle='--', 
        linewidth=2, 
        color='r',
        label='Chance', 
        alpha=.8
    )
    plt.grid(True)
    ax = plt.gca()
    ax.set_aspect('equal')
    plt.legend(loc="lower right")
plot_roc(y_val, probs_deep)

In [None]:
fpr, tpr, thresholds = roc_curve(y_val, probs_deep)
J = tpr - fpr
ix = np.argmax(J)
best_thresh = thresholds[ix]
print('Best Threshold=%f' % (best_thresh))

In [None]:
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_val, probs_deep)
print('AUC: %.3f' % auc)

In [None]:
d_keras = pd.DataFrame(
    data = {'label': y_val, 'probs':probs_deep.ravel()}
)
d_keras['pred'] = d_keras.probs.apply(lambda pred: 1 if(pred >= best_thresh) else 0)
cm_keras = confusion_matrix(d_keras['label'], d_keras['pred'])
plt.figure(figsize=(8, 6))
ax= plt.subplot()
sns.heatmap(cm_keras/cm_keras.sum(axis=1)[:, np.newaxis], annot=True, ax = ax, annot_kws={"size": 20});

# labels, title and ticks

ax.set_xlabel('Predicted labels', fontsize=20);ax.set_ylabel('True labels', fontsize=20); 
ax.set_title('Normalized Confusion Matrix', fontsize=20); 
ax.xaxis.set_ticklabels(['Negative', 'Positive'], fontsize=20); 
ax.yaxis.set_ticklabels(['Negative', 'Positive'], fontsize=20);
print(cm_keras)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(d_keras['label'], d_keras['pred'])
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(d_keras['label'], d_keras['pred'])
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(d_keras['label'], d_keras['pred'])
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(d_keras['label'], d_keras['pred'])
print('F1 score: %f' % f1)

## Testing Dataset

In [None]:
amzn_test.head()

In [None]:
amzn_test.info()

In [None]:
goog_test.head()

In [None]:
goog_test.info()

### Cleaning Test Dataset

In [None]:
amzn_test = pd.read_csv('../input/Amazon_test.csv')
amzn_test = amzn_test.drop(['Unnamed: 0'], axis=1)

goog_test = pd.read_csv('../input/Google_test.csv')
goog_test = goog_test.drop(['Unnamed: 0'], axis=1)

AG_test = pd.read_csv('../input/AG_perfect_matching_test.csv')
AG_test = AG_test.drop(['Unnamed: 0'], axis=1)

In [None]:
amzn_test.description = amzn_test.description.fillna("")
goog_test.description = goog_test.description.fillna("")
goog_test.manufacturer = goog_test.manufacturer.fillna("")
goog_test['price'] = pd.to_numeric(goog_test['price'], errors='coerce')

In [None]:
amzn_test['title'] = amzn_test.title.apply(lambda text: get_processed_text(text))
amzn_test['description'] = amzn_test.description.apply(lambda text: get_processed_text(text))
amzn_test['manufacturer'] = amzn_test.manufacturer.apply(lambda text: get_processed_text(text))
amzn_test.columns = ['idAmazon', 'title', 'description', 'manufacturer', 'price']
amzn_test = amzn_test.set_index('idAmazon')

goog_test['name'] = goog_test.name.apply(lambda text: get_processed_text(text))
goog_test['description'] = goog_test.description.apply(lambda text: get_processed_text(text))
goog_test['manufacturer'] = goog_test.manufacturer.apply(lambda text: get_processed_text(text))
goog_test.columns = ['idGoogleBase', 'title', 'description', 'manufacturer', 'price']
goog_test = goog_test.set_index('idGoogleBase')

In [None]:
allpairs = recordlinkage.FullIndex()
allpairs = allpairs.index(amzn_test, goog_test)
len(allpairs)

In [None]:
feature_vectors_test = c1.compute(allpairs, amzn_test, goog_test)
feature_vectors_test

In [None]:
feature_vectors_test = c1.compute(allpairs, amzn_test, goog_test)
data_test = feature_vectors_test.reset_index()
data_test = data_test.merge(AG_test, indicator=True, how='outer')
data_test['label'] = np.where(data_test['_merge']=='both', 1, 0)
data_test = data_test.drop(columns=['_merge'])
actual = data_test.label
X = data_test.drop(['idAmazon', 'idGoogleBase', 'label'] , axis=1)

In [None]:
actual = data_test.label
X = data_test.drop(['idAmazon', 'idGoogleBase', 'label'] , axis=1)

In [None]:
probs = model4.predict(X)

In [None]:
average_precision_test = average_precision_score(actual, probs)

print('Average precision-recall score: {0:0.4f}'.format(
      average_precision_test))

precision, recall, _ = precision_recall_curve(actual, probs.ravel())
plt.plot(recall, precision, label='deep-learning (AP=0.38)')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.title('2-class Precision-Recall curve: '
                  'AP={0:0.4f}'.format(average_precision_test))
plt.show()

In [None]:
plot_roc(actual, probs)

In [None]:
fpr, tpr, thresholds = roc_curve(actual, probs)
J = tpr - fpr
ix = np.argmax(J)
best_thresh = thresholds[ix]
print('Best Threshold=%f' % (best_thresh))

In [None]:
probs = model4.predict(X)
result = pd.DataFrame(
    data = {'label': actual, 'probs':probs.ravel()}
)
result['pred'] = result.probs.apply(lambda pred: 1 if(pred >= best_thresh) else 0)
cm = confusion_matrix(result['label'], result['pred'])
plt.figure(figsize=(8, 6))
ax= plt.subplot()
sns.heatmap(cm/cm.sum(axis=1)[:, np.newaxis], annot=True, ax = ax, annot_kws={"size": 20});

# labels, title and ticks

ax.set_xlabel('Predicted labels', fontsize=20);ax.set_ylabel('True labels', fontsize=20); 
ax.set_title('Normalized Confusion Matrix', fontsize=20); 
ax.xaxis.set_ticklabels(['Negative', 'Positive'], fontsize=20); 
ax.yaxis.set_ticklabels(['Negative', 'Positive'], fontsize=20);
print(cm)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(result['label'], result['pred'])
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(result['label'], result['pred'])
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(result['label'], result['pred'])
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(result['label'], result['pred'])
print('F1 score: %f' % f1)

### Compare string by suing Levenshtein

In [None]:
c2 = recordlinkage.Compare()

c2.string('title', 'title', method='levenshtein', label='cmp_title')
c2.string('description', 'description', method='levenshtein', label='cmp_description')
c2.string('manufacturer', 'manufacturer', method='levenshtein', label='cmp_manufacturer')
c2.numeric('price', 'price', method='gauss', scale=5,
          offset=10, label='cmp_price')

feature_vectors2 = c2.compute(block, amzn_train, goog_train)

In [None]:
matches_index2 = feature_vectors2.index & matches
match_data2 = matches_index2.to_frame(index=False)
data2 = feature_vectors2.reset_index()
data2 = data2.merge(match_data2, indicator=True, how='outer')
data2['label'] = np.where(data2['_merge']=='both', 1, 0)
data2 = data2.drop(columns=['_merge'])

In [None]:
data_label20 = data2.loc[data2['label'] == 0]
data_label21 = data2.loc[data2['label'] == 1]

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,8))

ax0, ax1, ax2, ax3 = axes.flatten()

ax0.hist(data_label20.cmp_title, bins=10, alpha=0.7, rwidth=0.85)
ax0.set_title('The similarity of title for negative records')
ax1.hist(data_label20.cmp_description, bins=10, alpha=0.7, rwidth=0.85)
ax1.set_title('The similarity of description for negative records')
ax2.hist(data_label20.cmp_manufacturer, bins=10, alpha=0.7, rwidth=0.85)
ax2.set_title('The similarity of manufacturer for negative records')
ax3.hist(data_label20.cmp_price, bins=10, alpha=0.7, rwidth=0.85)
ax3.set_title('The similarity of price for negative records')

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,8))

ax0, ax1, ax2, ax3 = axes.flatten()

ax0.hist(data_label21.cmp_title, bins=10, alpha=0.7, rwidth=0.85)
ax0.set_title('The similarity of title for positive records')
ax1.hist(data_label21.cmp_description, bins=10, alpha=0.7, rwidth=0.85)
ax1.set_title('The similarity of description for positive records')
ax2.hist(data_label21.cmp_manufacturer, bins=10, alpha=0.7, rwidth=0.85)
ax2.set_title('The similarity of manufacturer for positive records')
ax3.hist(data_label21.cmp_price, bins=10, alpha=0.7, rwidth=0.85)
ax3.set_title('The similarity of price for positive records')

In [None]:
# Create a training and test set
train2, validation2 = train_test_split(data2, test_size=0.2, random_state=23)

y_train2 = train2.label
X_train2 = train2.drop(['idAmazon', 'idGoogleBase', 'label'] , axis=1)
y_val2 = validation2.label
X_val2 = validation2.drop(['idAmazon', 'idGoogleBase', 'label'] , axis=1)
scaler = StandardScaler()
X_train2 = scaler.fit_transform(X_train2)
X_val2 = scaler.transform(X_val2)

In [None]:
model4 = Sequential()
model4.add(Dense(64, input_dim=4, activation='relu'))
model4.add(Dropout(0.5))
model4.add(BatchNormalization())
model4.add(Dense(32, activation='relu'))
model4.add(Dropout(0.5))
model4.add(BatchNormalization())
model4.add(Dense(1, activation='sigmoid'))
model4.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

model4.summary()

In [None]:
start = time.time()
hist2 = model4.fit(X_train2, y_train2, 
                   validation_data=(X_val2, y_val2),  
                   batch_size=batch_size, epochs=epochs, callbacks = [early_stopping], class_weight=class_weight)
end = time.time()
print ("Elapsed time: {:.2f} seconds".format(end - start))

In [None]:
plot_pression_recall(hist2)

In [None]:
plot_acc_loss(hist2)

In [None]:
probs_deep2 = model4.predict(X_val2)

average_precision2 = average_precision_score(y_val2, probs_deep2.ravel())

print('Average precision-recall score: {0:0.4f}'.format(
      average_precision2))

precision, recall, _ = precision_recall_curve(y_val2, probs_deep2.ravel())
plt.plot(recall, precision, label='deep-learning (AP={0:0.2f})'.format(
      average_precision2))
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.title('2-class Precision-Recall curve: '
                  'AP={0:0.4f}'.format(average_precision2))
plt.show()

In [None]:
plot_roc(y_val2, probs_deep2)

In [None]:
fpr, tpr, thresholds = roc_curve(y_val2, probs_deep2)
J = tpr - fpr
ix = np.argmax(J)
best_thresh = thresholds[ix]
print('Best Threshold=%f' % (best_thresh))

In [None]:
d_keras2 = pd.DataFrame(
    data = {'label': y_val2, 'probs':probs_deep2.ravel()}
)
d_keras2['pred'] = d_keras2.probs.apply(lambda pred: 1 if(pred >= best_thresh) else 0)
cm_keras2 = confusion_matrix(d_keras2['label'], d_keras2['pred'])
plt.figure(figsize=(8, 6))
ax= plt.subplot()
sns.heatmap(cm_keras2/cm_keras2.sum(axis=1)[:, np.newaxis], annot=True, ax = ax, annot_kws={"size": 20});

# labels, title and ticks

ax.set_xlabel('Predicted labels', fontsize=20);ax.set_ylabel('True labels', fontsize=20); 
ax.set_title('Normalized Confusion Matrix', fontsize=20); 
ax.xaxis.set_ticklabels(['Negative', 'Positive'], fontsize=20); 
ax.yaxis.set_ticklabels(['Negative', 'Positive'], fontsize=20);
print(cm_keras2)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(d_keras2['label'], d_keras2['pred'])
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(d_keras2['label'], d_keras2['pred'])
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(d_keras2['label'], d_keras2['pred'])
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(d_keras2['label'], d_keras2['pred'])
print('F1 score: %f' % f1)

### Compare string by suing Q-gram

In [None]:
c3 = recordlinkage.Compare()

c3.string('title', 'title', method='qgram', label='cmp_title')
c3.string('description', 'description', method='qgram', label='cmp_description')
c3.string('manufacturer', 'manufacturer', method='qgram', label='cmp_manufacturer')
c3.numeric('price', 'price', method='gauss', scale=5,
          offset=10, label='cmp_price')

feature_vectors3 = c3.compute(block, amzn_train, goog_train)

In [None]:
matches_index3 = feature_vectors3.index & matches
match_data3 = matches_index3.to_frame(index=False)
data3 = feature_vectors3.reset_index()
data3 = data2.merge(match_data3, indicator=True, how='outer')
data3['label'] = np.where(data3['_merge']=='both', 1, 0)
data3 = data3.drop(columns=['_merge'])

In [None]:
data_label30 = data3.loc[data2['label'] == 0]
data_label31 = data3.loc[data2['label'] == 1]

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,8))

ax0, ax1, ax2, ax3 = axes.flatten()

ax0.hist(data_label30.cmp_title, bins=10, alpha=0.7, rwidth=0.85)
ax0.set_title('The similarity of title for negative records')
ax1.hist(data_label30.cmp_description, bins=10, alpha=0.7, rwidth=0.85)
ax1.set_title('The similarity of description for negative records')
ax2.hist(data_label30.cmp_manufacturer, bins=10, alpha=0.7, rwidth=0.85)
ax2.set_title('The similarity of manufacturer for negative records')
ax3.hist(data_label30.cmp_price, bins=10, alpha=0.7, rwidth=0.85)
ax3.set_title('The similarity of price for negative records')

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,8))

ax0, ax1, ax2, ax3 = axes.flatten()

ax0.hist(data_label31.cmp_title, bins=10, alpha=0.7, rwidth=0.85)
ax0.set_title('The similarity of title for positive records')
ax1.hist(data_label31.cmp_description, bins=10, alpha=0.7, rwidth=0.85)
ax1.set_title('The similarity of description for positive records')
ax2.hist(data_label31.cmp_manufacturer, bins=10, alpha=0.7, rwidth=0.85)
ax2.set_title('The similarity of manufacturer for positive records')
ax3.hist(data_label31.cmp_price, bins=10, alpha=0.7, rwidth=0.85)
ax3.set_title('The similarity of price for positive records')

In [None]:
# Create a training and test set
train3, validation3 = train_test_split(data3, test_size=0.2, random_state=23)

y_train3 = train3.label
X_train3 = train3.drop(['idAmazon', 'idGoogleBase', 'label'] , axis=1)
y_val3 = validation3.label
X_val3 = validation3.drop(['idAmazon', 'idGoogleBase', 'label'] , axis=1)
scaler = StandardScaler()
X_train3 = scaler.fit_transform(X_train3)
X_val3 = scaler.transform(X_val3)

In [None]:
model4 = Sequential()
model4.add(Dense(64, input_dim=4, activation='relu'))
model4.add(Dropout(0.5))
model4.add(BatchNormalization())
model4.add(Dense(32, activation='relu'))
model4.add(Dropout(0.5))
model4.add(BatchNormalization())
model4.add(Dense(1, activation='sigmoid'))
model4.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

model4.summary()

In [None]:
start = time.time()
hist3 = model4.fit(X_train3, y_train3, 
                   validation_data=(X_val3, y_val3),  
                   batch_size=batch_size, epochs=epochs, callbacks = [early_stopping], class_weight=class_weight)
end = time.time()
print ("Elapsed time: {:.2f} seconds".format(end - start))

In [None]:
plot_pression_recall(hist3)

In [None]:
plot_acc_loss(hist3)

In [None]:
probs_deep3 = model4.predict(X_val3)

average_precision3 = average_precision_score(y_val3, probs_deep3.ravel())

print('Average precision-recall score: {0:0.4f}'.format(
      average_precision3))

precision, recall, _ = precision_recall_curve(y_val3, probs_deep3.ravel())
plt.plot(recall, precision, label='deep-learning (AP={0:0.2f})'.format(
      average_precision3))
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.title('2-class Precision-Recall curve: '
                  'AP={0:0.4f}'.format(average_precision3))
plt.show()

In [None]:
plot_roc(y_val3, probs_deep3)

In [None]:
fpr, tpr, thresholds = roc_curve(y_val3, probs_deep3)
J = tpr - fpr
ix = np.argmax(J)
best_thresh = thresholds[ix]
print('Best Threshold=%f' % (best_thresh))

In [None]:
d_keras3 = pd.DataFrame(
    data = {'label': y_val3, 'probs':probs_deep3.ravel()}
)
d_keras3['pred'] = d_keras3.probs.apply(lambda pred: 1 if(pred >= best_thresh) else 0)
cm_keras3 = confusion_matrix(d_keras3['label'], d_keras3['pred'])
plt.figure(figsize=(8, 6))
ax= plt.subplot()
sns.heatmap(cm_keras3/cm_keras3.sum(axis=1)[:, np.newaxis], annot=True, ax = ax, annot_kws={"size": 20});

# labels, title and ticks

ax.set_xlabel('Predicted labels', fontsize=20);ax.set_ylabel('True labels', fontsize=20); 
ax.set_title('Normalized Confusion Matrix', fontsize=20); 
ax.xaxis.set_ticklabels(['Negative', 'Positive'], fontsize=20); 
ax.yaxis.set_ticklabels(['Negative', 'Positive'], fontsize=20);
print(cm_keras2)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(d_keras3['label'], d_keras3['pred'])
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(d_keras3['label'], d_keras3['pred'])
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(d_keras3['label'], d_keras3['pred'])
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(d_keras2['label'], d_keras3['pred'])
print('F1 score: %f' % f1)