# Load library and dataset

In [1]:
import warnings
warnings.filterwarnings("ignore")
import torch
import numpy as np
import pandas as pd
import os

from torch import optim
from tqdm import tqdm_notebook as tqdm
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score

from model.model import TripletModel, TripletNoEmbeddingModel
from model.SelfAttentionModel import StructuredSelfAttention
from utils.data_loader import load_data_set, load_word_to_index, load_bow_data, load_triplet_orders, load_padded_data, load_triplet
from utils.pretrained_glove_embeddings import load_glove_embeddings
import itertools
import time

from sklearn.utils import shuffle


Using TensorFlow backend.
[nltk_data] Downloading package punkt to /home/dac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
full_generated_data_path = 'generated_labeled_data.csv'
# device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Load dataset
df = load_data_set(full_generated_data_path, retrain=False)
print('Load dataset successfully!')

df_triplet_orders = load_triplet_orders(df, retrain=False)['content']
print('Loading triplet order successfully!')

Load dataset successfully!
Loading triplet order successfully!


In [62]:
shuffle(content_df)

28219    inactive - riviera home furnishings pvt. ltd. ...
2320       3pl global 2211 e carson carson usa 90810 90810
4125     inactive -jiangyin snowballion textile industr...
32632    colorland co. ltd. # 545-2 dongduchun-dong str...
36949    aerofil technology inc 225 industrial park dr....
                               ...                        
20455    al-karam textile mills (pvt) limited h.t.11/1 ...
27237    pactiv - can 2480 sommers drive canandaigua us...
48922           jang you inc. 620-5 choji-dong ansansi kr 
7176     elite comfort solutions inc. 24 herring rd new...
16085    delta galil 3601 west 4th street williamsport ...
Name: content, Length: 85064, dtype: object

In [65]:
import spacy

sp = spacy.load('en')

for content in shuffle(content_df)[:5]:
    for word in sp(content):
        print(word.text, word.pos_)
    

arunima PROPN
sports NOUN
wear VERB
limited ADJ
dewan PROPN
idris PROPN
rd PROPN
. PUNCT
zirabo PROPN
savar PROPN
null PROPN
ashulia PROPN
bd PROPN
null PROPN
nguyen PROPN
toan PROPN
trading PROPN
& CCONJ
apparel PROPN
co. PROPN
ltd PROPN
. PROPN
chau PROPN
son PROPN
industrial PROPN
zone PROPN
chau PROPN
son PROPN
ward PROPN
phu PROPN
ly PROPN
ha INTJ
nam PROPN
1 NUM
guangdong PROPN
midea PROPN
kitchen PROPN
appliances NOUN
manufacturing VERB
no DET
6 NUM
yong NOUN
an DET
road NOUN
beijiao PROPN
shunde PROPN
foshan PROPN
guangdong PROPN
foshan PROPN
cn PROPN
528311 NUM
528311 NUM
pt PROPN
. PUNCT
anugerah PROPN
abadi PROPN
magelang PROPN
dusun PROPN
demesan PROPN
rt.004 PROPN
rw.002 PROPN
desa NOUN
girirejo NOUN
kec.tempuran PROPN
kab.magelang PROPN
inactive ADJ
- PUNCT
fujian PROPN
jinjiang PROPN
city PROPN
fulian PROPN
shoes NOUN
& CCONJ
plastics PROPN
company PROPN
ltd PROPN
huzhong PROPN
industry PROPN
area PROPN
chendai PROPN
362211 NUM


# Building Model

## Load Data

In [3]:
pad_size = 20  # pad size or timestep
batch_size = 10000
embedding = False

In [4]:
def train_test_split():
    print('Train/test split')
    train_X_df = df[df['cid'].isin(df['cid'].unique()[0:-10])]
    train_df_triplet_orders = load_triplet_orders(train_X_df, dump_path='triplet_samples_id_train.csv',
                                                  retrain=False)['content']
    train_anc_loader, train_pos_loader, train_neg_loader = load_triplet(x_padded, train_df_triplet_orders,
                                                                        dump_path='embedding/triplet_data_train.pkl',
                                                                        batch_size=batch_size, retrain=True)
    test_X_df = df[df['cid'].isin(df['cid'].unique()[-10:])]
    test_df_triplet_orders = load_triplet_orders(test_X_df, dump_path='triplet_samples_id_test.csv',
                                                 retrain=False)['content']
    test_anc_loader, test_pos_loader, test_neg_loader = load_triplet(x_padded, test_df_triplet_orders,
                                                                     dump_path='embedding/triplet_data_test.pkl',
                                                                     batch_size=batch_size, retrain=true)
    print('All train/test sets are ready!')

    return (train_anc_loader, train_pos_loader, train_neg_loader), (test_anc_loader, test_pos_loader, test_neg_loader)


In [5]:
# get word to index and embedding whole dataset
word_to_index = load_word_to_index(df, retrain=False)
print('Load word to index successfully!')
embeddings = load_glove_embeddings(word_to_index, embedding_dim=300, retrain=False)
print('Load pretrained embedding')
X = load_padded_data(df, word_to_index, pad_size=pad_size, retrain=False)
print('Load padded data successfully!')
if embedding: 
    # Pre-embedding to not update embedding through all training data
    X = embedding_data(X, embeddings)
    print('Embedding data sucessfully!')
anc_loader, pos_loader, neg_loader = load_triplet(X, df_triplet_orders,
                                                  batch_size=batch_size, 
                                                  retrain=False)
print('Load triplet data successfully!')


Load word to index successfully!
Load pretrained embedding
Load padded data successfully!
Load triplet data successfully!


## Self-Attention triplet model with embedding inside model    

In [12]:
# Self-attention triplet model
lr = 0.1
margin = 0.2
attentive_features = 10
# Load model & optimizer
model = StructuredSelfAttention(embeddings=embeddings, max_len=pad_size,
                                               r=attentive_features, margin=margin, 
                                               cuda=device).cuda(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
model


StructuredSelfAttention(
  (embeddings): Embedding(1890, 300)
  (lstm): LSTM(300, 120, batch_first=True)
  (linear_first): Linear(in_features=120, out_features=100, bias=True)
  (linear_second): Linear(in_features=100, out_features=10, bias=True)
  (linear_final): Linear(in_features=120, out_features=50, bias=True)
  (linear_distance): Linear(in_features=100, out_features=1, bias=True)
  (tanh): Tanh()
)

In [28]:
# Load model and optimizer
self_attention_model_path = '/data/dac/dedupe-project/model/sam_nls_300d_40p_notest'

checkpoint = torch.load(model)
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])

model.eval()


StructuredSelfAttention(
  (embeddings): Embedding(1890, 300)
  (lstm): LSTM(300, 120, batch_first=True)
  (linear_first): Linear(in_features=120, out_features=100, bias=True)
  (linear_second): Linear(in_features=100, out_features=40, bias=True)
  (linear_final): Linear(in_features=120, out_features=50, bias=True)
  (linear_distance): Linear(in_features=100, out_features=1, bias=True)
  (tanh): Tanh()
)

In [13]:
# Train model
epochs = 10
best_lost = None
clipping_value = 1
loss_list = []
average_list = []

model.train()

start_time = time.time()
for epoch in tqdm(range(epochs), desc='Epoch'):
    avg_loss = 0
    avg_acc = 0
    for batch, [anc_x, pos_x, neg_x] in enumerate(zip(anc_loader, pos_loader, neg_loader)):
        # Training model per batch
        # Send data to graphic card - Cuda0
        anc_x, pos_x, neg_x = anc_x[0].to(device), pos_x[0].to(device), neg_x[0].to(device)
        pos_pred, neg_pred = model(anc_x, pos_x, neg_x)

        loss = (pos_pred + neg_pred).mean()
        corrects = torch.sum(pos_pred == 0) + torch.sum(neg_pred == 0)
        accuracy = float(corrects) / (2 * len(anc_x))
        avg_acc += accuracy
        avg_loss += float(loss)
        # Gradient
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()  # Empty cuda cache
        print('\rBatch:\t{}\t\tLoss:\t{}\t\tAccuracy:\t{}\t\t'.format(batch, float(loss),
                                                                      round(accuracy, 4)), end='')
    # Average loss and accuracy
    avg_acc = avg_acc / len(anc_loader)
    avg_loss = avg_loss / len(anc_loader)
    loss_list.append(avg_loss)
    average_list.append(avg_acc)
    print('\rEpoch:{}\t\t\t\tAverage Loss:\t{}\t\tAvg Accuracy:\t{}'.format(epoch, round(avg_loss, 4),
                                                                            round(avg_acc, 4)))
    if best_lost is None or best_lost > loss:
        best_lost = loss
print("--- %s seconds ---"%(time.time() - start_time))

HBox(children=(IntProgress(value=0, description='Epoch', max=10, style=ProgressStyle(description_width='initia…

Epoch:0				Average Loss:	0.1934		Avg Accuracy:	0.548753				
Epoch:1				Average Loss:	0.0995		Avg Accuracy:	0.85989				


KeyboardInterrupt: 

In [9]:
# Save model
self_attention_model_path = '/data/dac/dedupe-project/model/sam_nls_300d_40p_notest'

torch.save({
    'model': model.state_dict(),
    'optimizer': optimizer.state_dict()
}, self_attention_model_path)

## TripletModel with embedding inside model

In [6]:
# Self-attention triplet model
# triplet_300d_20p_dynamic_embedding -- glove embedding
# triplet_300d_20p_own_embedding -- initiate embedding
# triplet_300d_20p_own_embedding_bi_gru -- gru
# triplet_300d_20p_own_embedding_bi -- lstm
triplet_model_path = '/data/dac/dedupe-project/model/triplet_300d_20p_own_embedding_bi_gru'

lr = 0.0002
margin = 0.2
# Load model & optimizer
model = TripletModel(embeddings=embeddings, max_len=pad_size,
                     margin=margin, cuda=device).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
model


TripletModel(
  (embeddings): Embedding(1890, 300)
  (lstm): GRU(300, 120, batch_first=True, bidirectional=True)
  (linear_final): Linear(in_features=240, out_features=50, bias=True)
  (linear_distance): Linear(in_features=100, out_features=1, bias=True)
  (tanh): Tanh()
)

In [7]:
# Load model and optimizer
checkpoint = torch.load(triplet_model_path, map_location=device)
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])

model.eval()


TripletModel(
  (embeddings): Embedding(1890, 300)
  (lstm): GRU(300, 120, batch_first=True, bidirectional=True)
  (linear_final): Linear(in_features=240, out_features=50, bias=True)
  (linear_distance): Linear(in_features=100, out_features=1, bias=True)
  (tanh): Tanh()
)

In [None]:
# Train model
epochs = 10
best_lost = None
early_stopping_steps = 7
forward_index = 0

loss_list = []
average_list = []
model.train()


start_time = time.time()
for epoch in tqdm(range(epochs), desc='Epoch'):
    avg_loss = 0
    avg_acc = 0
    for batch, [anc_x, pos_x, neg_x] in enumerate(zip(anc_loader, pos_loader, neg_loader)):
        # Training model per batch
        # Send data to graphic card - Cuda0
        anc_x, pos_x, neg_x = anc_x[0].to(device), pos_x[0].to(device), neg_x[0].to(device)
        pos_pred, neg_pred = model(anc_x, pos_x, neg_x)

        loss = (pos_pred + neg_pred).mean()
        corrects = torch.sum(pos_pred == 0) + torch.sum(neg_pred == 0)
        accuracy = float(corrects) / (2 * len(anc_x))
        avg_acc += accuracy
        avg_loss += float(loss)

        # Gradient
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()  # Empty cuda cache
        print('\rBatch:\t{}\t\tLoss:\t{}\t\tAccuracy:\t{}\t\t'.format(batch, round(float(loss), 4),
                                                                      round(accuracy, 4)), end='')
    # Average loss and accuracy
    avg_acc = avg_acc / len(anc_loader)
    avg_loss = avg_loss / len(anc_loader)
    loss_list.append(avg_loss)
    average_list.append(avg_acc)
    print('\rEpoch:\t{}\t\tAverage Loss:\t{}\t\tAvg Accuracy:\t{}\t\t'.format(epoch, round(avg_loss, 4),
                                                                            round(avg_acc, 4)))
    if avg_acc > 0.87:
        break
    if best_lost is None or best_lost > avg_loss:
        best_lost = avg_loss
        forward_index = 0
        
        # Save model
#         torch.save({
#             'model': model.state_dict(),
#             'optimizer': optimizer.state_dict()
#         }, triplet_model_path)
    else:
        # Early stopping after reachs {early_stopping_steps} steps
        forward_index += 1
        if forward_index == early_stopping_steps:
            break
        
print("--- %s seconds ---"%(time.time() - start_time))


HBox(children=(IntProgress(value=0, description='Epoch', max=10, style=ProgressStyle(description_width='initia…

Epoch:	0		Average Loss:	0.9152		Avg Accuracy:	0.5332		
Batch:	96		Loss:	0.9049		Accuracy:	0.4618		

## Triplet Model without embedding inside

In [None]:
# Self-attention triplet model
triplet_ne_model_path = '/data/dac/dedupe-project/model/triplet_300d_20p_ne'

lr = 0.002
margin = 0.1
# Load model & optimizer
train_ne_model = TripletModelExEmb(embeddings=embeddings, max_len=pad_size,
                     margin=margin, cuda=cuda0).cuda(cuda0)
optimizer = optim.Adam(train_ne_model.parameters(), lr=lr)
model


In [None]:
# Load model and optimizer
checkpoint = torch.load(triplet_ne_model_path)
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])

model.eval()


In [None]:
# Train model
epochs = 40
best_lost = None
early_stopping_steps = 5
forward_index = 0

loss_list = []
acc_list = []
model.train()


start_time = time.time()
for epoch in tqdm(range(epochs), desc='Epoch'):
    avg_loss = 0
    avg_acc = 0
    for batch, [anc_x, pos_x, neg_x] in enumerate(zip(anc_loader, pos_loader, neg_loader)):
        # Training model per batch
        # Send data to graphic card - Cuda0
        anc_x, pos_x, neg_x = anc_x[0].to(device), pos_x[0].to(device), neg_x[0].to(device)
        pos_pred, neg_pred = model(anc_x, pos_x, neg_x)

        loss = (pos_pred + neg_pred).mean()
        corrects = torch.sum(pos_pred == 0) + torch.sum(neg_pred == 0)
        accuracy = float(corrects) / (2 * len(anc_x))
        avg_acc += accuracy
        avg_loss += float(loss)

        # Gradient
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
#         torch.cuda.empty_cache()  # Empty cuda cache
        print('\rBatch:\t{}\t\tLoss:\t{}\t\tAccuracy:\t{}\t\t'.format(batch, round(float(loss), 4),
                                                                      round(accuracy, 4)), end='')
    # Average loss and accuracy
    avg_acc = avg_acc / len(anc_loader)
    avg_loss = avg_loss / len(anc_loader)
    loss_list.append(avg_loss)
    average_list.append(avg_acc)
    print('\rEpoch:\t{}\t\tAverage Loss:\t{}\t\tAvg Accuracy:\t{}\t\t'.format(epoch, round(avg_loss, 4),
                                                                            round(avg_acc, 4)))
    if best_lost is None or best_lost > avg_loss:
        best_lost = avg_loss
        forward_index = 0
        
        # Save model
        torch.save({
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }, triplet_model_path)
    else:
        # Early stopping after reachs {early_stopping_steps} steps
        forward_index += 1
        if forward_index == early_stopping_steps:
            break
        
print("--- %s seconds ---"%(time.time() - start_time))


## Triplet Bag of Word Model

In [None]:
# Bag of word
batch_size = 200
x_train_bow = np.array(load_bow_data(df, word_to_index, retrain=True))
x_train_bow = x_train_bow[:, :, x_train_bow.sum(0)[0]>250]
print('Load bag of word data successfully!')

anc_loader, pos_loader, neg_loader = load_triplet(x_train_bow, df_triplet_orders,
                                                  batch_size=batch_size,
                                                  dump_path='embedding/triplet__bow_data.pkl',
                                                  embedded=True,
                                                  retrain=True)
vocabulary_size = len(x_train_bow[0])
print('Load triplet data successfully!')

Load bag of word data successfully!


HBox(children=(IntProgress(value=0, description='Load triplets', max=1644502, style=ProgressStyle(description_…




In [None]:
# Self-attention triplet model
triplet_model_path = '/data/dac/dedupe-project/model/triplet_bow'
cuda0 = torch.device('cuda:0')

lr = 0.0001
margin = 0.2
# Load model & optimizer
model = TripletBoWModel(max_len=vocabulary_size,
                        margin=margin, cuda=cuda0).cuda(cuda0)
optimizer = optim.Adam(model.parameters(), lr=lr)
model


In [None]:
# Train model
epochs = 40
best_lost = None
early_stopping_steps = 3
forward_index = 0

loss_list = []
average_list = []
model.train()

start_time = time.time()
for epoch in tqdm(range(epochs), desc='Epoch'):
    avg_loss = 0
    avg_acc = 0
    for batch, [anc_x, pos_x, neg_x] in enumerate(zip(anc_loader, pos_loader, neg_loader)):
        # Training model per batch
        # Send data to graphic card - Cuda0
        anc_x, pos_x, neg_x = anc_x[0].to(cuda0), pos_x[0].to(cuda0), neg_x[0].to(cuda0)
        pos_pred, neg_pred = model(anc_x, pos_x, neg_x)

        loss = (pos_pred + neg_pred).mean()
        corrects = torch.sum(pos_pred == 0) + torch.sum(neg_pred == 0)
        accuracy = float(corrects) / (2 * len(anc_x))
        avg_acc += accuracy
        avg_loss += float(loss)
        # Gradient
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()  # Empty cuda cache
        print('\rBatch:\t{}\t\tLoss:\t{}\t\tAccuracy:\t{}\t\t'.format(batch, round(float(loss), 4),
                                                                      round(accuracy, 4)), end='')
    # Average loss and accuracy
    avg_acc = avg_acc / len(anc_loader)
    avg_loss = avg_loss / len(anc_loader)
    loss_list.append(avg_loss)
    average_list.append(avg_acc)
    print('\rEpoch:\t{}\t\tAverage Loss:\t{}\t\tAvg Accuracy:\t{}\t\t'.format(epoch, round(avg_loss, 4),
                                                                            round(avg_acc, 4)))
    if best_lost is None or best_lost > avg_loss:
        best_lost = avg_loss
        forward_index = 0
        
#         # Save model
#         torch.save({
#             'model': model.state_dict(),
#             'optimizer': optimizer.state_dict()
#         }, triplet_model_path)
    else:
        # Early stopping after reachs {early_stopping_steps} steps
        forward_index += 1
        if forward_index == early_stopping_steps:
            break
        
print("--- %s seconds ---"%(time.time() - start_time))


## Stacking Embedding

In [6]:
from flair.data import Sentence
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, BertEmbeddings

In [7]:
glove_embedding = WordEmbeddings('glove').cuda(device)
flair_embedding_news_forward = FlairEmbeddings('news-forward-fast').cuda(device)
flair_embedding_news_backward = FlairEmbeddings('news-backward-fast').cuda(device)
bert_embedding = BertEmbeddings().cuda(device)

document_embeddings = DocumentPoolEmbeddings([
    flair_embedding_news_forward, 
    flair_embedding_news_backward
])


emb_dim = 2048

def stack_embedding(row, document_embeddings):
    sentence = Sentence(row)
    z = sentence.embedding.size()[0]
    document_embeddings.embed(sentence)
    
    return sentence.embedding.view(1, emb_dim)

In [8]:
# Self-attention triplet model
se_model_path = '/data/dac/dedupe-project/model/triplet_stacked_embedding'

lr = 0.01
margin = 0.1
# Load model & optimizer
model = TripletNoEmbeddingModel(max_len=1,
                     margin=margin, cuda=device, emb_dim=emb_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
model


TripletNoEmbeddingModel(
  (lstm): LSTM(2048, 120, batch_first=True)
  (linear_final): Linear(in_features=120, out_features=50, bias=True)
  (linear_distance): Linear(in_features=100, out_features=1, bias=True)
  (tanh): Tanh()
)

In [None]:
batch = torch.zeros([0, emb_dim])
batch_size = 500
current_size = 0
best_lost = None
epochs = 1

model.train()

loss_list = []
acc_list = []
start_time = time.time()
for epoch in tqdm(range(epochs), desc='Epoch'):
    avg_loss = 0
    avg_acc = 0
    count = 0
    
    anc_x = torch.zeros([0, emb_dim])
    pos_x = torch.zeros([0, emb_dim])
    neg_x = torch.zeros([0, emb_dim])
    for index, row in df_triplet_orders.iterrows():
        anc_loc, pos_loc, neg_loc = row[['anchor', 'pos', 'neg']]
        
        anc = df.loc[anc_loc, 'content'][0]
        pos = df.loc[pos_loc, 'content'][0]
        neg = df.loc[neg_loc, 'content'][0]
        anc = stack_embedding(anc, document_embeddings)
        pos = stack_embedding(pos, document_embeddings)
        neg = stack_embedding(neg, document_embeddings)
        
        if current_size < batch_size:
            # Create DataLoader with batchsize is 500
            anc_x = torch.cat((anc_x, anc.cpu()), 0)
            pos_x = torch.cat((pos_x, pos.cpu()), 0)
            neg_x = torch.cat((neg_x, neg.cpu()), 0)
            current_size += 1
        else:
            # Training model per batch
            anc_x, pos_x, neg_x = anc_x.to(device), pos_x.to(device), neg_x.to(device)
            pos_pred, neg_pred = model(anc_x, pos_x, neg_x)

            loss = (pos_pred + neg_pred).mean()
            corrects = torch.sum(pos_pred == 0) + torch.sum(neg_pred == 0)
            accuracy = float(corrects) / (2 * len(anc_x))
            avg_acc += accuracy
            avg_loss += float(loss)
            # Gradient
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            torch.cuda.empty_cache()  # Empty cuda cache
            print('\rBatch:\t{}\t\tLoss:\t{}\t\tAccuracy:\t{}\t\t'.format(count, round(float(loss), 4),
                                                                      round(accuracy, 4)), end='')
            
            # Reset data loader and increase count
            current_size = 1
            anc_x = anc.cpu()
            pos_x = pos.cpu()
            neg_x = neg.cpu()
            count += 1
            
    # Average loss and accuracy
    avg_acc = avg_acc / (len(df)/batch_size)
    avg_loss = avg_loss / (len(df)/batch_size)
    loss_list.append(avg_loss)
    acc_list.append(avg_acc)
    print('\rEpoch:\t1{}\t\tAverage Loss:\t{}\t\tAvg Accuracy:\t{}\t\t'.format(epoch, round(avg_loss, 4),
                                                                            round(avg_acc, 4)))
    if best_lost is None or best_lost > avg_loss:
        best_lost = avg_loss
        forward_index = 0
        
#         # Save model
#         torch.save({
#             'model': model.state_dict(),
#             'optimizer': optimizer.state_dict()
#         }, triplet_model_path)
    else:
        # Early stopping after reachs {early_stopping_steps} steps
        forward_index += 1
        if forward_index == early_stopping_steps:
            break
        
print("--- %s seconds ---"%(time.time() - start_time))


HBox(children=(IntProgress(value=0, description='Epoch', max=5, style=ProgressStyle(description_width='initial…

Batch:	2042		Loss:	0.0968		Accuracy:	0.956		

# Evaluate

In [None]:
# Evaluation
model.eval()
avg_loss = 0
avg_acc = 0

y_true = np.array([])
y_pred = np.array([])

for batch, [anc_x, pos_x, neg_x] in enumerate(zip(test_anc_loader, test_pos_loader, test_neg_loader)):
    # Send data to graphic card - Cuda0
    anc_x, pos_x, neg_x = anc_x[0].to(cuda0), pos_x[0].to(cuda0), neg_x[0].to(cuda0)
    with torch.no_grad():
        pos_pred, neg_pred = model(anc_x, pos_x, neg_x)
        pos_pred, neg_pred = pos_pred.cpu(), neg_pred.cpu()
        corrects = torch.sum(pos_pred == 0) + torch.sum(neg_pred == 0)

        y_true_curr = np.concatenate([np.ones(len(pos_pred)), np.zeros(len(neg_pred))])
        y_true = np.concatenate([y_true, y_true_curr])
        
        y_pred_curr = np.concatenate([np.ones(len(pos_pred)), np.zeros(len(neg_pred))])
        y_pred_curr[np.where(pos_pred != 0)[0]] = 0
        y_pred_curr[np.where(neg_pred != 0)[0] + len(pos_pred)] = 1
        y_pred = np.concatenate([y_pred, y_pred_curr])
        print('\rBatch:\t{}\t\tAccuracy:\t{}\t\tF1-score:\t{}\t\t'.format(
            batch, round(accuracy_score(y_true_curr, y_pred_curr), 4),
            round(f1_score(y_true_curr, y_pred_curr), 4)), end='')
print('\nAvg Accuracy:\t{}\t\t\tAvg F1-score:\t{}\t\t'.format(
    round(accuracy_score(y_true, y_pred), 4), round(f1_score(y_true, y_pred), 4)))

# Test

In [9]:
path = '/data/dac/dedupe-project/test/'
test_df = pd.read_excel(path + 'GT_added.xls')
test_df.fillna('', inplace=True)
# test_df = shuffle(test_df)
test_df_1 = test_df.loc[:, ['name', 'address']]
test_df_1['content'] = test_df_1['name'].str.lower() + ' ' + test_df_1['address'].str.lower()
test_df_1['content'] = test_df_1['content'].str.replace('\n', ' ').str.replace(',' ,' ').str.replace(r'[ ]+', ' ', regex=True)
test_df_2 = test_df.loc[:, ['duplicated_name', 'duplicated_address']]
test_df_2['content'] = test_df_2['duplicated_name'].str.lower() + ' ' + test_df_2['duplicated_address'].str.lower()
test_df_2['content'] = test_df_2['content'].str.replace('\n', ' ').str.replace(',' ,' ').str.replace(r'[ ]+', ' ', regex=True)

## True Test

In [10]:
x1 = load_padded_data(pd.DataFrame(test_df_1), word_to_index, dump_path=None,
                                   pad_size=pad_size, retrain=True)
x2 = load_padded_data(pd.DataFrame(test_df_2), word_to_index, dump_path=None, 
                                   pad_size=pad_size, retrain=True)

def create_data_loader(array, batch_size=batch_size):
    # Create data loader
    data = TensorDataset(torch.from_numpy(array).type(torch.LongTensor))
    loader = DataLoader(data, batch_size=batch_size, drop_last=False)
    return loader

x1 = create_data_loader(x1)
x2 = create_data_loader(x2)

pred_list = np.array([])
y_true = np.array([])
y_pred = np.array([])
att1_list = []
att2_list = []
for a, b in tqdm(zip(x1, x2)):
    # Send data to graphic card - Cuda0
    a, b = a[0].to(device), b[0].to(device)
    with torch.no_grad():
        pred = model(a, b)
        pred = pred.cpu()
#         att1 = att1.cpu()
#         att2 = att2.cpu()
        y_true_curr = np.ones(len(pred))
        y_true = np.concatenate([y_true, y_true_curr])

        y_pred_curr = np.ones(len(pred))
        y_pred_curr[np.where(pred <= 0)[0]] = 0
        y_pred = np.concatenate([y_pred, y_pred_curr])
        
        pred_list = np.concatenate([pred_list, pred.squeeze().data.numpy()])
#         att1_list.append(att1)
#         att2_list.append(att2)

print('Accuracy:\t{}\t\tF1-score:\t{}\t\t'.format(
    round(accuracy_score(y_true, y_pred), 4),
    round(f1_score(y_true, y_pred), 4)), end='')


HBox(children=(IntProgress(value=0, description='Padding', max=447, style=ProgressStyle(description_width='ini…




HBox(children=(IntProgress(value=0, description='Padding', max=447, style=ProgressStyle(description_width='ini…




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Accuracy:	0.9732		F1-score:	0.9864		

In [15]:
def create_test(n):
    test_df_1a = pd.DataFrame()
    test_df_1b = pd.DataFrame()

    for i1, i2 in list(itertools.combinations(test_df_1.index, 2))[:n]:
        test_df_1a = test_df_1a.append(test_df_1.iloc[i1, :])
        test_df_1b = test_df_1b.append(test_df_1.iloc[i2, :])

    test_df_1b = test_df_1b.append(test_df_1a)
    test_df_1a = test_df_1a.append(test_df_1a)

    test_df_1a.reset_index(inplace=True)
    test_df_1b.reset_index(inplace=True)
    
    return test_df_1a, test_df_1b

## Test 1

In [16]:
test1_n = 500
test_df_1a, test_df_1b = create_test(test1_n)

def create_data_loader(array, batch_size=batch_size):
    # Create data loader
    data = TensorDataset(torch.from_numpy(array).type(torch.LongTensor))
    loader = DataLoader(data, batch_size=batch_size, drop_last=False)
    return loader

In [17]:
x1 = load_padded_data(pd.DataFrame(test_df_1a), word_to_index, dump_path=None,
                                   pad_size=pad_size, retrain=True)
x2 = load_padded_data(pd.DataFrame(test_df_1b), word_to_index, dump_path=None, 
                                   pad_size=pad_size, retrain=True)

x1 = create_data_loader(x1)
x2 = create_data_loader(x2)

pred_list = np.array([])
y_true = np.array([])
y_pred = np.array([])
# att1_list = []
# att2_list = []
for a, b in tqdm(zip(x1, x2)):
    # Send data to graphic card - Cuda0
    a, b = a[0].to(device), b[0].to(device)
    with torch.no_grad():
        pred = model(a, b)
        pred = pred.cpu()
#         att1 = att1.cpu()
#         att2 = att2.cpu()
        y_true_curr = np.zeros(len(pred))
        y_true_curr[test1_n:] = 1
        y_true = np.concatenate([y_true, y_true_curr])

        y_pred_curr = np.ones(len(pred))
        y_pred_curr[np.where(pred <= 0)[0]] = 0
        y_pred = np.concatenate([y_pred, y_pred_curr])
        
        pred_list = np.concatenate([pred_list, pred.squeeze().data.numpy()])
#         att1_list.append(att1)
#         att2_list.append(att2)

print('Accuracy:\t{}\t\tF1-score:\t{}\t\t'.format(
    round(accuracy_score(y_true, y_pred), 4),
    round(f1_score(y_true, y_pred), 4)), end='')


HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Accuracy:	0.66		F1-score:	0.7463		

## Test 2

In [18]:
test2_n = 1000
test_df_1a, test_df_1b = create_test(test2_n)

In [19]:
x1 = load_padded_data(pd.DataFrame(test_df_1a), word_to_index, dump_path=None,
                                   pad_size=pad_size, retrain=True)
x2 = load_padded_data(pd.DataFrame(test_df_1b), word_to_index, dump_path=None, 
                                   pad_size=pad_size, retrain=True)

def create_data_loader(array, batch_size=batch_size):
    # Create data loader
    data = TensorDataset(torch.from_numpy(array).type(torch.LongTensor))
    loader = DataLoader(data, batch_size=batch_size, drop_last=False)
    return loader

x1 = create_data_loader(x1)
x2 = create_data_loader(x2)

pred_list = np.array([])
y_true = np.array([])
y_pred = np.array([])
# att1_list = []
# att2_list = []
for a, b in tqdm(zip(x1, x2)):
    # Send data to graphic card - Cuda0
    a, b = a[0].to(device), b[0].to(device)
    with torch.no_grad():
        pred = model(a, b)
        pred = pred.cpu()
#         att1 = att1.cpu()
#         att2 = att2.cpu()
        y_true_curr = np.zeros(len(pred))
        y_true_curr[test2_n:] = 1
        y_true = np.concatenate([y_true, y_true_curr])

        y_pred_curr = np.ones(len(pred))
        y_pred_curr[np.where(pred <= 0)[0]] = 0
        y_pred = np.concatenate([y_pred, y_pred_curr])
        
        pred_list = np.concatenate([pred_list, pred.squeeze().data.numpy()])
#         att1_list.append(att1)
#         att2_list.append(att2)

print('Accuracy:\t{}\t\tF1-score:\t{}\t\t'.format(
    round(accuracy_score(y_true, y_pred), 4),
    round(f1_score(y_true, y_pred), 4)), end='')


HBox(children=(IntProgress(value=0, description='Padding', max=2000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=0, description='Padding', max=2000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Accuracy:	0.758		F1-score:	0.8052		

## Test case for stacked embedding

In [None]:
batch = torch.zeros([0, emb_dim])
batch_size = 500
current_size = 0
best_lost = None
epochs = 1

model.train()

loss_list = []
acc_list = []
start_time = time.time()
avg_loss = 0
avg_acc = 0
count = 0

anc_x = torch.zeros([0, emb_dim])
pos_x = torch.zeros([0, emb_dim])
neg_x = torch.zeros([0, emb_dim])
for index, row in df_triplet_orders.iterrows():
    anc_loc, pos_loc, neg_loc = row[['anchor', 'pos', 'neg']]

    anc = df.loc[anc_loc, 'content'][0]
    pos = df.loc[pos_loc, 'content'][0]
    neg = df.loc[neg_loc, 'content'][0]
    anc = stack_embedding(anc, document_embeddings)
    pos = stack_embedding(pos, document_embeddings)
    neg = stack_embedding(neg, document_embeddings)

    if current_size < batch_size:
        # Create DataLoader with batchsize is 500
        anc_x = torch.cat((anc_x, anc.cpu()), 0)
        pos_x = torch.cat((pos_x, pos.cpu()), 0)
        neg_x = torch.cat((neg_x, neg.cpu()), 0)
        current_size += 1
    else:
        # Training model per batch
        anc_x, pos_x, neg_x = anc_x.to(device), pos_x.to(device), neg_x.to(device)
        pos_pred, neg_pred = model(anc_x, pos_x, neg_x)

        loss = (pos_pred + neg_pred).mean()
        corrects = torch.sum(pos_pred == 0) + torch.sum(neg_pred == 0)
        accuracy = float(corrects) / (2 * len(anc_x))
        avg_acc += accuracy
        avg_loss += float(loss)
        # Gradient
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()  # Empty cuda cache
        print('\rBatch:\t{}\t\tLoss:\t{}\t\tAccuracy:\t{}\t\t'.format(count, round(float(loss), 4),
                                                                  round(accuracy, 4)), end='')

        # Reset data loader and increase count
        current_size = 1
        anc_x = anc.cpu()
        pos_x = pos.cpu()
        neg_x = neg.cpu()
        count += 1

# Average loss and accuracy
avg_acc = avg_acc / (len(df)/batch_size)
avg_loss = avg_loss / (len(df)/batch_size)
loss_list.append(avg_loss)
acc_list.append(avg_acc)
print('\rAverage Loss:\t{}\t\tAvg Accuracy:\t{}\t\t'.format(round(avg_loss, 4),
                                                                        round(avg_acc, 4)))
        
print("--- %s seconds ---"%(time.time() - start_time))


# Test with another dataset

In [20]:
fd_df = pd.read_csv(path + 'fd_content.csv').iloc[:132, :]
fd_df.fillna('', inplace=True)

# test_df = shuffle(test_df)
fd_df['content'] = fd_df['name'].str.lower() + ' ' + fd_df['address'].str.lower()
fd_df['content'] = fd_df['content'].str.replace('\n', ' ').str.replace(',' ,' ').str.replace(r'[ ]+', ' ', regex=True)

fd_df = fd_df.loc[:, ['Unnamed: 0', 'content']]
fd_df['cid'] = fd_df.loc[:, 'Unnamed: 0']
del(fd_df['Unnamed: 0'])

# Padding fd_df
fd_arr = load_padded_data(pd.DataFrame(fd_df), word_to_index, dump_path=None,
                                   pad_size=pad_size, retrain=True)

# Split data set to anchor and object and y_true for testing model
fd_anchor_arr = []
fd_object_arr = []
y = []
for i, j in tqdm(list(itertools.combinations(range(0, len(fd_arr)), 2))):
    # Loop through combination of fd dataset
    fd_anchor_arr.append(fd_arr[i])
    fd_object_arr.append(fd_arr[j])
    # Get true label by cid (column 1)
    y.append(1 if fd_df.iloc[i, 1]==fd_df.iloc[j, 1] else 0)
    
fd_anchor_arr, fd_object_arr, y = np.array(fd_anchor_arr), np.array(fd_object_arr), np.array(y)

HBox(children=(IntProgress(value=0, description='Padding', max=132, style=ProgressStyle(description_width='ini…




HBox(children=(IntProgress(value=0, max=8646), HTML(value='')))




In [21]:
import torch.utils.data as data_utils

fd_anchor = data_utils.TensorDataset(torch.from_numpy(fd_anchor_arr).type(torch.LongTensor))
fd_anchor = data_utils.DataLoader(fd_anchor, batch_size=batch_size, drop_last=False)

fd_object = data_utils.TensorDataset(torch.from_numpy(fd_object_arr).type(torch.LongTensor))
fd_object = data_utils.DataLoader(fd_object, batch_size=batch_size, drop_last=False)

y_true = data_utils.TensorDataset(torch.from_numpy(y).type(torch.DoubleTensor))
y_true = data_utils.DataLoader(y, batch_size=batch_size, drop_last=False)


fd_pred_list = np.array([])
fd_y_pred = np.array([])
fd_y_true = np.array([])
for anc, obj, y_t in tqdm(zip(fd_anchor, fd_object, y_true)):
    # Predict for each batch
    anc, obj, y_t  = anc[0].to(device), obj[0].to(device), y_t
    
    with torch.no_grad():
        pred = model(anc, obj)
        pred = pred.cpu()

        y_pred_curr = np.ones(len(pred))
        y_pred_curr[np.where(pred <= 0)[0]] = 0
        fd_y_pred = np.concatenate([fd_y_pred, y_pred_curr])
        
        fd_pred_list = np.concatenate([fd_pred_list, pred.squeeze().data.numpy()])
        fd_y_true = np.concatenate([fd_y_true, y_t])

print('Accuracy:\t{}\t\tF1-score:\t{}\t\t'.format(
    round(accuracy_score(fd_y_true, fd_y_pred), 4),
    round(f1_score(fd_y_true, fd_y_pred), 4)), end='')


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Accuracy:	0.2451		F1-score:	0.0015		

In [32]:
print(pred.view(-1)[:100])

tensor([-0.9580,  0.8658,  0.7956, -0.7032, -0.7825, -0.9492,  0.9857,  0.2551,
        -0.8189, -0.9438, -0.9957, -0.2437,  0.7037, -0.8359, -0.9853, -0.0531,
        -0.8023, -0.9999, -0.4756, -0.9884, -0.9199,  0.3313,  0.8672, -0.6696,
        -0.9626,  0.9931, -0.9924, -0.9847,  0.8892,  0.9809, -0.9619, -0.3206,
         0.9818, -1.0000, -0.9550,  0.8916, -0.9987, -0.6110, -0.2472, -0.9990,
        -0.9927,  0.9552, -0.1526, -1.0000,  0.3961, -0.9918,  0.9762,  0.9557,
        -0.9377,  0.8395,  0.9892, -0.9999, -0.7036,  0.5103, -0.9135, -0.6444,
        -0.9998, -0.9998,  0.5157, -0.5695, -0.8669, -0.7765, -0.4961, -0.9981,
         0.3463, -0.6963, -0.9613, -0.9985, -0.2059, -1.0000,  0.5956, -0.9695,
        -0.9947, -0.9957, -0.2103, -0.9456, -1.0000,  0.9994, -0.8633,  0.9705,
         0.9993, -0.9973,  0.9895, -0.5695, -0.9974,  0.9278, -0.9167, -0.9941,
        -0.9915, -0.9871, -0.9487,  0.4247,  0.9987, -0.9971, -0.9999, -0.9993,
        -0.9845, -0.3585, -0.9939,  0.37

In [46]:
fd_y_true[:100]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [42]:
fd_anchor_arr[fd_y_true != fd_y_pred][:5]

array([[   0,    0,    0,    0, 1024,    0,    0, 1568,    0, 1494,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0, 1024,    0,    0, 1568,    0, 1494,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0, 1024,    0,    0, 1568,    0, 1494,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0, 1024,    0,    0, 1568,    0, 1494,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0, 1024,    0,    0, 1568,    0, 1494,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0]],
      dtype=int32)

In [43]:
fd_object_arr[fd_y_true != fd_y_pred][:5]

array([[   0,    0, 1416,    0,  630,    0,    0,    0,    0, 1416,    0,
         630,    0,    0,    0, 1774,    0,    0,    0,    0],
       [   0,  502,    0,    0, 1494,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,  984,    0,    0,    0,  980,    0,  493,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,  911,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [ 465, 1251,    0,    0, 1621,    0, 1251,    0,  985,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0]],
      dtype=int32)