# Load library and dataset

In [1]:
import warnings

warnings.filterwarnings("ignore")
import torch
import numpy as np
import pandas as pd
import os
import itertools
import time

from tqdm import tqdm_notebook as tqdm
from torch import optim
from torch.utils.data import TensorDataset, DataLoader
from model.model_new import TripletModel
from model.SelfAttentionModel import StructuredSelfAttention
from utils.data_loader_new import (
    load_data_set,
    load_word_to_index,
    load_triplet_orders,
    load_padded_data,
    load_triplet,
    generate_embedding
)
from sklearn.utils import shuffle
from sklearn.metrics import f1_score, accuracy_score


In [2]:
full_generated_data_path = 'new_generated_labeled_data.csv'
# device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Load dataset
df = load_data_set(full_generated_data_path, retrain=False)
df.fillna("", inplace=True)
df = df[df['cid'] != 50]
df.reset_index(inplace=True)
print('Load dataset successfully!')
# get word to index and embedding whole dataset
word_to_index = load_word_to_index(df, retrain=False)
print('Load word to index successfully!')
embeddings = generate_embedding(word_to_index, embedding_dim=300)
print('Load pretrained embedding')
X, X_len = load_padded_data(df, word_to_index, retrain=False)
print('Load padded data successfully!')

def truncate_non_string(X, X_len):
    # Drop rows that have length of word vector = 0
    truncate_index = [i for i in range(0, len(X_len)) if X_len[i] <= 0]
    X, X_len = np.delete(X, truncate_index, axis=0), np.delete(X_len, truncate_index, axis=0)
    
    return X, X_len, truncate_index

X, X_len, truncate_index = truncate_non_string(X, X_len)
df.drop(index=truncate_index, inplace=True)
df.reset_index(inplace=True)


Load dataset successfully!
Load word to index successfully!
Load pretrained embedding
Load padded data successfully!


# Building Model

## Generate DataLoader and Triplet orders

In [3]:
batch_size = 10000
df_triplet_orders = load_triplet_orders(shuffle(df), retrain=False)['content']
print('Loading triplet order successfully!')
anc_loader, pos_loader, neg_loader = load_triplet(np.array(X), X_len, df_triplet_orders,
                                                  batch_size=batch_size,
                                                  retrain=False)
print('Load triplet data successfully!')


Loading triplet order successfully!
Load triplet data successfully!


In [4]:
def to_cuda(loader):
    return [load.to(device) for load in loader]

## TripletModel with embedding inside model

In [24]:
# Self-attention triplet model
# triplet_300d_20p_dynamic_embedding -- glove embedding
# triplet_300d_20p_own_embedding -- initiate embedding
# triplet_300d_20p_own_embedding_bi_gru -- gru
# triplet_300d_20p_own_embedding_bi -- lstm
# triplet_300d_20p_own_embedding_bi_lstm
triplet_model_path = '/data/dac/dedupe-project/new/model/triplet_300d_no_embedding_bi_gru'

lr = 0.015
margin = 0.2
# Load model & optimizer
model = TripletModel(embeddings=embeddings, margin=margin, cuda=device).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
model


TripletModel(
  (embeddings): Embedding(1335, 300)
  (gru): GRU(300, 120, batch_first=True, bidirectional=True)
  (linear_final): Linear(in_features=240, out_features=10, bias=True)
  (linear_distance): Linear(in_features=20, out_features=1, bias=True)
  (tanh): Tanh()
)

In [None]:
# Load model and optimizer
checkpoint = torch.load(triplet_model_path, map_location=device)
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])

model.eval()


In [25]:
# Train model
epochs = 1
best_lost = None
early_stopping_steps = 10

loss_list = []
average_list = []
model.train()

start_time = time.time()
for epoch in tqdm(range(epochs), desc='Epoch'):
    avg_loss = 0
    avg_acc = 0
    for batch, [anc_x, pos_x, neg_x] in enumerate(zip(anc_loader, pos_loader, neg_loader)):
        # Training model per batch
        # Send data to graphic card - Cuda0
        anc_x, pos_x, neg_x = to_cuda(anc_x), to_cuda(pos_x), to_cuda(neg_x)
        pos_pred, neg_pred = model(anc_x, pos_x, neg_x)

        loss = (pos_pred + neg_pred).mean()
        corrects = torch.sum(pos_pred == 0) + torch.sum(neg_pred == 0)
        accuracy = float(corrects) / (2 * len(anc_x[0]))
        avg_acc += accuracy
        avg_loss += float(loss)

        # Gradient
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()  # Empty cuda cache
        print('\rBatch:\t{}\t\tLoss:\t{}\t\tAccuracy:\t{}\t\t'.format(batch, round(float(loss), 4),
                                                                      round(accuracy, 4)), end='')
    # Average loss and accuracy
    avg_acc = avg_acc / len(anc_loader)
    avg_loss = avg_loss / len(anc_loader)
    loss_list.append(avg_loss)
    average_list.append(avg_acc)
    print('\rEpoch:\t{}\t\tAverage Loss:\t{}\t\tAvg Accuracy:\t{}\t\t'.format(epoch, round(avg_loss, 4),
                                                                            round(avg_acc, 4)))
    if avg_acc > 0.85:
        break
    if best_lost is None or best_lost > avg_loss:
        best_lost = avg_loss
        forward_index = 0
        
#         Save model
        torch.save({
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }, triplet_model_path)
    else:
        # Early stopping after reachs {early_stopping_steps} steps
        forward_index += 1
        if forward_index == early_stopping_steps:
            break
        
print("--- %s seconds ---"%(time.time() - start_time))


HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

Epoch:	0		Average Loss:	0.5083		Avg Accuracy:	0.7739		

--- 120.92139029502869 seconds ---


In [54]:
#Save model
torch.save({
    'model': model.state_dict(),
    'optimizer': optimizer.state_dict()
}, triplet_model_path)

# Test

In [26]:
path = '/data/dac/dedupe-project/test/'
test_df = pd.read_excel(path + 'GT_added.xls')
test_df.fillna('', inplace=True)
test_df.reset_index(inplace=True)
test_df_1 = test_df.loc[:, ['address']]
test_df_1['content'] = test_df_1['address'].str.lower().str.replace('\n', ' ').str.replace(r'[ ]+', ' ', regex=True)
test_df_2 = test_df.loc[:, ['duplicated_address']]
test_df_2['content'] = test_df_2['duplicated_address'].str.lower().str.replace('\n', ' ').str.replace(r'[ ]+', ' ', regex=True)

In [27]:
def data_loader(test_df_1, test_df_2):
    # Make data loader
    X1, X1_lens = load_padded_data(pd.DataFrame(test_df_1), 
                               word_to_index, 
                               dump_path=None, 
                               retrain=True)

    X2, X2_lens = load_padded_data(pd.DataFrame(test_df_2),
                                   word_to_index, 
                                   dump_path=None, 
                                   retrain=True)

    # Drop rows that have length of word vector = 0
    truncate_index = [i for i in range(0, len(X1_lens)) if (X1_lens[i] <= 0 or X2_lens[i] <= 0)]
    X1, X1_lens = np.delete(X1, truncate_index, axis=0), np.delete(X1_lens, truncate_index, axis=0)
    X2, X2_lens = np.delete(X2, truncate_index, axis=0), np.delete(X2_lens, truncate_index, axis=0)

    def create_data_loader(X, batch_size=batch_size):
        X, X_lens = np.array(X[0]), np.array(X[1])

        # Create data loader
        data = TensorDataset(torch.from_numpy(X).type(torch.LongTensor),
                             torch.ByteTensor(X_lens))
        loader = DataLoader(data, batch_size=batch_size, drop_last=False)
        return loader

    return create_data_loader([X1, X1_lens]), create_data_loader([X2, X2_lens])


def create_test(n):
    # Generate small test based on ground truth
    test_df_1a = pd.DataFrame()
    test_df_1b = pd.DataFrame()

    for i1, i2 in shuffle(list(itertools.combinations(test_df_1.index, 2)))[:n]:
        test_df_1a = test_df_1a.append(test_df_1.iloc[i1, :])
        test_df_1b = test_df_1b.append(test_df_1.iloc[i2, :])

    test_df_1b = test_df_1b.append(test_df_1a)
    test_df_1a = test_df_1a.append(test_df_1a)

    test_df_1a.reset_index(inplace=True)
    test_df_1b.reset_index(inplace=True)
    
    return test_df_1a, test_df_1b

## True Test

In [28]:
X1, X2 = data_loader(test_df_1, test_df_2)

pred_list = np.array([])
y_true = np.array([])
y_pred = np.array([])
att1_list = []
att2_list = []
for a, b in tqdm(zip(X1, X2)):
    # Send data to graphic card - Cuda0
    a, b = to_cuda(a), to_cuda(b)
    with torch.no_grad():
        pred = model(a, b)
        pred = pred.cpu()
#         att1 = att1.cpu()
#         att2 = att2.cpu()
        y_true_curr = np.ones(len(pred))
        y_true = np.concatenate([y_true, y_true_curr])

        y_pred_curr = np.ones(len(pred))
        y_pred_curr[np.where(pred <= 0)[0]] = 0
        y_pred = np.concatenate([y_pred, y_pred_curr])
        
        pred_list = np.concatenate([pred_list, pred.squeeze().data.numpy()])
#         att1_list.append(att1)
#         att2_list.append(att2)

print('Accuracy:\t{}\t\tF1-score:\t{}\t\t'.format(
    round(accuracy_score(y_true, y_pred), 4),
    round(f1_score(y_true, y_pred), 4)), end='')

HBox(children=(IntProgress(value=0, description='Padding', max=428, style=ProgressStyle(description_width='ini…




HBox(children=(IntProgress(value=0, description='Padding', max=428, style=ProgressStyle(description_width='ini…




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Accuracy:	0.8067		F1-score:	0.893		

In [15]:
test_df_1.iloc[np.where(pred<=0)[0]]

Unnamed: 0,address,content
1,"12124 Rojas Dr, STE Suite 1200, El Paso, US 79936","12124 rojas dr, ste suite 1200, el paso, us 79936"
4,"SheYang Economic Development Zone, , Yancheng,...","sheyang economic development zone, , yancheng,..."
6,"458 2ND AVEN, TIFFIN, US 44883","458 2nd aven, tiffin, us 44883"
8,"Chuangye Road , Chengnan Development Area, Yua...","chuangye road , chengnan development area, yua..."
9,"56/1, Block # C, Section # 13, Mirpur, Dhaka-1...","56/1, block # c, section # 13, mirpur, dhaka-1..."
...,...,...
389,"Garden town, Buttar road, daska road, Sialkot.","garden town, buttar road, daska road, sialkot."
411,"PHUM PREY RORKA, KHUM CHORKCHENEANG, SORK ANGS...","phum prey rorka, khum chorkcheneang, sork angs..."
413,"79 North Qingyang Road, Changzhou City, Jiangs...","79 north qingyang road, changzhou city, jiangs..."
415,"#1 Fuyou rd. Guangling area, Yangzhou","#1 fuyou rd. guangling area, yangzhou"


In [13]:
test_df_2.iloc[np.where(pred<=0)[0]]

Unnamed: 0,duplicated_address,content
26,"181 Freeman Ave, Islip, US 11751","181 freeman ave, islip, us 11751"
54,"NO.175, XIANYOU, PUTIAN, CN","no.175, xianyou, putian, cn"
63,"No.76 Sunwu Road, Xukou Town, Wuzhong District...","no.76 sunwu road, xukou town, wuzhong district..."
73,"1 Tsvi Borenstein, PO Box 16, Yeruham, IL 80500","1 tsvi borenstein, po box 16, yeruham, il 80500"
74,"No.495, Quyuan North Road, Wukang Town, Deqing...","no.495, quyuan north road, wukang town, deqing..."
113,"1980 Industrial Drive, Sterling,USA","1980 industrial drive, sterling,usa"
156,Missing Address,missing address
195,"NO.7 QILIANG ROAD ,HUAFU INDUSTRIAL & TRADING ...","no.7 qiliang road ,huafu industrial & trading ..."
208,"No.15, Fengzu Road STE Paitou town, zhuji, CN ...","no.15, fengzu road ste paitou town, zhuji, cn ..."
217,"No.135 Zhaofeng Road, Leyu Town, Zhangjiagang,...","no.135 zhaofeng road, leyu town, zhangjiagang,..."


## Test 1

In [29]:
test1_n = 500
total_acc = 0
total_f1 = 0
for i in range (0, 10):
    test_df_1a, test_df_1b = create_test(test1_n)
    X1, X2 = data_loader(test_df_1a, test_df_1b)

    pred_list = np.array([])
    y_true = np.array([])
    y_pred = np.array([])
    # att1_list = []
    # att2_list = []
    for a, b in tqdm(zip(X1, X2)):
        # Send data to graphic card - Cuda0
        a, b = to_cuda(a), to_cuda(b)
        with torch.no_grad():
            pred = model(a, b)
            pred = pred.cpu()
    #         att1 = att1.cpu()
    #         att2 = att2.cpu()
            y_true_curr = np.zeros(len(pred))
            y_true_curr[test1_n:] = 1
            y_true = np.concatenate([y_true, y_true_curr])

            y_pred_curr = np.ones(len(pred))
            y_pred_curr[np.where(pred <= 0)[0]] = 0
            y_pred = np.concatenate([y_pred, y_pred_curr])

            pred_list = np.concatenate([pred_list, pred.squeeze().data.numpy()])
    #         att1_list.append(att1)
    #         att2_list.append(att2)
    total_acc += accuracy_score(y_true, y_pred)
    total_f1 += f1_score(y_true, y_pred)
    print('Accuracy:\t{}\t\tF1-score:\t{}\t\t'.format(
        round(accuracy_score(y_true, y_pred), 4),
        round(f1_score(y_true, y_pred), 4)), end='')
    
print('\nAccuracy:\t{}\t\tF1-score:\t{}\t\t'.format(
        round(total_acc/10, 4),
        round(total_f1/10, 4)))

HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Accuracy:	0.6653		F1-score:	0.7148		

HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Accuracy:	0.6653		F1-score:	0.7165		

HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Accuracy:	0.6639		F1-score:	0.7104		

HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Accuracy:	0.6886		F1-score:	0.7351		

HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Accuracy:	0.6735		F1-score:	0.7241		

HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Accuracy:	0.6505		F1-score:	0.7043		

HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Accuracy:	0.6578		F1-score:	0.7098		

HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Accuracy:	0.6565		F1-score:	0.7131		

HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Accuracy:	0.6595		F1-score:	0.7121		

HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=0, description='Padding', max=1000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Accuracy:	0.6643		F1-score:	0.7135		Accuracy:	0.6645		F1-score:	0.7154		

## Test 2

In [16]:
test2_n = 1000
test_df_1a, test_df_1b = create_test(test2_n)

In [17]:
X1, X2 = data_loader(test_df_1a, test_df_1b)

pred_list = np.array([])
y_true = np.array([])
y_pred = np.array([])
# att1_list = []
# att2_list = []
for a, b in tqdm(zip(X1, X2)):
    # Send data to graphic card - Cuda0
    a, b = to_cuda(a), to_cuda(b)
    with torch.no_grad():
        pred = model(a, b)
        pred = pred.cpu()
#         att1 = att1.cpu()
#         att2 = att2.cpu()
        y_true_curr = np.zeros(len(pred))
        y_true_curr[test2_n:] = 1
        y_true = np.concatenate([y_true, y_true_curr])

        y_pred_curr = np.ones(len(pred))
        y_pred_curr[np.where(pred <= 0)[0]] = 0
        y_pred = np.concatenate([y_pred, y_pred_curr])
        
        pred_list = np.concatenate([pred_list, pred.squeeze().data.numpy()])
#         att1_list.append(att1)
#         att2_list.append(att2)

print('Accuracy:\t{}\t\tF1-score:\t{}\t\t'.format(
    round(accuracy_score(y_true, y_pred), 4),
    round(f1_score(y_true, y_pred), 4)), end='')


HBox(children=(IntProgress(value=0, description='Padding', max=2000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=0, description='Padding', max=2000, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Accuracy:	0.8848		F1-score:	0.8958		

# Test with another dataset

In [16]:
fd_df = pd.read_csv(path + 'fd_content.csv').iloc[:132, :]
fd_df.fillna('', inplace=True)

# test_df = shuffle(test_df)
fd_df['content'] = fd_df['address'].str.lower()
fd_df['content'] = fd_df['content'].str.replace('\n', ' ').str.replace(',' ,' ').str.replace(r'[ ]+', ' ', regex=True)

fd_df = fd_df.loc[:, ['Unnamed: 0', 'content']]
fd_df['cid'] = fd_df.loc[:, 'Unnamed: 0']
del(fd_df['Unnamed: 0'])

# Padding all data of fd
fd_arr, fd_lens = load_padded_data(pd.DataFrame(fd_df), word_to_index, dump_path=None, retrain=True)
fd_arr, fd_lens, _ = truncate_non_string(fd_arr, fd_lens)
# Split data set to anchor and object and y_true for testing model
fd_anchor_arr = []
fd_object_arr = []
fd_anchor_lens = []
fd_object_lens = []

y = []
for i, j in tqdm(list(itertools.combinations(range(0, len(fd_arr)), 2))):
    # Loop through combination of fd dataset
    fd_anchor_arr.append(fd_arr[i])
    fd_object_arr.append(fd_arr[j])
    
    fd_anchor_lens.append(fd_lens[i])
    fd_object_lens.append(fd_lens[j])
    # Get true label by cid (column 1)
    y.append(1 if fd_df.iloc[i, 1]==fd_df.iloc[j, 1] else 0)

    
# convert to numpy array
fd_anchor_lens, fd_object_lens = np.array(fd_anchor_lens), np.array(fd_object_lens)
y = np.array(y)

HBox(children=(IntProgress(value=0, description='Padding', max=132, style=ProgressStyle(description_width='ini…




HBox(children=(IntProgress(value=0, max=8128), HTML(value='')))




In [17]:
def to_data_loader(data, batch_size=batch_size):
    temp = TensorDataset(torch.cat(fd_anchor_arr).view(8128, -1).type(torch.LongTensor), 
                                    torch.ByteTensor(data[1]))
    return DataLoader(temp, batch_size=batch_size, drop_last=False)

In [18]:
import torch.utils.data as data_utils

fd_anchor = to_data_loader([fd_anchor_arr, fd_anchor_lens], batch_size=batch_size)
fd_object = to_data_loader([fd_object_arr, fd_object_lens], batch_size=batch_size)
y_true = DataLoader(y, batch_size=batch_size, drop_last=False)

fd_pred_list = np.array([])
fd_y_pred = np.array([])
fd_y_true = np.array([])
for anc, obj, y_t in tqdm(zip(fd_anchor, fd_object, y_true)):
    # Predict for each batch
    anc, obj, y_t  = to_cuda(anc), to_cuda(obj), y_t
    
    with torch.no_grad():
        pred = model(anc, obj)
        pred = pred.cpu()

        y_pred_curr = np.ones(len(pred))
        y_pred_curr[np.where(pred <= 0)[0]] = 0
        fd_y_pred = np.concatenate([fd_y_pred, y_pred_curr])
        
        fd_pred_list = np.concatenate([fd_pred_list, pred.squeeze().data.numpy()])
        fd_y_true = np.concatenate([fd_y_true, y_t])

print('Accuracy:\t{}\t\tF1-score:\t{}\t\t'.format(
    round(accuracy_score(fd_y_true, fd_y_pred), 4),
    round(f1_score(fd_y_true, fd_y_pred), 4)), end='')


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Accuracy:	0.0416		F1-score:	0.001		

In [19]:
print(pred.view(-1)[:100])

tensor([0.8776, 0.8357, 0.7150, 0.7150, 0.7150, 0.8357, 0.7150, 0.1008, 0.4291,
        0.1008, 0.7722, 0.7150, 0.7150, 0.8862, 0.7150, 0.8386, 0.1008, 0.8357,
        0.1008, 0.1008, 0.4291, 0.8737, 0.7150, 0.4291, 0.7150, 0.4291, 0.8776,
        0.8357, 0.7150, 0.8357, 0.7150, 0.8357, 0.8357, 0.8862, 0.7150, 0.4291,
        0.4291, 0.7150, 0.8776, 0.8357, 0.8862, 0.8357, 0.8357, 0.8862, 0.7722,
        0.8776, 0.8776, 0.8357, 0.8862, 0.4291, 0.8776, 0.4291, 0.8357, 0.8357,
        0.7150, 0.7150, 0.4291, 0.7150, 0.8776, 0.8862, 0.8776, 0.8357, 0.7150,
        0.1008, 0.8357, 0.7150, 0.7150, 0.7150, 0.8357, 0.8357, 0.4291, 0.4291,
        0.8862, 0.8776, 0.7150, 0.4291, 0.8737, 0.8862, 0.4291, 0.7150, 0.4291,
        0.1008, 0.8776, 0.8862, 0.8357, 0.7150, 0.7150, 0.8357, 0.2334, 0.4291,
        0.8386, 0.7150, 0.8776, 0.8357, 0.8737, 0.8357, 0.4291, 0.8357, 0.8862,
        0.8776])


In [20]:
fd_y_true[:100]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [21]:
np.array(fd_obj)[fd_y_true != fd_y_pred][:5]

NameError: name 'fd_obj' is not defined

In [122]:
np.array(fd_anc)[fd_y_true != fd_y_pred][:5]

array([[1185,  342,  365,    0,    0,    0,    0,    0,    0,    0,    0,
           0],
       [1185,  342,  365,    0,    0,    0,    0,    0,    0,    0,    0,
           0],
       [1185,  342,  365,    0,    0,    0,    0,    0,    0,    0,    0,
           0],
       [1185,  342,  365,    0,    0,    0,    0,    0,    0,    0,    0,
           0],
       [1185,  342,  365,    0,    0,    0,    0,    0,    0,    0,    0,
           0]])

In [123]:
fd_y_pred[fd_y_pred != fd_y_true]

array([1., 1., 1., ..., 1., 1., 1.])