In [1]:
import sys
sys.path.append('/source/main')

In [2]:
import os
import logging
from datetime import datetime
import time
from itertools import chain

import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from naruto_skills.new_voc import Voc
from torch.utils.data import DataLoader, Subset, Dataset
import matplotlib.pyplot as plt

from model_def.simple_cnn import SimpleCNN
from model_def.siamese_model_ import SiameseModel
from model_def.siamese_core import SiameseModelCore
from data_for_train.pool import PoolDocs
from utils import pytorch_utils
from preprocess import preprocessor
from data_for_train.index_dataset import IndexDataset
from data_for_train.positive_dataset import PositiveDataset
from naruto_skills.training_checker import TrainingChecker

In [3]:
logging.basicConfig(level=logging.INFO)
pd.set_option('display.max_colwidth', -1)

In [4]:
def docs2input_tensors(docs, device):
    preprocessed_docs = [preprocessor.infer_preprocess(doc) for doc in docs]
#     max_len = max([len(item.split()) for item in preprocessed_docs])
    max_len = 100
    preprocessed_docs = [' '.join(doc.split()[:max_len]) for doc in preprocessed_docs]
    word_input = voc.docs2idx(preprocessed_docs, equal_length=max_len)
    inputs = np.array(word_input)
    input_tensors = torch.from_numpy(inputs)
    input_tensors = input_tensors.to(device)
    return input_tensors

def predict_batch(docs):
    with torch.no_grad():
        input_tensors = docs2input_tensors(docs, device)
        predict_tensor = model(input_tensors)
        predict_np = predict_tensor.cpu().numpy()
        return predict_np[:, 1]

def predict_docs(docs, batch_size):
    return list(chain(*[predict_batch(docs[i: i+batch_size]) for i in tqdm(range(0, len(docs), batch_size))]))

# 1. Data loading

## 1.1 Pool

In [5]:
def collate_fn(list_data):
    """
    shape == (batch_size, col1, col2, ...)
    """
    data = zip(*list_data)
    data = [np.stack(col, axis=0) for col in data]
    data = [torch.from_numpy(col) for col in data]
    return data
voc = Voc.load('/source/main/vocab/output/voc.pkl')
MAX_LENGTH = 100
BATCH_SIZE = 256
EXP_ID = '22a'

In [6]:
# df_neg = pd.read_csv('/source/main/data_for_train/output/huge_pool/wiki.csv', nrows=1e6, usecols=['target'])
# df_neg.rename(columns={'target': 'mention'}, inplace=True)
# df_neg.dropna(inplace=True, subset=['mention'])
# df_neg.drop_duplicates(inplace=True, subset=['mention'])
# df_neg = df_neg.iloc[:794323, ]

In [7]:
# print(df_neg.shape)
# neg = IndexDataset(voc, list(df_neg['mention']), equal_length=MAX_LENGTH)
# neg = PoolDocs(neg)

In [8]:
df_pool = pd.read_csv('/source/main/data_for_train/output/train/pool.csv', nrows=1e6)
df_pool.dropna(inplace=True, subset=['mention'])
df_pool.drop_duplicates(inplace=True, subset=['mention'])
df_pool = df_pool.iloc[:794323, :]

In [9]:
print(df_pool.shape)
pool = IndexDataset(voc, list(df_pool['mention']), equal_length=MAX_LENGTH)
# pool = PoolDocs(pool)

(794323, 2)


In [10]:
len(pool)

794323

## 1.2 Positive data

In [11]:
POSITIVE_NAME = 'positive_class_9'
df_pos = pd.read_csv('/source/main/data_for_train/output/train/%s.csv' % POSITIVE_NAME)
df_pos.dropna(inplace=True, subset=['mention'])
df_pos.drop_duplicates(inplace=True, subset=['mention'])

print(df_pos.shape)
pos = IndexDataset(voc, list(df_pos['mention']), equal_length=MAX_LENGTH)
# pos = PositiveDataset(pos)

(33539, 15)


In [12]:
# pool_data, _ = zip(*pool)
# pool_data = [str(item) for item in pool_data]
# pos_data, _ = zip(*pos)
# pos_data = [str(item) for item in pos_data]
# len(set(pool_data).intersection(set(pos_data)))

In [13]:
df_anchor = pd.read_csv('/source/main/data_for_train/output/train/%s.csv' % POSITIVE_NAME)
df_anchor.dropna(inplace=True, subset=['mention'])
df_anchor.drop_duplicates(inplace=True, subset=['mention'])
df_anchor = df_anchor.sample(df_anchor.shape[0], random_state=43)
print(df_anchor.shape)
anchor = IndexDataset(voc, list(df_anchor['mention']), equal_length=MAX_LENGTH)
# anchor = PositiveDataset(anchor)

(33539, 15)


In [14]:
class TripletDataset(Dataset):
    def __init__(self, anchor, pos, pool):
        super(TripletDataset, self).__init__()
        self.anchor = anchor
        self.pos = pos
        self.pool = pool
        self.len_pos = len(pos)

    def __len__(self):
        return len(self.pool)

    def __getitem__(self, idx):
        return self.anchor[idx % self.len_pos], self.pos[idx % self.len_pos], self.pool[idx]


In [15]:
ds = TripletDataset(anchor, pos, pool)

In [16]:
data_loader = DataLoader(dataset=ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=collate_fn)

In [17]:
data = next(iter(data_loader))

In [18]:
voc.idx2docs(data[0].cpu().numpy()[:3, :])

['bé mình uống nan __d__ năm rồi , thấy khỏe mạnh và phát triển tốt lắm . mình hoàn toàn yên tâm về nan',
 'sữa friso mát lắm , bé uống thích lắm',
 'trần hồng quí mom cứ chịu khó nấu , con ăn đc bao nhiêu thì ăn thôi mom , ko ép , để cho chơi nhiều và giảm cử ăn vặt là mấy ảnh sẽ chịu ăn nhiều hơn thôi . bé nhà mình có dùng thêm pediasure , uống ngày __d__ cử thấy con ăn uống ngon miệng hơn nhiều ấy mom .']

In [19]:
voc.idx2docs(data[1].cpu().numpy()[:3, :])

['trộm vía con tiêu hóa tốt nên thích lắm vì uống sữa nan đó chị',
 'bé em cũng dùng friso ne , vậy mà giờ mới biết đến cuộc thi',
 'optimum mình dùng mấy hộp ko tăng lạng nào luôn']

In [20]:
voc.idx2docs(data[2].cpu().numpy()[:3, :])

['loa loa __o__ ... aaaaaa ngon tuyệt vời mn ơiii 😭😭😭 sấu ngâm mắm gừng ớt 😻😻😻 bác e làm đảm bảo sạch sẽ ngon tuyệt vời luôn ấy huhu nước mắm nam ngư đun lên , sấu cũng trần qua nước sôi để k bị váng và để được __o__ sạch sẽ vừa ngon luôn __o__ hộp đầy như kia __d__k nha mn 😋😋😋 k ăn thì sẽ tiếc lắm ấy ... ăn ngấm vào tận trong hột sấu luôn = ) ) ) ) bác nào ăn báo e luôn nàoooo __o__ sđt __d__ hoặc __d__',
 'lê hà trang tiếc cái t k dùng kotex',
 'tại sao lại như vậy ?']

In [21]:
core_model = SiameseModelCore(voc.get_embedding_weights())
model = SiameseModel(core_model)

In [23]:
model.train()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [24]:
# PRE_TRAINED_MODEL='/source/main/train/output/saved_models/12bb/None.pt'
# checkpoint = torch.load(PRE_TRAINED_MODEL, map_location=device)
# model.load_state_dict(checkpoint['model_state_dict'])

In [25]:
pytorch_utils.count_parameters(model)

13886910

In [26]:
def train_step(inputs):
    model.train()
    step_loss = model.train_batch(inputs[0], inputs[1], inputs[2])
    return step_loss

In [27]:
len(data_loader)

3103

In [28]:
# model.eval()
# # import pdb; pdb.set_trace()
# predict_docs(['giá bao tiền', 'ee', 'Giảm giá sốc'], batch_size=2)

In [29]:
anchor = docs2input_tensors(['giá bao tiền'], device)
pos = docs2input_tensors(['bao nhiêu thế ?'], device)
neg = docs2input_tensors(['hôm nay tôi đi học'], device)

In [30]:
print('dis_1', model.get_distance(anchor, pos))
print('dis_2', model.get_distance(anchor, neg))

dis_1 tensor([0.0874], device='cuda:0', grad_fn=<NormBackward1>)
dis_2 tensor([0.2014], device='cuda:0', grad_fn=<NormBackward1>)


# 2. Training

## 2.1 Epoch 1

In [None]:
model.build_stuff_for_training(device)
for epoch_idx in range(1):
    start = time.time()
    for idx, inputs in tqdm(enumerate(data_loader)):
        inputs = [i.to(device) for i in inputs]
        l = train_step(inputs)
        if idx % 100 == 0:
            logging.info('\t Step: %s Loss: %.5f ', idx, l)
    duration = time.time() - start
    logging.info('Epoch %s took %.2f s', epoch_idx, duration)

0it [00:00, ?it/s]INFO:root:	 Step: 0 Loss: 10.00150 
100it [01:12,  1.39it/s]INFO:root:	 Step: 100 Loss: 0.72637 
200it [02:24,  1.39it/s]INFO:root:	 Step: 200 Loss: 1.23698 
300it [03:36,  1.39it/s]INFO:root:	 Step: 300 Loss: 0.51996 
400it [04:48,  1.39it/s]INFO:root:	 Step: 400 Loss: 0.27985 
500it [06:00,  1.39it/s]INFO:root:	 Step: 500 Loss: 0.49068 
600it [07:12,  1.39it/s]INFO:root:	 Step: 600 Loss: 0.60918 
700it [08:24,  1.39it/s]INFO:root:	 Step: 700 Loss: 0.17859 
800it [09:36,  1.39it/s]INFO:root:	 Step: 800 Loss: 0.13675 
900it [10:48,  1.38it/s]INFO:root:	 Step: 900 Loss: 0.14004 
1000it [12:03,  1.39it/s]INFO:root:	 Step: 1000 Loss: 0.26835 
1100it [13:15,  1.39it/s]INFO:root:	 Step: 1100 Loss: 0.38971 
1200it [14:28,  1.38it/s]INFO:root:	 Step: 1200 Loss: 0.37492 
1300it [15:40,  1.38it/s]INFO:root:	 Step: 1300 Loss: 0.22061 
1400it [16:52,  1.39it/s]INFO:root:	 Step: 1400 Loss: 0.40308 
1500it [18:19,  1.10s/it]INFO:root:	 Step: 1500 Loss: 0.08176 
1600it [20:08,  1.0

In [None]:
# for epoch_idx in range(10):
#     start = time.time()
#     model.train()
#     for idx, inputs in tqdm(enumerate(data_loader)):    
#         inputs = [i.to(device) for i in inputs]
#         l = train_step(inputs)
#     duration = time.time() - start
#     logging.info('Epoch %s took %.2f s', epoch_idx, duration)
    
#     model.eval()    
#     df_pos['pred'] = predict_docs(df_pos['mention'], batch_size=256)
#     df_pool['pred'] = predict_docs(df_pool['mention'], batch_size=256)
    
#     logging.info('Recall: %s/%s=%.4f', (df_pos['pred']>=0.5).sum(), df_pos.shape[0], 
#                  (df_pos['pred']>=0.5).sum()/df_pos.shape[0])
#     logging.info('Ratio on pool: %s/%s=%.4f', (df_pool['pred']>=0.5).sum(), df_pool.shape[0], 
#                  (df_pool['pred']>=0.5).sum()/df_pool.shape[0])
    

In [None]:
# fig = plt.figure(figsize=(10, 5))

# ax = fig.add_subplot(1, 2, 1)
# df_pos.loc[:500, 'pred'].hist(bins=100, ax=ax)
# ax.set_title('Spy')
# ax.set_xlim(0, 0.9)
# ax.set_ylim(0, 100)

# ax = fig.add_subplot(1, 2, 2)
# df_pos.loc[500:, 'pred'].hist(bins=100, ax=ax)
# ax.set_title('Positive')
# ax.set_xlim(0, 0.9)
# ax.set_ylim(0, 100)

# plt.show()


In [None]:
# df_pool_social = pd.read_csv('/source/main/data_for_train/output/train/pool.csv')
# df_pool_social.rename(columns={'target': 'mention'}, inplace=True)
# df_pool_social.dropna(inplace=True, subset=['mention'])
# df_pool_social.drop_duplicates(inplace=True, subset=['mention'])
# df_pool_social = df_pool_social.iloc[:794323, ]

In [None]:
EXP_ID

In [None]:
training_checker = TrainingChecker(model, root_dir='/source/main/train/output/saved_models/%s/' % EXP_ID,
                                   init_score=-10000)
training_checker.save_model()

# Analyse

In [None]:
model.eval()

In [None]:
anchor = docs2input_tensors([df_pos['mention'].iloc[20]], device)
pos = docs2input_tensors([df_pos['mention'].iloc[40]], device)
neg = docs2input_tensors(['hôm nay tôi đi học'], device)
print('dis_1', model.get_distance(anchor, pos))
print('dis_2', model.get_distance(anchor, neg))

In [None]:
df_pos['mention'].sample(3)

In [None]:
POS = list(df_pos['mention'].sample(5))

def get_distance(docs_1, docs_2):
    with torch.no_grad():
        docs1 = docs2input_tensors(docs_1, device)
        docs2 = docs2input_tensors(docs_2, device)
        return model.get_distance(docs1, docs2).cpu().numpy()

def predict_batch(docs):
    len_docs = len(docs)
    pos_docs = [item for doc in POS for item in [doc]*len_docs]
    docs = docs * len(POS)
    
    dis = np.array(get_distance(pos_docs, docs))
    dis = dis.reshape((len(POS), len_docs))
#     print(dis)
    return dis.mean(axis=0)

# def get_distance_to_pos(docs, batch_size):
#     return list(chain(*[predict_batch(docs[i: i+batch_size]) for i in tqdm(range(0, len(docs), batch_size))]))


In [None]:
model.eval()
predict_docs(['giá bao tiền', 'ee', 'Giảm giá sốc'], batch_size=1)

In [None]:
import ast

import pandas as pd
pd.set_option('display.max_colwidth', -1)
from sklearn import metrics

from data_for_train.index_dataset import IndexDataset
from data_for_train.positive_dataset import PositiveDataset
from data_for_train import pool
from naruto_skills.new_voc import Voc

## Recall

### Eval

In [None]:
df_pos['pred'] = predict_docs(list(df_pos['mention']), batch_size=256)

In [None]:
df_pos['pred'].describe()

In [None]:
THRESHOLD = df_pos['pred'].mean()

In [None]:
THRESHOLD

In [None]:
df_pos_eval = pd.read_csv('/source/main/data_for_train/output/eval/%s.csv' % POSITIVE_NAME)
df_pos_eval = df_pos_eval.drop_duplicates(subset=['mention'])

In [None]:
df_pos_eval['mention'].shape

In [None]:
df_pos_eval.dropna(subset=['mention'], inplace=True)

In [None]:
df_pos_eval.shape

In [None]:
df_pos_eval['pred'] = predict_docs(list(df_pos_eval['mention']), batch_size=256)
# print(sum(df_pos_eval['pred']>=0.5)/df_pos_eval.shape[0])
# print(df_pos_eval.shape)

In [None]:
(df_pos_eval['pred']<=THRESHOLD).sum()/df_pos_eval.shape[0]

### Test

In [None]:
df_pos_test = pd.read_csv('/source/main/data_for_train/output/test/%s.csv' % POSITIVE_NAME)
df_pos_test = df_pos_test.drop_duplicates(subset=['mention'])

In [None]:
df_pos_test['pred'] = predict_docs(list(df_pos_test['mention']), batch_size=256)


In [None]:
print(sum(df_pos_test['pred']<=THRESHOLD)/df_pos_test.shape[0])
print(df_pos_test.shape)

## Score: pr/P(y=1)

### Eval

In [None]:
df_pool_eval = pd.read_csv('/source/main/data_for_train/output/eval/pool.csv')

In [None]:
df_pool_eval['pred'] = predict_docs(list(df_pool_eval['mention']), batch_size=256)

# print(sum(df_pool_eval['pred']>=0.5)/df_pool_eval.shape[0])
# print(df_pool_eval.shape)

In [None]:
sum(df_pool_eval['pred']<=THRESHOLD)/df_pool_eval.shape[0]

In [None]:
# df_pool_eval['pred2'] = predict_docs(list(df_pool_eval['mention']), batch_size=64)
# sum(df_pool_eval['pred2'] <= 5.871464)/df_pool_eval.shape[0]

In [None]:
# sum(df_pool_eval['pred2'] <= 5.871464)/df_pool_eval.shape[0]

In [None]:
df_pool_eval[df_pool_eval['pred']<=THRESHOLD].sample(100)

### Test

In [None]:
df_pool_test = pd.read_csv('/source/main/data_for_train/output/test/pool.csv')

In [None]:
df_pool_test['pred'] = predict_docs(list(df_pool_test['mention']), batch_size=256)

In [None]:
print(sum(df_pool_test['pred']<=THRESHOLD)/df_pool_test.shape[0])
print(df_pool_test.shape)

In [None]:
EXP_ID

In [None]:
total_pos_pred = sum(df_pool_test['pred']<=THRESHOLD)
df_pool_test[df_pool_test['pred']<=THRESHOLD].sample(min(100, total_pos_pred)).to_csv('%s.csv' % EXP_ID, index=None)

In [None]:
total_pos_pred

In [None]:
df_pos[['mention']].sample(10)

In [None]:
df_pool_test[df_pool_test['pred']<=THRESHOLD].sample(10)