In [1]:
import sys
sys.path.append('/source/main')

In [2]:
import os
import logging
from datetime import datetime
import time
from itertools import chain

import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from naruto_skills.new_voc import Voc
from torch.utils.data import DataLoader, Subset, Dataset
import matplotlib.pyplot as plt

from model_def.siamese_model_8 import SiameseModel
from model_def.siamese_core import SiameseModelCore
from utils import pytorch_utils
from preprocess import preprocessor
from naruto_skills.training_checker import TrainingChecker

In [3]:
logging.basicConfig(level=logging.INFO)
pd.set_option('display.max_colwidth', -1)

In [4]:
def docs2input_tensors(docs, device):
    preprocessed_docs = [preprocessor.infer_preprocess(doc) for doc in docs]
    max_len = 100
    preprocessed_docs = [' '.join(doc.split()[:max_len]) for doc in preprocessed_docs]
    word_input = voc.docs2idx(preprocessed_docs, equal_length=max_len)
    inputs = np.array(word_input)
    input_tensors = torch.from_numpy(inputs)
    input_tensors = input_tensors.to(device)
    return input_tensors

def predict_docs(docs, batch_size):
    return list(chain(*[predict_batch(docs[i: i+batch_size]) for i in tqdm(range(0, len(docs), batch_size))]))

In [5]:
voc = Voc.load('/source/main/vocab/output/voc.pkl')
MAX_LENGTH = 100
BATCH_SIZE = 256

In [6]:
core_model = SiameseModelCore(voc.get_embedding_weights())
model = SiameseModel(core_model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device('cpu')
model = model.to(device)
model = model.eval()

In [7]:
PRE_TRAINED_MODEL='/source/main/train/output/saved_models/15.2/5800.pt'
checkpoint = torch.load(PRE_TRAINED_MODEL, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [8]:
def predict_batch(docs):
    return get_distance_anchors_(docs)

def get_distance_anchors_(docs):
    with torch.no_grad():
        docs = docs2input_tensors(docs, device)
        return model.get_distance_anchors(POS_IDX, docs).cpu().numpy().mean(axis=0)

def get_distance_(doc1, doc2):
    with torch.no_grad():
        docs = docs2input_tensors([doc1, doc2], device)
        return model.get_distance(docs[0:1], docs[1:]).cpu().numpy()
    
def get_vector(docs):
    with torch.no_grad():
        docs = docs2input_tensors(docs, device)
        return model._get_inner_repr(docs).cpu().numpy()
    
def get_distance_from_center(docs):
    docs = get_vector(docs)
    return np.linalg.norm(docs-CENTER_VECTOR, axis=1)


In [9]:
df_train_pos = pd.read_csv('/source/main/data_for_train/output/train/positive_class_1.csv')
POS = list(df_train_pos['mention'].sample(10))
POS_IDX = docs2input_tensors(POS, device)
# CENTER_VECTOR = get_vector(POS).mean(axis=0)

In [10]:
df_train_pos['pred'] = predict_docs(list(df_train_pos['mention']), batch_size=32)

100%|██████████| 81/81 [00:05<00:00, 16.14it/s]


In [11]:
df_train_pos['pred'].describe()

count    2582.000000
mean     19.617797  
std      13.308473  
min      14.338962  
25%      14.893758  
50%      16.252757  
75%      19.226352  
max      209.486053 
Name: pred, dtype: float64

In [12]:
THRESHOLD=df_train_pos['pred'].mean()

## Eval

In [13]:
df_pos_eval = pd.read_csv('/source/main/data_for_train/output/eval/positive_class_1.csv')
df_pos_eval = df_pos_eval.drop_duplicates(subset=['mention'])
df_pos_eval.dropna(subset=['mention'], inplace=True)

In [14]:
df_pos_eval['pred'] = predict_docs(list(df_pos_eval['mention']), batch_size=32)

100%|██████████| 10/10 [00:00<00:00, 17.16it/s]


In [15]:
(df_pos_eval['pred']<=THRESHOLD).sum()/df_pos_eval.shape[0]

0.7492163009404389

### Test

In [16]:
df_pos_test = pd.read_csv('/source/main/data_for_train/output/test/positive_class_1.csv')
df_pos_test = df_pos_test.drop_duplicates(subset=['mention'])

In [17]:
df_pos_test['pred'] = predict_docs(list(df_pos_test['mention']), batch_size=32)

100%|██████████| 11/11 [00:00<00:00, 15.20it/s]


In [18]:
print(sum(df_pos_test['pred']<=THRESHOLD)/df_pos_test.shape[0])
print(df_pos_test.shape)

0.6666666666666666
(321, 4)


## Score: pr/P(y=1)

### Test

In [19]:
df_pool_test = pd.read_csv('/source/main/data_for_train/output/test/pool.csv')

In [20]:
df_pool_test['pred'] = predict_docs(list(df_pool_test['mention']), batch_size=32)

100%|██████████| 3125/3125 [03:23<00:00, 15.39it/s]


In [21]:
print(sum(df_pool_test['pred']<=THRESHOLD)/df_pool_test.shape[0])
print(df_pool_test.shape)

0.00535
(100000, 3)


In [22]:
df_pool_test[df_pool_test['pred']<=THRESHOLD].sample(20)

Unnamed: 0,mention_type,mention,pred
19898,2,size nb cho bé mấy kg vậy . báo giá mình nhé,14.47014
3547,2,loai tu nay bn tien vay,16.551823
90723,2,bn v c,15.99589
80851,2,giá nhiu,15.191274
57635,2,giá nhiêu __d__ bịch ạ,15.047525
57827,2,gửi cho xin báo giá nhé,14.503909
84429,2,bobby xxl quần bn b,18.622555
83285,2,máy giặt panasonic __d__kg giá nhiêu bạn,15.723895
21550,2,lon friso __d__g đó nay có giá nhiêu chị ơi,16.722418
3668,2,abbott grow __d__ có giá bao nhiêu à .,14.384077
