In [1]:
import os
import sys
import numpy as np
import argparse
from collections import defaultdict

sys.path.append("../")

import torch
from torch.utils.data import random_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from utils.eval import retrieval_normalized_dcg_all, retrieval_precision_all, semantic_precision_all
from utils.toolbox import same_seeds, show_settings, record_settings, get_preprocess_document, get_preprocess_document_embs, get_preprocess_document_labels_v3, get_word_embs

torch.set_num_threads(8)



In [15]:
def evaluate_Decoder(config, model, vocabulary, word_embeddings, X_test, y_test):
    results = defaultdict(list)
        
    # predict all data
    pred = torch.abs(torch.tensor(model.predict(X_test)))
    y = torch.abs(torch.tensor(y_test))

    # Semantic Prcision
    precision_scores, word_result = semantic_precision_all(pred, y, word_embeddings, vocabulary, k=config['topk'], th=config['threshold'])
    for k, v in precision_scores.items():
        results['Semantic Precision@{}'.format(k)].append(v)

    # Precision
    precision_scores = retrieval_precision_all(pred, y, k=config["topk"])
    for k, v in precision_scores.items():
        results['precision@{}'.format(k)].append(v)
    
    # NDCG
    ndcg_scores = retrieval_normalized_dcg_all(pred, y, k=config["topk"])
    for k, v in ndcg_scores.items():
        results['ndcg@{}'.format(k)].append(v)
        
    for k in results:
        results[k] = np.mean(results[k])

    return results

In [3]:
config = {
    'model': 'KNN',
    'dataset': '20news',
    'use_pos': True,
    'min_df': 1,
    'max_df:': 1.0,
    'vocab_size': 0,
    'min_doc_word': 15,
    'encoder': 'mpnet',
    'target': 'yake',
    'seed': 123,
    'ratio': 0.8,
    'topk': [5, 10, 15],
    'threshold': 0.5,
    'n_neighbors': 20
}
same_seeds(config["seed"])

In [4]:
# Parameter
if config['dataset'] == '20news':
    config['min_df'], config['max_df'], config['min_doc_word'] = 50, 1.0, 15
elif config['dataset'] == 'agnews':
    config['min_df'], config['max_df'], config['min_doc_word'] = 100, 1.0, 15
elif config['dataset'] == 'tweet':
    config['min_df'], config['max_df'], config['min_doc_word'] = 5, 1.0, 15

In [5]:
# Decode target & Vocabulary
labels, vocabularys= get_preprocess_document_labels_v3(config)
id2token = {k: v for k, v in zip(range(0, len(vocabularys[config['target']])), vocabularys[config['target']])}

Getting preprocess documents labels


In [6]:
# data preprocessing
unpreprocessed_corpus ,preprocessed_corpus = get_preprocess_document(**config)
texts = [text.split() for text in preprocessed_corpus]

Getting preprocess documents: 20news
min_df: 50 max_df: 1.0 vocabulary_size: None min_doc_word: 15




In [7]:
print(len(labels[config['target']][1]))
print(len(vocabularys[config['target']]))

5680
5680


In [8]:
# generating document embedding
doc_embs, doc_model = get_preprocess_document_embs(preprocessed_corpus, config['encoder'])

Getting preprocess documents embeddings
Using cuda 1 for training...


Batches:   0%|          | 0/95 [00:00<?, ?it/s]

In [9]:
# word embedding preparation
word_embeddings = get_word_embs(vocabularys[config['target']], id2token=id2token, data_type='tensor')

0it [00:00, ?it/s]

Number of words:400000
Getting [tensor] word embeddings


In [10]:
# show setting
config['vocab_size'] = len(vocabularys[config['target']])
show_settings(config)

-------- Info ---------
model: KNN
dataset: 20news
use_pos: True
min_df: 50
max_df:: 1.0
vocab_size: 5680
min_doc_word: 15
encoder: mpnet
target: yake
seed: 123
ratio: 0.8
topk: [5, 10, 15]
threshold: 0.5
n_neighbors: 20
max_df: 1.0

-----------------------


In [11]:
X_train, X_test, y_train, y_test = train_test_split(doc_embs, labels[config["target"]], test_size=0.2, random_state=config["seed"])
X_train, X_test, y_train, y_test = train_test_split(doc_embs, labels[config["target"]], test_size=0.2, random_state=config["seed"])
model = KNeighborsRegressor(n_neighbors=config["n_neighbors"])
model.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=20)

In [16]:
res = evaluate_Decoder(config, model, vocabularys[config['target']], word_embeddings, X_test, y_test)
for key,val in res.items():
    print(f"{key}:{val:.4f}")

Semantic Precision@5:0.8307
Semantic Precision@10:0.7179
Semantic Precision@15:0.6391
precision@5:0.7856
precision@10:0.6360
precision@15:0.5442
ndcg@5:0.7451
ndcg@10:0.6369
ndcg@15:0.5711
ndcg@all:0.7122


In [12]:
pred = torch.abs(torch.tensor(model.predict(X_test)))
y = torch.abs(torch.tensor(y_test))

In [13]:
idx = 50
print(pred[0])

tensor([0.0000, 0.0000, 0.0113,  ..., 0.0000, 0.0000, 0.0000],
       dtype=torch.float64)


In [14]:
print(y[50])

tensor([0., 0., 0.,  ..., 0., 0., 0.], dtype=torch.float64)
