In [1]:
import os
import sys
import numpy as np
import argparse
from collections import defaultdict

sys.path.append("../")

import torch
from torch.utils.data import random_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from utils.eval import retrieval_normalized_dcg_all, retrieval_precision_all, semantic_precision_all, retrieval_precision_all_v2, semantic_precision_all_v2
from utils.toolbox import same_seeds, show_settings, record_settings, get_preprocess_document, get_preprocess_document_embs, get_preprocess_document_labels, get_word_embs, merge_targets
from load_pretrain_label import load_preprocess_document_labels
torch.set_num_threads(8)

In [2]:
def knn_evaluate(config, model, vocabulary, word_embeddings, X_test, y_test):
    results = defaultdict(list)
        
    # predict all data
    if config['target'] == 'yake':
        pred = torch.abs(torch.Tensor(model.predict(X_test)))
        y = torch.abs(torch.Tensor(y_test))
    else:
        pred = torch.Tensor(model.predict(X_test))
        y = torch.Tensor(y_test)
    # Semantic Prcision
    precision_scores, word_result = semantic_precision_all(pred, y, word_embeddings, vocabulary, k=config['topk'], th=config['threshold'])
    for k, v in precision_scores.items():
        results['Semantic Precision v1@{}'.format(k)].append(v)
    precision_scores, word_result = semantic_precision_all_v2(pred, y, word_embeddings, vocabulary, k=config['topk'], th=config['threshold'])
    for k, v in precision_scores.items():
        results['Semantic Precision_v2@{}'.format(k)].append(v)

    # Precision
    precision_scores = retrieval_precision_all(pred, y, k=config["topk"])
    for k, v in precision_scores.items():
        results['precision v1@{}'.format(k)].append(v)
    precision_scores = retrieval_precision_all_v2(pred, y, k=config["topk"])
    for k, v in precision_scores.items():
        results['precision v2@{}'.format(k)].append(v)
    
    # NDCG
    ndcg_scores = retrieval_normalized_dcg_all(pred, y, k=config["topk"])
    for k, v in ndcg_scores.items():
        results['ndcg@{}'.format(k)].append(v)
        
    for k in results:
        results[k] = np.mean(results[k])

    return results

def mean_evaluate(config, preds, labels, vocabulary, word_embeddings):
    results = defaultdict(list)
        
    # predict all data
    if config['target'] == 'yake':
        pred = torch.abs(torch.Tensor(preds))
        y = torch.abs(torch.Tensor(labels))
    else:
        pred = torch.Tensor(preds)
        y = torch.Tensor(labels)
    # Semantic Prcision
    precision_scores, word_result = semantic_precision_all(pred, y, word_embeddings, vocabulary, k=config['topk'], th=config['threshold'])
    for k, v in precision_scores.items():
        results['Semantic Precision v1@{}'.format(k)].append(v)
    precision_scores, word_result = semantic_precision_all_v2(pred, y, word_embeddings, vocabulary, k=config['topk'], th=config['threshold'])
    for k, v in precision_scores.items():
        results['Semantic Precision_v2@{}'.format(k)].append(v)

    # Precision
    precision_scores = retrieval_precision_all(pred, y, k=config["topk"])
    for k, v in precision_scores.items():
        results['precision v1@{}'.format(k)].append(v)
    precision_scores = retrieval_precision_all_v2(pred, y, k=config["topk"])
    for k, v in precision_scores.items():
        results['precision v2@{}'.format(k)].append(v)
    
    # NDCG
    ndcg_scores = retrieval_normalized_dcg_all(pred, y, k=config["topk"])
    for k, v in ndcg_scores.items():
        results['ndcg@{}'.format(k)].append(v)
        
    for k in results:
        results[k] = np.mean(results[k])

    return results

# Single Dataset

In [3]:
config = {
    'model': 'mean',
    'dataset': '20news',
    'use_pos': True,
    'min_df': 1,
    'max_df:': 1.0,
    'vocab_size': 0,
    'min_doc_word': 15,
    'encoder': 'mpnet',
    'target': 'yake',
    'seed': 123,
    'ratio': 0.8,
    'topk': [5, 10, 15],
    'threshold': 0.5,
    'n_neighbors': 20
}
same_seeds(config["seed"])

In [4]:
# Parameter
if config['dataset'] == '20news':
    config['min_df'], config['max_df'], config['min_doc_word'] = 62, 1.0, 15
elif config['dataset'] == 'agnews':
    config['min_df'], config['max_df'], config['min_doc_word'] = 425, 1.0, 15
elif config['dataset'] == 'IMDB':
    config['min_df'], config['max_df'], config['min_doc_word'] = 166, 1.0, 15
elif config['dataset'] == 'wiki':
    config['min_df'], config['max_df'], config['min_doc_word'] = 2872, 1.0, 15
elif config['dataset'] == 'tweet':
    config['min_df'], config['max_df'], config['min_doc_word'] = 5, 1.0, 15

In [5]:
# data preprocessing
unpreprocessed_corpus ,preprocessed_corpus = get_preprocess_document(**config)
texts = [text.split() for text in preprocessed_corpus]

Getting preprocess documents: 20news
min_df: 62 max_df: 1.0 vocabulary_size: None min_doc_word: 15




In [5]:
# Decode target & Vocabulary
if config['target'] == 'keybert' or config['target'] == 'yake':
    labels, vocabularys= load_preprocess_document_labels(config)
    label = labels[config['target']].toarray()
else:
    labels, vocabularys= get_preprocess_document_labels(preprocessed_corpus)
    label = labels[config['target']]
    vocabularys = vocabularys[config['target']]

Load preprocess documents labels


In [13]:
print(label.shape)

(18589, 4823)


In [17]:
for id in range(label.shape[0]):
    for wid in range(label.shape[1]):
        if label[id][wid] < 0:
            print(label[id][wid])

In [7]:
# generating document embedding
doc_embs, doc_model, device = get_preprocess_document_embs(preprocessed_corpus, config['encoder'])

Getting preprocess documents embeddings
Using cuda 0 for training...


Batches:   0%|          | 0/1162 [00:00<?, ?it/s]

In [8]:
# word embedding preparation
word_embeddings = get_word_embs(vocabularys, data_type='tensor')

0it [00:00, ?it/s]

Number of words:400000
Getting [tensor] word embeddings


  word_embs = torch.Tensor(word_embs)


In [9]:
# show setting
config['vocab_size'] = len(vocabularys)
show_settings(config)

-------- Info ---------
model: mean
dataset: 20news
use_pos: True
min_df: 62
max_df:: 1.0
vocab_size: 4823
min_doc_word: 15
encoder: mpnet
target: yake
seed: 123
ratio: 0.8
topk: [5, 10, 15]
threshold: 0.5
n_neighbors: 20
max_df: 1.0

-----------------------


## KNN baseline

In [10]:
import pandas as pd

In [11]:
X_train, X_test, y_train, y_test = train_test_split(doc_embs, label, test_size=0.2, random_state=config["seed"])
model = KNeighborsRegressor(n_neighbors=config["n_neighbors"])
model.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=20)

In [12]:
res = knn_evaluate(config, model, vocabularys, word_embeddings, X_test, y_test)
for key,val in res.items():
    print(f"{key}:{val:.4f}")

Semantic Precision v1@5:0.8704
Semantic Precision v1@10:0.7486
Semantic Precision v1@15:0.6640
Semantic Precision_v2@5:0.1666
Semantic Precision_v2@10:0.2750
Semantic Precision_v2@15:0.3328
precision v1@5:0.8257
precision v1@10:0.6710
precision v1@15:0.5726
precision v2@5:0.1377
precision v2@10:0.2241
precision v2@15:0.2683
ndcg@5:0.8247
ndcg@10:0.7192
ndcg@15:0.6476
ndcg@all:0.7439


In [19]:
print(res['Semantic Precision v1@5'])

0.8704142011834319


In [32]:
df = pd.DataFrame.from_dict(res, orient='index').T
print(df)

   Semantic Precision v1@5  Semantic Precision v1@10  \
0                 0.870414                  0.748628   

   Semantic Precision v1@15  Semantic Precision_v2@5  \
0                  0.663995                 0.166595   

   Semantic Precision_v2@10  Semantic Precision_v2@15  precision v1@5  \
0                   0.27504                  0.332795        0.825713   

   precision v1@10  precision v1@15  precision v2@5  precision v2@10  \
0         0.671033         0.572566        0.137708         0.224072   

   precision v2@15    ndcg@5   ndcg@10   ndcg@15  ndcg@all  
0         0.268316  0.824674  0.719165  0.647621  0.743946  


In [38]:
df = pd.concat([df, pd.DataFrame.from_dict(res, orient='index').T], axis=0)

In [39]:
df

Unnamed: 0,Semantic Precision v1@5,Semantic Precision v1@10,Semantic Precision v1@15,Semantic Precision_v2@5,Semantic Precision_v2@10,Semantic Precision_v2@15,precision v1@5,precision v1@10,precision v1@15,precision v2@5,precision v2@10,precision v2@15,ndcg@5,ndcg@10,ndcg@15,ndcg@all
0,0.870414,0.748628,0.663995,0.166595,0.27504,0.332795,0.825713,0.671033,0.572566,0.137708,0.224072,0.268316,0.824674,0.719165,0.647621,0.743946
0,0.870414,0.748628,0.663995,0.166595,0.27504,0.332795,0.825713,0.671033,0.572566,0.137708,0.224072,0.268316,0.824674,0.719165,0.647621,0.743946
0,0.870414,0.748628,0.663995,0.166595,0.27504,0.332795,0.825713,0.671033,0.572566,0.137708,0.224072,0.268316,0.824674,0.719165,0.647621,0.743946


In [40]:
df.to_csv('./test_dataframe.csv', index=False)

## Mean baseline

In [42]:
# predic
predict = label.mean(axis=0)
predict = np.tile(predict, (label.shape[0], 1))

In [43]:
res = mean_evaluate(config, predict, label, vocabularys, word_embeddings)
df = pd.DataFrame(res)
print(df)
#for key,val in res.items():
#    print(f"{key}:{val:.4f}")

Semantic Precision v1@5:0.7103
Semantic Precision v1@10:0.6211
Semantic Precision v1@15:0.4953
Semantic Precision_v2@5:0.0420
Semantic Precision_v2@10:0.1556
Semantic Precision_v2@15:0.1937
precision v1@5:0.6821
precision v1@10:0.4914
precision v1@15:0.3846
precision v2@5:0.0349
precision v2@10:0.0937
precision v2@15:0.1253
ndcg@5:0.7152
ndcg@10:0.5735
ndcg@15:0.4876
ndcg@all:0.6373


# Cross Domain

In [3]:
config = {
    'model': 'mean',
    'dataset': '20news',
    'dataset2': 'wiki',
    'use_pos': True,
    'min_df': 1,
    'max_df:': 1.0,
    'vocab_size': 0,
    'min_doc_word': 15,
    'encoder': 'mpnet',
    'target': 'tf-idf',
    'seed': 123,
    'ratio': 0.8,
    'topk': [5, 10, 15],
    'threshold': 0.5,
    'n_neighbors': 20
}
same_seeds(config["seed"])

In [4]:
### Dataset
# Parameter
if config['dataset'] == '20news':
    config['min_df'], config['max_df'], config['min_doc_word'] = 62, 1.0, 15
elif config['dataset'] == 'agnews':
    config['min_df'], config['max_df'], config['min_doc_word'] = 425, 1.0, 15
elif config['dataset'] == 'IMDB':
    config['min_df'], config['max_df'], config['min_doc_word'] = 166, 1.0, 15
elif config['dataset'] == 'wiki':
    config['min_df'], config['max_df'], config['min_doc_word'] = 2872, 1.0, 15
elif config['dataset'] == 'tweet':
    config['min_df'], config['max_df'], config['min_doc_word'] = 5, 1.0, 15

# data preprocessing
unpreprocessed_corpus ,preprocessed_corpus = get_preprocess_document(**config)

# Decode target & Vocabulary
if config['target'] == 'keybert' or config['target'] == 'yake':
    labels, vocabularys= load_preprocess_document_labels(config)
    label = labels[config['target']].toarray()
else:
    labels, vocabularys= get_preprocess_document_labels(preprocessed_corpus)
    label = labels[config['target']]
    vocabularys = vocabularys[config['target']]

# generating document embedding
doc_embs, doc_model, device = get_preprocess_document_embs(preprocessed_corpus, config['encoder'])

Getting preprocess documents: 20news
min_df: 62 max_df: 1.0 vocabulary_size: None min_doc_word: 15




Getting preprocess documents labels
Getting preprocess documents embeddings
Using cuda 3 for training...


Batches:   0%|          | 0/1162 [00:00<?, ?it/s]

In [None]:
### Dataset2
# Parameter
config['dataset'] = config['dataset2']
if config['dataset'] == '20news':
    config['min_df'], config['max_df'], config['min_doc_word'] = 62, 1.0, 15
elif config['dataset'] == 'agnews':
    config['min_df'], config['max_df'], config['min_doc_word'] = 425, 1.0, 15
elif config['dataset'] == 'IMDB':
    config['min_df'], config['max_df'], config['min_doc_word'] = 166, 1.0, 15
elif config['dataset'] == 'wiki':
    config['min_df'], config['max_df'], config['min_doc_word'] = 2872, 1.0, 15
elif config['dataset'] == 'tweet':
    config['min_df'], config['max_df'], config['min_doc_word'] = 5, 1.0, 15

# data preprocessing
unpreprocessed_corpus2 ,preprocessed_corpus2 = get_preprocess_document(**config)

# Decode target & Vocabulary
if config['target'] == 'keybert' or config['target'] == 'yake':
    labels2, vocabularys2= load_preprocess_document_labels(config)
    label2 = labels2[config['target']].toarray()
else:
    labels2, vocabularys2= get_preprocess_document_labels(preprocessed_corpus2)
    label2 = labels2[config['target']]
    vocabularys2 = vocabularys2[config['target']]

# generating document embedding
doc_embs2, doc_model2, device2 = get_preprocess_document_embs(preprocessed_corpus2, config['encoder'])

Getting preprocess documents: wiki
min_df: 2872 max_df: 1.0 vocabulary_size: None min_doc_word: 15


Reusing dataset wikitext (/dhome/casimir0304/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


In [None]:
# merge two dataset
targets1, targets2, new_vocabularys = merge_targets(label, label2, vocabularys, vocabularys2)

In [None]:
# word embedding preparation
word_embeddings = get_word_embs(new_vocabularys, data_type='tensor')

## KNN baseline

In [None]:
model = KNeighborsRegressor(n_neighbors=config["n_neighbors"])
model.fit(doc_embs, targets1)

In [None]:
res = knn_evaluate(config, model, new_vocabularys, word_embeddings, doc_embs2, targets2)
for key,val in res.items():
    print(f"{key}:{val:.4f}")

## Mean baseline

In [None]:
# predic
predict = targets1.mean(axis=0)
predict = np.tile(predict, (targets2.shape[0], 1))

In [None]:
res = mean_evaluate(config, predict, targets2, new_vocabularys, word_embeddings)
for key,val in res.items():
    print(f"{key}:{val:.4f}")