In [1]:
import sys
import os
import torch
import nltk
import argparse
from torch.utils.data import DataLoader, random_split

sys.path.append("../")
from model.ide_topic_decoder import IDEDataset, IDETopicDecoder
#from utils.toolbox import same_seeds, show_settings, record_settings, get_preprocess_document_embs, get_preprocess_document_labels, get_word_embs


os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.set_num_threads(8)



In [2]:
import os
import re
import sys
import nltk
import torch
import numpy as np
import json
from math import log
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from torch.utils.data import DataLoader, random_split
from scipy import sparse

sys.path.append("../")
from utils.preprocessing import WhiteSpacePreprocessing, WhiteSpacePreprocessingStopwords, WhiteSpacePreprocessing_v2
from utils.data_loader import load_document, load_word2emb
from sklearn.feature_extraction.text import TfidfVectorizer

def preprocess_document(raw_documents):
    sp = WhiteSpacePreprocessingStopwords(raw_documents, stopwords_list=['english'], vocabulary_size=10000, min_words=15)
    preprocessed_documents, unpreprocessed_corpus, vocab, _ = sp.preprocess()
    delete_non_eng_documents = delete_non_eng(preprocessed_documents)
    noun_documents = pos(delete_non_eng_documents)
    delete_documents = []
    for idx in range(len(noun_documents)):
        if len(noun_documents[idx]) == 0:
            delete_documents.append(idx)
    delete_documents = sorted(delete_documents, reverse=True)
    for idx in delete_documents:
        del unpreprocessed_corpus[idx]
    noun_documents = list(filter(None, noun_documents))
    texts = [text.split() for text in noun_documents]
    return noun_documents, unpreprocessed_corpus, texts, vocab

def generate_document_embedding(model, documents):
    if model == 'roberta':
        model = SentenceTransformer("paraphrase-distilroberta-base-v1", device=get_free_gpu())
    else:
        model = SentenceTransformer("all-mpnet-base-v2", device=get_free_gpu())

    return np.array(model.encode(documents, show_progress_bar=True, batch_size=200))

def tokenizer_eng(text):
        text = re.sub(r'[^A-Za-z ]+', '', text)
        text = text.strip().split()
        return text

def delete_non_eng(documents):
    preprocessed_documents = []
    for text in documents:
        selected_word = []
        for word in tokenizer_eng(text):
            selected_word.append(word)
        preprocessed_documents.append(" ".join(selected_word))
    return preprocessed_documents


def pos(documents):
    is_noun = lambda pos: pos[:2] == 'NN'
    is_verb = lambda pos: pos[:2] == 'VB'

    preprocessed_documents = []
    for text in documents:
        tokenized = nltk.word_tokenize(text)
        noun_word = []
        for (word, pos) in nltk.pos_tag(tokenized):
            if is_noun(pos) or is_verb(pos):
                noun_word.append(word)
        preprocessed_documents.append(" ".join(noun_word))
    return preprocessed_documents

def calculate_word_embeddings_tensor(word2embedding, vocab, idx2token):
    word_embeddings = torch.zeros(len(vocab), len(word2embedding['a']))
    for k in idx2token:
        if idx2token[k] not in word2embedding:
            # print('not found word embedding', idx2token[k])
            continue
        word_embeddings[k] = torch.tensor(word2embedding[idx2token[k]])

    return word_embeddings

def get_free_gpu():
    os.system('nvidia-smi -q -d Memory |grep -A4 GPU|grep Free > tmp')
    memory_available = [int(x.split()[2])
                        for x in open('tmp', 'r').readlines()]
    os.system('rm -f tmp')
    print('Using cuda {} for training...'.format(int(np.argmax(memory_available))))
    torch.cuda.device(int(np.argmax(memory_available)))
    return "cuda:{}".format(int(np.argmax(memory_available)))


def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


def show_settings(config):
    print('-------- Info ---------')
    settings = ""
    for key in list(config.keys()):
        settings += "{}: {}\n".format(key, config.get(key))
    print(settings)
    print('-----------------------')

def record_settings(config):
    record = open('./'+config['dataset']+'_'+config['model']+'_'+config['encoder']+'_'+config['target']+'.txt', 'a')
    record.write('-------- Info ---------\n')
    settings = ""
    for key in list(config.keys()):
        settings += "{}: {}\n".format(key, config.get(key))
    record.write(settings)
    record.write('-----------------------\n')

def split_data(dataset, config):
    train_length = int(len(dataset)*0.6)
    valid_length = int(len(dataset)*0.2)
    test_length = len(dataset) - train_length - valid_length

    train_dataset, valid_dataset, test_dataset = random_split(
        dataset, lengths=[train_length, valid_length, test_length],
        generator=torch.Generator().manual_seed(42)
    )

    train_loader = DataLoader(
        train_dataset, batch_size=config["batch_size"],
        shuffle=True, pin_memory=True,
    )
    valid_loader = DataLoader(
        valid_dataset, batch_size=config["batch_size"], shuffle=False, pin_memory=True)
    test_loader = DataLoader(
        test_dataset, batch_size=config["batch_size"], shuffle=False)

    return train_loader, valid_loader, test_loader

def doc_filter(raw_document, vocab):
    PATTERN = r"(?u)\b\w\w+\b"
    doc = re.findall(PATTERN, raw_document.lower())
    return [x for x in doc if x in vocab]

def generate_graph(doc_list, word2index, index2word):
    window_size = 10
    windows = []

    # Traverse Each Document & Move window on each of them
    for doc in doc_list:
        length = len(doc)
        if length <= window_size:
            windows.append(doc)
        else:
            for i in range(length-window_size+1):
                window = doc[i: i+window_size]
                windows.append(window)
    
    word_freq = {}
    word_pair_count = {}
    for window in tqdm(windows, desc='Calculate word pair: '):
        appeared = set()
        for i in range(len(window)):
            if window[i] not in appeared:
                if window[i] in word_freq:
                    word_freq[window[i]] += 1
                else:
                    word_freq[window[i]] = 1
                appeared.add(window[i])
            if i != 0:
                for j in range(0, i):
                    word_i = window[i]
                    word_i_id = word2index[word_i]
                    word_j = window[j]
                    word_j_id = word2index[word_j]
                    if word_i_id == word_j_id:
                        continue
                    word_pair_str = str(word_i_id) + ',' + str(word_j_id)
                    if word_pair_str in word_pair_count:
                        word_pair_count[word_pair_str] += 1
                    else:
                        word_pair_count[word_pair_str] = 1
                    word_pair_str = str(word_j_id) + ',' + str(word_i_id)
                    if word_pair_str in word_pair_count:
                        word_pair_count[word_pair_str] += 1
                    else:
                        word_pair_count[word_pair_str] = 1
    
    row = []
    col = []
    edge = []
    weight = []
    # pmi as weights

    num_window = len(windows)
    # count_mean = np.array(list(word_pair_count.values())).mean()
    for key in tqdm(word_pair_count, desc='Construct Edge: '):
        temp = key.split(',')
        i = int(temp[0])
        j = int(temp[1])
        count = word_pair_count[key]
        word_freq_i = word_freq[index2word[i]]
        word_freq_j = word_freq[index2word[j]]
        pmi = log((1.0 * count / num_window) /
                (1.0 * word_freq_i * word_freq_j/(num_window * num_window)))
        if pmi <= 0:
            continue
        row.append(i)
        col.append(j)
        if count >= 15:
            edge.append([i, j])
            edge.append([j, i])
        weight.append(pmi)

    print('# of Node: {}\n# of Edge: {}'.format(len(word2index), len(edge)))

    return edge

def get_preprocess_document(dataset, min_df=1, max_df=1.0, vocabulary_size=None, min_doc_word=15, use_pos=True, **kwargs):
    '''
    Returns preprocessed_docs & unpreprocessed_docs of the dataset

            Parameters:
                    dataset (str): For data_loader
                    min_df, max_df, vocabulary_size: For CountVectorizer in CTM preprocess
                    min_doc_word: Minimum doc length
            Returns:
                    unpreprocessed_docs (list):
                    preprocessed_docs (list):
    '''
    print('Getting preprocess documents:', dataset)
    print(f'min_df: {min_df} max_df: {max_df} vocabulary_size: {vocabulary_size} min_doc_word: {min_doc_word}')
    raw_documents = load_document(dataset)["documents"]
    # CTM preprocess
    sp = WhiteSpacePreprocessing_v2(raw_documents, stopwords_language='english',\
                                    min_df=min_df, max_df=max_df, vocabulary_size=vocabulary_size)

    preprocessed_docs, unpreprocessed_docs, vocabulary, _ = sp.preprocess()
    # filter special character
    preprocessed_docs = delete_non_eng(preprocessed_docs)
    # select nouns & verbs
    if use_pos:
        preprocessed_docs = pos(preprocessed_docs)
    # delete short articles
    delete_docs_idx = []
    for idx in range(len(preprocessed_docs)):
        # length > min_doc_word
        if len(preprocessed_docs[idx]) == 0 or len(preprocessed_docs[idx]) < min_doc_word:
            delete_docs_idx.append(idx)
    delete_docs_idx = sorted(delete_docs_idx, reverse=True)
    for idx in delete_docs_idx:
        del preprocessed_docs[idx]
        del unpreprocessed_docs[idx]
    
    return unpreprocessed_docs ,preprocessed_docs

def get_preprocess_document_labels(preprocessed_docs, preprocess_config='../chris/parameters/preprocess_config.json'):
    '''
    Returns labels for document decoder

            Parameters:
                    preprocessed_docs (list): 
            Returns:
                    labels (dict): bow, tf-idf
                    vocabulary (dict): bow, tf-idf
    '''
    print('Getting preprocess documents labels')
    vectorizer = TfidfVectorizer()
    # covert sparse matrix to numpy array
    tf_idf_vector = vectorizer.fit_transform(preprocessed_docs).toarray()
    bow_vector = tf_idf_vector.copy()
    bow_vector[bow_vector > 0] = 1
    bow_vector[bow_vector < 0] = 0
    vocabulary = vectorizer.get_feature_names()

    labels = {}
    labels['tf-idf'] = tf_idf_vector
    labels['bow'] = bow_vector
    
    vocabularys = {}
    vocabularys['tf-idf'] = vocabulary
    vocabularys['bow'] = vocabulary

    return labels, vocabularys

def get_preprocess_document_labels_v2(preprocessed_docs, preprocess_config, preprocess_config_dir, ngram=1):
    '''
    Returns labels for document decoder

            Parameters:
                    preprocessed_docs (list): 
            Returns:
                    labels (dict): bow, tf-idf
                    vocabulary (dict): bow, tf-idf
    '''
    print('Getting preprocess documents labels')
    print('Finding precompute_keyword by preprocess_config', preprocess_config)

    config_dir = os.path.join('../data/precompute_keyword', preprocess_config_dir, \
                              '{}_ngram_{}'.format(preprocess_config['dataset'], ngram))
    # check preprocess config the same when loading precompute labels
    with open(os.path.join(config_dir, 'preprocess_config.json'), 'r') as f:
        preprocess_config2 = json.load(f)
        assert preprocess_config == preprocess_config2

    tf_idf_vector = sparse.load_npz(os.path.join(config_dir, 'TFIDF.npz'))
    bow_vector = sparse.load_npz(os.path.join(config_dir, 'BOW.npz'))
    try:
        keybert_vector = sparse.load_npz(os.path.join(config_dir, 'KeyBERT.npz'))
        yake_vector = sparse.load_npz(os.path.join(config_dir, 'YAKE.npz'))
    except:
        print('no precompute keyword')
        keybert_vector = None
        yake_vector = None

    vocabulary = np.load(os.path.join(config_dir, 'vocabulary.npy'))

    labels = {}
    labels['tf-idf'] = tf_idf_vector
    labels['bow'] = bow_vector
    labels['keybert'] = keybert_vector
    labels['yake'] = yake_vector
    
    return labels, vocabulary
    
def merge_targets(targets, targets2, vocabularys, vocabularys2):
    # ref: https://numpy.org/doc/stable/reference/generated/numpy.put_along_axis.html#numpy.put_along_axis
    vocabularys_all = list(vocabularys)
    vocabularys2_map_idx = []
    
    for s in vocabularys2:
        if s in vocabularys_all:
            idx = vocabularys_all.index(s)
            vocabularys2_map_idx.append(idx)
        else:
            vocabularys_all.append(s)
            vocabularys2_map_idx.append(len(vocabularys_all)-1)
            
    print('vocabularys len', len(vocabularys))
    print('vocabularys2 len', len(vocabularys2))
    print('merge len', len(vocabularys_all))
    mr = (len(vocabularys_all) - len(vocabularys)) / len(vocabularys2)
    print('vocabularys2 missing ratio', mr)
    
    new_targets = np.zeros((targets.shape[0], len(vocabularys_all)))
    new_targets2 = np.zeros((targets2.shape[0], len(vocabularys_all)))
    
    idx1 = np.arange(targets.shape[1])
    idx1 = np.repeat(idx1.reshape(1, -1), targets.shape[0], axis=0)
    np.put_along_axis(new_targets, idx1, targets, axis=1)
    
    assert len(vocabularys2_map_idx) == len(vocabularys2)
    idx2 = np.array(vocabularys2_map_idx)
    idx2 = np.repeat(idx2.reshape(1, -1), targets2.shape[0], axis=0)
    np.put_along_axis(new_targets2, idx2, targets2, axis=1)
    
    assert new_targets.shape[1] == new_targets2.shape[1] == len(vocabularys_all)
    
    return new_targets, new_targets2, np.array(vocabularys_all)

def get_preprocess_document_embs(preprocessed_docs, model_name):
    '''
    Returns embeddings(input) for document decoder

            Parameters:
                    preprocessed_docs (list): 
                    model_name (str):
            Returns:
                    doc_embs (array): 
                    model (class): 
    '''
    print('Getting preprocess documents embeddings')
    device = get_free_gpu()
    if model_name == 'bert':
        model = SentenceTransformer("bert-base-uncased", device=device)
        doc_embs = np.array(model.encode(preprocessed_docs, show_progress_bar=True, batch_size=50))
    elif model_name == 'mpnet':
        model = SentenceTransformer("all-mpnet-base-v2", device=device)
        doc_embs = np.array(model.encode(preprocessed_docs, show_progress_bar=True, batch_size=50))
    elif model_name == 'average':
        model = SentenceTransformer("average_word_embeddings_glove.840B.300d", device=device)
        doc_embs = np.array(model.encode(preprocessed_docs, show_progress_bar=True, batch_size=50))
    elif model_name == 'doc2vec':
        doc_embs = []
        preprocessed_docs_split = [doc.split() for doc in preprocessed_docs]
        documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(preprocessed_docs_split)]
        model = Doc2Vec(documents, vector_size=200, workers=4)
        for idx in range(len(preprocessed_docs_split)):
            doc_embs.append(model.infer_vector(preprocessed_docs_split[idx]))
        doc_embs = np.array(doc_embs)

    return doc_embs, model, device  

def get_word_embs(vocabularys, id2token=None, word_emb_file='../data/glove.6B.300d.txt', data_type='ndarray'):
    '''
    Returns word_embs array for semantic precision

            Parameters:
                    vocabularys (list): 
                    word_emb_file (str): 
            Returns:
                    word_embs (array): 
    '''
    
    word2emb = load_word2emb(word_emb_file)
    dim = len(list(word2emb.values())[0])
    word_embs = []
    for word in vocabularys:
        if word not in word2emb:
            emb = np.zeros(dim)
        else:
            emb = word2emb[word]
        word_embs.append(emb) 

    if data_type == 'tensor':
        print('Getting [tensor] word embeddings')
        word_embs = torch.Tensor(word_embs)
    else:
        print('Getting [ndarray] word embeddings')
        word_embs = np.array(word_embs)
    return word_embs


In [3]:
config = {
    'model': 'ZTM',
    'architecture': 'after',
    'activation': 'sigmoid',
    'dataset': 'tweet',
    'dataset_name': 'tweet',
    'vocabulary_size':100,
    'encoder': 'bert',
    'target': 'tf-idf',
    'topic_num': 50,
    'seed': 123,
    'epochs': 10,
    'lr': 1e-4,
    'loss': 'listnet',
    'batch_size': 8,
    'weight_decay': 0,
    'ratio': 0.8,
    'topk': [10, 30, 50],
    'save': False,
    'threshold': 0.7,
}

#show_settings(config)
same_seeds(config['seed'])

In [4]:
# data preprocessing
unpreprocessed_corpus ,preprocessed_corpus = get_preprocess_document(**config)
texts = [text.split() for text in preprocessed_corpus]

Getting preprocess documents: tweet
min_df: 1 max_df: 1.0 vocabulary_size: 100 min_doc_word: 15


Reusing dataset tweet_eval (/dhome/casimir0304/.cache/huggingface/datasets/tweet_eval/emotion/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


In [5]:
# generating document embedding
doc_embs, doc_model, device = get_preprocess_document_embs(preprocessed_corpus, config['encoder'])

Getting preprocess documents embeddings
Using cuda 2 for training...


Some weights of the model checkpoint at /dhome/casimir0304/.cache/torch/sentence_transformers/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

In [6]:
print(device)

cuda:2


In [7]:
# Decode target & Vocabulary
labels, vocabularys= get_preprocess_document_labels(preprocessed_corpus)
id2token = {k: v for k, v in zip(range(0, len(vocabularys[config['target']])), vocabularys[config['target']])}

Getting preprocess documents labels


In [8]:
print(len(vocabularys[config['target']]))

74


In [9]:
# word embedding preparation
word_embeddings = get_word_embs(vocabularys[config['target']], data_type='tensor')

0it [00:00, ?it/s]

Number of words:400000
Getting [tensor] word embeddings


  word_embs = torch.Tensor(word_embs)


In [10]:
import sys
import random
import datetime
import wordcloud
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import multiprocessing as mp
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
# from tqdm.auto import tqdm

sys.path.append("./")
from utils.loss import ListNet, MythNet
from utils.eval import retrieval_normalized_dcg_all, retrieval_precision_all, semantic_precision_all, retrieval_precision_all_v2, semantic_precision_all_v2
from utils.eval_topic import CoherenceNPMI, TopicDiversity, InvertedRBO
from utils.toolbox import get_free_gpu, record_settings
from model.inference_network import ContextualInferenceNetwork

class IDEDataset(Dataset):
    def __init__(self, corpus, emb, target):
        
        assert len(emb) == len(target)
        self.corpus = corpus
        self.emb = torch.FloatTensor(emb)
        self.target = torch.FloatTensor(target)        
        
    def __getitem__(self, idx):
        return self.corpus[idx], self.emb[idx], self.target[idx]

    def __len__(self):
        return len(self.emb)

class DecoderNetwork(nn.Module):
    def __init__(self, config, device, vocab_size, contextual_size, glove_word_embeddings, n_components, hidden_sizes=(100,100), activation='softplus', dropout=0.2, learn_priors=True):
        super(DecoderNetwork, self).__init__()

        assert activation in ['softplus', 'relu']

        self.config = config
        self.device = device
        self.vocab_size = vocab_size
        self.contextual_size = contextual_size
        self.glove_word_embeddings = glove_word_embeddings
        self.n_components = n_components
        self.hidden_sizes = hidden_sizes
        self.activation = activation
        self.dropout = dropout
        self.learn_priors = learn_priors
        self.topic_word_matrix = None

        # decoder architecture
        self.batch_norm = nn.BatchNorm1d(vocab_size)
        self.word_embedding =  nn.Parameter(torch.randn(vocab_size*4, vocab_size))
        self.full_decoder = nn.Sequential(
            nn.Linear(vocab_size, vocab_size*4),
            nn.BatchNorm1d(vocab_size*4),
            nn.Sigmoid(),
            nn.Linear(vocab_size*4, vocab_size),
            nn.BatchNorm1d(vocab_size),
            nn.Sigmoid(),
        )
        self.half_decoder_tanh = nn.Sequential(
            nn.Linear(vocab_size+contextual_size, vocab_size*4),
            nn.BatchNorm1d(vocab_size*4),
            nn.Tanh(),
            nn.Dropout(p=0.2),
        )
        self.half_decoder_sigmoid = nn.Sequential(
            nn.Linear(vocab_size+contextual_size, vocab_size*4),
            nn.BatchNorm1d(vocab_size*4),
            nn.Sigmoid(),
            nn.Dropout(p=0.2),
        )
        self.share_wieght_decoder = nn.Sequential(
            nn.Linear(contextual_size, contextual_size*4),
            nn.BatchNorm1d(contextual_size*4),
            nn.Sigmoid(),
            nn.Linear(contextual_size*4, vocab_size),
            nn.BatchNorm1d(vocab_size),
            nn.Sigmoid(),
        )
        self.glove_emb_decoder = nn.Sequential(
            nn.Linear(vocab_size+contextual_size, vocab_size*4),
            nn.BatchNorm1d(vocab_size*4),
            nn.Sigmoid(),
            nn.Linear(vocab_size*4, glove_word_embeddings.shape[1]),
            nn.BatchNorm1d(glove_word_embeddings.shape[1]),
            nn.Sigmoid(),
        )
        # topic model architecture
        self.inf_net = ContextualInferenceNetwork(vocab_size, contextual_size, n_components, hidden_sizes, activation, label_size=0)
        
        topic_prior_mean = 0.0
        self.prior_mean = torch.tensor([topic_prior_mean] * n_components).to(device)
        if self.learn_priors:
            self.prior_mean = nn.Parameter(self.prior_mean)

        topic_prior_variance = 1. - (1. / self.n_components)
        self.prior_variance = torch.tensor([topic_prior_variance] * n_components).to(device)
        if self.learn_priors:
            self.prior_variance = nn.Parameter(self.prior_variance)

        self.beta = torch.Tensor(n_components, vocab_size).to(device)
        self.beta = nn.Parameter(self.beta)
        
        nn.init.xavier_uniform_(self.beta)
        
        self.beta_batchnorm = nn.BatchNorm1d(vocab_size, affine=False)
        
        self.drop_theta = nn.Dropout(p=self.dropout)
    
    @staticmethod
    def reparameterize(mu, logvar):
        """Reparameterize the theta distribution."""
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return eps.mul(std).add_(mu)

    def forward(self, emb, target, labels=None):
        """Forward pass."""
        posterior_mu, posterior_log_sigma = self.inf_net(target, emb, labels)
        posterior_sigma = torch.exp(posterior_log_sigma)

        # generate samples from theta
        theta = F.softmax(self.reparameterize(posterior_mu, posterior_log_sigma), dim=1)
        theta = self.drop_theta(theta)

        # prodLDA
        # in: batch_size x input_size x n_components
        word_dist = F.softmax(self.beta_batchnorm(torch.matmul(theta, self.beta)), dim=1)
        # word_dist: batch_size x input_size
        self.topic_word_matrix = self.beta
        
        if self.config['activation'] == 'tanh':
            emb_word_dist = torch.cat((word_dist, emb), dim=1)
            decoded_word_dist = self.half_decoder_tanh(emb_word_dist)
            recon_dist = torch.sigmoid(self.batch_norm((torch.matmul(decoded_word_dist, self.word_embedding))))
        else:
            emb_word_dist = torch.cat((word_dist, emb), dim=1)
            decoded_word_dist = self.half_decoder_sigmoid(emb_word_dist)
            recon_dist = torch.sigmoid(self.batch_norm((torch.matmul(decoded_word_dist, self.word_embedding))))

        return self.prior_mean, self.prior_variance, posterior_mu, posterior_sigma, posterior_log_sigma, word_dist, recon_dist
    
    def get_theta(self, target, emb, labels=None):
        with torch.no_grad():
            posterior_mu, posterior_log_sigma = self.inf_net(target, emb, labels)
            theta = F.softmax(self.reparameterize(posterior_mu, posterior_log_sigma), dim=1)

            return theta

class IDETopicDecoder:
    def __init__(self, config, texts=None, vocab = None, idx2token=None, device=None, contextual_size=768, word_embeddings=None, 
                n_components=10, hidden_sizes=(100, 100), activation='softplus', dropout=0.2, learn_priors=True,
                momentum=0.99, reduce_on_plateau=False, num_data_loader_workers=mp.cpu_count(), loss_weights=None):
        self.config = config
        self.texts = texts
        self.vocab = vocab
        self.idx2token = idx2token
        self.device = device
        self.contextual_size = contextual_size
        self.word_embeddings = word_embeddings
        self.n_components = n_components
        self.hidden_sizes = hidden_sizes
        self.activation = activation
        self.dropout = dropout
        self.learn_priors = learn_priors
        self.reduce_on_plateau = reduce_on_plateau
        self.momentum = momentum
        self.num_data_loader_workers = num_data_loader_workers
        self.training_doc_topic_distributions = None
        self.distribution_cache = None
        self.num_epochs = config['epochs']
        if config['loss'] == 'mse':
            self.loss_funct = torch.nn.MSELoss(reduction='mean')
        else:
             self.loss_funct = MythNet
        if loss_weights:
            self.weights = loss_weights
        else:
            self.weights = {"beta": 1}

        self.model = DecoderNetwork(
                    config, device, len(vocab), contextual_size, word_embeddings, n_components, hidden_sizes, activation,
                    dropout, learn_priors)
        
        self.optimizer = torch.optim.Adam(
                        self.model.parameters(), lr=config['lr'], betas=(self.momentum, 0.99), weight_decay=config['weight_decay'])

        if self.reduce_on_plateau:
            self.scheduler = ReduceLROnPlateau(self.optimizer, patience=10)
        
        self.best_components = None

    def loss(self, inputs, word_dists, recon_dists, prior_mean, prior_variance,
              posterior_mean, posterior_variance, posterior_log_variance):
        var_division = torch.sum(posterior_variance / prior_variance, dim=1)
        diff_means = prior_mean - posterior_mean
        diff_term = torch.sum((diff_means * diff_means) / prior_variance, dim=1)
        logvar_det_division = prior_variance.log().sum() - posterior_log_variance.sum(dim=1)

        KL = 0.5 * (var_division + diff_term - self.n_components + logvar_det_division)

        RL = torch.sum(-inputs * torch.log(word_dists + 1e-10), dim=1)

        DL = self.loss_funct(recon_dists, inputs)

        return KL, RL, DL

    def training(self, loader):
        """Train epoch."""
        self.model.train()
        train_loss = 0
        samples_processed = 0

        for batch, (corpus, emb, target) in enumerate(loader):
            target = target.reshape(target.shape[0], -1)
            emb, target = emb.to(self.device), target.to(self.device)

            self.model.zero_grad()
            prior_mean, prior_variance, posterior_mean, posterior_variance,\
            posterior_log_variance, word_dists, recon_dists = self.model(emb, target)

            kl_loss, rl_loss, dl_loss = self.loss(
                target, word_dists, recon_dists, prior_mean, prior_variance,
                posterior_mean, posterior_variance, posterior_log_variance)
            loss = self.weights["beta"] * kl_loss + rl_loss + dl_loss
            loss = loss.sum()

            loss.backward()
            self.optimizer.step()

            samples_processed += target.size()[0]
            train_loss += loss.item()

        train_loss /= samples_processed

        return samples_processed, train_loss
    
    def validation(self, loader):
        """Validation epoch."""
        self.model.eval()
        val_loss = 0
        samples_processed = 0

        results = defaultdict(list)
        dists = defaultdict(list)

        for batch, (corpus, emb, target) in enumerate(loader):
            target = target.reshape(target.shape[0], -1)
            emb, target = emb.to(self.device), target.to(self.device)

            self.model.zero_grad()
            prior_mean, prior_variance, posterior_mean, posterior_variance,\
            posterior_log_variance, word_dists, recon_dists = self.model(emb, target)
            
            kl_loss, rl_loss, dl_loss = self.loss(target, word_dists, recon_dists, prior_mean, prior_variance,
                              posterior_mean, posterior_variance, posterior_log_variance)

            loss = self.weights["beta"] * kl_loss + rl_loss + dl_loss
            loss = loss.sum()

            samples_processed += target.size()[0]
            val_loss += loss.item()

             # Semantic Prcision for reconstruct
            precision_scores, word_result = semantic_precision_all(recon_dists, target, self.word_embeddings, self.vocab, k=self.config['topk'], th = self.config['threshold'])
            for k, v in precision_scores.items():
                results['[Recon] Semantic Precision v1@{}'.format(k)].append(v)

            precision_scores, word_result = semantic_precision_all_v2(recon_dists, target, self.word_embeddings, self.vocab, k=self.config['topk'], th = self.config['threshold'])
            for k, v in precision_scores.items():
                results['[Recon] Semantic Precision v2@{}'.format(k)].append(v)
                
            # Precision for reconstruct
            precision_scores = retrieval_precision_all(recon_dists, target, k=self.config['topk'])
            for k, v in precision_scores.items():
                results['[Recon] Precision v1@{}'.format(k)].append(v)
            
            precision_scores = retrieval_precision_all_v2(recon_dists, target, k=self.config['topk'])
            for k, v in precision_scores.items():
                results['[Recon] Precision v2@{}'.format(k)].append(v)

            # NDCG for reconstruct
            ndcg_scores = retrieval_normalized_dcg_all(recon_dists, target, k=self.config['topk'])
            for k, v in ndcg_scores.items():
                results['[Recon] ndcg@{}'.format(k)].append(v)

            # Semantic Prcision for word dist
            precision_scores, word_result = semantic_precision_all(word_dists, target, self.word_embeddings, self.vocab, k=self.config['topk'], th = self.config['threshold'])
            for k, v in precision_scores.items():
                dists['[Word Dist] Semantic Precision v1@{}'.format(k)].append(v)

            precision_scores, word_result = semantic_precision_all_v2(word_dists, target, self.word_embeddings, self.vocab, k=self.config['topk'], th = self.config['threshold'])
            for k, v in precision_scores.items():
                dists['[Word Dist] Semantic Precision v2@{}'.format(k)].append(v)
                
            # Precision for word dist
            precision_scores = retrieval_precision_all(word_dists, target, k=self.config['topk'])
            for k, v in precision_scores.items():
                dists['[Word Dist] Precision v1@{}'.format(k)].append(v)

            precision_scores = retrieval_precision_all_v2(word_dists, target, k=self.config['topk'])
            for k, v in precision_scores.items():
                dists['[Word Dist] Precision v2@{}'.format(k)].append(v)

            # NDCG for word dist
            ndcg_scores = retrieval_normalized_dcg_all(word_dists, target, k=self.config['topk'])
            for k, v in ndcg_scores.items():
                dists['[Word Dist] ndcg@{}'.format(k)].append(v)
        
        for k in results:
            results[k] = np.mean(results[k])
        
        for k in dists:
            dists[k] = np.mean(dists[k])

        val_loss /= samples_processed

        return samples_processed, val_loss, results, dists

    def fit(self, training_set, validation_set, n_samples=20):
        self.model.to(self.device)
        train_loader = DataLoader(training_set, batch_size=self.config['batch_size'], shuffle=True, num_workers=self.num_data_loader_workers)
        validation_loader = DataLoader(validation_set, batch_size=self.config['batch_size'], shuffle=True, num_workers=self.num_data_loader_workers)
        
        train_loss = 0
        samples_processed = 0
        
        pbar = tqdm(self.num_epochs, position=0, leave=True)
        record_settings(self.config)

        for epoch in range(self.config['epochs']):
            s = datetime.datetime.now()
            sp, train_loss = self.training(train_loader)
            samples_processed += sp
            e = datetime.datetime.now()
            pbar.update(1)

            if  (epoch + 1) % 10 == 0:
                s = datetime.datetime.now()
                val_samples_processed, val_loss, val_res, dist_res = self.validation(validation_loader)
                e = datetime.datetime.now()

                pbar.set_description("Epoch: [{}/{}]\t Seen Samples: [{}/{}]\tTrain Loss: {}\tValid Loss: {}\tTime: {}".format(
                    epoch + 1, self.num_epochs, samples_processed,
                    len(training_set) * self.num_epochs, train_loss, val_loss, e - s))
                
                npmi = CoherenceNPMI(texts=self.texts, topics=self.get_topic_lists(10))
                diversity = InvertedRBO(topics=self.get_topic_lists(10))
                #record = open('./'+self.config['experiment']+'_'+self.config['dataset']+'_'+self.config['model']+'_'+self.config['architecture']+'_'+self.config['activation']+'_'+self.config['encoder']+'_'+self.config['target']+'_loss_'+self.config['loss']+'_lr'+str(self.config['lr'])+'_batch'+str(self.config['batch_size'])+'_weightdecay'+str(self.config['weight_decay'])+'.txt', 'a')
                print('---------------------------------------')
                #record.write('-------------------------------------------------\n')
                print('EPOCH', epoch + 1)
                #record.write('EPOCH '+ str(epoch + 1) + '\n')
                for key,val in val_res.items():
                    print(f"{key}:{val:.4f}")
                    #record.write(f"{key}:{val:.4f}\n")
                for key,val in dist_res.items():
                    print(f"{key}:{val:.4f}")
                    #record.write(f"{key}:{val:.4f}\n")
                print('NPMI: ', npmi.score())
                print('IRBO: ', diversity.score())
                #record.write('NPMI: '+ str(npmi.score()) + '\n')
                #record.write('IRBO: '+ str(diversity.score()) + '\n')

            self.best_components = self.model.beta
            pbar.set_description("Epoch: [{}/{}]\t Seen Samples: [{}/{}]\tTrain Loss: {}\tTime: {}".format(
                epoch + 1, self.num_epochs, samples_processed,
                len(training_set) * self.num_epochs, train_loss, e - s))
        pbar.close()
    
    def get_topic_lists(self, k=10):
        """
        Retrieve the lists of topic words.

        :param k: (int) number of words to return per topic, default 10.
        """
        assert k <= len(self.vocab), "k must be <= input size."
        # TODO: collapse this method with the one that just returns the topics
        component_dists = self.best_components
        topics = []
        for i in range(self.n_components):
            _, idxs = torch.topk(component_dists[i], k)
            component_words = [self.idx2token[idx]
                               for idx in idxs.cpu().numpy()]
            topics.append(component_words)
        return topics


In [11]:
# prepare dataset
dataset = IDEDataset(unpreprocessed_corpus, doc_embs, labels[config['target']])
training_length = int(len(dataset) * config['ratio'])
validation_length = len(dataset) - training_length
training_set, validation_set = random_split(dataset, lengths=[training_length, validation_length],generator=torch.Generator().manual_seed(42))

In [12]:
model = IDETopicDecoder(config, texts=texts, vocab = vocabularys[config['target']], idx2token=id2token, device=device, contextual_size=doc_embs.shape[1], word_embeddings=word_embeddings)
model.fit(training_set, validation_set)

Epoch: [10/10]	 Seen Samples: [4820/4820]	Train Loss: 20.03272578132598	Valid Loss: 11.53534818286738	Time: 0:00:17.283672: : 10it [02:06, 10.65s/it]

---------------------------------------
EPOCH 10
[Recon] Semantic Precision v1@10:0.2773
[Recon] Semantic Precision v1@30:0.1039
[Recon] Semantic Precision v1@50:0.0630
[Recon] Semantic Precision v2@10:0.2773
[Recon] Semantic Precision v2@30:0.1039
[Recon] Semantic Precision v2@50:0.0630
[Recon] Precision v1@10:0.2719
[Recon] Precision v1@30:0.1036
[Recon] Precision v1@50:0.0630
[Recon] Precision v2@10:0.2719
[Recon] Precision v2@30:0.1036
[Recon] Precision v2@50:0.0630
[Recon] ndcg@10:0.8406
[Recon] ndcg@30:0.8748
[Recon] ndcg@50:0.8773
[Recon] ndcg@all:0.8773
[Word Dist] Semantic Precision v1@10:0.0789
[Word Dist] Semantic Precision v1@30:0.0594
[Word Dist] Semantic Precision v1@50:0.0495
[Word Dist] Semantic Precision v2@10:0.0789
[Word Dist] Semantic Precision v2@30:0.0594
[Word Dist] Semantic Precision v2@50:0.0495
[Word Dist] Precision v1@10:0.0461
[Word Dist] Precision v1@30:0.0438
[Word Dist] Precision v1@50:0.0427
[Word Dist] Precision v2@10:0.0461
[Word Dist] Precision v2@30:

Epoch: [10/10]	 Seen Samples: [4820/4820]	Train Loss: 20.03272578132598	Time: 0:00:17.283672: : 10it [02:16, 13.68s/it]                              

NPMI:  -0.47155388081625854
IRBO:  0.8990302476769841





In [None]:
###

In [None]:
import random
doc_idx = []
print(len(validation_set))
for idx in range(200):
    doc_idx.append(random.randint(0, len(validation_set)))
print(doc_idx)

In [None]:
import numpy as np
import random
# visualize documents
check_nums = 10
for idx in doc_idx:
    # get recontruct result
    recon_list, target_list, doc_list = model.get_reconstruct(validation_set)

    # get ranking index
    recon_rank_list = np.zeros((len(recon_list), len(tp.vocab)), dtype='float32')
    target_rank_list = np.zeros((len(recon_list), len(tp.vocab)), dtype='float32')
    for i in range(len(recon_list)):
        recon_rank_list[i] = np.argsort(recon_list[i])[::-1]
        target_rank_list[i] = np.argsort(target_list[i])[::-1]

        # show info
    doc_topics_distribution = model.get_doc_topic_distribution(validation_set)
    doc_topics = model.get_topic_lists()[np.argmax(doc_topics_distribution[idx])]
    print('Documents ', idx)
    print(doc_list[idx])
    print('---------------------------------------')
    print('Topic of Document: ')
    print(doc_topics)
    print('---------------------------------------')
    print('[Predict] Top 10 Words in Document: ')
    for word_idx in range(10):
        print(dataset.idx2token[recon_rank_list[idx][word_idx]])
    print('---------------------------------------')
    print('[Label] Top 10 Words in Document: ')
    for idx in range(10):
        print(dataset.idx2token[target_rank_list[idx][word_idx]])
        print('---------------------------------------\n')

Sampling: [5/20]: : 5it [00:25,  5.08s/it]

In [7]:
recon_list, target_list, doc_list = model.get_reconstruct(validation_set)

In [8]:
print(target_list.shape)

(3770, 2000)


In [None]:
import numpy as np
recon_list = recon_list
recon_rank_list = np.zeros((len(recon_list), len(tp.vocab)), dtype='float32')
target_rank_list = np.zeros((len(recon_list), len(tp.vocab)), dtype='float32')
for i in range(len(recon_list)):
        recon_rank_list[i] = np.argsort(recon_list[i])[::-1]
        target_rank_list[i] = np.argsort(target_list[i])[::-1]

In [10]:
doc_idx = 1698
print(recon_rank_list[doc_idx])

[ 179. 1844. 1907. ...  491.  577.  979.]


In [11]:
print(recon_rank_list)

[[ 950. 1379.   76. ... 1402.  709. 1320.]
 [ 107.  310.  793. ... 1815. 1387. 1539.]
 [1072. 1269. 1006. ...  980.  893.   46.]
 ...
 [ 761. 1626. 1996. ... 1014. 1373. 1739.]
 [ 670.  693.  865. ...  582.   46.  772.]
 [ 290.  928.  904. ... 1205.  476.  463.]]


In [12]:
print(doc_list[doc_idx])

From: dennisk@cs.uoregon.edu (Dennis Kennedy)
Subject: '72 Chevelle SS forsale
Organization: University of Oregon
Lines: 11
Distribution: usa
NNTP-Posting-Host: fp2-cc-25.uoregon.edu

I don't want to sell this car, but I need money for college.
1972 Chevelle Super Sport
Rebuilt 402, four speed, 12 Bolt positrac
Numbers match
110,000 original miles
no rust
Looks and runs excellent
$5995 or best offer.
Call Dennis at (503)343-3759
or email dennisk@cs.uoregon.edu


In [13]:
for idx in range(10):
    print(dataset.idx2token[recon_rank_list[doc_idx][idx]])

already
tv
video
late
card
display
3t
1t
asked
games


In [14]:
for idx in range(10):
    print(dataset.idx2token[target_rank_list[doc_idx][idx]])

cs
72
miles
excellent
runs
numbers
offer
sell
four
looks


In [12]:
model.get_topic_lists()

[['windows',
  'drive',
  'card',
  'disk',
  'help',
  'mac',
  'dos',
  'mouse',
  'problem',
  'pc'],
 ['god',
  'jesus',
  'sin',
  'rutgers',
  'christ',
  'faith',
  'athos',
  'truth',
  'sandvik',
  'church'],
 ['jpeg',
  'edu',
  'gif',
  'image',
  'quality',
  'format',
  'images',
  'get',
  'programs',
  'color'],
 ['gov',
  'access',
  'hst',
  'nasa',
  'shuttle',
  'digex',
  'net',
  'jpl',
  'pat',
  'mission'],
 ['bike', 'dog', 'ca', 'com', 'ride', 'riding', 'dod', 'bnr', 'car', 'bmw'],
 ['10', '46', 'van', '12', '25', 'nj', '11', '64', '28', '60'],
 ['ax', 'max', 'giz', 'bhj', 'writes', 'g9v', '75u', 'pl', 'b8f', '2tm'],
 ['one',
  'people',
  'would',
  'like',
  'see',
  'even',
  'time',
  'lord',
  'said',
  'us'],
 ['article',
  'writes',
  'muslims',
  'islam',
  'turkey',
  'edu',
  'greek',
  'muslim',
  'turks',
  'turkish'],
 ['window',
  'problem',
  'program',
  'help',
  'nl',
  'thanks',
  'error',
  'create',
  'table',
  'screen'],
 ['cx', 'mv', 'ax'

In [33]:
doc_topics_distribution = model.get_doc_topic_distribution(validation_set)

Sampling: [20/20]: : 20it [03:04,  9.25s/it]


In [37]:
doc_topics_distribution[doc_idx]

array([0.00264944, 0.00154069, 0.00157822, 0.00240728, 0.00557763,
       0.00427778, 0.00276508, 0.00872498, 0.04012846, 0.00356483,
       0.00288309, 0.00270965, 0.00298711, 0.00746678, 0.00177612,
       0.00248846, 0.00091876, 0.00518265, 0.00199116, 0.01588894,
       0.08754831, 0.00166128, 0.00302516, 0.00350954, 0.00638637,
       0.00344035, 0.00659486, 0.00204372, 0.00246886, 0.01168864,
       0.01562314, 0.00923532, 0.00320664, 0.00575135, 0.02873103,
       0.00292833, 0.0047355 , 0.00574315, 0.00327179, 0.00780392,
       0.00383813, 0.00213848, 0.00608836, 0.00335248, 0.00340084,
       0.00218007, 0.00324257, 0.63300758, 0.00526384, 0.00458329])

In [40]:
doc_topics = model.get_topic_lists()[np.argmax(doc_topics_distribution[doc_idx])]
print(doc_topics)

['israel', 'israeli', 'arab', 'jewish', 'jews', 'arabs', 'adam', 'policy', 'attacks', 'peace']


In [27]:
test = []
for i in range(2):
    for j in range(2):
        test.append(unpreprocessed_corpus[i+j])

In [9]:
raw_documents = load_document(config['dataset'])["documents"]
preprocessed_documents, unpreprocessed_corpus, texts = preprocess_document(raw_documents)

Reusing dataset tweet_eval (/dhome/casimir0304/.cache/huggingface/datasets/tweet_eval/emotion/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

raw_documents = load_document(config['dataset'])["documents"]
preprocessed_documents, unpreprocessed_corpus, texts = preprocess_document(raw_documents)
vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b[\w+|\-]+\b')
decode_target = vectorizer.fit_transform(preprocessed_documents)
vocabulary = vectorizer.get_feature_names()
id2token = {k: v for k, v in zip(range(0, len(vocab)), vocab)}

In [11]:
id2token = {k: v for k, v in zip(range(0, len(vocab)), vocab)}

In [20]:
print(target.shape)

(4614, 1719)
