In [1]:
import sys
import os
import torch
import nltk
import argparse
from torch.utils.data import DataLoader, random_split

sys.path.append("../")
from model.contextualized_topic_models.models.ctm import ZeroShotTM, CombinedTM
from model.contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation, calculate_word_embeddings_tensor, load_word2emb
from model.contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing
from utils.toolbox import same_seeds, show_settings, record_settings, get_preprocess_document, get_preprocess_document_embs, get_preprocess_document_labels, get_word_embs


os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.set_num_threads(8)



In [2]:
config = {
    'model': 'ZTM',
    'dataset': 'tweet',
    'dataset_name': 'tweet',
    'vocabulary_size':100,
    'encoder': 'roberta',
    'target': 'tf-idf',
    'topic_num': 50,
    'seed': 123,
    'epochs': 1,
    'ratio': 0.8,
    'topk': [10, 30, 50],
    'save': False,
    'threshold': 0.7,
}

show_settings(config)
same_seeds(config['seed'])

-------- Info ---------
model: ZTM
dataset: tweet
dataset_name: tweet
vocabulary_size: 100
encoder: roberta
target: tf-idf
topic_num: 50
seed: 123
epochs: 1
ratio: 0.8
topk: [10, 30, 50]
save: False
threshold: 0.7

-----------------------


In [3]:
# data preprocessing
unpreprocessed_corpus ,preprocessed_corpus = get_preprocess_document(**config)
texts = [text.split() for text in preprocessed_corpus]

Getting preprocess documents: tweet
min_df: 1 max_df: 1.0 vocabulary_size: 100 min_doc_word: 15


Reusing dataset tweet_eval (/dhome/casimir0304/.cache/huggingface/datasets/tweet_eval/emotion/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


In [4]:
# generating document embedding
doc_embs, doc_model = get_preprocess_document_embs(preprocessed_corpus, config['encoder'])

Getting preprocess documents embeddings
Using cuda 0 for training...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
# Decode target & Vocabulary
labels, vocabularys= get_preprocess_document_labels(preprocessed_corpus)
id2token = {k: v for k, v in zip(range(0, len(vocabularys[config['target']])), vocabularys[config['target']])}

Getting preprocess documents labels


In [6]:
# prepare dataset
tp = TopicModelDataPreparation(contextualized_model=doc_embs, target=config['target'])
dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_corpus, text_for_doc2vec=texts, decode_target=labels[config['target']], vocab=vocabularys[config['target']], id2token=id2token)
training_length = int(len(dataset) * config['ratio'])
validation_length = len(dataset) - training_length
training_set, validation_set = random_split(dataset, lengths=[training_length, validation_length],generator=torch.Generator().manual_seed(42))

In [7]:
print(len(tp.vocab))
print(len(vocabularys[config['target']]))

74
74


In [8]:
# word embedding preparation
word_embeddings = calculate_word_embeddings_tensor(load_word2emb("../data/glove.6B.300d.txt"), tp)

0it [00:00, ?it/s]

Number of words:400000


In [9]:
# Define document embeddings dimension
if config['encoder'] == 'doc2vec':
    contextual_size = 200
elif config['encoder'] == 'average':
    contextual_size = 300
else:
    contextual_size = 768

In [10]:
if config['model'] == 'CombinedTM':
    model = CombinedTM(bow_size=len(tp.vocab), contextual_size=contextual_size, n_components=config['topic_num'], num_epochs=config['epochs'], config=config, texts=texts, vocab = tp.vocab, word_embeddings=word_embeddings, idx2token=dataset.idx2token)
elif config['model'] == 'mlp':
    model = MLPDecoder(bow_size=len(tp.vocab), contextual_size=contextual_size, num_epochs=config['epochs'], config=config, texts=texts,vocab = tp.vocab, word_embeddings=word_embeddings, idx2token=dataset.idx2token)
else:
    model = ZeroShotTM(bow_size=len(tp.vocab), contextual_size=contextual_size, n_components=config['topic_num'], num_epochs=config['epochs'], config=config, texts=texts, vocab = tp.vocab, word_embeddings=word_embeddings, idx2token=dataset.idx2token)
model.fit(training_set, validation_set)

Using cuda 2 for training...


Epoch: [1/1]	 Seen Samples: [482/482]	Train Loss: 53.37462847064639	Time: 0:00:04.788851: : 1it [00:11,  6.80s/it]                               

---------------------------------------
EPOCH 1
[Recon] Semantic Precision@10:0.1832
[Recon] Semantic Precision@30:0.0932
[Recon] Semantic Precision@50:0.0605
[Recon] Precision@10:0.1742
[Recon] Precision@30:0.0897
[Recon] Precision@50:0.0602
[Recon] ndcg@10:0.4970
[Recon] ndcg@30:0.5787
[Recon] ndcg@50:0.6034
[Recon] ndcg@all:0.6097
[Word Dist] Semantic Precision@10:0.0720
[Word Dist] Semantic Precision@30:0.0514
[Word Dist] Semantic Precision@50:0.0482
[Word Dist] Precision@10:0.0447
[Word Dist] Precision@30:0.0368
[Word Dist] Precision@50:0.0438
[Word Dist] ndcg@10:0.0958
[Word Dist] ndcg@30:0.1653
[Word Dist] ndcg@50:0.2522
[Word Dist] ndcg@all:0.3154
NPMI:  -0.4655091663437667
IRBO:  0.9054914960380933


Epoch: [1/1]	 Seen Samples: [482/482]	Train Loss: 53.37462847064639	Time: 0:00:04.788851: : 1it [00:28, 28.75s/it]


In [None]:
###

In [None]:
import random
doc_idx = []
print(len(validation_set))
for idx in range(200):
    doc_idx.append(random.randint(0, len(validation_set)))
print(doc_idx)

In [None]:
import numpy as np
import random
# visualize documents
check_nums = 10
for idx in doc_idx:
    # get recontruct result
    recon_list, target_list, doc_list = model.get_reconstruct(validation_set)

    # get ranking index
    recon_rank_list = np.zeros((len(recon_list), len(tp.vocab)), dtype='float32')
    target_rank_list = np.zeros((len(recon_list), len(tp.vocab)), dtype='float32')
    for i in range(len(recon_list)):
        recon_rank_list[i] = np.argsort(recon_list[i])[::-1]
        target_rank_list[i] = np.argsort(target_list[i])[::-1]

        # show info
    doc_topics_distribution = model.get_doc_topic_distribution(validation_set)
    doc_topics = model.get_topic_lists()[np.argmax(doc_topics_distribution[idx])]
    print('Documents ', idx)
    print(doc_list[idx])
    print('---------------------------------------')
    print('Topic of Document: ')
    print(doc_topics)
    print('---------------------------------------')
    print('[Predict] Top 10 Words in Document: ')
    for word_idx in range(10):
        print(dataset.idx2token[recon_rank_list[idx][word_idx]])
    print('---------------------------------------')
    print('[Label] Top 10 Words in Document: ')
    for idx in range(10):
        print(dataset.idx2token[target_rank_list[idx][word_idx]])
        print('---------------------------------------\n')

Sampling: [5/20]: : 5it [00:25,  5.08s/it]

In [7]:
recon_list, target_list, doc_list = model.get_reconstruct(validation_set)

In [8]:
print(target_list.shape)

(3770, 2000)


In [None]:
import numpy as np
recon_list = recon_list
recon_rank_list = np.zeros((len(recon_list), len(tp.vocab)), dtype='float32')
target_rank_list = np.zeros((len(recon_list), len(tp.vocab)), dtype='float32')
for i in range(len(recon_list)):
        recon_rank_list[i] = np.argsort(recon_list[i])[::-1]
        target_rank_list[i] = np.argsort(target_list[i])[::-1]

In [10]:
doc_idx = 1698
print(recon_rank_list[doc_idx])

[ 179. 1844. 1907. ...  491.  577.  979.]


In [11]:
print(recon_rank_list)

[[ 950. 1379.   76. ... 1402.  709. 1320.]
 [ 107.  310.  793. ... 1815. 1387. 1539.]
 [1072. 1269. 1006. ...  980.  893.   46.]
 ...
 [ 761. 1626. 1996. ... 1014. 1373. 1739.]
 [ 670.  693.  865. ...  582.   46.  772.]
 [ 290.  928.  904. ... 1205.  476.  463.]]


In [12]:
print(doc_list[doc_idx])

From: dennisk@cs.uoregon.edu (Dennis Kennedy)
Subject: '72 Chevelle SS forsale
Organization: University of Oregon
Lines: 11
Distribution: usa
NNTP-Posting-Host: fp2-cc-25.uoregon.edu

I don't want to sell this car, but I need money for college.
1972 Chevelle Super Sport
Rebuilt 402, four speed, 12 Bolt positrac
Numbers match
110,000 original miles
no rust
Looks and runs excellent
$5995 or best offer.
Call Dennis at (503)343-3759
or email dennisk@cs.uoregon.edu


In [13]:
for idx in range(10):
    print(dataset.idx2token[recon_rank_list[doc_idx][idx]])

already
tv
video
late
card
display
3t
1t
asked
games


In [14]:
for idx in range(10):
    print(dataset.idx2token[target_rank_list[doc_idx][idx]])

cs
72
miles
excellent
runs
numbers
offer
sell
four
looks


In [12]:
model.get_topic_lists()

[['windows',
  'drive',
  'card',
  'disk',
  'help',
  'mac',
  'dos',
  'mouse',
  'problem',
  'pc'],
 ['god',
  'jesus',
  'sin',
  'rutgers',
  'christ',
  'faith',
  'athos',
  'truth',
  'sandvik',
  'church'],
 ['jpeg',
  'edu',
  'gif',
  'image',
  'quality',
  'format',
  'images',
  'get',
  'programs',
  'color'],
 ['gov',
  'access',
  'hst',
  'nasa',
  'shuttle',
  'digex',
  'net',
  'jpl',
  'pat',
  'mission'],
 ['bike', 'dog', 'ca', 'com', 'ride', 'riding', 'dod', 'bnr', 'car', 'bmw'],
 ['10', '46', 'van', '12', '25', 'nj', '11', '64', '28', '60'],
 ['ax', 'max', 'giz', 'bhj', 'writes', 'g9v', '75u', 'pl', 'b8f', '2tm'],
 ['one',
  'people',
  'would',
  'like',
  'see',
  'even',
  'time',
  'lord',
  'said',
  'us'],
 ['article',
  'writes',
  'muslims',
  'islam',
  'turkey',
  'edu',
  'greek',
  'muslim',
  'turks',
  'turkish'],
 ['window',
  'problem',
  'program',
  'help',
  'nl',
  'thanks',
  'error',
  'create',
  'table',
  'screen'],
 ['cx', 'mv', 'ax'

In [33]:
doc_topics_distribution = model.get_doc_topic_distribution(validation_set)

Sampling: [20/20]: : 20it [03:04,  9.25s/it]


In [37]:
doc_topics_distribution[doc_idx]

array([0.00264944, 0.00154069, 0.00157822, 0.00240728, 0.00557763,
       0.00427778, 0.00276508, 0.00872498, 0.04012846, 0.00356483,
       0.00288309, 0.00270965, 0.00298711, 0.00746678, 0.00177612,
       0.00248846, 0.00091876, 0.00518265, 0.00199116, 0.01588894,
       0.08754831, 0.00166128, 0.00302516, 0.00350954, 0.00638637,
       0.00344035, 0.00659486, 0.00204372, 0.00246886, 0.01168864,
       0.01562314, 0.00923532, 0.00320664, 0.00575135, 0.02873103,
       0.00292833, 0.0047355 , 0.00574315, 0.00327179, 0.00780392,
       0.00383813, 0.00213848, 0.00608836, 0.00335248, 0.00340084,
       0.00218007, 0.00324257, 0.63300758, 0.00526384, 0.00458329])

In [40]:
doc_topics = model.get_topic_lists()[np.argmax(doc_topics_distribution[doc_idx])]
print(doc_topics)

['israel', 'israeli', 'arab', 'jewish', 'jews', 'arabs', 'adam', 'policy', 'attacks', 'peace']


In [27]:
test = []
for i in range(2):
    for j in range(2):
        test.append(unpreprocessed_corpus[i+j])

In [9]:
raw_documents = load_document(config['dataset'])["documents"]
preprocessed_documents, unpreprocessed_corpus, texts = preprocess_document(raw_documents)

Reusing dataset tweet_eval (/dhome/casimir0304/.cache/huggingface/datasets/tweet_eval/emotion/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

raw_documents = load_document(config['dataset'])["documents"]
preprocessed_documents, unpreprocessed_corpus, texts = preprocess_document(raw_documents)
vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b[\w+|\-]+\b')
decode_target = vectorizer.fit_transform(preprocessed_documents)
vocabulary = vectorizer.get_feature_names()
id2token = {k: v for k, v in zip(range(0, len(vocab)), vocab)}

In [11]:
id2token = {k: v for k, v in zip(range(0, len(vocab)), vocab)}

In [20]:
print(target.shape)

(4614, 1719)
