In [1]:
import os
import sys
import nltk
import time
import torch
import random
import argparse
import numpy as np
from torch.utils.data import DataLoader, random_split

sys.path.append("../")
from load_pretrain_label import load_preprocess_document_labels
from model.ide_topic_decoder import IDEDataset, IDETopicDecoder
from utils.toolbox import same_seeds, show_settings, record_settings, get_preprocess_document, get_preprocess_document_embs, get_preprocess_document_labels, get_word_embs




In [2]:
config = {
    'model': 'ZTM',
    'architecture': 'after',
    'activation': 'sigmoid',
    'dataset': '20news',
    'vocab_size':0,
    'encoder': 'bert',
    'target': 'tf-idf-gensim',
    'topic_num': 50,
    'seed': 123,
    'epochs': 10,
    'lr': 1e-4,
    'loss': 'listnet',
    'batch_size': 8,
    'weight_decay': 0,
    'ratio': 0.8,
    'topk': [10, 30, 50],
    'save': False,
    'threshold': 0.7,
}

#show_settings(config)
same_seeds(config['seed'])

In [3]:
# Parameter
if config['dataset'] == '20news':
    config['min_df'], config['max_df'], config['min_doc_word'] = 62, 1.0, 15
elif config['dataset'] == 'agnews':
    config['min_df'], config['max_df'], config['min_doc_word'] = 425, 1.0, 15
elif config['dataset'] == 'IMDB':
    config['min_df'], config['max_df'], config['min_doc_word'] = 166, 1.0, 15
elif config['dataset'] == 'wiki':
    config['min_df'], config['max_df'], config['min_doc_word'] = 2872, 1.0, 15
elif config['dataset'] == 'tweet':
    config['min_df'], config['max_df'], config['min_doc_word'] = 5, 1.0, 15

In [4]:
# data preprocessing
unpreprocessed_corpus ,preprocessed_corpus = get_preprocess_document(**config)
texts = [text.split() for text in preprocessed_corpus]

Getting preprocess documents: 20news
min_df: 62 max_df: 1.0 vocabulary_size: None min_doc_word: 15




In [6]:
# generating document embedding
doc_embs, doc_model, device = get_preprocess_document_embs(preprocessed_corpus, config['encoder'])

Getting preprocess documents embeddings
Using cuda 1 for training...


Some weights of the model checkpoint at /dhome/casimir0304/.cache/torch/sentence_transformers/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Batches:   0%|          | 0/372 [00:00<?, ?it/s]

In [6]:
print(device)

cuda:4


In [5]:
# Decode target & Vocabulary
if config['target'] == 'keybert' or config['target'] == 'yake':
    labels, vocabularys= load_preprocess_document_labels(config)
    label = labels[config['target']].toarray()
    if config['target'] == 'yake':
        label = np.abs(label)
else:
    labels, vocabularys= get_preprocess_document_labels(preprocessed_corpus)
    label = labels[config['target']]
    vocabularys = vocabularys[config['target']]
id2token = {k: v for k, v in zip(range(0, len(vocabularys)), vocabularys)}

Getting preprocess documents labels


In [7]:
print(len(vocabularys))
print(np.array(label)

4829


AttributeError: 'list' object has no attribute 'type'

In [26]:
# word embedding preparation
word_embeddings = get_word_embs(vocabularys, data_type='tensor')

0it [00:00, ?it/s]

Number of words:400000
Getting [tensor] word embeddings


  word_embs = torch.Tensor(word_embs)


In [31]:
# prepare dataset
dataset = IDEDataset(unpreprocessed_corpus, doc_embs, label)
training_length = int(len(dataset) * config['ratio'])
validation_length = len(dataset) - training_length
training_set, validation_set = random_split(dataset, lengths=[training_length, validation_length],generator=torch.Generator().manual_seed(42))

In [33]:
model = IDETopicDecoder(config, texts=texts, vocab = vocabularys, idx2token=id2token, device=device, contextual_size=doc_embs.shape[1], word_embeddings=word_embeddings)
model.fit(training_set, validation_set)

0it [00:00, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 412.00 MiB (GPU 1; 23.70 GiB total capacity; 4.19 GiB already allocated; 369.81 MiB free; 4.79 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
###

In [None]:
import random
doc_idx = []
print(len(validation_set))
for idx in range(200):
    doc_idx.append(random.randint(0, len(validation_set)))
print(doc_idx)

In [None]:
import numpy as np
import random
# visualize documents
check_nums = 10
for idx in doc_idx:
    # get recontruct result
    recon_list, target_list, doc_list = model.get_reconstruct(validation_set)

    # get ranking index
    recon_rank_list = np.zeros((len(recon_list), len(tp.vocab)), dtype='float32')
    target_rank_list = np.zeros((len(recon_list), len(tp.vocab)), dtype='float32')
    for i in range(len(recon_list)):
        recon_rank_list[i] = np.argsort(recon_list[i])[::-1]
        target_rank_list[i] = np.argsort(target_list[i])[::-1]

        # show info
    doc_topics_distribution = model.get_doc_topic_distribution(validation_set)
    doc_topics = model.get_topic_lists()[np.argmax(doc_topics_distribution[idx])]
    print('Documents ', idx)
    print(doc_list[idx])
    print('---------------------------------------')
    print('Topic of Document: ')
    print(doc_topics)
    print('---------------------------------------')
    print('[Predict] Top 10 Words in Document: ')
    for word_idx in range(10):
        print(dataset.idx2token[recon_rank_list[idx][word_idx]])
    print('---------------------------------------')
    print('[Label] Top 10 Words in Document: ')
    for idx in range(10):
        print(dataset.idx2token[target_rank_list[idx][word_idx]])
        print('---------------------------------------\n')

Sampling: [5/20]: : 5it [00:25,  5.08s/it]

In [7]:
recon_list, target_list, doc_list = model.get_reconstruct(validation_set)

In [8]:
print(target_list.shape)

(3770, 2000)


In [None]:
import numpy as np
recon_list = recon_list
recon_rank_list = np.zeros((len(recon_list), len(tp.vocab)), dtype='float32')
target_rank_list = np.zeros((len(recon_list), len(tp.vocab)), dtype='float32')
for i in range(len(recon_list)):
        recon_rank_list[i] = np.argsort(recon_list[i])[::-1]
        target_rank_list[i] = np.argsort(target_list[i])[::-1]

In [10]:
doc_idx = 1698
print(recon_rank_list[doc_idx])

[ 179. 1844. 1907. ...  491.  577.  979.]


In [11]:
print(recon_rank_list)

[[ 950. 1379.   76. ... 1402.  709. 1320.]
 [ 107.  310.  793. ... 1815. 1387. 1539.]
 [1072. 1269. 1006. ...  980.  893.   46.]
 ...
 [ 761. 1626. 1996. ... 1014. 1373. 1739.]
 [ 670.  693.  865. ...  582.   46.  772.]
 [ 290.  928.  904. ... 1205.  476.  463.]]


In [12]:
print(doc_list[doc_idx])

From: dennisk@cs.uoregon.edu (Dennis Kennedy)
Subject: '72 Chevelle SS forsale
Organization: University of Oregon
Lines: 11
Distribution: usa
NNTP-Posting-Host: fp2-cc-25.uoregon.edu

I don't want to sell this car, but I need money for college.
1972 Chevelle Super Sport
Rebuilt 402, four speed, 12 Bolt positrac
Numbers match
110,000 original miles
no rust
Looks and runs excellent
$5995 or best offer.
Call Dennis at (503)343-3759
or email dennisk@cs.uoregon.edu


In [13]:
for idx in range(10):
    print(dataset.idx2token[recon_rank_list[doc_idx][idx]])

already
tv
video
late
card
display
3t
1t
asked
games


In [14]:
for idx in range(10):
    print(dataset.idx2token[target_rank_list[doc_idx][idx]])

cs
72
miles
excellent
runs
numbers
offer
sell
four
looks


In [12]:
model.get_topic_lists()

[['windows',
  'drive',
  'card',
  'disk',
  'help',
  'mac',
  'dos',
  'mouse',
  'problem',
  'pc'],
 ['god',
  'jesus',
  'sin',
  'rutgers',
  'christ',
  'faith',
  'athos',
  'truth',
  'sandvik',
  'church'],
 ['jpeg',
  'edu',
  'gif',
  'image',
  'quality',
  'format',
  'images',
  'get',
  'programs',
  'color'],
 ['gov',
  'access',
  'hst',
  'nasa',
  'shuttle',
  'digex',
  'net',
  'jpl',
  'pat',
  'mission'],
 ['bike', 'dog', 'ca', 'com', 'ride', 'riding', 'dod', 'bnr', 'car', 'bmw'],
 ['10', '46', 'van', '12', '25', 'nj', '11', '64', '28', '60'],
 ['ax', 'max', 'giz', 'bhj', 'writes', 'g9v', '75u', 'pl', 'b8f', '2tm'],
 ['one',
  'people',
  'would',
  'like',
  'see',
  'even',
  'time',
  'lord',
  'said',
  'us'],
 ['article',
  'writes',
  'muslims',
  'islam',
  'turkey',
  'edu',
  'greek',
  'muslim',
  'turks',
  'turkish'],
 ['window',
  'problem',
  'program',
  'help',
  'nl',
  'thanks',
  'error',
  'create',
  'table',
  'screen'],
 ['cx', 'mv', 'ax'

In [33]:
doc_topics_distribution = model.get_doc_topic_distribution(validation_set)

Sampling: [20/20]: : 20it [03:04,  9.25s/it]


In [37]:
doc_topics_distribution[doc_idx]

array([0.00264944, 0.00154069, 0.00157822, 0.00240728, 0.00557763,
       0.00427778, 0.00276508, 0.00872498, 0.04012846, 0.00356483,
       0.00288309, 0.00270965, 0.00298711, 0.00746678, 0.00177612,
       0.00248846, 0.00091876, 0.00518265, 0.00199116, 0.01588894,
       0.08754831, 0.00166128, 0.00302516, 0.00350954, 0.00638637,
       0.00344035, 0.00659486, 0.00204372, 0.00246886, 0.01168864,
       0.01562314, 0.00923532, 0.00320664, 0.00575135, 0.02873103,
       0.00292833, 0.0047355 , 0.00574315, 0.00327179, 0.00780392,
       0.00383813, 0.00213848, 0.00608836, 0.00335248, 0.00340084,
       0.00218007, 0.00324257, 0.63300758, 0.00526384, 0.00458329])

In [40]:
doc_topics = model.get_topic_lists()[np.argmax(doc_topics_distribution[doc_idx])]
print(doc_topics)

['israel', 'israeli', 'arab', 'jewish', 'jews', 'arabs', 'adam', 'policy', 'attacks', 'peace']


In [27]:
test = []
for i in range(2):
    for j in range(2):
        test.append(unpreprocessed_corpus[i+j])

In [9]:
raw_documents = load_document(config['dataset'])["documents"]
preprocessed_documents, unpreprocessed_corpus, texts = preprocess_document(raw_documents)

Reusing dataset tweet_eval (/dhome/casimir0304/.cache/huggingface/datasets/tweet_eval/emotion/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

raw_documents = load_document(config['dataset'])["documents"]
preprocessed_documents, unpreprocessed_corpus, texts = preprocess_document(raw_documents)
vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b[\w+|\-]+\b')
decode_target = vectorizer.fit_transform(preprocessed_documents)
vocabulary = vectorizer.get_feature_names()
id2token = {k: v for k, v in zip(range(0, len(vocab)), vocab)}

In [11]:
id2token = {k: v for k, v in zip(range(0, len(vocab)), vocab)}

In [20]:
print(target.shape)

(4614, 1719)


In [7]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
dct = Dictionary(texts)  # fit dictionary
corpus = [dct.doc2bow(line) for line in texts]  # convert corpus to BoW format


In [22]:
print(corpus[0])
print(texts[0])
print(len(dct))

[(0, 1), (1, 3), (2, 1), (3, 1), (4, 4), (5, 1), (6, 3), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 2), (22, 1), (23, 1), (24, 1), (25, 1)]
['berkeley', 'edu', 'cubs', 'article', 'organization', 'university', 'california', 'berkeley', 'lines', 'posting', 'host', 'berkeley', 'pilot', 'net', 'writes', 'era', 'run', 'year', 'cubs', 'think', 'pitcher', 'season', 'helped', 'lead', 'era', 'rotation', 'cubs', 'era', 'braves', 'know', 'season', 'cubs', 'fans', 'learned']
4829


In [28]:
model = TfidfModel(corpus, normalize=False)  # fit model
vector = model[corpus]

In [64]:
from gensim.matutils import corpus2dense, corpus2csc
corpus_tfidf_dense = corpus2dense(vector, num_terms=len(dct.keys()), num_docs=dct.num_docs)
#corpus_tfidf_sparse = corpus2csc(vector, num_terms=len(dct.keys()), num_docs=dct.num_docs)

In [69]:
target = np.array(corpus_tfidf_dense).T.tolist()
print(len(target.shape))


AttributeError: 'list' object has no attribute 'shape'

In [74]:
vocabularys

['aaron',
 'ab',
 'abc',
 'abiding',
 'ability',
 'abortion',
 'absence',
 'absolute',
 'abstract',
 'absurd',
 'abuse',
 'abz',
 'ac',
 'academic',
 'acc',
 'accelerator',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'accepting',
 'access',
 'accident',
 'accidents',
 'accomplish',
 'accomplished',
 'according',
 'account',
 'accounts',
 'accuracy',
 'accurate',
 'accused',
 'achieve',
 'achieved',
 'acid',
 'acns',
 'acpub',
 'across',
 'acs',
 'acsu',
 'act',
 'acting',
 'action',
 'actions',
 'activities',
 'activity',
 'acts',
 'ad',
 'adam',
 'adams',
 'adapter',
 'add',
 'added',
 'adding',
 'addition',
 'address',
 'addressed',
 'addresses',
 'addressing',
 'adds',
 'adequate',
 'adjust',
 'admin',
 'administration',
 'administrator',
 'admit',
 'admitted',
 'adobe',
 'ads',
 'adult',
 'adults',
 'advance',
 'advanced',
 'advantage',
 'advantages',
 'advertising',
 'advice',
 'advocate',
 'af',
 'affairs',
 'affect',
 'affected',
 'afford',
 'afraid',
 'africa',
 'aft

In [87]:
list(zip(*dct.items()))[0]

(0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [89]:
print(len(vocabularys))

4823


In [90]:
print(len(dct))

4829


In [88]:
list(zip(*dct.items()))[1]

('article',
 'berkeley',
 'braves',
 'california',
 'cubs',
 'edu',
 'era',
 'fans',
 'helped',
 'host',
 'know',
 'lead',
 'learned',
 'lines',
 'net',
 'organization',
 'pilot',
 'pitcher',
 'posting',
 'rotation',
 'run',
 'season',
 'think',
 'university',
 'writes',
 'year',
 'annoying',
 'apple',
 'boot',
 'buttons',
 'cable',
 'come',
 'complain',
 'computer',
 'connected',
 'control',
 'convenient',
 'deal',
 'dealer',
 'decided',
 'drive',
 'ethernet',
 'exchange',
 'feature',
 'functions',
 'get',
 'got',
 'keyboard',
 'literature',
 'location',
 'machine',
 'made',
 'mentioned',
 'nelson',
 'oh',
 'order',
 'ordered',
 'others',
 'physics',
 'pissed',
 'pitched',
 'power',
 'price',
 'reset',
 'rutgers',
 'saw',
 'screen',
 'scsi',
 'seems',
 'seen',
 'shipping',
 'store',
 'thanks',
 'time',
 'took',
 'try',
 'univ',
 'wanted',
 'way',
 'weeks',
 'yesterday',
 'ac',
 'address',
 'age',
 'agency',
 'ai',
 'alan',
 'algorithm',
 'algorithms',
 'alt',
 'america',
 'analysis',


In [76]:
id2token

{0: 'aaron',
 1: 'ab',
 2: 'abc',
 3: 'abiding',
 4: 'ability',
 5: 'abortion',
 6: 'absence',
 7: 'absolute',
 8: 'abstract',
 9: 'absurd',
 10: 'abuse',
 11: 'abz',
 12: 'ac',
 13: 'academic',
 14: 'acc',
 15: 'accelerator',
 16: 'accept',
 17: 'acceptable',
 18: 'acceptance',
 19: 'accepted',
 20: 'accepting',
 21: 'access',
 22: 'accident',
 23: 'accidents',
 24: 'accomplish',
 25: 'accomplished',
 26: 'according',
 27: 'account',
 28: 'accounts',
 29: 'accuracy',
 30: 'accurate',
 31: 'accused',
 32: 'achieve',
 33: 'achieved',
 34: 'acid',
 35: 'acns',
 36: 'acpub',
 37: 'across',
 38: 'acs',
 39: 'acsu',
 40: 'act',
 41: 'acting',
 42: 'action',
 43: 'actions',
 44: 'activities',
 45: 'activity',
 46: 'acts',
 47: 'ad',
 48: 'adam',
 49: 'adams',
 50: 'adapter',
 51: 'add',
 52: 'added',
 53: 'adding',
 54: 'addition',
 55: 'address',
 56: 'addressed',
 57: 'addresses',
 58: 'addressing',
 59: 'adds',
 60: 'adequate',
 61: 'adjust',
 62: 'admin',
 63: 'administration',
 64: 'adm

In [79]:
dct.get()

TypeError: get() missing 1 required positional argument: 'key'