In [1]:
import os
import sys
import nltk
import time
import math
import torch
import random
import argparse
import numpy as np
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader, random_split, ConcatDataset

sys.path.append("../")
from load_pretrain_label import load_preprocess_document_labels
from model.ide_gan_decoder import IDEDataset, IDEGanDecoder
from utils.toolbox import same_seeds, show_settings, record_settings, get_preprocess_document, get_preprocess_document_embs, get_word_embs, merge_targets
from utils.eval import retrieval_normalized_dcg_all, retrieval_precision_all, semantic_precision_all, retrieval_precision_all_v2, semantic_precision_all_v2

os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.set_num_threads(15)

In [2]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense
from sklearn.feature_extraction.text import TfidfVectorizer
def get_preprocess_document_labels(preprocessed_docs, preprocess_config='../chris/parameters/preprocess_config.json'):
    '''
    Returns labels for document decoder

            Parameters:
                    preprocessed_docs (list): 
            Returns:
                    labels (dict): bow, tf-idf
                    vocabulary (dict): bow, tf-idf
    '''
    print('Getting preprocess documents labels')
    vectorizer = TfidfVectorizer()
    # covert sparse matrix to numpy array
    sklearn_tf_idf_vector = vectorizer.fit_transform(preprocessed_docs).toarray()
    docs = [doc.split() for doc in preprocessed_docs]
    gensim_dct = Dictionary(docs)
    gensim_corpus = [gensim_dct.doc2bow(doc) for doc in docs]
    model = TfidfModel(gensim_corpus, normalize=False)
    gensim_vector = model[gensim_corpus]
    gensim_tf_idf_vector = corpus2dense(gensim_vector, num_terms=len(gensim_dct.keys()), num_docs=gensim_dct.num_docs)
    gensim_tf_idf_vector = np.array(gensim_tf_idf_vector).T.tolist()
    bow_vector = sklearn_tf_idf_vector.copy()
    bow_vector[bow_vector > 0] = 1
    bow_vector[bow_vector < 0] = 0
    vocabulary = vectorizer.get_feature_names()

    labels = {}
    labels['tf-idf'] = sklearn_tf_idf_vector
    labels['tf-idf-gensim'] = np.array(gensim_tf_idf_vector)
    labels['bow'] = bow_vector
    
    vocabularys = {}
    vocabularys['tf-idf'] = vocabulary
    vocabularys['tf-idf-gensim'] = list(zip(*gensim_dct.items()))[1]
    vocabularys['bow'] = vocabulary

    return labels, vocabularys, gensim_dct

In [3]:
config = {
    'experiment': 'with_classifier',
    'model': 'IDE_GAN',
    'architecture': 'concatenate',
    'activation': 'sigmoid',
    'dataset': '20news',
    'vocab_size':0,
    'encoder': 'mpnet',
    'target': 'tf-idf-gensim',
    'seed': 123,
    'epochs': 3000,
    'lr': 1e-4,
    'optim': 'AdamW',
    'scheduler': True,
    'warmup': 'linear',
    'warmup_proportion': 0.1, 
    'loss': 'listnet',
    'batch_size': 32,
    'weight_decay': 0,
    'ratio': 0.1,
    'topk': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'save': False,
    'threshold': 0.7,
}
same_seeds(config["seed"])

In [4]:
def generate_dataset(config, balance=False):    
    # Data preprocessing
    unpreprocessed_corpus ,preprocessed_corpus = get_preprocess_document(**config)
    texts = [text.split() for text in preprocessed_corpus]
    print('[INFO] Load corpus done.')

    # Generating document embedding
    while True:
        try:
            doc_embs, doc_model, device = get_preprocess_document_embs(preprocessed_corpus, config['encoder'])
            break
        except:
            print('[Error] CUDA Memory Insufficient, retry after 15 secondes.')
            time.sleep(15)
    print('[INFO] Generate embedding done.')
    
    # Generate Decode target & Vocabulary
    if config['target'] == 'keybert' or config['target'] == 'yake':
        labels, vocabularys, gensim_dct = load_preprocess_document_labels(config)
        label = labels[config['target']].toarray()
    else:
        labels, vocabularys, gensim_dct= get_preprocess_document_labels(preprocessed_corpus)
        label = labels[config['target']]
        vocabularys = vocabularys[config['target']]
    print('[INFO] Load label done.')
    
    # generate idx to token
    id2token = {k: v for k, v in zip(range(0, len(vocabularys)), vocabularys)}
    print('[INFO] Generate id2token done.')
    
    idx = np.arange(len(unpreprocessed_corpus))
    np.random.shuffle(idx)
    train_length = int(len(unpreprocessed_corpus) * 0.8)
    train_idx = idx[:train_length]
    valid_idx = idx[train_length:]

    train_unpreprocessed_corpus = list(np.array(unpreprocessed_corpus)[train_idx])
    valid_unpreprocessed_corpus = list(np.array(unpreprocessed_corpus)[valid_idx])
    train_embs = np.array(doc_embs)[train_idx]
    valid_embs = np.array(doc_embs)[valid_idx]
    train_label = np.array(label)[train_idx]
    valid_label = np.array(label)[valid_idx]
    
    # Generate labeled mask
    label_masks = np.zeros((train_embs.shape[0], 1), dtype=bool)
    num_labeled_data = int(train_embs.shape[0] * config['ratio'])
    while True:
        if num_labeled_data > 0:
            idx = random.randrange(0, train_embs.shape[0])
            if label_masks[idx] == 0:
                label_masks[idx] = 1
                num_labeled_data -= 1
        else:
            break
    print('[INFO] mask labels done.')
    print(num_labeled_data)

    # Balance data if required
    original_num_data = train_embs.shape[0]
    if config['ratio'] != 1 and balance:
        for idx in range(original_num_data): 
            if label_masks[idx]:
                balance = int(1/config['ratio'])
                balance = int(math.log(balance,2))
                if balance < 1:
                    balance = 1
                for b in range(0, int(balance)):
                    train_unpreprocessed_corpus.append(train_preprocessed_corpus[idx])
                    train_embs = np.concatenate((train_embs, train_embs[idx].reshape(1, train_embs.shape[1])), axis=0)
                    train_label = np.concatenate((train_label, train_label[idx].reshape(1, train_label.shape[1])), axis=0)
                    label_masks = np.concatenate((label_masks, label_masks[idx].reshape(1, label_masks.shape[1])), axis=0)
    
    training_set = IDEDataset(train_unpreprocessed_corpus, train_embs, train_label, label_masks)
    validation_set = IDEDataset(valid_unpreprocessed_corpus, valid_embs, valid_label, np.ones((valid_embs.shape[0], 1), dtype=bool))
    
    return training_set, validation_set, vocabularys, id2token, gensim_dct, device 

In [5]:
# Parameter
if config['dataset'] == '20news':
    config['min_df'], config['max_df'], config['min_doc_word'] = 62, 1.0, 15
elif config['dataset'] == 'agnews':
    config['min_df'], config['max_df'], config['min_doc_word'] = 425, 1.0, 15
elif config['dataset'] == 'IMDB':
    config['min_df'], config['max_df'], config['min_doc_word'] = 166, 1.0, 15
elif config['dataset'] == 'wiki':
    config['min_df'], config['max_df'], config['min_doc_word'] = 2872, 1.0, 15
elif config['dataset'] == 'tweet':
    config['min_df'], config['max_df'], config['min_doc_word'] = 5, 1.0, 15

In [6]:
training_set, validation_set, vocabularys, id2token, gensim_dct, device = generate_dataset(config, balance=False)

Getting preprocess documents: 20news
min_df: 62 max_df: 1.0 vocabulary_size: None min_doc_word: 15




[INFO] Load corpus done.
Getting preprocess documents embeddings
Using cuda 4 for training...


Batches:   0%|          | 0/1162 [00:00<?, ?it/s]

[INFO] Generate embedding done.
Getting preprocess documents labels
[INFO] Load label done.
[INFO] Generate id2token done.
[INFO] mask labels done.
0


In [7]:
def Singular_MythNet(y_pred, y_true, eps=1e-10):
	# ListNet switch softmax to L1 norm
    # (1) y_pred: the decoded vector. 
    #     ex: tfidf score of each word in certain document.
    # (2) y_true: the vector before encoded. 
    #     ex: same as above.
    # (3) eps: a small number to avoid error when computing log operation. 
    #     ex: log0 will cause error while log(0+eps) will not.

    y_pred = torch.sigmoid(y_pred) 
    y_pred = torch.nn.functional.normalize(y_pred, dim=1, p=1)
    # y_true = torch.nn.functional.softmax(y_true, dim=1) 
    y_true = torch.nn.functional.normalize(y_true, dim=1, p=1)
    pred = y_pred + eps
    pred_log = torch.log(pred)

    return torch.sum(-y_true * pred_log, dim=1)

In [10]:
import sys
import random
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import multiprocessing as mp
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule_with_warmup, BertTokenizer, BertForMaskedLM, RobertaTokenizer, RobertaForMaskedLM, AlbertTokenizer, AlbertForMaskedLM
# from tqdm.auto import tqdm

sys.path.append("./")
from utils.eval import retrieval_normalized_dcg_all, retrieval_precision_all, semantic_precision_all, retrieval_precision_all_v2, semantic_precision_all_v2
from utils.toolbox import get_free_gpu, record_settings

class IDEDataset(Dataset):
    def __init__(self, corpus, emb, target, mask):
        
        assert len(emb) == len(target)
        self.corpus = corpus
        self.emb = torch.FloatTensor(emb)
        self.target = torch.FloatTensor(target)
        self.mask = torch.BoolTensor(mask)
        
    def __getitem__(self, idx):
        return self.corpus[idx], self.emb[idx], self.target[idx], self.mask[idx]

    def __len__(self):
        return len(self.emb)

class Generator(nn.Module):
    def __init__(self, input_dim=100, output_dim=768, hidden_dim=512, dropout=0.2):
        super(Generator, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LeakyReLU(0.2, inplace=True), 
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, noise):
        return self.layers(noise)

class Discriminator(nn.Module):
    def __init__(self, input_dim=768, output_dim=100, num_labels=2, dropout=0.2):
        super(Discriminator, self).__init__()
        self.decoder = nn.Sequential(
            nn.Linear(input_dim, input_dim*4),
            nn.BatchNorm1d(input_dim*4),
            nn.Sigmoid(),
            nn.Dropout(dropout),
            nn.Linear(input_dim*4, output_dim),
            nn.BatchNorm1d(output_dim),
        )

    def forward(self, embs):
        recons = self.decoder(embs)
        return recons
    
class Classifier(nn.Module):
    def __init__(self, input_dim=768, output_dim=2, dropout=0.2):
        super(Classifier, self).__init__()
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(input_dim, output_dim),
        )

    def forward(self, embs):
        return self.classifier(embs)

class BertFamily(nn.Module):
    def __init__(self, device):
        super().__init__()
        self.device = device
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(device)

    def forward(self, documents):
        return self.get_docvec(documents)

    def get_docvec(self, documents):
        inputs = self.tokenizer(documents, return_tensors='pt', padding=True,
                                truncation=True, max_length=128).to(self.device)
        embedding = self.model.bert(**inputs).last_hidden_state[:, 0, :]
        return embedding    
    
class IDEGanDecoder:
    def __init__(self, config, train_set, valid_set, vocab = None, id2token=None, gensim_dct=None ,device=None, contextual_dim=768, noise_dim=100, word_embeddings=None, dropout=0.2, momentum=0.99, num_data_loader_workers=mp.cpu_count(), loss_weights=None, eps=1e-8):
        self.config = config
        self.train_set = train_set
        self.valid_set = valid_set
        self.vocab = vocab
        self.id2token = id2token
        self.gensim_dct = gensim_dct
        self.device = device
        self.contextual_dim = contextual_dim
        self.noise_dim = noise_dim
        self.word_embeddings = word_embeddings
        self.dropout = dropout
        self.momentum = momentum
        self.num_data_loader_workers = num_data_loader_workers
        self.loss_weights = loss_weights
        self.eps = eps
        self.cls_loss = torch.nn.CrossEntropyLoss()
        self.relu = torch.nn.ReLU()
        self.softmax = torch.nn.Softmax(dim=1)

        # self.generator = Generator(input_dim=noise_dim, output_dim=contextual_dim, dropout=dropout)
        self.discriminator = Discriminator(input_dim=contextual_dim, output_dim=len(vocab), num_labels=2, dropout=dropout)
        self.classifier = Classifier(input_dim=contextual_dim, output_dim=2)
        self.bert = BertFamily(device)
        
        if config['optim'] == 'AdamW':
            # self.gen_optimizer = AdamW(self.generator.parameters(), lr=config['lr'], eps=eps)
            self.dis_optimizer = AdamW(self.discriminator.parameters(), lr=config['lr'], eps=eps)
            self.cls_optimizer = AdamW(self.classifier.parameters(), lr=config['lr'], eps=eps)
            self.bert_optimizer = AdamW(self.bert.parameters(), lr=config['lr'], eps=eps)
        else:
            # self.gen_optimizer = Adam(self.generator.parameters(), lr=config['lr'], betas=(self.momentum, 0.99), weight_decay=config['weight_decay'])
            self.dis_optimizer = Adam(self.discriminator.parameters(), lr=config['lr'], betas=(self.momentum, 0.99), weight_decay=config['weight_decay'])
            self.cls_optimizer = Adam(self.classifier.parameters(), lr=config['lr'], betas=(self.momentum, 0.99), weight_decay=config['weight_decay'])
            self.bert_optimizer = Adam(self.bert.parameters(), lr=config['lr'], betas=(self.momentum, 0.99), weight_decay=config['weight_decay'])
            
        if config['scheduler']:
            num_training_steps = int(len(train_set) / config['batch_size'] * config['epochs'])
            num_warmup_steps = int(num_training_steps * config['warmup_proportion'])
            # self.gen_optimizer = AdamW(self.generator.parameters(), lr=config['lr'], eps=eps)
            self.dis_optimizer = AdamW(self.discriminator.parameters(), lr=config['lr'], eps=eps)
            self.cls_optimizer = AdamW(self.classifier.parameters(), lr=config['lr'], eps=eps)
            self.bert_optimizer = AdamW(self.bert.parameters(), lr=config['lr'], eps=eps)
            if config['warmup'] == 'linear':
                # self.gen_scheduler = get_linear_schedule_with_warmup(self.gen_optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
                self.dis_scheduler = get_linear_schedule_with_warmup(self.dis_optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
                self.cls_scheduler = get_linear_schedule_with_warmup(self.cls_optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
                self.bert_scheduler = get_linear_schedule_with_warmup(self.bert_optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
            
            else:
                # self.gen_scheduler = get_constant_schedule_with_warmup(self.gen_optimizer, num_warmup_steps=num_warmup_steps)
                self.dis_scheduler = get_constant_schedule_with_warmup(self.dis_optimizer, num_warmup_steps=num_warmup_steps)
                self.cls_scheduler = get_constant_schedule_with_warmup(self.cls_optimizer, num_warmup_steps=num_warmup_steps)
                self.bert_scheduler = get_constant_schedule_with_warmup(self.bert_optimizer, num_warmup_steps=num_warmup_steps)
                
    def training(self, epoch, loader):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch + 1, config['epochs']))
        print('Training...')

        gen_train_loss, dis_train_loss, cls_train_loss = 0, 0, 0

        # self.generator.train()
        self.discriminator.train()
        self.classifier.train()
        self.bert.train()

        for batch, (corpus, embs, labels, masks) in enumerate(loader):
            real_embs, labels, masks = embs.to(self.device), labels.to(self.device), masks.to(self.device)
            cur_batch_size = embs.shape[0]

            # noise = torch.zeros(cur_batch_size, 100, device=self.device).uniform_(0, 1)
            # fake_embs = self.generator(noise)
            
            # fake label from BERT
            noise = torch.empty([cur_batch_size, 100], dtype=torch.long).random_(len(self.vocab))
            noise_docs = []
            for i in range(cur_batch_size):
                noise_doc = []
                for j in range(100):
                    noise_doc.append(id2token[int(noise[i][j])])
                noise_docs.append(noise_doc)
            gensim_corpus = [self.gensim_dct.doc2bow(doc) for doc in noise_docs]
            model = TfidfModel(gensim_corpus, normalize=False)
            gensim_vector = model[gensim_corpus]
            gensim_tf_idf_vector = corpus2dense(gensim_vector, num_terms=len(gensim_dct.keys()), num_docs=cur_batch_size)
            gensim_tf_idf_vector = np.array(gensim_tf_idf_vector).T.tolist()
            fake_labels = torch.FloatTensor(gensim_tf_idf_vector).to(self.device)
            #
            noise_corpus = [" ".join(doc) for doc in noise_docs]
            fake_embs = self.bert(noise_corpus).to(device)

            mixed_embs = torch.cat((real_embs, fake_embs), dim=0)
            
            logits = self.classifier(mixed_embs)
            logits_list = torch.split(logits, cur_batch_size)
            D_real_logits = logits_list[0]
            D_fake_logits = logits_list[1]
            
            probs_list = torch.split(self.softmax(logits), cur_batch_size)
            D_real_probs = probs_list[0]
            D_fake_probs = probs_list[1]
            
            recons = self.discriminator(mixed_embs)
            recons_list = torch.split(recons, cur_batch_size)
            D_real_recons = recons_list[0]
            D_fake_recons = recons_list[1]
            
            # Classifier's LOSS
            c_loss_r = self.cls_loss(D_real_logits, torch.ones(cur_batch_size, dtype=torch.long).to(self.device))
            c_loss_f = self.cls_loss(D_fake_logits, torch.zeros(cur_batch_size, dtype=torch.long).to(self.device))
            cls_loss = c_loss_r + c_loss_f
            
            # Generator's LOSS
            # g_loss_d = torch.reciprocal(c_loss_f + self.eps)#-1 * torch.mean(torch.log(1 - D_fake_probs[:,-1] + self.eps))
            # g_recon_reg = -1 * torch.log(self.relu(torch.nn.functional.cosine_similarity(torch.mean(real_embs, dim=0), torch.mean(fake_embs, dim=0), dim=0)) + self.eps)
            # gen_loss = g_loss_d + g_recon_reg
            
            # BERT's LOSS
            g_loss_d = torch.reciprocal(c_loss_f + self.eps)#-1 * torch.mean(torch.log(1 - D_fake_probs[:,-1] + self.eps))
            g_recon_reg = -1 * torch.log(self.relu(torch.nn.functional.cosine_similarity(torch.mean(real_embs, dim=0), torch.mean(fake_embs, dim=0), dim=0)) + self.eps)
            g_recon_weight = self.relu(torch.nn.functional.cosine_similarity(torch.mean(real_embs, dim=0), torch.mean(fake_embs, dim=0), dim=0))
            gen_loss = g_loss_d + g_recon_reg
    
            # Disciminator's LOSS
            recon_loss = torch.masked_select(Singular_MythNet(D_real_recons, labels), torch.flatten(masks))
            fake_recon_loss = Singular_MythNet(D_fake_recons, fake_labels) * g_recon_weight
            labeled_count = recon_loss.type(torch.float32).numel()
            if labeled_count == 0:
                D_L_Supervised = torch.mean(fake_recon_loss)
            else:
                D_L_Supervised = torch.mean(recon_loss) + torch.mean(fake_recon_loss)          
            #D_L_unsupervised1U = -1 * torch.mean(torch.log(1 - D_real_probs[:, -1] + self.eps))
            #D_L_unsupervised2U = -1 * torch.mean(torch.log(D_fake_probs[:, -1] + self.eps))
            dis_loss = D_L_Supervised + cls_loss#+ D_L_unsupervised1U + D_L_unsupervised2U

            # self.gen_optimizer.zero_grad()
            self.bert_optimizer.zero_grad()
            self.dis_optimizer.zero_grad()
            self.cls_optimizer.zero_grad()

            gen_loss.backward(retain_graph=True)
            cls_loss.backward(retain_graph=True)
            dis_loss.backward() 
            
            # self.gen_optimizer.step()
            self.bert_optimizer.step()
            self.dis_optimizer.step()
            self.cls_optimizer.step()

            if config['scheduler']:
                # self.gen_scheduler.step()
                self.bert_scheduler.step()
                self.dis_scheduler.step()
                self.cls_scheduler.step()

            gen_train_loss += gen_loss.item()
            dis_train_loss += dis_loss.item()
            cls_train_loss += cls_loss.item()
            #break
        avg_gen_train_loss = gen_train_loss / len(loader)
        avg_dis_train_loss = dis_train_loss / len(loader)   
        avg_cls_train_loss = cls_train_loss / len(loader) 

        print("")
        print("  Average training loss generetor: {0:.3f}".format(avg_gen_train_loss))
        print("  Average training loss discriminator: {0:.3f}".format(avg_dis_train_loss))
        print("  Average training loss classifier: {0:.3f}".format(avg_cls_train_loss))
        return avg_gen_train_loss, avg_dis_train_loss, avg_cls_train_loss

    def validation(self, loader):
        # self.generator.eval()
        self.discriminator.eval()
        self.classifier.eval()
        self.bert.eval()
        
        results = defaultdict(list)
        with torch.no_grad():
            for batch, (corpus, embs, labels, masks) in enumerate(loader):
                embs, labels = embs.to(self.device), labels.to(self.device)
                recons = self.discriminator(embs)
                
                # Precision for reconstruct
                precision_scores = retrieval_precision_all(recons, labels, k=config['topk'])
                for k, v in precision_scores.items():
                    results['[Recon] Precision v1@{}'.format(k)].append(v)
                
                precision_scores = retrieval_precision_all_v2(recons, labels, k=config['topk'])
                for k, v in precision_scores.items():
                    results['[Recon] Precision v2@{}'.format(k)].append(v)

                # NDCG for reconstruct
                ndcg_scores = retrieval_normalized_dcg_all(recons, labels, k=config['topk'])
                for k, v in ndcg_scores.items():
                    results['[Recon] ndcg@{}'.format(k)].append(v)

        for k in results:
            results[k] = np.mean(results[k])
                
        return results

    def fit(self):
        # self.generator.to(self.device)
        self.discriminator.to(self.device)
        self.classifier.to(self.device)
        self.bert.to(self.device)

        train_loader = DataLoader(self.train_set, batch_size=self.config['batch_size'], shuffle=True, num_workers=self.num_data_loader_workers)
        valid_loader = DataLoader(self.valid_set, batch_size=self.config['batch_size'], shuffle=False, num_workers=self.num_data_loader_workers)

        for epoch in range(self.config['epochs']):
            gen_train_loss, dis_train_loss, cls_train_loss = self.training(epoch, train_loader)
            #break
            if (epoch + 1) % 10 == 0:
                val_res = self.validation(valid_loader)
                record = open('./ide_gan_'+self.config['experiment']+'_'+self.config['dataset']+'_'+self.config['encoder']+'_'+self.config['target']+'_loss_'+self.config['loss']+'_lr'+str(self.config['lr'])+'_optim'+self.config['optim']+'_batch'+str(self.config['batch_size'])+'_weightdecay'+str(self.config['weight_decay'])+'.txt', 'a')
                print('---------------------------------------')
                record.write('-------------------------------------------------\n')
                for key,val in val_res.items():
                    print(f"{key}:{val:.4f}")
                    record.write(f"{key}:{val:.4f}\n")

In [None]:
model = IDEGanDecoder(config, training_set, validation_set, vocabularys, id2token, gensim_dct, device)
model.fit()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Training...

  Average training loss generetor: 13.162
  Average training loss discriminator: 9.617
  Average training loss classifier: 1.237

Training...

  Average training loss generetor: 4.246
  Average training loss discriminator: 10.198
  Average training loss classifier: 1.394

Training...

  Average training loss generetor: 3.706
  Average training loss discriminator: 10.508
  Average training loss classifier: 1.401

Training...

  Average training loss generetor: 3.594
  Average training loss discriminator: 10.502
  Average training loss classifier: 1.386

Training...

  Average training loss generetor: 3.569
  Average training loss discriminator: 10.443
  Average training loss classifier: 1.382

Training...

  Average training loss generetor: 3.565
  Average training loss discriminator: 10.479
  Average training loss classifier: 1.376

Training...


In [48]:
noise = torch.empty([32, 100], dtype=torch.long).random_(len(vocabularys))

In [49]:
noise_docs = []
for i in range(32):
    noise_doc = []
    for j in range(100):
        noise_doc.append(id2token[int(noise[i][j])])
    noise_docs.append(noise_doc)

In [50]:
new_docs = [" ".join(doc) for doc in docs]

In [51]:
gensim_corpus = [gensim_dct.doc2bow(doc) for doc in noise_docs]
model = TfidfModel(gensim_corpus, normalize=False)
gensim_vector = model[gensim_corpus]
gensim_tf_idf_vector = corpus2dense(gensim_vector, num_terms=len(gensim_dct.keys()), num_docs=32)
gensim_tf_idf_vector = np.array(gensim_tf_idf_vector).T.tolist()
labels = torch.FloatTensor(gensim_tf_idf_vector)

In [53]:
labels.shape

torch.Size([32, 4829])