In [1]:
import os
import sys
import nltk
import time
import math
import torch
import random
import argparse
import numpy as np
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader, random_split, ConcatDataset

sys.path.append("../")
from load_pretrain_label import load_preprocess_document_labels
#from model.ide_ae_decoder import IDEDataset, IDEAEDecoder
from utils.toolbox import same_seeds, show_settings, record_settings, get_preprocess_document, get_preprocess_document_embs, get_preprocess_document_labels, get_word_embs, merge_targets
from utils.eval import retrieval_normalized_dcg_all, retrieval_precision_all, semantic_precision_all, retrieval_precision_all_v2, semantic_precision_all_v2

os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.set_num_threads(15)

In [2]:
from base64 import encode
import sys
import random
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import multiprocessing as mp
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule_with_warmup, BertTokenizer, BertForMaskedLM, RobertaTokenizer, RobertaForMaskedLM, AlbertTokenizer, AlbertForMaskedLM
# from tqdm.auto import tqdm

sys.path.append("./")
from utils.loss import MythNet, ContrastiveLoss
from utils.eval import retrieval_normalized_dcg_all, retrieval_precision_all, semantic_precision_all, retrieval_precision_all_v2, semantic_precision_all_v2
from utils.toolbox import get_free_gpu, record_settings
from model.inference_network import ContextualInferenceNetwork

class IDEDataset(Dataset):
    def __init__(self, docs, corpus, emb, target, real):
        
        assert len(emb) == len(target)
        self.docs = docs
        self.corpus = corpus
        self.emb = torch.FloatTensor(emb)
        self.target = torch.FloatTensor(target)
        self.real = torch.LongTensor(real)
        
    def __getitem__(self, idx):
        return self.docs[idx], self.corpus[idx], self.emb[idx], self.target[idx], self.real[idx]

    def __len__(self):
        return len(self.emb)

class Encoder(nn.Module):
    def __init__(self, device):
        super().__init__()
        self.device = device
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(device)

    def forward(self, documents):
        return self.get_docvec(documents)

    def get_docvec(self, documents):
        inputs = self.tokenizer(documents, return_tensors='pt', padding=True,
                                truncation=True, max_length=128).to(self.device)
        embedding = self.model.bert(**inputs).last_hidden_state[:, 0, :]
        return embedding

class Decoder(nn.Module):
    def __init__(self, input_dim=768, output_dim=100, dropout=0.2):
        super(Decoder, self).__init__()
        self.decoder = nn.Sequential(
            nn.Linear(input_dim, input_dim*4),
            nn.BatchNorm1d(input_dim*4),
            nn.Sigmoid(),
            nn.Dropout(dropout),
            nn.Linear(input_dim*4, output_dim),
            nn.BatchNorm1d(output_dim),
        )

    def forward(self, embs):
        recons = self.decoder(embs)
        return recons

class Classifier(nn.Module):
    def __init__(self, input_dim=768, output_dim=2):
        super(Classifier, self).__init__()
        self.logit = nn.Linear(input_dim, output_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, embs):
        logits = self.logit(embs)
        probs = self.softmax(logits)
        return logits, probs

class FeatureExtractor(nn.Module):
    def __init__(self, input_dim, feature_dim, hidden_dim=1024):
        super(FeatureExtractor, self).__init__()
        self.input_dim = input_dim
        self.feature_dim = feature_dim
        self.hidden_dim = hidden_dim
        self.cnn = nn.Sequential(
            nn.Conv1d(input_dim, hidden_dim, 1, stride=2),
            nn.ReLU(),
            nn.Conv1d(hidden_dim, hidden_dim*2, 1, stride=2),
            nn.ReLU(),
        )
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.Sigmoid(),
            nn.Linear(hidden_dim, feature_dim),
        )
        self.fc_q = nn.Sequential(
            nn.Linear(input_dim, input_dim),
            nn.Sigmoid(),
        )
        self.fc_k = nn.Sequential(
            nn.Linear(input_dim, input_dim),
            nn.Sigmoid(),
        )
        self.attention = nn.MultiheadAttention(input_dim, 8)

    def forward(self, embs, reals):
        # 1D Convolution
        conv_embs = self.cnn(embs.unsqueeze(dim=-1))
        feature = self.fc(conv_embs)
        feature = reals * embs + (1 - reals) * feature#.unsqueeze(dim=-1)

        # attention block
        # q = self.fc_q(embs)
        # k = self.fc_k(q)
        # feature, _ = self.attention(q, k, embs)

        return feature

class IDEDADecoder:
    def __init__(self, config, label_set, unlabel_set, valid_set, vocab = None, id2token=None, device=None, contextual_dim=768, encoded_dim=768, noise_dim=100, word_embeddings=None, dropout=0.2, momentum=0.99, num_data_loader_workers=mp.cpu_count(), loss_weights=None, eps=1e-8):
        self.config = config
        self.label_set = label_set
        self.unlabel_set = unlabel_set
        self.valid_set = valid_set
        self.merge_set = None
        self.vocab = vocab
        self.id2token = id2token
        self.device = device
        self.contextual_dim = contextual_dim
        self.encoded_dim = encoded_dim
        self.noise_dim = noise_dim
        self.word_embeddings = word_embeddings
        self.dropout = dropout
        self.momentum = momentum
        self.num_data_loader_workers = num_data_loader_workers
        self.loss_weights = loss_weights
        self.eps = eps
        self.relu = torch.nn.ReLU()
        self.cross_entropy = torch.nn.CrossEntropyLoss()
        self.mse_loss = torch.nn.MSELoss()
        self.kl_loss = torch.nn.KLDivLoss(reduction="batchmean")

        # model
        self.encoder = Encoder(device)
        self.decoder = Decoder(input_dim=contextual_dim, output_dim=len(vocab))
        self.classifier = Classifier(input_dim=encoded_dim, output_dim=2)
        self.extractor = FeatureExtractor(input_dim=contextual_dim, feature_dim=encoded_dim)
        
        # optimizer
        if config['optim'] == 'AdamW':
            self.en_optimizer = AdamW(self.encoder.parameters(), lr=config['lr'], eps=eps)
            self.de_optimizer = AdamW(self.decoder.parameters(), lr=config['lr'], eps=eps)
            self.cls_optimizer = AdamW(self.classifier.parameters(), lr=config['lr'], eps=eps)
            self.ex_optimizer = AdamW(self.extractor.parameters(), lr=config['lr'], eps=eps)
        else:
            self.en_optimizer = torch.optim.Adam(self.encoder.parameters(), lr=config['lr'], betas=(self.momentum, 0.99), weight_decay=config['weight_decay'])
            self.de_optimizer = torch.optim.Adam(self.decoder.parameters(), lr=config['lr'], betas=(self.momentum, 0.99), weight_decay=config['weight_decay'])
            self.cls_optimizer = torch.optim.Adam(self.classifier.parameters(), lr=config['lr'], betas=(self.momentum, 0.99), weight_decay=config['weight_decay'])
            self.ex_optimizer = torch.optim.Adam(self.extractor.parameters(), lr=config['lr'], betas=(self.momentum, 0.99), weight_decay=config['weight_decay'])
                   
        # scheduler
        if config['scheduler']:
            num_en_training_steps = int(len(label_set) / config['batch_size'] * config['en_epochs'])   
            num_de_training_steps = int((len(label_set) + len(unlabel_set)) / config['batch_size'] * config['de_epochs'])
            num_en_warmup_steps = int(num_en_training_steps * config['warmup_proportion'])
            num_de_warmup_steps = int(num_de_training_steps * config['warmup_proportion'])    
            if config['warmup'] == 'linear':
                self.en_scheduler = get_linear_schedule_with_warmup(self.en_optimizer, num_warmup_steps=num_en_warmup_steps, num_training_steps=num_en_training_steps)
                self.de_scheduler = get_linear_schedule_with_warmup(self.de_optimizer, num_warmup_steps=num_de_warmup_steps, num_training_steps=num_de_training_steps)
                self.cls_scheduler = get_linear_schedule_with_warmup(self.cls_optimizer, num_warmup_steps=num_de_warmup_steps, num_training_steps=num_de_training_steps)
                self.ex_scheduler = get_linear_schedule_with_warmup(self.ex_optimizer, num_warmup_steps=num_de_warmup_steps, num_training_steps=num_de_training_steps)
            else:
                self.en_scheduler = get_constant_schedule_with_warmup(self.en_optimizer, num_warmup_steps=num_en_warmup_steps)
                self.de_scheduler = get_constant_schedule_with_warmup(self.de_optimizer, num_warmup_steps=num_de_warmup_steps)
                self.cls_scheduler = get_constant_schedule_with_warmup(self.cls_optimizer, num_warmup_steps=num_de_warmup_steps)
                self.ex_scheduler = get_constant_schedule_with_warmup(self.ex_optimizer, num_warmup_steps=num_de_warmup_steps)
                
    def en_training(self, epoch, loader):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch + 1, self.config['en_epochs']))
        print('Encoder Training...')
        
        en_train_loss = 0
        en_train_dis = 0
        en_train_cos = 0
        
        self.encoder.train()

        for batch, (docs, corpus, embs, labels, reals) in enumerate(loader):
            real_embs = embs.to(self.device)
            
            real_embs_t = real_embs
            
            # fake label from BERT            
            fake_embs = self.encoder(corpus).to(self.device)

            # Encoder's LOSS
            # e_loss_d = -1 * torch.mean(torch.log(1 - D_fake_probs[:,-1] + self.eps))
            # e_cos = torch.nn.functional.cosine_similarity(torch.mean(real_embs_t, dim=0), torch.mean(fake_embs, dim=0), dim=0)
            e_cos = torch.mean(torch.nn.functional.cosine_similarity(real_embs_t, fake_embs))
            # e_feat_emb = torch.mean(torch.pow(torch.mean(real_embs_t, dim=0) - torch.mean(fake_embs, dim=0), 2))
            # e_feat_emb =  ContrastiveLoss(real_embs_t, fake_embs, torch.eye(real_embs_t.shape[0], requires_grad=True).to(self.device))
            e_feat_emb = torch.mean(torch.mean(torch.cdist(fake_embs, real_embs_t, p=2), dim=0), dim=0).squeeze()
            # e_feat_emb = self.kl_loss(fake_embs, real_embs_t)
            # e_feat_emb = self.mse_loss(fake_embs, real_embs_t)
            # en_loss = e_feat_emb + (1 - e_cos)
            en_loss = e_feat_emb + embs.shape[0] * (1 - e_cos)
            

            self.en_optimizer.zero_grad()
            en_loss.backward()
            self.en_optimizer.step()
            if self.config['scheduler']:
                self.en_scheduler.step()

            en_train_loss += en_loss.item()
            en_train_dis += e_feat_emb
            en_train_cos += e_cos

        avg_en_train_loss = en_train_loss / len(loader)        
        avg_en_train_dis = en_train_dis / len(loader) 
        avg_en_train_cos = en_train_cos / len(loader) 

        return avg_en_train_loss, avg_en_train_dis, avg_en_train_cos

    def en_validation(self, loader):
        
        en_val_loss = 0
        en_val_dis = 0
        en_val_cos = 0
        
        self.encoder.eval()
        # self.classifier.eval()
        
        with torch.no_grad():
            for batch, (docs, corpus, embs, labels, reals) in enumerate(loader):
                embs = embs.to(self.device)
                real_embs_t = embs
                
                fake_embs = self.encoder(corpus).to(self.device)

                # e_loss_d = -1 * torch.mean(torch.log(1 - D_fake_probs[:,-1] + self.eps))
                # e_cos = torch.nn.functional.cosine_similarity(torch.mean(real_embs_t, dim=0), torch.mean(fake_embs, dim=0), dim=0)
                e_cos = torch.mean(torch.nn.functional.cosine_similarity(real_embs_t, fake_embs))
                # e_feat_emb = torch.mean(torch.pow(torch.mean(real_embs_t, dim=0) - torch.mean(fake_embs, dim=0), 2))
                # e_feat_emb =  ContrastiveLoss(real_embs_t, fake_embs, torch.eye(real_embs_t.shape[0], requires_grad=True).to(self.device))
                e_feat_emb = torch.mean(torch.mean(torch.cdist(fake_embs, real_embs_t, p=2), dim=0), dim=0).squeeze()
                # e_feat_emb = self.kl_loss(fake_embs, real_embs_t)
                # e_feat_emb = self.mse_loss(fake_embs, real_embs_t)
                # en_loss = e_feat_emb + (1 - e_cos)
                en_loss = e_feat_emb + embs.shape[0] * (1 - e_cos)
                
                en_val_loss += en_loss
                en_val_dis += e_feat_emb
                en_val_cos += e_cos
               
            avg_en_val_loss = en_val_loss / len(loader)
            avg_en_val_dis = en_val_dis / len(loader)
            avg_en_val_cos = en_val_cos / len(loader)
        
        return avg_en_val_loss, avg_en_val_dis, avg_en_val_cos
        
    def generate_fake_data(self, loader):
        self.encoder.eval()
        doc_list = []
        corpus_list = []
        emb_list = []
        label_list = []

        for batch, (doc, corpus, embs, labels, reals) in enumerate(loader):
            fake_embs = self.encoder(corpus).detach().cpu()
            for raw_doc, pro_doc, emb, label in zip(doc, corpus, fake_embs, labels):
                doc_list.append(raw_doc)
                corpus_list.append(pro_doc)
                emb_list.append(emb.numpy())
                label_list.append(label.numpy())

        return IDEDataset(doc_list, corpus_list, np.array(emb_list), np.array(label_list), np.zeros((len(emb_list), 1)))

    def en_fit(self):
        self.encoder.to(self.device)

        label_loader = DataLoader(self.label_set, batch_size=self.config['batch_size'], shuffle=True, num_workers=self.num_data_loader_workers)
        unlabel_loader = DataLoader(self.unlabel_set, batch_size=self.config['batch_size'], shuffle=True, num_workers=self.num_data_loader_workers)
        valid_loader = DataLoader(self.valid_set, batch_size=self.config['batch_size'], shuffle=False, num_workers=self.num_data_loader_workers)

        for epoch in range(self.config['en_epochs']):
            en_train_loss, en_train_dis, en_train_cos = self.en_training(epoch, label_loader)
            
            en_val_loss, en_val_dis, en_val_cos = self.en_validation(unlabel_loader)
            
            print('---------------------------------------')
            print("Training: ")
            print(" [Encoder] Average training loss: {0:.3f}".format(en_train_loss))
            print(" [Encoder] Average training distance: {0:.3f}".format(en_train_dis))
            print(" [Encoder] Average training cosine similarity: {0:.3f}".format(en_train_cos))
            print("Validation:")
            print(" [Encoder] Average validation loss: {0:.3f}".format(en_val_loss))
            print(" [Encoder] Average validation dis: {0:.3f}".format(en_val_dis))
            print(" [Encoder] Average validation cosine similarity: {0:.3f}".format(en_val_cos))

        self.merge_set = torch.utils.data.ConcatDataset([self.generate_fake_data(unlabel_loader), self.label_set])

    def cls_training(self, epoch, loader):        
        cls_train_loss = 0
        
        self.classifier.train()
        self.decoder.eval()
        self.extractor.eval()

        for batch, (docs, corpus, embs, labels, reals) in enumerate(loader):
            embs, reals = embs.to(self.device), reals.to(self.device)   

            # Extract features
            features = self.extractor(embs, reals)
            # features = embs
            
            # Classifier discrimiate features
            logits, probs = self.classifier(features)
            cls_loss = self.cross_entropy(probs, torch.flatten(reals))
            
            self.cls_optimizer.zero_grad()
            
            cls_loss.backward()
            
            self.cls_optimizer.step()
            if self.config['scheduler']:
                self.cls_scheduler.step()

            cls_train_loss += cls_loss.item()
  
        avg_cls_train_loss = cls_train_loss / len(loader)    

        return avg_cls_train_loss

    def de_training(self, epoch, loader):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch + 1, self.config['de_epochs']))
        print('Decoder Training...')
        
        de_train_loss = 0
        cls_train_loss = 0

        de_loss_weight = 1
        cls_loss_weight = 10
        
        self.classifier.eval()
        self.decoder.train()
        self.extractor.train()

        for batch, (docs, corpus, embs, labels, reals) in enumerate(loader):
            embs, labels, reals = embs.to(self.device), labels.to(self.device), reals.to(self.device)   

            # Extract features
            features = self.extractor(embs, reals)
            # features = embs
            
            # Classifier discrimiate features
            logits, probs = self.classifier(features)
            # cls_loss = self.cross_entropy(probs, torch.ones([embs.shape[0]], dtype=torch.long).to(self.device))
            cls_loss = self.cross_entropy(probs, torch.flatten(1 - reals))
            # cls_loss = torch.reciprocal(self.cross_entropy(probs, torch.flatten(reals)) + self.eps)

            # Decoder reconstruct embs
            recons = self.decoder(features)
            
            # ListNet Loss
            de_loss = de_loss_weight * MythNet(recons, labels) + cls_loss_weight * cls_loss
            
            self.de_optimizer.zero_grad()
            self.ex_optimizer.zero_grad()
            
            de_loss.backward()
            
            self.de_optimizer.step()
            self.ex_optimizer.step()

            if self.config['scheduler']:
                self.de_scheduler.step()
                self.ex_scheduler.step()

            de_train_loss += de_loss.item()

        avg_de_train_loss = de_train_loss / len(loader)      

        return avg_de_train_loss
    
    def de_validation(self, loader):
        de_val_loss = 0
        cls_val_loss = 0

        de_loss_weight = 1
        cls_loss_weight = 10
        
        self.classifier.eval()
        self.decoder.eval()
        self.extractor.eval()
        
        results = defaultdict(list)
        with torch.no_grad():
            for batch, (docs, corpus, embs, labels, reals) in enumerate(loader):
                embs, labels, reals = embs.to(self.device), labels.to(self.device), reals.to(self.device)
                
                # Extract features
                features = self.extractor(embs, reals)
                # features = embs

                # Clssifier
                logits, probs = self.classifier(features)
                cls_loss = self.cross_entropy(probs, torch.flatten(reals))
                # de_cls_loss = self.cross_entropy(probs, torch.ones([embs.shape[0]], dtype=torch.long).to(self.device))
                # de_cls_loss = torch.reciprocal(self.cross_entropy(probs, torch.flatten(reals)) + self.eps)
                de_cls_loss = self.cross_entropy(probs, torch.flatten(1 - reals))

                # Decoder reconstruct
                recons = self.decoder(features)
                
                # ListNet Loss
                de_loss = de_loss_weight * MythNet(recons, labels) + cls_loss_weight * de_cls_loss
                de_val_loss += de_loss.item()
                cls_val_loss += cls_loss.item()
                
                # Precision for reconstruct
                precision_scores = retrieval_precision_all(recons, labels, k=self.config['topk'])
                for k, v in precision_scores.items():
                    results['[Recon] Precision v1@{}'.format(k)].append(v)
                
                precision_scores = retrieval_precision_all_v2(recons, labels, k=self.config['topk'])
                for k, v in precision_scores.items():
                    results['[Recon] Precision v2@{}'.format(k)].append(v)

                # NDCG for reconstruct
                ndcg_scores = retrieval_normalized_dcg_all(recons, labels, k=self.config['topk'])
                for k, v in ndcg_scores.items():
                    results['[Recon] ndcg@{}'.format(k)].append(v)
        
        avg_de_val_loss = de_val_loss / len(loader)
        avg_cls_val_loss = cls_val_loss / len(loader)
        
        for k in results:
            results[k] = np.mean(results[k])
                
        return avg_de_val_loss, results, avg_cls_val_loss
    
    def de_fit(self):
        self.decoder.to(self.device)
        self.extractor.to(self.device)
        self.classifier.to(self.device)

        fake_weight = [len(self.label_set)] * len(self.unlabel_set)
        real_weight = [len(self.unlabel_set)] * len(self.label_set)

        sampler = WeightedRandomSampler(fake_weight+real_weight, len(self.merge_set), replacement=True)

        train_loader = DataLoader(self.merge_set, batch_size=self.config['batch_size'], shuffle=False, num_workers=self.num_data_loader_workers, sampler=sampler)
        valid_loader = DataLoader(self.valid_set, batch_size=self.config['batch_size'], shuffle=False, num_workers=self.num_data_loader_workers)
        
        for epoch in range(self.config['de_epochs']):
            de_train_loss = self.de_training(epoch, train_loader)
            cls_train_loss = self.cls_training(epoch, train_loader)

            print('---------------------------------------')
            print("Training: ")
            print(" [Decoder] Average training loss: {0:.3f}".format(de_train_loss))
            print(" [Classifier] Average training loss: {0:.3f}".format(cls_train_loss))
            if (epoch + 1) % 10 == 0:
                de_val_loss, de_val_res, cls_val_loss = self.de_validation(valid_loader)
            
                print("Validation:")
                print(" [Decoder] Average validation loss: {0:.3f}".format(de_val_loss))
                print(" [Classifier] Average validation loss: {0:.3f}".format(cls_val_loss))
                print(" [Decoder] Average validation result:")

                for key,val in de_val_res.items():
                    print(f"{key}:{val:.4f}")
    

In [3]:
config = {
    'experiment': 'autoencoder_testting',
    'model': 'VAE',
    'architecture': 'concatenate',
    'activation': 'sigmoid',
    'dataset': '20news',
    'vocab_size':0,
    'encoder': 'mpnet',
    'target': 'tf-idf-gensim',
    'seed': 123,
    'epochs': 20,
    'de_epochs': 20,
    'en_epochs':0,
    'lr': 1e-4,
    'ae_lr':1e-4,
    'optim': 'AdamW',
    'scheduler': False,
    'warmup': 'linear',
    'warmup_proportion': 0.1, 
    'loss': 'listnet',
    'batch_size': 32,
    'weight_decay': 0,
    'ratio': 0.1,
    'topk': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'save': False,
    'threshold': 0.7,
    'balance': False,
}
same_seeds(config["seed"])

In [4]:
# Parameter
if config['dataset'] == '20news':
    config['min_df'], config['max_df'], config['min_doc_word'] = 62, 1.0, 15
elif config['dataset'] == 'agnews':
    config['min_df'], config['max_df'], config['min_doc_word'] = 425, 1.0, 15
elif config['dataset'] == 'IMDB':
    config['min_df'], config['max_df'], config['min_doc_word'] = 166, 1.0, 15
elif config['dataset'] == 'wiki':
    config['min_df'], config['max_df'], config['min_doc_word'] = 2872, 1.0, 15
elif config['dataset'] == 'tweet':
    config['min_df'], config['max_df'], config['min_doc_word'] = 5, 1.0, 15

In [5]:
# Data preprocessing
unpreprocessed_corpus ,preprocessed_corpus = get_preprocess_document(**config)
texts = [text.split() for text in preprocessed_corpus]
print('[INFO] Load corpus done.')

# Generating document embedding
while True:
    try:
        doc_embs, doc_model, device = get_preprocess_document_embs(preprocessed_corpus, config['encoder'])
        break
    except:
        print('[Error] CUDA Memory Insufficient, retry after 15 secondes.')
        time.sleep(15)
print('[INFO] Generate embedding done.')

# Generate Decode target & Vocabulary
if config['target'] == 'keybert' or config['target'] == 'yake':
    labels, vocabularys= load_preprocess_document_labels(config)
    label = labels[config['target']].toarray()
else:
    labels, vocabularys= get_preprocess_document_labels(preprocessed_corpus)
    label = labels[config['target']]
    vocabularys = vocabularys[config['target']]
print('[INFO] Load label done.')

# generate idx to token
id2token = {k: v for k, v in zip(range(0, len(vocabularys)), vocabularys)}
print('[INFO] Generate id2token done.')

reals = np.ones((doc_embs.shape[0], 1))

dataset = IDEDataset(unpreprocessed_corpus, preprocessed_corpus, doc_embs, label, reals)
label_length = int(len(dataset) * config['ratio'])
unlabel_length = int(len(dataset) * (0.8 - config['ratio']))
validation_length = len(dataset) - label_length - unlabel_length
label_set, unlabel_set, validation_set = random_split(dataset, lengths=[label_length, unlabel_length, validation_length], generator=torch.Generator().manual_seed(42))


Getting preprocess documents: 20news
min_df: 62 max_df: 1.0 vocabulary_size: None min_doc_word: 15




[INFO] Load corpus done.
Getting preprocess documents embeddings
Using cuda 0 for training...


Batches:   0%|          | 0/1162 [00:00<?, ?it/s]

[INFO] Generate embedding done.
Getting preprocess documents labels
[INFO] Load label done.
[INFO] Generate id2token done.


In [52]:
model = IDEDADecoder(config, label_set, unlabel_set, validation_set, vocabularys, id2token, device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
model.en_fit()
model.de_fit()


Encoder Training...
---------------------------------------
Training: 
 [Encoder] Average training loss: 28.778
 [Encoder] Average training distance: 10.829
 [Encoder] Average training cosine similarity: 0.429
Validation:
 [Encoder] Average validation loss: 26.473
 [Encoder] Average validation dis: 10.463
 [Encoder] Average validation cosine similarity: 0.499


In [47]:
loss_ft = torch.nn.KLDivLoss()

In [105]:
input = torch.rand(32, 768)
output = torch.rand(32, 768)
target = torch.ones(input.shape[0])

In [106]:
input = input.unsqueeze(dim=-1)

In [107]:
m = nn.Conv1d(768, 1024, 1, stride=2)
n = nn.Conv1d(1024, 2048, 1, stride=2)
p = nn.Linear(2048, 1024)

In [108]:
f_i = m(input)

In [90]:
f_i.shape

torch.Size([32, 1024, 1])

In [91]:
f_ii = n(f_i)

In [99]:
f_ii = f_ii.squeeze(dim=-1)

In [100]:
f_ii.shape

torch.Size([32, 2048])

In [101]:
p_i = p(f_ii)

In [49]:
r_i[0]

tensor([[0.7780],
        [0.7515],
        [0.0189],
        [0.2546],
        [0.2149],
        [0.0862],
        [0.5484],
        [0.2309],
        [0.4746],
        [0.7762],
        [0.1945],
        [0.3249],
        [0.9649],
        [0.2566],
        [0.2249],
        [0.1953],
        [0.8901],
        [0.9546],
        [0.4946],
        [0.6430],
        [0.8602],
        [0.3492],
        [0.5560],
        [0.8231],
        [0.2780],
        [0.1194],
        [0.9353],
        [0.0206],
        [0.6764],
        [0.5586],
        [0.4602],
        [0.7814],
        [0.8910],
        [0.0398],
        [0.4550],
        [0.1149],
        [0.3009],
        [0.4356],
        [0.4032],
        [0.0020],
        [0.4546],
        [0.7572],
        [0.8089],
        [0.0862],
        [0.3820],
        [0.0703],
        [0.7150],
        [0.5383],
        [0.5395],
        [0.9188],
        [0.5681],
        [0.5385],
        [0.5496],
        [0.9235],
        [0.2063],
        [0

In [47]:
f_i[0]

tensor([[-0.5675],
        [-0.0274],
        [ 0.3374],
        ...,
        [ 0.1751],
        [-0.2532],
        [-0.3159]], grad_fn=<SelectBackward0>)

In [48]:
p_i[0]

tensor([[-0.5675],
        [-0.0274],
        [ 0.3374],
        ...,
        [ 0.1751],
        [-0.2532],
        [-0.3159]], grad_fn=<SelectBackward0>)

In [21]:
class FeatureExtractor(nn.Module):
    def __init__(self, input_dim, feature_dim, hidden_dim=1024):
        super(FeatureExtractor, self).__init__()
        self.input_dim = input_dim
        self.feature_dim = feature_dim
        self.hidden_dim = hidden_dim
        self.cnn = nn.Sequential(
            nn.Conv1d(input_dim, hidden_dim, 1, stride=2),
            nn.ReLU(),
            nn.Conv1d(hidden_dim, hidden_dim*2, 1, stride=2),
            nn.ReLU(),
        )
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.Sigmoid(),
            nn.Linear(hidden_dim, feature_dim),
        )

    def forward(self, embs):
        conv_embs = self.cnn(embs.unsqueeze(dim=-1))
        feature = self.fc(conv_embs.squeeze(dim=-1))
        return feature

In [29]:
model = FeatureExtractor(768, 768)
train_loader = DataLoader(label_set, batch_size=config['batch_size'], shuffle=False)
for batch, (docs, corpus, embs, labels) in enumerate(train_loader):
    features = model(embs)
    print(features.shape)

torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
torch.Size

In [3]:
real = torch.rand([32, 768])
fake = torch.rand([32, 768])
margin = 2.0

In [20]:
euclidean_distance = F.pairwise_distance(real, fake, keepdim=True)

In [21]:
euclidean_distance.shape

torch.Size([32, 1])

In [22]:
euclidean_distance = torch.cdist(fake, real, p=2)

In [23]:
dist.shape

torch.Size([32, 32])

In [30]:
label = torch.eye(32)

In [31]:
loss_contrastive = torch.mean((1-label) * torch.pow(dist, 2) +
                                      (label) * torch.pow(torch.clamp(margin - dist, min=0.0), 2))


In [32]:
label.shape

torch.Size([32, 32])

In [33]:
loss_contrastive

tensor(123.8685)

In [4]:
loss_ce = torch.nn.CrossEntropyLoss()

In [16]:
input = torch.rand([32, 2])
soft = torch.nn.Softmax(dim=-1)
soft_input = soft(input)

In [17]:
loss = loss_ce(soft_input, torch.ones([input.shape[0]], dtype=torch.long))

In [18]:
loss

tensor(0.6956)

In [15]:
torch.ones([input.shape[0]], dtype=torch.long).shape

torch.Size([32])

In [3]:
embs1 = torch.rand([32, 768])
embs2 = torch.rand([32, 768])
embs3 = torch.rand([32, 768])
embs4 = torch.rand([32, 768])

In [5]:
embs5 = torch.stack((embs1, embs2, embs3), dim=1)

In [6]:
embs5.shape

torch.Size([32, 3, 768])

In [15]:
multihead_attn = nn.MultiheadAttention(768, 3)

In [17]:
output, weight = multihead_attn(embs3, embs2, embs1)

In [29]:
transformer_model = nn.Transformer(d_model=768, nhead=16, num_encoder_layers=12)

In [30]:
out = transformer_model(embs1, embs2)

In [31]:
out

tensor([[ 0.6732, -0.1739,  0.7569,  ...,  0.3597,  2.2979, -0.2133],
        [-0.0127, -0.1261, -0.1806,  ...,  0.1698,  3.0161, -0.1863],
        [ 0.5780, -0.2042,  0.3258,  ...,  0.7780,  1.7690, -0.0397],
        ...,
        [-0.9630, -0.9636,  0.8446,  ...,  0.2313,  1.2824, -0.5504],
        [-0.8126, -0.2329,  0.5636,  ...,  0.0285,  2.2121,  0.1297],
        [-0.1163, -0.2724,  0.5043,  ...,  0.6184,  1.7682, -0.0203]],
       grad_fn=<NativeLayerNormBackward0>)

In [33]:
out.shape

torch.Size([32, 768])

In [38]:
x = embs1.unsqueeze(dim=-1)
y = embs2.unsqueeze(dim=-1)
z = embs3.unsqueeze(dim=-1)

In [39]:
output, weight = multihead_attn(x, y, z)

AssertionError: was expecting embedding dimension of 768, but got 1

In [4]:
cnn = nn.Sequential(
        nn.Conv2d(768, 1024, 2, stride=2),
        nn.ReLU(),
        nn.Conv2d(1024, 1024*2, 2, stride=2),
        nn.ReLU(),
    )
fc= nn.Sequential(
        nn.Linear(1024*2, 1024),
        nn.BatchNorm1d(1024),
        nn.Sigmoid(),
        nn.Linear(1024, 768),
)

In [21]:
embs5.unsqueeze(dim=1).shape

torch.Size([32, 1, 3, 768])

In [51]:
m = nn.Conv2d(1, 3, (3, 1), stride=1)
n = nn.Conv2d(3, 6, 2, stride=2)

In [52]:
f = m(embs5.unsqueeze(dim=1))

In [53]:
f.shape

torch.Size([32, 3, 1, 768])

In [35]:
g = n(f)

RuntimeError: Calculated padded input size per channel: (1 x 384). Kernel size: (2 x 2). Kernel size can't be greater than actual input size