In [2]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoModel, AutoTokenizer
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [99]:
path = "/home/keonwoo/anaconda3/envs/KoDiffCSE/data/ko_sts_test.txt"

data = pd.read_csv(path)
data = data[data['score'] > 4.0]
data['sentence1'].to_csv("sent1.csv", index=False)
data['sentence2'].to_csv("sent2.csv", index=False)

In [60]:
def load_model(path):
    tok = AutoTokenizer.from_pretrained(path)
    model = AutoModel.from_pretrained(path)
    return tok, model

ko_sbert_multitask = 'jhgan/ko-sbert-multitask'
KoSbert_tok, KoSbert_model = load_model(ko_sbert_multitask)

In [101]:
class contentDataset(Dataset):
    def __init__(self, file, tok, max_len, pad_index=None):
        super().__init__()
        self.tok =tok
        self.max_len = max_len
        self.content = pd.read_csv(file)
        self.len = self.content.shape[0]
        self.pad_index = self.tok.pad_token
        self.column = self.content.columns[0]
    
    def add_padding_data(self, inputs, max_len):
        if len(inputs) < max_len:
            pad = np.array([0] * (max_len - len(inputs)))
            inputs = np.concatenate([inputs, pad])
            return inputs
        else:
            inputs = inputs[:max_len]
            return inputs
    
    def __getitem__(self,idx):
        instance = self.content.iloc[idx]
        text = instance[self.column]
        input_ids = self.tok.encode(text)
        
        input_ids = self.add_padding_data(input_ids, max_len=self.max_len)
        return {"encoder_input_ids" : np.array(input_ids, dtype=np.int_)}        

    def __len__(self):
        return self.len

In [102]:
data_setup_1 = contentDataset(file = "sent1.csv", tok = KoSbert_tok, max_len = 128)
data_setup_2 = contentDataset(file = "sent2.csv", tok = KoSbert_tok, max_len = 128)

dataloader_1 = DataLoader(data_setup_1, batch_size=32, shuffle=False)
dataloader_2 = DataLoader(data_setup_2, batch_size=32, shuffle=False)

In [103]:
def align_loss(x, y, alpha=2):    
    return (x - y).norm(p=2, dim=1).pow(alpha).mean()

def uniform_loss(x, t=2):
    return torch.pdist(x, p=2).pow(2).mul(-t).exp().mean().log()

In [106]:
device = torch.device("cuda:2") if torch.cuda.is_available() else torch.device("cpu")

KoSbert_model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(32000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [132]:
align_all = []
unif_all = []

for batch1, batch2 in zip(dataloader_1, dataloader_2):
    batch1 = {k: v.to(device) for k, v in batch1.items()}
    batch2 = {k: v.to(device) for k, v in batch2.items()}

    encoder_attention_mask1 = batch1["encoder_input_ids"].ne(0).float().to(device)
    encoder_attention_mask2 = batch2["encoder_input_ids"].ne(0).float().to(device)

    with torch.no_grad():
        outputs1 = KoSbert_model(batch1['encoder_input_ids'], attention_mask=encoder_attention_mask1)
        outputs2 = KoSbert_model(batch2['encoder_input_ids'], attention_mask=encoder_attention_mask2)

        pooler_output1 = outputs1.pooler_output
        pooler_output2 = outputs2.pooler_output

        pooler_output1 = F.normalize(pooler_output1,p=2,dim=1)
        pooler_output2 = F.normalize(pooler_output2,p=2,dim=1)

        align_all.append(align_loss(pooler_output1, pooler_output2, alpha=2))

        pooler_cat = torch.cat((pooler_output1, pooler_output2))
        unif_all.append(uniform_loss(pooler_cat, t=2))

In [133]:
alignment = sum(align_all) / len(align_all)
alignment

tensor(0.1995, device='cuda:2')

In [134]:
uniformity = sum(unif_all) / len(unif_all)
uniformity

tensor(-2.2073, device='cuda:2')

In [4]:
tok = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")
model = AutoModel.from_pretrained("beomi/KcELECTRA-base")

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
if 'ELECTRA' in 'KcELECTRA':
    print('d')

d
