In [1]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoModel, AutoTokenizer
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import argparse
from kobert_tokenizer import KoBertTokenizer
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from scipy.stats import pearsonr, spearmanr
import math
import logging
from tqdm import tqdm
import numpy as np
from numpy import ndarray
import pandas as pd
import torch
from torch import Tensor, device
import transformers
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from typing import List, Dict, Tuple, Type, Union
from kobert_tokenizer import KoBertTokenizer

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s', datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
class PLM_MODEL(object):
    """
    A class for embedding sentences, calculating similarities, and retriving sentences by DiffCSE. The code here is provided by SimCSE.
    """
    def __init__(self, model_name_or_path: str, 
                device: str = None,
                num_cells: int = 100,
                num_cells_in_search: int = 10,
                pooler = None):

        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        # self.tokenizer = KoBertTokenizer.from_pretrained(model_name_or_path)
        self.model = AutoModel.from_pretrained(model_name_or_path)
        if device is None:
            device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.device = device

        self.index = None
        self.is_faiss_index = False
        self.num_cells = num_cells
        self.num_cells_in_search = num_cells_in_search

        if pooler is not None:
            self.pooler = pooler
        else:
            logger.info("Use `cls_before_pooler` for DiffCSE models. If you want to use other pooling policy, specify `pooler` argument.")
            self.pooler = "cls_before_pooler"
    
    def encode(self, sentence: Union[str, List[str]], 
                device: str = None, 
                return_numpy: bool = False,
                normalize_to_unit: bool = True,
                keepdim: bool = False,
                batch_size: int = 2,
                max_length: int = 128) -> Union[ndarray, Tensor]:

        target_device = self.device if device is None else device
        self.model = self.model.to(target_device)
        
        single_sentence = False
        if isinstance(sentence, str):
            sentence = [sentence]
            single_sentence = True

        embedding_list = [] 
        with torch.no_grad():
            total_batch = len(sentence) // batch_size + (1 if len(sentence) % batch_size > 0 else 0)
            for batch_id in tqdm(range(total_batch)):
                inputs = self.tokenizer(
                    sentence[batch_id*batch_size:(batch_id+1)*batch_size], 
                    padding=True, 
                    truncation=True, 
                    max_length=max_length, 
                    return_tensors="pt"
                )
                inputs = {k: v.to(target_device) for k, v in inputs.items()}
                outputs = self.model(**inputs, return_dict=True)
                if self.pooler == "cls":
                    embeddings = outputs.pooler_output
                elif self.pooler == "cls_before_pooler":
                    embeddings = outputs.last_hidden_state[:, 0]
                else:
                    raise NotImplementedError
                if normalize_to_unit:
                    embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)
                embedding_list.append(embeddings.cpu())
        embeddings = torch.cat(embedding_list, 0)
        
        if single_sentence and not keepdim:
            embeddings = embeddings[0]
        
        if return_numpy and not isinstance(embeddings, ndarray):
            return embeddings.numpy()
        return embeddings
    
    def similarity(self, queries: Union[str, List[str]], 
                    keys: Union[str, List[str], ndarray], 
                    device: str = None) -> Union[float, ndarray]:
        
        query_vecs = self.encode(queries, device=device, return_numpy=True) # suppose N queries
        
        if not isinstance(keys, ndarray):
            key_vecs = self.encode(keys, device=device, return_numpy=True) # suppose M keys
        else:
            key_vecs = keys

        # check whether N == 1 or M == 1
        single_query, single_key = len(query_vecs.shape) == 1, len(key_vecs.shape) == 1 
        if single_query:
            query_vecs = query_vecs.reshape(1, -1)
        if single_key:
            key_vecs = key_vecs.reshape(1, -1)
        
        # returns an N*M similarity array
        similarities = cosine_similarity(query_vecs, key_vecs)
        
        if single_query:
            similarities = similarities[0]
            if single_key:
                similarities = float(similarities[0])
        
        return similarities
    
    def build_index(self, sentences_or_file_path: Union[str, List[str]], 
                        use_faiss: bool = None,
                        faiss_fast: bool = False,
                        device: str = None,
                        batch_size: int = 64):

        if use_faiss is None or use_faiss:
            try:
                import faiss
                assert hasattr(faiss, "IndexFlatIP")
                use_faiss = True 
            except:
                logger.warning("Fail to import faiss. If you want to use faiss, install faiss through PyPI. Now the program continues with brute force search.")
                use_faiss = False
        
        # if the input sentence is a string, we assume it's the path of file that stores various sentences
        if isinstance(sentences_or_file_path, str):
            sentences = []
            with open(sentences_or_file_path, "r") as f:
                logging.info("Loading sentences from %s ..." % (sentences_or_file_path))
                for line in tqdm(f):
                    sentences.append(line.rstrip())
            sentences_or_file_path = sentences
        
        logger.info("Encoding embeddings for sentences...")
        embeddings = self.encode(sentences_or_file_path, device=device, batch_size=batch_size, normalize_to_unit=True, return_numpy=True)

        logger.info("Building index...")
        self.index = {"sentences": sentences_or_file_path}
        
        if use_faiss:
            quantizer = faiss.IndexFlatIP(embeddings.shape[1])  
            if faiss_fast:
                index = faiss.IndexIVFFlat(quantizer, embeddings.shape[1], min(self.num_cells, len(sentences_or_file_path))) 
            else:
                index = quantizer

            if (self.device == "cuda" and device != "cpu") or device == "cuda":
                if hasattr(faiss, "StandardGpuResources"):
                    logger.info("Use GPU-version faiss")
                    res = faiss.StandardGpuResources()
                    res.setTempMemory(20 * 1024 * 1024 * 1024)
                    index = faiss.index_cpu_to_gpu(res, 0, index)
                else:
                    logger.info("Use CPU-version faiss")
            else: 
                logger.info("Use CPU-version faiss")

            if faiss_fast:            
                index.train(embeddings.astype(np.float32))
            index.add(embeddings.astype(np.float32))
            index.nprobe = min(self.num_cells_in_search, len(sentences_or_file_path))
            self.is_faiss_index = True
        else:
            index = embeddings
            self.is_faiss_index = False
        self.index["index"] = index
        logger.info("Finished")
    
    def search(self, queries: Union[str, List[str]], 
                device: str = None, 
                threshold: float = 0,
                top_k: int = 5) -> Union[List[Tuple[str, float]], List[List[Tuple[str, float]]]]:
        
        if not self.is_faiss_index:
            if isinstance(queries, list):
                combined_results = []
                for query in queries:
                    results = self.search(query, device)
                    combined_results.append(results)
                return combined_results
            
            similarities = self.similarity(queries, self.index["index"]).tolist()
            id_and_score = []
            for i, s in enumerate(similarities):
                if s >= threshold:
                    id_and_score.append((i, s))
            id_and_score = sorted(id_and_score, key=lambda x: x[1], reverse=True)[:top_k]
            results = [(self.index["sentences"][idx], score) for idx, score in id_and_score]
            return results
        else:
            query_vecs = self.encode(queries, device=device, normalize_to_unit=True, keepdim=True, return_numpy=True)

            distance, idx = self.index["index"].search(query_vecs.astype(np.float32), top_k)
            
            def pack_single_result(dist, idx):
                results = [(self.index["sentences"][i], s) for i, s in zip(idx, dist) if s >= threshold]
                return results
            
            if isinstance(queries, list):
                combined_results = []
                for i in range(len(queries)):
                    results = pack_single_result(distance[i], idx[i])
                    combined_results.append(results)
                return combined_results
            else:
                return pack_single_result(distance[0], idx[0])

In [8]:
def euclidean_distance(x,y):
  """ return euclidean distance between two lists """
 
  return math.sqrt(sum(pow(a-b,2) for a, b in zip(x, y)))

def squared_sum(x):
  """ return 3 rounded square rooted value """
 
  return round(math.sqrt(sum([a*a for a in x])),3)
  
def cos_similarity(x,y):
  """ return cosine similarity between two lists """
 
  numerator = sum(a*b for a,b in zip(x,y))
  denominator = squared_sum(x)*squared_sum(y)
  return round(numerator/float(denominator),3)

def evaluation(eval_dataset,model):
    sen_emb1 = model.encode(eval_dataset['sentence1'].tolist())
    sen_emb2 = model.encode(eval_dataset['sentence2'].tolist())
    labels = eval_dataset['score']

    cosine_scores = 1 - (paired_cosine_distances(sen_emb1, sen_emb2))
    manhattan_distances = -paired_manhattan_distances(sen_emb1, sen_emb2)
    euclidean_distances = -paired_euclidean_distances(sen_emb1, sen_emb2)
    dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(sen_emb1, sen_emb2)]
    
    eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
    eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)

    eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
    eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)

    eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
    eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)

    eval_pearson_dot, _ = pearsonr(labels, dot_products)
    eval_spearman_dot, _ = spearmanr(labels, dot_products)

    score = {'eval_pearson_cosine': eval_pearson_cosine,
            'eval_spearman_cosine': eval_spearman_cosine,
            'eval_pearson_manhattan': eval_pearson_manhattan,
            'eval_spearman_manhattan': eval_spearman_manhattan,
            'eval_pearson_euclidean': eval_pearson_euclidean,
            'eval_spearman_euclidean': eval_spearman_euclidean,
            'eval_pearson_dot': eval_pearson_dot,
            'eval_spearman_dot': eval_spearman_dot}
    return score

In [9]:
ko_sbert_multitask = 'jhgan/ko-sbert-multitask'
msbert = 'sentence-transformers/paraphrase-xlm-r-multilingual-v1'
kcelectra = "beomi/KcELECTRA-base"
kosimcse = "BM-K/KoSimCSE-roberta"
kobert = 'monologg/kobert'
kodiffcse = "/home/keonwoo/anaconda3/envs/KoDiffCSE/sroberta_change_lr"
koroberta = "klue/roberta-base"

path = "/home/keonwoo/anaconda3/envs/KoDiffCSE/data/ko_sts_test.txt"

data = pd.read_csv(path)

koroberta

In [10]:
koroberta= PLM_MODEL(koroberta)
evaluation(data, koroberta)

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for

{'eval_pearson_cosine': 0.25679844613822966,
 'eval_spearman_cosine': 0.322253040444909,
 'eval_pearson_manhattan': 0.30918020338289237,
 'eval_spearman_manhattan': 0.3230152408737565,
 'eval_pearson_euclidean': 0.3081440348576169,
 'eval_spearman_euclidean': 0.3222551174547771,
 'eval_pearson_dot': 0.2567991498556915,
 'eval_spearman_dot': 0.3222567713268123}

: 

ko_sbert_multitask

In [8]:
ko_sbert_multitask= PLM_MODEL(ko_sbert_multitask)
evaluation(data, ko_sbert_multitask)

08/17/2022 19:52:40 - INFO - __main__ -   Use `cls_before_pooler` for DiffCSE models. If you want to use other pooling policy, specify `pooler` argument.
100%|██████████| 688/688 [00:13<00:00, 49.79it/s]
100%|██████████| 688/688 [00:12<00:00, 55.47it/s]


{'eval_pearson_cosine': 0.8282746047533913,
 'eval_spearman_cosine': 0.8331369159176246,
 'eval_pearson_manhattan': 0.8263263709233586,
 'eval_spearman_manhattan': 0.8327156016896692,
 'eval_pearson_euclidean': 0.8269972815899188,
 'eval_spearman_euclidean': 0.8331352218673969,
 'eval_pearson_dot': 0.8282746052892088,
 'eval_spearman_dot': 0.8331339952185075}

: 

msbert

In [6]:
msbert= PLM_MODEL(msbert)
evaluation(data, msbert)

08/17/2022 19:54:21 - INFO - __main__ -   Use `cls_before_pooler` for DiffCSE models. If you want to use other pooling policy, specify `pooler` argument.
100%|██████████| 688/688 [00:09<00:00, 71.01it/s]
100%|██████████| 688/688 [00:08<00:00, 85.84it/s]


{'eval_pearson_cosine': 0.7344746573351436,
 'eval_spearman_cosine': 0.7437088820329356,
 'eval_pearson_manhattan': 0.7513232675402886,
 'eval_spearman_manhattan': 0.7471425832200496,
 'eval_pearson_euclidean': 0.7480378062050028,
 'eval_spearman_euclidean': 0.7437068734137403,
 'eval_pearson_dot': 0.7344746304708472,
 'eval_spearman_dot': 0.7437101591465074}

: 

kcelectra

In [5]:
kcelectra= PLM_MODEL(kcelectra)
evaluation(data, kcelectra)

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
08/17/2022 19:55:35 - INFO - __main__ -   Use `cls_before_pooler` for DiffCSE models. If you want to use other pooling policy, specify `pooler` argument.
100%|██████████| 688/688 [00:11<00:00, 59.86it/s]
100%|██████████| 688/688 [00:12<00:00, 56.4

{'eval_pearson_cosine': 0.14947429068230578,
 'eval_spearman_cosine': 0.22748011671833054,
 'eval_pearson_manhattan': 0.19911186599993233,
 'eval_spearman_manhattan': 0.22932989363658562,
 'eval_pearson_euclidean': 0.1990849249670487,
 'eval_spearman_euclidean': 0.22747870750095237,
 'eval_pearson_dot': 0.1494743415402462,
 'eval_spearman_dot': 0.22747942966845075}

: 

kosimcse

In [5]:
kosimcse= PLM_MODEL(kosimcse)
evaluation(data, kosimcse)

08/17/2022 19:56:39 - INFO - __main__ -   Use `cls_before_pooler` for DiffCSE models. If you want to use other pooling policy, specify `pooler` argument.
100%|██████████| 688/688 [00:10<00:00, 63.62it/s]
100%|██████████| 688/688 [00:12<00:00, 55.08it/s]


{'eval_pearson_cosine': 0.8318715147053124,
 'eval_spearman_cosine': 0.8348851282018802,
 'eval_pearson_manhattan': 0.8300508476463244,
 'eval_spearman_manhattan': 0.8346963758335499,
 'eval_pearson_euclidean': 0.8301738518983001,
 'eval_spearman_euclidean': 0.8348824759936001,
 'eval_pearson_dot': 0.8318715139464892,
 'eval_spearman_dot': 0.8348828650876597}

: 

kodiffcse

In [5]:
kodiffcse= PLM_MODEL(kodiffcse)
evaluation(data, kodiffcse)

Some weights of the model checkpoint at /home/keonwoo/anaconda3/envs/KoDiffCSE/sroberta_change_lr were not used when initializing RobertaModel: ['discriminator.encoder.layer.0.attention.self.value.bias', 'discriminator.encoder.layer.10.attention.self.key.weight', 'discriminator.encoder.layer.1.output.LayerNorm.weight', 'discriminator.encoder.layer.5.attention.self.value.weight', 'generator.roberta.encoder.layer.1.attention.self.key.bias', 'generator.roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'discriminator.encoder.layer.11.attention.self.key.bias', 'discriminator.embeddings.LayerNorm.weight', 'discriminator.encoder.layer.10.output.LayerNorm.bias', 'discriminator.encoder.layer.1.attention.self.query.bias', 'discriminator.encoder.layer.9.intermediate.dense.weight', 'discriminator.encoder.layer.5.attention.output.LayerNorm.weight', 'discriminator.encoder.layer.7.attention.self.key.bias', 'discriminator.encoder.layer.0.attention.self.query.weight', 'generator.roberta.encoder

{'eval_pearson_cosine': 0.839635876989272,
 'eval_spearman_cosine': 0.8445370018525205,
 'eval_pearson_manhattan': 0.8422651782455157,
 'eval_spearman_manhattan': 0.8445933927701841,
 'eval_pearson_euclidean': 0.842234823517936,
 'eval_spearman_euclidean': 0.8445351939944561,
 'eval_pearson_dot': 0.8396358754378573,
 'eval_spearman_dot': 0.8445377247537453}

: 

kobert

In [7]:
kobert= PLM_MODEL(kobert)
evaluation(data, kobert)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.
08/17/2022 19:59:18 - INFO - __main__ -   Use `cls_before_pooler` for DiffCSE models. If you want to use other pooling policy, specify `pooler` argument.
100%|██████████| 688/688 [00:13<00:00, 50.61it/s]
100%|██████████| 688/688 [00:10<00:00, 64.96it/s]


{'eval_pearson_cosine': 0.21849038706847992,
 'eval_spearman_cosine': 0.25617634499135167,
 'eval_pearson_manhattan': 0.26066839293480387,
 'eval_spearman_manhattan': 0.2542042615476074,
 'eval_pearson_euclidean': 0.25948163369055666,
 'eval_spearman_euclidean': 0.25617530966095187,
 'eval_pearson_dot': 0.2184903961732708,
 'eval_spearman_dot': 0.2561754521316946}

: 

tmp


In [2]:
from transformers import AutoModel, AutoTokenizer

ckpt = 'princeton-nlp/sup-simcse-bert-base-uncased'
model = AutoModel.from_pretrained(ckpt)

In [7]:
model.config.hidden_size

768

In [8]:
from transformers import BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
BertModel

transformers.models.bert.modeling_bert.BertModel

In [11]:
AutoModel

transformers.models.auto.modeling_auto.AutoModel