In [1]:
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from scipy.stats import pearsonr, spearmanr
import math

In [2]:
def euclidean_distance(x,y):
  """ return euclidean distance between two lists """
 
  return math.sqrt(sum(pow(a-b,2) for a, b in zip(x, y)))

def squared_sum(x):
  """ return 3 rounded square rooted value """
 
  return round(math.sqrt(sum([a*a for a in x])),3)
  
def cos_similarity(x,y):
  """ return cosine similarity between two lists """
 
  numerator = sum(a*b for a,b in zip(x,y))
  denominator = squared_sum(x)*squared_sum(y)
  return round(numerator/float(denominator),3)

In [3]:
import logging
from tqdm import tqdm
import numpy as np
from numpy import ndarray
import pandas as pd
import torch
from torch import Tensor, device
import transformers
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from typing import List, Dict, Tuple, Type, Union
from kobert_tokenizer import KoBertTokenizer

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s', datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

In [4]:
sts_dev = pd.read_csv("/home/keonwoo/anaconda3/envs/KoDiffCSE/data/ko_sts_dev.txt")
sts_test = pd.read_csv("/home/keonwoo/anaconda3/envs/KoDiffCSE/data/ko_sts_test.txt")

In [33]:
class SE_model(object):
    """
    A class for embedding sentences, calculating similarities, and retriving sentences by DiffCSE. The code here is provided by SimCSE.
    """
    def __init__(self, 
                seq2seq,
                model_name_or_path: str, 
                device: str = None,
                num_cells: int = 100,
                num_cells_in_search: int = 10,
                pooler = None):

        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        # self.tokenizer = KoBertTokenizer.from_pretrained(model_name_or_path)
        self.model = AutoModel.from_pretrained(model_name_or_path)
        if seq2seq:
            self.model = self.model.encoder
        if device is None:
            device = "cuda:1" if torch.cuda.is_available() else "cpu"
        self.device = device

        self.index = None
        self.is_faiss_index = False
        self.num_cells = num_cells
        self.num_cells_in_search = num_cells_in_search

        if pooler is not None:
            self.pooler = pooler
        else:
            logger.info("Use `cls_before_pooler` for DiffCSE models. If you want to use other pooling policy, specify `pooler` argument.")
            self.pooler = "cls_before_pooler"
    
    def encode(self, sentence: Union[str, List[str]], 
                device: str = None, 
                return_numpy: bool = False,
                normalize_to_unit: bool = True,
                keepdim: bool = False,
                batch_size: int = 64,
                max_length: int = 128) -> Union[ndarray, Tensor]:

        target_device = self.device if device is None else device
        self.model = self.model.to(target_device)
        
        single_sentence = False
        if isinstance(sentence, str):
            sentence = [sentence]
            single_sentence = True

        embedding_list = [] 
        with torch.no_grad():
            total_batch = len(sentence) // batch_size + (1 if len(sentence) % batch_size > 0 else 0)
            for batch_id in tqdm(range(total_batch)):
                inputs = self.tokenizer(
                    sentence[batch_id*batch_size:(batch_id+1)*batch_size], 
                    padding=True, 
                    truncation=True, 
                    max_length=max_length, 
                    return_tensors="pt"
                )
                inputs = {k: v.to(target_device) for k, v in inputs.items()}
                outputs = self.model(**inputs, return_dict=True)
                if self.pooler == "cls":
                    embeddings = outputs.pooler_output
                elif self.pooler == "cls_before_pooler":
                    embeddings = outputs.last_hidden_state[:, 0]
                else:
                    raise NotImplementedError
                if normalize_to_unit:
                    embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)
                embedding_list.append(embeddings.cpu())
        embeddings = torch.cat(embedding_list, 0)
        
        if single_sentence and not keepdim:
            embeddings = embeddings[0]
        
        if return_numpy and not isinstance(embeddings, ndarray):
            return embeddings.numpy()
        return embeddings
    
    def similarity(self, queries: Union[str, List[str]], 
                    keys: Union[str, List[str], ndarray], 
                    device: str = None) -> Union[float, ndarray]:
        
        query_vecs = self.encode(queries, device=device, return_numpy=True) # suppose N queries
        
        if not isinstance(keys, ndarray):
            key_vecs = self.encode(keys, device=device, return_numpy=True) # suppose M keys
        else:
            key_vecs = keys

        # check whether N == 1 or M == 1
        single_query, single_key = len(query_vecs.shape) == 1, len(key_vecs.shape) == 1 
        if single_query:
            query_vecs = query_vecs.reshape(1, -1)
        if single_key:
            key_vecs = key_vecs.reshape(1, -1)
        
        # returns an N*M similarity array
        similarities = cosine_similarity(query_vecs, key_vecs)
        
        if single_query:
            similarities = similarities[0]
            if single_key:
                similarities = float(similarities[0])
        
        return similarities
    
    def build_index(self, sentences_or_file_path: Union[str, List[str]], 
                        use_faiss: bool = None,
                        faiss_fast: bool = False,
                        device: str = None,
                        batch_size: int = 64):

        if use_faiss is None or use_faiss:
            try:
                import faiss
                assert hasattr(faiss, "IndexFlatIP")
                use_faiss = True 
            except:
                logger.warning("Fail to import faiss. If you want to use faiss, install faiss through PyPI. Now the program continues with brute force search.")
                use_faiss = False
        
        # if the input sentence is a string, we assume it's the path of file that stores various sentences
        if isinstance(sentences_or_file_path, str):
            sentences = []
            with open(sentences_or_file_path, "r") as f:
                logging.info("Loading sentences from %s ..." % (sentences_or_file_path))
                for line in tqdm(f):
                    sentences.append(line.rstrip())
            sentences_or_file_path = sentences
        
        logger.info("Encoding embeddings for sentences...")
        embeddings = self.encode(sentences_or_file_path, device=device, batch_size=batch_size, normalize_to_unit=True, return_numpy=True)

        logger.info("Building index...")
        self.index = {"sentences": sentences_or_file_path}
        
        if use_faiss:
            quantizer = faiss.IndexFlatIP(embeddings.shape[1])  
            if faiss_fast:
                index = faiss.IndexIVFFlat(quantizer, embeddings.shape[1], min(self.num_cells, len(sentences_or_file_path))) 
            else:
                index = quantizer

            if (self.device == "cuda" and device != "cpu") or device == "cuda":
                if hasattr(faiss, "StandardGpuResources"):
                    logger.info("Use GPU-version faiss")
                    res = faiss.StandardGpuResources()
                    res.setTempMemory(20 * 1024 * 1024 * 1024)
                    index = faiss.index_cpu_to_gpu(res, 0, index)
                else:
                    logger.info("Use CPU-version faiss")
            else: 
                logger.info("Use CPU-version faiss")

            if faiss_fast:            
                index.train(embeddings.astype(np.float32))
            index.add(embeddings.astype(np.float32))
            index.nprobe = min(self.num_cells_in_search, len(sentences_or_file_path))
            self.is_faiss_index = True
        else:
            index = embeddings
            self.is_faiss_index = False
        self.index["index"] = index
        logger.info("Finished")
    
    def search(self, queries: Union[str, List[str]], 
                device: str = None, 
                threshold: float = 0,
                top_k: int = 5) -> Union[List[Tuple[str, float]], List[List[Tuple[str, float]]]]:
        
        if not self.is_faiss_index:
            if isinstance(queries, list):
                combined_results = []
                for query in queries:
                    results = self.search(query, device)
                    combined_results.append(results)
                return combined_results
            
            similarities = self.similarity(queries, self.index["index"]).tolist()
            id_and_score = []
            for i, s in enumerate(similarities):
                if s >= threshold:
                    id_and_score.append((i, s))
            id_and_score = sorted(id_and_score, key=lambda x: x[1], reverse=True)[:top_k]
            results = [(self.index["sentences"][idx], score) for idx, score in id_and_score]
            return results
        else:
            query_vecs = self.encode(queries, device=device, normalize_to_unit=True, keepdim=True, return_numpy=True)

            distance, idx = self.index["index"].search(query_vecs.astype(np.float32), top_k)
            
            def pack_single_result(dist, idx):
                results = [(self.index["sentences"][i], s) for i, s in zip(idx, dist) if s >= threshold]
                return results
            
            if isinstance(queries, list):
                combined_results = []
                for i in range(len(queries)):
                    results = pack_single_result(distance[i], idx[i])
                    combined_results.append(results)
                return combined_results
            else:
                return pack_single_result(distance[0], idx[0])

In [27]:
def evaluation(eval_dataset,model):
    sen_emb1 = model.encode(eval_dataset['sentence1'].tolist())
    sen_emb2 = model.encode(eval_dataset['sentence2'].tolist())
    labels = eval_dataset['score']

    cosine_scores = 1 - (paired_cosine_distances(sen_emb1, sen_emb2))
    manhattan_distances = -paired_manhattan_distances(sen_emb1, sen_emb2)
    euclidean_distances = -paired_euclidean_distances(sen_emb1, sen_emb2)
    dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(sen_emb1, sen_emb2)]
    
    eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
    eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)

    eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
    eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)

    eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
    eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)

    eval_pearson_dot, _ = pearsonr(labels, dot_products)
    eval_spearman_dot, _ = spearmanr(labels, dot_products)

    score = {'eval_pearson_cosine': eval_pearson_cosine,
            'eval_spearman_cosine': eval_spearman_cosine,
            'eval_pearson_manhattan': eval_pearson_manhattan,
            'eval_spearman_manhattan': eval_spearman_manhattan,
            'eval_pearson_euclidean': eval_pearson_euclidean,
            'eval_spearman_euclidean': eval_spearman_euclidean,
            'eval_pearson_dot': eval_pearson_dot,
            'eval_spearman_dot': eval_spearman_dot}
    return score

### ko-SROBERTA

In [9]:
model_name = "jhgan/ko-sroberta-multitask"
sroberta = SE_model(model_name)

08/02/2022 16:15:07 - INFO - __main__ -   Use `cls_before_pooler` for DiffCSE models. If you want to use other pooling policy, specify `pooler` argument.


In [10]:
evaluation(sts_dev, sroberta)

100%|██████████| 23/23 [00:01<00:00, 12.18it/s]
100%|██████████| 23/23 [00:01<00:00, 12.87it/s]


{'eval_pearson_cosine': 0.8235922346436934,
 'eval_spearman_cosine': 0.8306339848688392,
 'eval_pearson_manhattan': 0.8340475913066008,
 'eval_spearman_manhattan': 0.8308704946645719,
 'eval_pearson_euclidean': 0.8338403027148542,
 'eval_spearman_euclidean': 0.8306342868724003,
 'eval_pearson_dot': 0.8235922284981624,
 'eval_spearman_dot': 0.8306336245663968}

In [11]:
evaluation(sts_test, sroberta)

100%|██████████| 22/22 [00:01<00:00, 14.99it/s]
100%|██████████| 22/22 [00:01<00:00, 15.23it/s]


{'eval_pearson_cosine': 0.7633258046589375,
 'eval_spearman_cosine': 0.7766879477066665,
 'eval_pearson_manhattan': 0.783168845374605,
 'eval_spearman_manhattan': 0.7757696904826148,
 'eval_pearson_euclidean': 0.7837702254917288,
 'eval_spearman_euclidean': 0.7766907576440633,
 'eval_pearson_dot': 0.7633257906623869,
 'eval_spearman_dot': 0.7766857288655976}

### XLM

In [12]:
model_name = "xlm-roberta-base"
xlm = SE_model(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=615.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=9096718.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1115590446.0, style=ProgressStyle(descr…




08/02/2022 16:18:40 - INFO - __main__ -   Use `cls_before_pooler` for DiffCSE models. If you want to use other pooling policy, specify `pooler` argument.


In [13]:
evaluation(sts_dev, xlm)

100%|██████████| 23/23 [00:01<00:00, 12.11it/s]
100%|██████████| 23/23 [00:01<00:00, 11.88it/s]


{'eval_pearson_cosine': 0.15857128888690109,
 'eval_spearman_cosine': 0.35752388818577313,
 'eval_pearson_manhattan': 0.3647729118482682,
 'eval_spearman_manhattan': 0.4494821851771784,
 'eval_pearson_euclidean': 0.2525653234069045,
 'eval_spearman_euclidean': 0.3575227482478141,
 'eval_pearson_dot': 0.15857250021070793,
 'eval_spearman_dot': 0.35752701848700713}

In [14]:
evaluation(sts_test, xlm)

100%|██████████| 22/22 [00:01<00:00, 13.19it/s]
100%|██████████| 22/22 [00:01<00:00, 16.02it/s]


{'eval_pearson_cosine': 0.10143291340818458,
 'eval_spearman_cosine': 0.3073667621946652,
 'eval_pearson_manhattan': 0.29167672012112594,
 'eval_spearman_manhattan': 0.38190490644713543,
 'eval_pearson_euclidean': 0.19762502245081698,
 'eval_spearman_euclidean': 0.3073634280991161,
 'eval_pearson_dot': 0.10143153065131222,
 'eval_spearman_dot': 0.3073755940861359}

### EASE

In [15]:
model_name = "sosuke/ease-bert-base-multilingual-cased"
EASE_model = SE_model(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=914.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1961847.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=537.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=711492337.0, style=ProgressStyle(descri…




08/02/2022 16:20:23 - INFO - __main__ -   Use `cls_before_pooler` for DiffCSE models. If you want to use other pooling policy, specify `pooler` argument.


In [16]:
evaluation(sts_dev, EASE_model)

100%|██████████| 23/23 [00:01<00:00, 11.54it/s]
100%|██████████| 23/23 [00:02<00:00, 11.14it/s]


{'eval_pearson_cosine': 0.6571814839737603,
 'eval_spearman_cosine': 0.6894279583297167,
 'eval_pearson_manhattan': 0.6871026115656402,
 'eval_spearman_manhattan': 0.6894776620854263,
 'eval_pearson_euclidean': 0.6869649746372406,
 'eval_spearman_euclidean': 0.6894273581426218,
 'eval_pearson_dot': 0.6571814999732488,
 'eval_spearman_dot': 0.6894282655243558}

In [17]:
evaluation(sts_test, EASE_model)

100%|██████████| 22/22 [00:01<00:00, 12.33it/s]
100%|██████████| 22/22 [00:01<00:00, 13.47it/s]


{'eval_pearson_cosine': 0.5710576355138167,
 'eval_spearman_cosine': 0.6125899662303511,
 'eval_pearson_manhattan': 0.6070152843357819,
 'eval_spearman_manhattan': 0.6113629797592428,
 'eval_pearson_euclidean': 0.6081834863996942,
 'eval_spearman_euclidean': 0.612591546988357,
 'eval_pearson_dot': 0.5710575169624974,
 'eval_spearman_dot': 0.6125946278397131}

### mBERT

In [18]:
model_name = "bongsoo/mdistilbertV1.1"
mbert = SE_model(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=574.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1231748.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2440214.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=484.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=624555631.0, style=ProgressStyle(descri…




08/02/2022 16:25:48 - INFO - __main__ -   Use `cls_before_pooler` for DiffCSE models. If you want to use other pooling policy, specify `pooler` argument.


In [19]:
evaluation(sts_dev, mbert)

100%|██████████| 23/23 [00:00<00:00, 23.18it/s]
100%|██████████| 23/23 [00:00<00:00, 30.34it/s]


{'eval_pearson_cosine': 0.6001289439562228,
 'eval_spearman_cosine': 0.6034266435390286,
 'eval_pearson_manhattan': 0.6100265875086988,
 'eval_spearman_manhattan': 0.6031005113076003,
 'eval_pearson_euclidean': 0.6099035868420458,
 'eval_spearman_euclidean': 0.6034265882765772,
 'eval_pearson_dot': 0.6001289349133595,
 'eval_spearman_dot': 0.6034260590209415}

In [20]:
evaluation(sts_test, mbert)

100%|██████████| 22/22 [00:00<00:00, 30.82it/s]
100%|██████████| 22/22 [00:00<00:00, 26.92it/s]


{'eval_pearson_cosine': 0.5275364053314909,
 'eval_spearman_cosine': 0.5182592279194106,
 'eval_pearson_manhattan': 0.5383089948304676,
 'eval_spearman_manhattan': 0.5188866936083439,
 'eval_pearson_euclidean': 0.5377250926082614,
 'eval_spearman_euclidean': 0.5182585571550561,
 'eval_pearson_dot': 0.5275364030857164,
 'eval_spearman_dot': 0.5182623577999687}

### mT5

In [35]:
model_name = "google/mt5-base"
mT5 = SE_model(True, model_name)

Some weights of the model checkpoint at google/mt5-base were not used when initializing MT5Model: ['lm_head.weight']
- This IS expected if you are initializing MT5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MT5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
08/02/2022 16:53:08 - INFO - __main__ -   Use `cls_before_pooler` for DiffCSE models. If you want to use other pooling policy, specify `pooler` argument.


In [36]:
evaluation(sts_dev, mT5)

100%|██████████| 23/23 [00:01<00:00, 15.67it/s]
100%|██████████| 23/23 [00:01<00:00, 16.33it/s]


{'eval_pearson_cosine': 0.12282037434944885,
 'eval_spearman_cosine': 0.14623620457569753,
 'eval_pearson_manhattan': 0.13827270990010335,
 'eval_spearman_manhattan': 0.1211560521994656,
 'eval_pearson_euclidean': 0.1374188430913635,
 'eval_spearman_euclidean': 0.14623609100452273,
 'eval_pearson_dot': 0.12282035521313545,
 'eval_spearman_dot': 0.14623636483364894}

In [37]:
evaluation(sts_test, mT5)

100%|██████████| 22/22 [00:02<00:00,  8.85it/s]
100%|██████████| 22/22 [00:01<00:00, 11.24it/s]


{'eval_pearson_cosine': 0.09406104296210874,
 'eval_spearman_cosine': 0.09993770245733527,
 'eval_pearson_manhattan': 0.11564662553151268,
 'eval_spearman_manhattan': 0.11670778272524364,
 'eval_pearson_euclidean': 0.10921812224695854,
 'eval_spearman_euclidean': 0.09994107875252256,
 'eval_pearson_dot': 0.09406104149777514,
 'eval_spearman_dot': 0.09994928773421191}

### KET5

In [38]:
model_name = "KETI-AIR/ke-t5-base"
keT5 = SE_model(True, model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=599.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1466734.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1786.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1964.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=990048016.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at KETI-AIR/ke-t5-base were not used when initializing T5Model: ['lm_head.weight']
- This IS expected if you are initializing T5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
08/02/2022 16:54:04 - INFO - __main__ -   Use `cls_before_pooler` for DiffCSE models. If you want to use other pooling policy, specify `pooler` argument.


In [39]:
evaluation(sts_dev, keT5)

100%|██████████| 23/23 [00:02<00:00,  9.48it/s]
100%|██████████| 23/23 [00:01<00:00, 12.91it/s]


{'eval_pearson_cosine': 0.25325630257295256,
 'eval_spearman_cosine': 0.2507158529857863,
 'eval_pearson_manhattan': 0.270132898490969,
 'eval_spearman_manhattan': 0.25174994286188124,
 'eval_pearson_euclidean': 0.2702215148087061,
 'eval_spearman_euclidean': 0.2507142800848121,
 'eval_pearson_dot': 0.2532563118797479,
 'eval_spearman_dot': 0.2507185852481801}

In [40]:
evaluation(sts_test, keT5)

100%|██████████| 22/22 [00:01<00:00, 11.95it/s]
100%|██████████| 22/22 [00:01<00:00, 13.56it/s]


{'eval_pearson_cosine': 0.16178171162500965,
 'eval_spearman_cosine': 0.1748727834214834,
 'eval_pearson_manhattan': 0.1888581875427734,
 'eval_spearman_manhattan': 0.17201728900221488,
 'eval_pearson_euclidean': 0.18932171989947139,
 'eval_spearman_euclidean': 0.17487468683313417,
 'eval_pearson_dot': 0.16178171867356078,
 'eval_spearman_dot': 0.1748723039554667}