# Import stuff


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -U sentence-transformers
!pip install pyvi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 4.0 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 34.7 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 50.5 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 4.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 61.9 MB/s 
Collecting tokenizers!=0.11.3,<

In [None]:
!pip install underthesea

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting underthesea
  Downloading underthesea-1.3.4-py3-none-any.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 10.0 MB/s 
Collecting unidecode
  Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 54.7 MB/s 
Collecting underthesea-core==0.0.4_alpha.10
  Downloading underthesea_core-0.0.4_alpha.10-cp37-cp37m-manylinux2010_x86_64.whl (581 kB)
[K     |████████████████████████████████| 581 kB 72.6 MB/s 
Installing collected packages: unidecode, underthesea-core, underthesea
Successfully installed underthesea-1.3.4 underthesea-core-0.0.4a10 unidecode-1.3.4


In [None]:
from sentence_transformers import SentenceTransformer
from pyvi.ViTokenizer import tokenize
import torch

In [None]:
%cd "/content/drive/MyDrive/vdt_dsai/dataset"

/content/drive/MyDrive/vdt_dsai/dataset


In [None]:
!ls

output		      train_file.json	     val_pairs_bm25.json
test_file.json	      train_pairs_bm25.json  val_pairs_top5.json
test_pairs_bm25.json  train_pairs_top5.json
test_pairs_top5.json  val_file.json


In [None]:
import json
with open("test_file.json") as f:
    test_data =  json.load(f)
with open("train_file.json") as f:
    train_data =  json.load(f)
with open("val_file.json") as f:
    val_data =  json.load(f)

In [None]:
from torch.utils.data import DataLoader
import math
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
import logging
from datetime import datetime
import os
import gzip
import csv

# Helper functions

In [None]:
from underthesea import sent_tokenize
from underthesea import word_tokenize

def find_sentence(corpus, start_answer):
    '''
    Find sentence contains answer
    * corpus: list sentences of context
    * start_answer: the begin position of answer in context
    '''
    start, end = 0, -1
    sent_idx = -1
    for i in range(len(corpus)):
        sentence = corpus[i]
        start = end + 1
        end = start+len(sentence)
        if start<=start_answer<end:
            sent_idx = i
            break
        elif start_answer == end:
            sent_idx = i + 1
            break 
    return sent_idx

def accuracy(data, top_k):
    """
    Calculate accuracy 
    Given queries Q = {Q1, ..., Qm} 
    Document D = {S1, ..., Sn}
    The sentence containing the answer to the question Qi is Ai
    
    Acc@K = 1/|Q| * sum(is Ai in get_top_k(Q, D))
    """
    results = []
    invalid = 0 # number of invalid qa
    for topic in data['data']:
        for paragraph in topic['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                if len(qa['answers']) > 0:

                    # If the answer is from multiple sentences then discard this qa
                    if len(sent_tokenize(qa['answers'][0]['text'])) > 1:
                        invalid += 1
                        continue 

                    start_answer = qa['answers'][0]['answer_start']
                    k = min(top_k, len(qa['candidate_indices']))

                    # Find the index of sentence that contains the answer 
                    ans_sent_idx = find_sentence(context, start_answer)
                    # Check if the above sentence is retrived in top_k 
                    results.append(ans_sent_idx in qa['candidate_indices'][:k])
    return {"top_k":top_k, "# valid qa":len(results), "# invalid qa": invalid,  "true":sum(results), "accuracy":round(sum(results) / len(results), 3)}

In [None]:
def load_pairs(filepath):
    samples = []
    with open(filepath) as f:
        pairs = json.load(f)

    for pair in pairs: 
        sample = InputExample(texts=[pair[0], pair[1]], label=pair[2])
        samples.append(sample)

    return samples 

In [None]:
from copy import deepcopy
from pyvi.ViTokenizer import tokenize

def find_top_k(top_k, model, question, corpus, corpus_embeddings):
    k = min(top_k, len(corpus))
    query_embedding = model.encode(tokenize(question))

    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=k)

    return top_results[1]

def get_topk_sentences(data, model, k):
  copied_data = deepcopy(data)
  for topic in copied_data['data']:
    for paragraph in topic['paragraphs']:

      context = paragraph['context']
      context = [tokenize(sentence) for sentence in context]
      context_embeddings = model.encode(context)
      for qa in paragraph['qas']:
        question = qa['question']

        ## Get top k most relevant sentences
        top_k_candidates = find_top_k(k, model, question, context, context_embeddings)
        qa['candidate_indices'] = top_k_candidates.tolist()
  return copied_data

In [None]:
class CustomSentenceTransformer(SentenceTransformer):
    def __init__(self, model_name, early_stopping_criteria=(1e-3, 3)):
        super(CustomSentenceTransformer, self).__init__(model_name)
        self.scores = []
        min_delta, patience = early_stopping_criteria
        self.min_delta = min_delta
        self.patience = patience

    def EarlyStoppingCallback(self, score, epoch, steps):

        def is_not_improving(a, delta):
            return all((y - x) <= delta for x, y in zip(a, a[1:]))
        
        if (steps == -1):
            self.scores.append(score)
            print("Epoch score: ", score, "Best score:", self.best_score)
            monitor = self.scores[-(self.patience + 1):]
            if (len(monitor) == (self.patience + 1) and is_not_improving(monitor, self.min_delta)):
                print("Early stop.")
                raise StopIteration()


# Training

In [None]:
# Read the dataset
# model_name = 'VoVanPhuc/sup-SimCSE-VietNamese-phobert-base'
model_name = 'paraphrase-xlm-r-multilingual-v1'
train_batch_size = 16
num_epochs = 20
# model_save_path = 'output/training_stsbenchmark_continue_training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model_save_path = "output/attempt-3-top5-hard-negatives"
model_save_path = "output/attempt-4-xlmr-top5-hard-negatives"

In [None]:
custom_model = CustomSentenceTransformer(model_name, early_stopping_criteria=(0, 3))
# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")

Downloading:   0%|          | 0.00/345 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

In [None]:
# Convert the dataset to a DataLoader ready for training
train_samples = load_pairs("train_pairs_top5.json")
val_samples = load_pairs("val_pairs_top5.json")

In [None]:
# Convert the dataset to a DataLoader ready for training
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=custom_model)

# Development set: Measure correlation between cosine score and gold labels
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples, name='sts-dev', main_similarity=SimilarityFunction.COSINE)

In [None]:
# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

In [None]:
try:
    custom_model.fit(train_objectives=[(train_dataloader, train_loss)],
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=1000,
            warmup_steps=warmup_steps,
            callback=custom_model.EarlyStoppingCallback, 
            save_best_model=True, 
            output_path=model_save_path)
except StopIteration: 
    print("Stop training.")



Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4700 [00:00<?, ?it/s]

Epoch score:  0.5372791381071909 Best score: 0.538194663228977


Iteration:   0%|          | 0/4700 [00:00<?, ?it/s]

Epoch score:  0.5390524667979968 Best score: 0.5481564814424128


Iteration:   0%|          | 0/4700 [00:00<?, ?it/s]

Epoch score:  0.5409431197346619 Best score: 0.5481564814424128


Iteration:   0%|          | 0/4700 [00:00<?, ?it/s]

Epoch score:  0.5336367897054501 Best score: 0.5481564814424128


Iteration:   0%|          | 0/4700 [00:00<?, ?it/s]

Epoch score:  0.5171531297392022 Best score: 0.5481564814424128


Iteration:   0%|          | 0/4700 [00:00<?, ?it/s]

# Evaluate finetuned XLM-R

In [None]:
model_save_path = "output/attempt-4-xlmr-top5-hard-negatives"

In [None]:
custom_model = SentenceTransformer(model_save_path)

In [None]:
val_results = get_topk_sentences(val_data, custom_model, 10)

In [None]:
print("Finetuned Validate")
simcse_val_results = []
for i in range(10):
    json_i = accuracy(val_results, i+1)
    simcse_val_results.append(json_i)
import pandas as pd 
simcse_val_df = pd.read_json(json.dumps(simcse_val_results))
simcse_val_df

Finetuned Validate


Unnamed: 0,top_k,# valid qa,# invalid qa,true,accuracy
0,1,1871,4,1459,0.78
1,2,1871,4,1677,0.896
2,3,1871,4,1775,0.949
3,4,1871,4,1816,0.971
4,5,1871,4,1846,0.987
5,6,1871,4,1858,0.993
6,7,1871,4,1861,0.995
7,8,1871,4,1865,0.997
8,9,1871,4,1867,0.998
9,10,1871,4,1868,0.998


In [None]:
test_results = get_topk_sentences(test_data, custom_model, 10)

In [None]:
print("Finetuned Test")
simcse_test_results = []
for i in range(10):
    json_i = accuracy(test_results, i+1)
    simcse_test_results.append(json_i)
import pandas as pd 
simcse_test_df = pd.read_json(json.dumps(simcse_test_results))
simcse_test_df

Finetuned Test


Unnamed: 0,top_k,# valid qa,# invalid qa,true,accuracy
0,1,1701,3,1319,0.775
1,2,1701,3,1543,0.907
2,3,1701,3,1619,0.952
3,4,1701,3,1651,0.971
4,5,1701,3,1678,0.986
5,6,1701,3,1689,0.993
6,7,1701,3,1694,0.996
7,8,1701,3,1699,0.999
8,9,1701,3,1701,1.0
9,10,1701,3,1701,1.0


# Evaluate finetuned SimCSE

In [None]:
custom_model = SentenceTransformer(model_save_path)

In [None]:
val_results = get_topk_sentences(val_data, custom_model, 10)

In [None]:
print("Finetuned Validate")
simcse_val_results = []
for i in range(10):
    json_i = accuracy(val_results, i+1)
    simcse_val_results.append(json_i)
import pandas as pd 
simcse_val_df = pd.read_json(json.dumps(simcse_val_results))
simcse_val_df

Finetuned Validate


Unnamed: 0,top_k,# valid qa,# invalid qa,true,accuracy
0,1,1871,4,1533,0.819
1,2,1871,4,1725,0.922
2,3,1871,4,1790,0.957
3,4,1871,4,1825,0.975
4,5,1871,4,1849,0.988
5,6,1871,4,1858,0.993
6,7,1871,4,1865,0.997
7,8,1871,4,1867,0.998
8,9,1871,4,1869,0.999
9,10,1871,4,1871,1.0


In [None]:
test_results = get_topk_sentences(test_data, custom_model, 10)

In [None]:
print("Finetuned Test")
simcse_test_results = []
for i in range(10):
    json_i = accuracy(test_results, i+1)
    simcse_test_results.append(json_i)
import pandas as pd 
simcse_test_df = pd.read_json(json.dumps(simcse_test_results))
simcse_test_df

Finetuned Test


Unnamed: 0,top_k,# valid qa,# invalid qa,true,accuracy
0,1,1701,3,1396,0.821
1,2,1701,3,1581,0.929
2,3,1701,3,1639,0.964
3,4,1701,3,1673,0.984
4,5,1701,3,1690,0.994
5,6,1701,3,1695,0.996
6,7,1701,3,1697,0.998
7,8,1701,3,1699,0.999
8,9,1701,3,1700,0.999
9,10,1701,3,1701,1.0
