#Import 

In [1]:
%cd /content/drive/MyDrive/vdt_dsai/squad1.1/data

/content/drive/MyDrive/vdt_dsai/squad1.1/data


In [2]:
import json 
with open("train-v1.1.json") as f:
    train_file = json.load(f)
with open("dev-v1.1.json") as f:
    dev_file = json.load(f)

In [None]:
!pip install rank_bm25
!pip install underthesea

In [4]:
from rank_bm25 import BM25Okapi
from rank_bm25 import BM25Plus
from copy import deepcopy 
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn
from copy import deepcopy
import numpy as np 
import pandas as pd 
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import torch

In [5]:
import nltk 
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Helper function

In [6]:
def sentence_tokenizer(data):
    """
    Segment contexts into sentences.
    """
    new_data = deepcopy(data)
    for topic in new_data['data']:
        for paragraph in topic['paragraphs']:
            paragraph['context'] = sent_tokenize(paragraph['context'])
    return new_data 

In [80]:
def remove_inconsistent_qas(data):
    """
    Remove qas that contain answers that are not in the same sentence.
    """
    inconsistent = 0
    copied_data = deepcopy(data)
    for topic in copied_data['data']:
        for paragraph in topic['paragraphs']:
            context = paragraph['context']
            consistent_qas = []
            for qa in paragraph['qas']:
                if len(qa['answers']) > 0:
                    flag = []
                    for answer in qa['answers']:
                        start_answer = answer['answer_start']
                        ans_sent_idx = find_sentence(context, start_answer)
                        flag.append(ans_sent_idx)
                    
                    # Check if all the answers in contained in a single sentence
                    if (len(set(flag)) == 1):
                        consistent_qas.append(qa)
            paragraph['qas'] = consistent_qas
    return copied_data

In [8]:
def find_sentence(corpus, start_answer):
    '''
    Find sentence contains answer
    * corpus: list sentences of context
    * start_answer: the begin position of answer in context
    '''
    start, end = 0, -1
    sent_idx = -1
    for i in range(len(corpus)):
        sentence = corpus[i]
        start = end + 1
        end = start+len(sentence)
        if start<=start_answer<end:
            sent_idx = i
            break
        elif start_answer == end:
            sent_idx = i + 1
            break 
    return sent_idx

def accuracy(data, top_k):
    """
    Calculate accuracy 
    Given queries Q = {Q1, ..., Qm} 
    Document D = {S1, ..., Sn}
    The sentence containing the answer to the question Qi is Ai
    
    Acc@K = 1/|Q| * sum(is Ai in get_top_k(Q, D))
    """
    results = []
    invalid = 0 # number of invalid qa
    for topic in data['data']:
        for paragraph in topic['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                if len(qa['answers']) > 0:

                    # If the answer is from multiple sentences then discard this qa
                    if len(sent_tokenize(qa['answers'][0]['text'])) > 1:
                        invalid += 1
                        continue 

                    start_answer = qa['answers'][0]['answer_start']
                    k = min(top_k, len(qa['candidate_indices']))

                    # Find the index of sentence that contains the answer 
                    ans_sent_idx = find_sentence(context, start_answer)
                    # Check if the above sentence is retrived in top_k 
                    results.append(ans_sent_idx in qa['candidate_indices'][:k])
    return {"top_k":top_k, "# valid qa":len(results), "# invalid qa": invalid,  "true":sum(results), "accuracy":round(sum(results) / len(results), 3)}

In [131]:
def evaluate(data):
    """
    Evaluate top 1 -> top 10 
    """
    results = []
    for i in range(10):
        json_i = accuracy(data, i+1)
        results.append(json_i)
    results_df = pd.read_json(json.dumps(results))
    return results_df

# Methods functions


In [133]:
def get_topk_sentences_bm25(data, top_k):
    """
    Using BM25 to get top k most relevant sentences for each question
    """
    copied_data = deepcopy(data)
    for topic in copied_data['data']:
        for paragraph in topic['paragraphs']:
            context = paragraph['context']
            vectorizer = TfidfVectorizer()
            tokenizer = vectorizer.build_tokenizer()
            tokenized_context = [tokenizer(sentence.lower()) for sentence in context]
            
            # learn statistic of the context sentences.
            bm25 = BM25Plus(tokenized_context)
            for qa in paragraph['qas']:
                question = qa['question']
                query = tokenizer(question)

                # compute scores of each sentence in the context.
                scores = bm25.get_scores(query)
                k = min(top_k, len(context))
                top_results = torch.topk(torch.tensor(scores.reshape(-1)), k=k)
                qa['candidate_indices'] = top_results[1].tolist()
    return copied_data

In [134]:
import sklearn 
def get_topk_sentences_sklearn(data, top_k):
    """
    Using sklearn TFIDF to get top k most relevant sentences for each question
    """
    copied_data = deepcopy(data)
    for topic in copied_data['data']:
        for paragraph in topic['paragraphs']:
            context = paragraph['context']
            vectorizer = TfidfVectorizer()
            tokenizer = vectorizer.build_tokenizer()

            # learn statistic of the context sentences.
            X = vectorizer.fit_transform(context)
            for qa in paragraph['qas']:
                question = qa['question']
                y = vectorizer.transform([question])
                # scores = np.dot(X, y.T).toarray()

                # compute scores using cosine similarity of each sentence in the context.
                scores = sklearn.metrics.pairwise.cosine_similarity(y, X)
                k = min(top_k, len(X.toarray()))
                top_results = torch.topk(torch.tensor(scores.reshape(-1)), k=k)
                qa['candidate_indices'] = top_results[1].tolist()
    return copied_data

In [136]:
import math 
class TFIDF:


    def __init__(self, tf_scheme="raw_count", idf_scheme="idf", k=0.5):
        self.tf_scheme = tf_scheme
        self.idf_scheme = idf_scheme
        self.k = k

    def fit(self, corpus):
        """
        Fit the various statistics that are required to calculate BM25 ranking
        score using the corpus given.

        Parameters
        ----------
        corpus : list[list[str]]
            Each element in the list represents a document, and each document
            is a list of the terms.

        Returns
        -------
        self
        """
        tf = []
        df = {}
        idf = {}
        doc_len = []
        corpus_size = 0
        for document in corpus:
            corpus_size += 1
            doc_len.append(len(document))

            # compute tf (term frequency) per document
            frequencies = {}
            for term in document:
                term_count = frequencies.get(term, 0) + 1
                frequencies[term] = term_count

            tf.append(frequencies)

            # compute df (document frequency) per term
            for term, _ in frequencies.items():
                df_count = df.get(term, 0) + 1
                df[term] = df_count

        for term, freq in df.items():
            idf[term] = math.log(corpus_size / (freq + 1))

        self.tf_ = tf
        self.df_ = df
        self.idf_ = idf
        self.doc_len_ = doc_len
        self.corpus_ = corpus
        self.corpus_size_ = corpus_size
        self.avg_doc_len_ = sum(doc_len) / corpus_size
        return self
        
    def search(self, query):
        scores = [self._score(query, index) for index in range(self.corpus_size_)]
        return scores

    def _score(self, query, index):
        score = 0.0

        # Choosing weighting scheme for tf
        tf_ = self.tf_
        if (self.tf_scheme == "raw_count"):
            tf_ = self.tf_
        elif (self.tf_scheme == "binary"):
            tf_ = [{key:1 for (key, value) in tf.items()} for tf in self.tf_]
        elif (self.tf_scheme == "term_frequency"):
            tf_ = [{key:value/sum(tf.values()) for (key, value) in tf.items()} for tf in self.tf_]

        # Choosing weighting scheme for idf
        idf_ = self.idf_
        if (self.idf_scheme == "unary"):
            idf_ = {key:1 for (key, value) in self.idf_.items()}
        if (self.idf_scheme == "idf"):
            idf_ = self.idf_

        frequencies = tf_[index]
        for term in query:
            if term not in frequencies:
                continue

            tf = frequencies[term]
            idf = idf_[term]
            score += tf * idf 

        return score

def get_topk_sentences_custom(data, top_k, scorer):
    """
    Using custiom TFIDF scorer to get topk relevant sentences
    """
    copied_data = deepcopy(data)
    for topic in copied_data['data']:
        for paragraph in topic['paragraphs']:
            context = paragraph['context']
            context = [[word.lower() for word in word_tokenize(cxt)] for cxt in context]
            scorer.fit(context)
            for qa in paragraph['qas']:
                question = [word.lower() for word in word_tokenize(qa['question'])]
                scores = torch.Tensor(scorer.search(question))

                k = min(top_k, len(context))
                top_results = torch.topk(scores, k=k)
                qa['candidate_indices'] = top_results[1].tolist()
                qa['candiate_scores'] = top_results[0].tolist()
    return copied_data

# Process data

In [140]:
import sklearn 

In [257]:
dev_file_updated = remove_inconsistent_qas(sentence_tokenizer(dev_file))

In [264]:
train_data_updated, dev_data_updated = sklearn.model_selection.train_test_split(train_file['data'], train_size=0.9, random_state=7)
train_file_updated = {"version": "squad1.1_train_sentences", "data": train_data_updated}
dev_file_updated = {"version": "squad1.1_dev_sentences", "data": dev_data_updated}

# segment to sentences 
train_file_sentences = sentence_tokenizer(train_file_updated)
dev_file_sentences = sentence_tokenizer(dev_file_updated)

In [262]:
test_file_sentences = remove_inconsistent_qas(sentence_tokenizer(dev_file))
test_file_sentences['version'] = "squad1.1_test_sentences"

In [266]:
def count_qas(data):
    count_qa = 0
    for topic in data['data']:
        for paragraph in topic['paragraphs']:
            for qa in paragraph['qas']:
                count_qa += 1
    return count_qa

# Some statistics 
print("Number of articles per set")
print("Train:", len(train_file_sentences['data']))
print("Dev:", len(dev_file_sentences['data']))
print("Test:", len(test_file_sentences['data']))
print("Number of qas per set")
print("Train:", count_qas(train_file_sentences))
print("Dev:", count_qas(dev_file_sentences))
print("Test:", count_qas(test_file_sentences))

Number of articles per set
Train: 397
Dev: 45
Test: 48
Number of qas per set
Train: 78830
Dev: 8769
Test: 9759


In [270]:
# Store newly created sets to json files

json_object = json.dumps(train_file_sentences, ensure_ascii=False)
with open("squad1.1_train_sentences.json", "w") as write_file:
    write_file.write(json_object)

json_object = json.dumps(dev_file_sentences, ensure_ascii=False)
with open("squad1.1_dev_sentences.json", "w") as write_file:
    write_file.write(json_object)

json_object = json.dumps(test_file_sentences, ensure_ascii=False)
with open("squad1.1_test_sentences.json", "w") as write_file:
    write_file.write(json_object)

# Tuning

## BM25 Tuning

In [277]:
from rank_bm25 import BM25Okapi
from rank_bm25 import BM25Plus
from rank_bm25 import BM25L
def get_topk_sentences_bm25_(data, top_k, k1=1.5, b=0.75):
    """
    Using sklearn TFIDF to get top k most relevant sentences for each question
    """
    copied_data = deepcopy(data)
    for topic in copied_data['data']:
        for paragraph in topic['paragraphs']:
            context = paragraph['context']
            vectorizer = TfidfVectorizer()
            tokenizer = vectorizer.build_tokenizer()
            tokenized_context = [tokenizer(sentence.lower()) for sentence in context]
            
            # learn statistic of the context sentences.
            bm25 = BM25Plus(tokenized_context, k1=k1, b=b)
            for qa in paragraph['qas']:
                question = qa['question']
                query = tokenizer(question)

                # compute scores of each sentence in the context.
                scores = bm25.get_scores(query)
                k = min(top_k, len(context))
                top_results = torch.topk(torch.tensor(scores.reshape(-1)), k=k)
                qa['candidate_indices'] = top_results[1].tolist()
    return copied_data

bm25_data_ = []
for k1 in np.arange(0, 2.25, 0.25):
    temp = []
    for b in np.arange(0, 1.1, 0.1):
        k1_ = k1 
        b_ = b 
        temp.append(get_topk_sentences_bm25_(dev_file_sentences, 1, k1_, b_))
    bm25_data_.append(temp)

results = [] 
for bm25_k1 in bm25_data_:
    temp = []
    for bm25_b in bm25_k1:
        json_i = accuracy(bm25_b, 1).get("accuracy")
        temp.append(json_i)
    results.append(temp)
results_df = pd.read_json(json.dumps(results))
# results_df.append(pd.Series(np.arange(0, 1.1, 0.1)), ignore_index=True)
results_df['k1'] = np.arange(0, 2.25, 0.25)
results_df

  (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,k1
0,0.236,0.236,0.236,0.236,0.236,0.236,0.236,0.236,0.236,0.236,0.236,0.0
1,0.746,0.747,0.746,0.746,0.747,0.747,0.747,0.745,0.745,0.745,0.742,0.25
2,0.746,0.746,0.746,0.746,0.747,0.747,0.746,0.744,0.744,0.745,0.742,0.5
3,0.746,0.746,0.746,0.746,0.746,0.747,0.745,0.745,0.744,0.744,0.741,0.75
4,0.745,0.745,0.746,0.746,0.745,0.746,0.745,0.744,0.744,0.743,0.739,1.0
5,0.744,0.744,0.746,0.746,0.746,0.746,0.745,0.744,0.742,0.742,0.737,1.25
6,0.744,0.744,0.745,0.745,0.746,0.746,0.746,0.743,0.742,0.741,0.735,1.5
7,0.742,0.743,0.745,0.744,0.745,0.746,0.744,0.742,0.741,0.739,0.734,1.75
8,0.741,0.742,0.744,0.744,0.745,0.745,0.744,0.741,0.74,0.737,0.734,2.0


No such parameters outperform the others, so I'll just use the default

## TFIDF Tuning

In [281]:
tf_schemes = ["raw_count", "binary", "term_frequency"]
idf_schmes = ["unary", "idf"]

results = []
for tf_scheme in tf_schemes:
    temp = []
    for idf_scheme in idf_schmes:
        tfidf = TFIDF(tf_scheme, idf_scheme)
        tfidf_val = get_topk_sentences_custom(dev_file_sentences, 1, tfidf)
        temp.append(accuracy(tfidf_val, 1).get("accuracy"))
    results.append(temp)

tfidf_val_df = pd.DataFrame(results)
tfidf_val_df

Unnamed: 0,0,1
0,0.66,0.722
1,0.77,0.763
2,0.645,0.719


Best hyperparameters is [unary, binary]

# Evaluate

In [272]:
bm25_test = get_topk_sentences_bm25(test_file_sentences, 10)

In [273]:
bm25_test_results_df = evaluate(bm25_test)
bm25_test_results_df

Unnamed: 0,top_k,# valid qa,# invalid qa,true,accuracy
0,1,9749,10,7404,0.759
1,2,9749,10,8650,0.887
2,3,9749,10,9179,0.942
3,4,9749,10,9468,0.971
4,5,9749,10,9593,0.984
5,6,9749,10,9681,0.993
6,7,9749,10,9711,0.996
7,8,9749,10,9728,0.998
8,9,9749,10,9738,0.999
9,10,9749,10,9745,1.0


In [286]:
tfidf = TFIDF("unary", "binary")
tfidf_test = get_topk_sentences_custom(test_file_sentences, 10, tfidf)

In [287]:
tfidf_test_results_df = evaluate(tfidf_test)
tfidf_test_results_df

Unnamed: 0,top_k,# valid qa,# invalid qa,true,accuracy
0,1,9749,10,7251,0.744
1,2,9749,10,8815,0.904
2,3,9749,10,9348,0.959
3,4,9749,10,9580,0.983
4,5,9749,10,9671,0.992
5,6,9749,10,9719,0.997
6,7,9749,10,9736,0.999
7,8,9749,10,9743,0.999
8,9,9749,10,9745,1.0
9,10,9749,10,9747,1.0


# Testing

In [None]:
!pip install -U sentence-transformers
!pip install pyvi

In [116]:
from copy import deepcopy
from sentence_transformers import util
from sentence_transformers import SentenceTransformer

def find_top_k(top_k, model, question, corpus, corpus_embeddings):
  k = min(top_k, len(corpus))
  query_embedding = model.encode(question)

  cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
  # print("error")
  # cos_scores = cos_scores.cpu()

  top_results = torch.topk(cos_scores, k=k)

  return top_results[1]

def get_topk_sentences_pretrained(train_data, model, k):
  copied_data = deepcopy(train_data)
  for topic in copied_data['data']:
    for paragraph in topic['paragraphs']:
      # paragraph['context'] = sent_tokenize(paragraph['context'])
      context = paragraph['context']
      context_embeddings = model.encode(context)
      for qa in paragraph['qas']:
        question = qa['question']

        ## Get top k most relevant sentences
        top_k_candidates = find_top_k(k, model, question, context, context_embeddings)
        qa['candidate_indices'] = top_k_candidates.tolist()
  return copied_data

In [117]:
model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [118]:
dev_pretrained = get_topk_sentences_pretrained(dev_file_updated, model, 10)

In [119]:
print("SimCSE Test")
simcse_test_results = []
for i in range(10):
    json_i = accuracy(dev_pretrained, i+1)
    simcse_test_results.append(json_i)
import pandas as pd 
simcse_test_df = pd.read_json(json.dumps(simcse_test_results))
simcse_test_df

SimCSE Test


Unnamed: 0,top_k,# valid qa,# invalid qa,true,accuracy
0,1,9749,10,7657,0.785
1,2,9749,10,8905,0.913
2,3,9749,10,9365,0.961
3,4,9749,10,9580,0.983
4,5,9749,10,9685,0.993
5,6,9749,10,9717,0.997
6,7,9749,10,9731,0.998
7,8,9749,10,9743,0.999
8,9,9749,10,9745,1.0
9,10,9749,10,9747,1.0


In [125]:
model_name_2 = 'paraphrase-xlm-r-multilingual-v1'
model_2 = SentenceTransformer(model_name_2)

Downloading:   0%|          | 0.00/345 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

In [126]:
dev_pretrained = get_topk_sentences_pretrained(dev_file_updated, model_2, 10)

In [132]:
print("SimCSE Test")
dev_pretrained_results_df = evaluate(dev_pretrained)
dev_pretrained_results_df

SimCSE Test


Unnamed: 0,top_k,# valid qa,# invalid qa,true,accuracy
0,1,9749,10,7660,0.786
1,2,9749,10,8854,0.908
2,3,9749,10,9362,0.96
3,4,9749,10,9571,0.982
4,5,9749,10,9675,0.992
5,6,9749,10,9712,0.996
6,7,9749,10,9735,0.999
7,8,9749,10,9743,0.999
8,9,9749,10,9747,1.0
9,10,9749,10,9748,1.0
