#Import 

In [1]:
%cd /content/drive/MyDrive/vdt_dsai/squad1.1/data

/content/drive/MyDrive/vdt_dsai/squad1.1/data


In [2]:
import json 
with open("squad1.1_train_sentences.json") as f:
    train_file_sentences = json.load(f)
with open("squad1.1_dev_sentences.json") as f:
    dev_file_sentences = json.load(f)
with open("squad1.1_test_sentences.json") as f:
    test_file_sentences = json.load(f)

In [6]:
import numpy as np
def count_stuff(data):
    count_articles = 0
    count_passages = 0
    count_qas = 0
    for article in data['data']:
        count_articles += 1
        for paragraph in article['paragraphs']:
            count_passages += 1
            for qa in paragraph['qas']:
                count_qas += 1
    return np.array([count_articles, count_passages, count_qas])

In [7]:
count_stuff(train_file_sentences) + count_stuff(dev_file_sentences) + count_stuff(test_file_sentences)

array([  490, 20963, 97358])

In [9]:
train_file_sentences['data'][0]

{'paragraphs': [{'context': ['The British Empire comprised the dominions, colonies, protectorates, mandates and other territories ruled or administered by the United Kingdom.',
    'It originated with the overseas possessions and trading posts established by England between the late 16th and early 18th centuries.',
    'At its height, it was the largest empire in history and, for over a century, was the foremost global power.',
    "By 1922 the British Empire held sway over about 458 million people, one-fifth of the world's population at the time, and covered more than 13,000,000 sq mi (33,670,000 km2), almost a quarter of the Earth's total land area.",
    'As a result, its political, legal, linguistic and cultural legacy is widespread.',
    'At the peak of its power, the phrase "the empire on which the sun never sets" was often used to describe the British Empire, because its expanse around the globe meant that the sun was always shining on at least one of its territories.'],
   'qa

In [None]:
!pip install rank_bm25
!pip install underthesea

In [None]:
from copy import deepcopy 
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn
from copy import deepcopy
import numpy as np 
import pandas as pd 
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import torch

In [None]:
import nltk 
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
!pip install -U sentence-transformers
!pip install pyvi

In [None]:
from copy import deepcopy
from sentence_transformers import util
from sentence_transformers import SentenceTransformer

# Helper function

In [None]:
def find_sentence(corpus, start_answer):
    '''
    Find sentence contains answer
    * corpus: list sentences of context
    * start_answer: the begin position of answer in context
    '''
    start, end = 0, -1
    sent_idx = -1
    for i in range(len(corpus)):
        sentence = corpus[i]
        start = end + 1
        end = start+len(sentence)
        if start<=start_answer<end:
            sent_idx = i
            break
        elif start_answer == end:
            sent_idx = i + 1
            break 
    return sent_idx

def accuracy(data, top_k):
    """
    Calculate accuracy 
    Given queries Q = {Q1, ..., Qm} 
    Document D = {S1, ..., Sn}
    The sentence containing the answer to the question Qi is Ai
    
    Acc@K = 1/|Q| * sum(is Ai in get_top_k(Q, D))
    """
    results = []
    invalid = 0 # number of invalid qa
    for topic in data['data']:
        for paragraph in topic['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                if len(qa['answers']) > 0:

                    # If the answer is from multiple sentences then discard this qa
                    if len(sent_tokenize(qa['answers'][0]['text'])) > 1:
                        invalid += 1
                        continue 

                    start_answer = qa['answers'][0]['answer_start']
                    k = min(top_k, len(qa['candidate_indices']))

                    # Find the index of sentence that contains the answer 
                    ans_sent_idx = find_sentence(context, start_answer)
                    # Check if the above sentence is retrived in top_k 
                    results.append(ans_sent_idx in qa['candidate_indices'][:k])
    return {"top_k":top_k, "# valid qa":len(results), "# invalid qa": invalid,  "true":sum(results), "accuracy":round(sum(results) / len(results), 3)}

In [None]:
def evaluate(data):
    """
    Evaluate top 1 -> top 10 
    """
    results = []
    for i in range(10):
        json_i = accuracy(data, i+1)
        results.append(json_i)
    results_df = pd.read_json(json.dumps(results))
    return results_df

# Methods functions

In [None]:
def find_top_k(top_k, model, question, corpus, corpus_embeddings):
  k = min(top_k, len(corpus))
  query_embedding = model.encode(question)

  cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
  # print("error")
  # cos_scores = cos_scores.cpu()

  top_results = torch.topk(cos_scores, k=k)

  return top_results[1]

def get_topk_sentences_pretrained(train_data, model, k):
  copied_data = deepcopy(train_data)
  for topic in copied_data['data']:
    for paragraph in topic['paragraphs']:
      # paragraph['context'] = sent_tokenize(paragraph['context'])
      context = paragraph['context']
      context_embeddings = model.encode(context)
      for qa in paragraph['qas']:
        question = qa['question']

        ## Get top k most relevant sentences
        top_k_candidates = find_top_k(k, model, question, context, context_embeddings)
        qa['candidate_indices'] = top_k_candidates.tolist()
  return copied_data

# Evaluate

In [None]:
model_name_1 = "multi-qa-mpnet-base-dot-v1" # Best model for semantic search according to sbert.com
model_name_2 = "all-mpnet-base-v2" # Second best model for semantich search, was trained on all available training data and for general purposes
model_name_3 = "paraphrase-xlm-r-multilingual-v1" # The model used in Vireader
model_1 = SentenceTransformer(model_name_1)
model_2 = SentenceTransformer(model_name_2)
model_3 = SentenceTransformer(model_name_3)

In [None]:
test_pretrained_1 = get_topk_sentences_pretrained(test_file_sentences, model_1, 10)

In [None]:
test_pretrained_2 = get_topk_sentences_pretrained(test_file_sentences, model_2, 10)

In [None]:
test_pretrained_3 = get_topk_sentences_pretrained(test_file_sentences, model_3, 10)

In [None]:
pretrained_results_1 = evaluate(test_pretrained_1)
pretrained_results_1

Unnamed: 0,top_k,# valid qa,# invalid qa,true,accuracy
0,1,9749,10,7878,0.808
1,2,9749,10,9111,0.935
2,3,9749,10,9476,0.972
3,4,9749,10,9649,0.99
4,5,9749,10,9711,0.996
5,6,9749,10,9734,0.998
6,7,9749,10,9741,0.999
7,8,9749,10,9746,1.0
8,9,9749,10,9748,1.0
9,10,9749,10,9748,1.0


In [None]:
pretrained_results_2 = evaluate(test_pretrained_2)
pretrained_results_2

Unnamed: 0,top_k,# valid qa,# invalid qa,true,accuracy
0,1,9749,10,7842,0.804
1,2,9749,10,9031,0.926
2,3,9749,10,9422,0.966
3,4,9749,10,9612,0.986
4,5,9749,10,9695,0.994
5,6,9749,10,9728,0.998
6,7,9749,10,9738,0.999
7,8,9749,10,9744,0.999
8,9,9749,10,9747,1.0
9,10,9749,10,9748,1.0


In [None]:
pretrained_results_3 = evaluate(test_pretrained_3)
pretrained_results_3

Unnamed: 0,top_k,# valid qa,# invalid qa,true,accuracy
0,1,9749,10,7660,0.786
1,2,9749,10,8854,0.908
2,3,9749,10,9362,0.96
3,4,9749,10,9571,0.982
4,5,9749,10,9675,0.992
5,6,9749,10,9712,0.996
6,7,9749,10,9735,0.999
7,8,9749,10,9743,0.999
8,9,9749,10,9747,1.0
9,10,9749,10,9748,1.0
