In [1]:
# Load model directly
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

In [2]:
from datasets import load_dataset

sciq_dataset = load_dataset("allenai/sciq")
sciq_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 11679
    })
    validation: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 1000
    })
})

### Sequential Model Distractors Analysis 

Load model from https://huggingface.co/rizkiduwinanto/final-bart-question-generation.

In [3]:
model_qa = BartForConditionalGeneration.from_pretrained('rizkiduwinanto/final-bart-question-generation').to(device)

In [4]:
text = "In the first law, an object will not change its motion unless a force acts on it. In the second law, the force on an object is equal to its mass times its acceleration. In the third law, when two objects interact, they apply forces to each other of equal magnitude and opposite direction."

Code for inference of question generation model.

In [5]:
def qa_inference(context):
    text = "Support: {}".format(context)
    max_length = 600
    tokenized_inputs = tokenizer.encode_plus(text, return_tensors="pt", max_length=max_length, truncation=True, padding='max_length').to(device) 

    output = model_qa.generate(input_ids=tokenized_inputs["input_ids"], max_length=1024)
    
    answer = tokenizer.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return answer

question_answer = qa_inference(text)
question_answer

['Question: In the first law, an object will not change its motion unless a force acts on it. in the second law, the force on an object is equal to its mass times what? Answer: its acceleration']

Load model from https://huggingface.co/rizkiduwinanto/final-bart-distractor-generation.

In [6]:
model_da = BartForConditionalGeneration.from_pretrained('rizkiduwinanto/final-bart-distractor-generation').to(device)

Code for inference of distractor generation model.

In [7]:
def da_inference(context, result):
    text = "Support: {} {}".format(context, result[0])
    max_length = 600
    tokenized_inputs = tokenizer.encode_plus(text, return_tensors="pt", max_length=max_length, truncation=True, padding='max_length').to(device) 

    output = model_da.generate(input_ids=tokenized_inputs["input_ids"], max_length=1024)
    
    answer = tokenizer.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return answer

distractors = da_inference(text, question_answer)
distractors

['Distractor1: its velocity Distractor2: its weight Distractor3: its density']

The pipeline for sequential generation, we do question answer inference first and then distractor generation. We extract the question answer and distractors from the labels e.g "Distractors" using Regex.

In [8]:
import re
def pipeline(context):
    question_answer = qa_inference(context)
    distractors = da_inference(context, question_answer)
    
    question_pattern = r"Question: (.+?\?)"
    answer_pattern = r"Answer: (.+)"
    dis1_pattern = r"Distractor1: (.+)"
    dis2_pattern = r"Distractor2: (.+)"
    dis3_pattern = r"Distractor3: (.+)"

    question = ""
    answer = ""
    
    question_match = re.search(question_pattern, question_answer[0])
    answer_match = re.search(answer_pattern, question_answer[0])

    if question_match:
        question = question_match.group(1)
    if answer_match:
        answer = answer_match.group(1)

    distractor_pattern = r"Distractor\d+: (.+?)(?= Distractor|$)"
    distractors = re.findall(distractor_pattern, distractors[0])

    return question, answer, distractors

pipeline(text)

('In the first law, an object will not change its motion unless a force acts on it. in the second law, the force on an object is equal to its mass times what?',
 'its acceleration',
 ['its velocity', 'its weight', 'its density'])

In [9]:
filtered_sciq = sciq_dataset.filter(lambda example: example["support"] != '')
filtered_dataset = sciq_dataset.filter(lambda example: example['support'] is not None and example['support'] != "")
# And remove any datapoints which contain questions that have a 'fill-in-the-blank' type answer
filtered_sciq = filtered_dataset.filter(lambda example: '_______' not in example['question'] and '______' not in example['question'] and '_____' not in example['question'] and '____' not in example['question'] and '___' not in example['question'])
filtered_sciq 

DatasetDict({
    train: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 10423
    })
    validation: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 881
    })
    test: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 880
    })
})

In [10]:
test_data = filtered_sciq['validation']
test_data

Dataset({
    features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
    num_rows: 881
})

Load the Word2Vec Model for distractor analysis. We use https://huggingface.co/fse/glove-wiki-gigaword-50 that encompasses the dictionary needed in science content.

In [11]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

glove_vectors = api.load("glove-wiki-gigaword-50")

Testing the vectors from gensim library https://radimrehurek.com/gensim/models/word2vec.html.

In [12]:
glove_vectors.most_similar('velocity')

[('velocities', 0.8433403372764587),
 ('angle', 0.8210359215736389),
 ('angular', 0.8034953474998474),
 ('gravity', 0.7958716154098511),
 ('amplitude', 0.7937971353530884),
 ('wavelength', 0.7915680408477783),
 ('gradient', 0.7870423793792725),
 ('probability', 0.7667955756187439),
 ('frequency', 0.7655791640281677),
 ('measurement', 0.7501229047775269)]

In [13]:
glove_vectors.similarity('angry', 'james')

0.22834066

In [14]:
glove_vectors.similarity('velocity', 'acceleration')

0.67458355

In [15]:
glove_vectors.similarity('weight', 'acceleration')

0.5167572

In [16]:
glove_vectors.similarity('density', 'acceleration')

0.5337082

We can see that very different words yields low score and nearly the same yields higher.

In [17]:
pipeline(test_data[469]['support'])

('The smallest particle of an element that still has the properties of that element is called?',
 'the atom',
 ['the electron', 'the neutron', 'the nucleus'])

Use map function to traverse the whole dataset.

In [18]:
def infer_all(examples):
    question, answer, distractors = pipeline(examples['support'])
    
    return {
        "pred_answer": answer if answer != None else "",
        "pred_question": question,
        "pred_distractors1": distractors[0] if len(distractors) > 0 else "",
        "pred_distractors2": distractors[1] if len(distractors) > 1 else "",
        "pred_distractors3": distractors[2] if len(distractors) > 2 else "",
    }

# test_data_sequential = test_data.map(infer_all)

To save inference time, data is already saved in the 'data.zip'

In [19]:
from datasets import load_from_disk
test_data_sequential = load_from_disk("data/res-sequential")

In [20]:
test_data_sequential[170]

{'question': 'The transfer of energy by electromagnetic waves is called what?',
 'distractor3': 'magnetic radiation',
 'distractor1': 'particulate radiation',
 'distractor2': 'mechanical radiation',
 'correct_answer': 'electromagnetic radiation',
 'support': 'Electromagnetic waves are waves that consist of vibrating electric and magnetic fields. Like other waves, electromagnetic waves transfer energy from one place to another. The transfer of energy by electromagnetic waves is called electromagnetic radiation . Electromagnetic waves can transfer energy through matter or across empty space. For an excellent video introduction to electromagnetic waves, go to this URL: http://www. youtube. com/watch?v=cfXzwh3KadE.',
 'pred_answer': 'magnetic fields',
 'pred_question': 'Electromagnetic waves are waves that consist of vibrating electric and what else?',
 'pred_distractors1': 'gravitational fields',
 'pred_distractors2': 'electromagnetic currents',
 'pred_distractors3': 'magnetic currents'}

Use phrase vector function to average the word vector in two vectors, assume empty and not found as 0.

In [21]:
import numpy as np
def phrase_vector(phrase):
    ## empty assume zero
    if phrase == "":
        wv = np.zeros(glove_vectors.vector_size)
    else:
        words = phrase.split()
        word_vectors = [glove_vectors[word] for word in words if word in glove_vectors]
        if len(word_vectors) == 0:
            wv = np.zeros(glove_vectors.vector_size)
        else:
            wv = np.mean(word_vectors, axis=0)
    return wv 
vec = phrase_vector("iron-def")
vec

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

Load the cosine similarity library.

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([phrase_vector("happy")], [phrase_vector("sad")])[0][0]

0.68906325

Function for ADS or answer distractor similarity that calculates the distance between each distractor to answer.

In [23]:
def calc_vector_answer_dis(examples):
    answer_vec = phrase_vector(examples['correct_answer'])
    dis1_vec = phrase_vector(examples['distractor1'])
    dis2_vec = phrase_vector(examples['distractor2'])
    dis3_vec = phrase_vector(examples['distractor3'])

    #distance between answers and distractors
    sim_answer_dis1 = cosine_similarity([answer_vec], [dis1_vec])
    sim_answer_dis2 = cosine_similarity([answer_vec], [dis2_vec])
    sim_answer_dis3 = cosine_similarity([answer_vec], [dis3_vec])

    sim_answer_dis = np.average([sim_answer_dis1, sim_answer_dis2, sim_answer_dis3])
    return sim_answer_dis

Function for IDS or answer distractor similarity that calculates the distance between each distractors.

In [24]:
def calc_vector_inter_dis(examples):
    dis1_vec = phrase_vector(examples['distractor1'])
    dis2_vec = phrase_vector(examples['distractor2'])
    dis3_vec = phrase_vector(examples['distractor3'])

    #distance between each distractors
    sim_dis1_dis2 = cosine_similarity([dis1_vec], [dis2_vec])
    sim_dis2_dis3 = cosine_similarity([dis2_vec], [dis3_vec])
    sim_dis3_dis1 = cosine_similarity([dis3_vec], [dis1_vec])

    sim_dis = np.average([sim_dis1_dis2, sim_dis2_dis3, sim_dis3_dis1])
    return sim_dis

ADS for prediction...

In [25]:
def calc_vector_answer_dis_pred(examples):
    answer_vec = phrase_vector(examples['pred_answer'])
    dis1_vec = phrase_vector(examples['pred_distractors1'])
    dis2_vec = phrase_vector(examples['pred_distractors2'])
    dis3_vec = phrase_vector(examples['pred_distractors3'])

     #distance between answers and distractors
    sim_answer_dis1 = cosine_similarity([answer_vec], [dis1_vec])
    sim_answer_dis2 = cosine_similarity([answer_vec], [dis2_vec])
    sim_answer_dis3 = cosine_similarity([answer_vec], [dis3_vec])

    sim_answer_dis = np.average([sim_answer_dis1, sim_answer_dis2, sim_answer_dis3])

    return sim_answer_dis

IDS for prediction...

In [26]:
def calc_vector_inter_dis_pred(examples):
    dis1_vec = phrase_vector(examples['pred_distractors1'])
    dis2_vec = phrase_vector(examples['pred_distractors2'])
    dis3_vec = phrase_vector(examples['pred_distractors3'])

    #distance between each distractors
    sim_dis1_dis2 = cosine_similarity([dis1_vec], [dis2_vec])
    sim_dis2_dis3 = cosine_similarity([dis2_vec], [dis3_vec])
    sim_dis3_dis1 = cosine_similarity([dis3_vec], [dis1_vec])

    sim_dis = np.average([sim_dis1_dis2, sim_dis2_dis3, sim_dis3_dis1])

    return sim_dis

In [27]:
def calc_distractor_analysis(test_data):
    answer_dis = []
    inter_dis = []
    answer_pred_dis = []
    inter_pred_dis = []
    for data in test_data:
        answer_dis.append(calc_vector_answer_dis(data))
        inter_dis.append(calc_vector_inter_dis(data))
        answer_pred_dis.append(calc_vector_answer_dis_pred(data))
        inter_pred_dis.append(calc_vector_inter_dis_pred(data))

    return answer_dis, inter_dis, answer_pred_dis, inter_pred_dis

answer_dis_seq, inter_dis_seq, answer_pred_dis_seq, inter_pred_dis_seq = calc_distractor_analysis(test_data_sequential)

Ground Truth Answer and Distractors Similarity

In [28]:
np.average(answer_dis_seq)

0.5800197711281374

Ground Truth Inter Distractors Similarity

In [29]:
np.average(inter_dis_seq)

0.5230395514299552

Predicted Answer and Distractors Similarity

In [30]:
np.average(answer_pred_dis_seq)

0.6643107100007571

Predicted Inter Distractors Similarity

In [31]:
np.average(inter_pred_dis_seq)

0.6502474400676812

The model produces more synonymous words than expected

Sequential Model Question Analysis with BLEU

In [32]:
from nltk.translate.bleu_score import sentence_bleu

def calc_bleu(test_data):
    bleus1 = []
    bleus2 = []
    bleus3 = []
    bleus4 = []
    for data in test_data:
        predicted = "Question: {} Answer {}".format(data['pred_question'], data['pred_answer']).split()
        references = "Question: {} Answer {}".format(data['question'], data['correct_answer']).split()
        final_score1 = sentence_bleu(references, predicted, weights=(1, 0, 0, 0))
        final_score2 = sentence_bleu(references, predicted, weights=(0, 1, 0, 0))
        final_score3 = sentence_bleu(references, predicted, weights=(0, 0, 1, 0))
        final_score4 = sentence_bleu(references, predicted, weights=(0, 0, 0, 1))
        bleus1.append(final_score1)
        bleus2.append(final_score2)
        bleus3.append(final_score3)
        bleus4.append(final_score4)
    
    # Calculate the average BLEU score
    average_bleu_score1 = np.mean(bleus1)
    average_bleu_score2 = np.mean(bleus2)
    average_bleu_score3 = np.mean(bleus3)
    average_bleu_score4 = np.mean(bleus4)
    
    return round(average_bleu_score1, 4), round(average_bleu_score2, 4), round(average_bleu_score3, 4), round(average_bleu_score4, 4)

calc_bleu(test_data_sequential)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


(0.0203, 0.0, 0.0, 0.0)

### Single Fine Tuned Model Distractors Analysis 

#### Base Model

In [33]:
model_single_base = BartForConditionalGeneration.from_pretrained("b-b-brouwer/CL_base").to(device)

In [34]:
def inference_sft(context, model):
    text = "Support: {}".format(context)
    max_length = 600
    tokenized_inputs = tokenizer.encode_plus(text, return_tensors="pt", max_length=max_length, truncation=True, padding='max_length').to(device) 

    output = model.generate(input_ids=tokenized_inputs["input_ids"], max_length=1024)
    
    res = tokenizer.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return res
    
inference_sft(text, model_single_base)

['Question:Question: Which law states that an object will not change its motion unless a force acts on it? Answer: first law Distractor1:  third law and second law of inertia Distractor2: fourth law and third law of relativity Distractor3: law and law']

In [35]:
def infer_all_sft(examples, model):
    result = inference_sft(examples['support'], model)

    question_pattern = r"Question: (.+?\?)"
    answer_pattern = r"Answer: (.+?)(?= Distractor\d+: |$)"
    dis1_pattern = r"Distractor1: (.+)"
    dis2_pattern = r"Distractor2: (.+)"
    dis3_pattern = r"Distractor3: (.+)"

    question = ""
    answer = ""
    
    question_match = re.search(question_pattern, result[0])
    answer_match = re.search(answer_pattern, result[0])

    if question_match:
        question = question_match.group(1)
    if answer_match:
        answer = answer_match.group(1)

    distractor_pattern = r"Distractor\d+: (.+?)(?= Distractor|$)"
    distractors = re.findall(distractor_pattern, result[0])
    
    return {
        "pred_answer": answer if answer != None else "",
        "pred_question": question,
        "pred_distractors1": distractors[0] if len(distractors) > 0 else "",
        "pred_distractors2": distractors[1] if len(distractors) > 1 else "",
        "pred_distractors3": distractors[2] if len(distractors) > 2 else "",
    }

# test_data_single_base = test_data.map(infer_all_sft, fn_kwargs={"model": model_single_base})

To save inference time, can use the data from data.zip file.

In [36]:
test_data_single_base = load_from_disk("data/res-base-single")

Do a distractor analysis on single base model

In [37]:
answer_dis_sb, inter_dis_sb, answer_pred_dis_sb, inter_pred_dis_sb = calc_distractor_analysis(test_data_single_base)

In [38]:
np.average(answer_dis_sb)

0.5800197711281374

In [39]:
np.average(inter_dis_sb)

0.5230395514299552

In [40]:
np.average(answer_pred_dis_sb)

0.5562485193243836

In [41]:
np.average(inter_pred_dis_sb)

0.5263792485850741

The model produces a comparable results to the ground truth

Do a BLEU analysis

In [42]:
calc_bleu(test_data_single_base)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


(0.0187, 0.0, 0.0, 0.0)

#### Large Model

In [43]:
model_single_large = BartForConditionalGeneration.from_pretrained("b-b-brouwer/CL_large").to(device)

In [44]:
inference_sft(text, model_single_large)

['Question: In which law is the force on an object equal to its mass times its acceleration? Answer:  second law Distractor1:  third law of inertia Distractor2:  fourth law of gravity Distractor3:']

In [45]:
# test_data_single_large = test_data.map(infer_all_sft, fn_kwargs={"model": model_single_large})

In [46]:
test_data_single_large = load_from_disk("data/res-large-single")

In [53]:
answer_dis_sl, inter_dis_sl, answer_pred_dis_sl, inter_pred_dis_sl = calc_distractor_analysis(test_data_single_large)

Do a distractor analysis on single large model

In [54]:
np.average(answer_dis_sl)

0.5800197711281374

In [55]:
np.average(inter_dis_sl)

0.5230395514299552

In [56]:
np.average(answer_pred_dis_sl)

0.5562485193243836

In [57]:
np.average(inter_pred_dis_sl)

0.5263792485850741

Do a BLEU analysis

In [58]:
calc_bleu(test_data_single_large)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


(0.0187, 0.0, 0.0, 0.0)