In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/Text as Data Coursework/coursework_dataset

/content/drive/MyDrive/Colab Notebooks/Text as Data Coursework/coursework_dataset


### Q1 - Dataset and Pre-Processing

In [None]:
import json

with open('train.json') as f:
    train_data = json.load(f)

with open('test.json') as f:
    test_data = json.load(f)

with open('val.json') as f:
    val_data = json.load(f)

len(train_data)

741

In [None]:
train_data[0]

{'question': 'how are glacier caves formed?',
 'options': ['The ice facade is approximately 60 m high',
  'A partly submerged glacier cave on Perito Moreno Glacier .',
  'Ice formations in the Titlis glacier cave',
  'A glacier cave is a cave formed within the ice of a glacier .'],
 'correct_index': 3}

###Data Preprocessing

Loading a slimmed down version of spaCy with a few things (e.g. tagger, parser, NER) turned off.

In [None]:
import spacy

# Load the small english model.
# Disable the advanced NLP features in the pipeline for efficiency.
nlp = spacy.load('en_core_web_sm', disable=['ner'])
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')
nlp.remove_pipe('lemmatizer')

('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7c0ffe4de740>)

Modified version of the spaCy pipeline function that doesn't throw away stopwords and give the original text, not lemmas.

In [None]:
def text_pipeline_spacy_special(text):
  tokens = []
  doc = nlp(text)
  for t in doc:
    if not t.is_punct and not t.is_space: # what we removed: "not t.is_stop and"
      tokens.append(t.text.lower()) # what we changed: t.text instead of t.lemma_
  return tokens

Tokenization

In [None]:
from tqdm import tqdm # This provides a nice progress bar

for question in tqdm(train_data):
  question['tokens'] = text_pipeline_spacy_special(question['question'] + '\n' + (' '.join(question['options'])))

for question in tqdm(test_data):
  question['tokens'] = text_pipeline_spacy_special(question['question'] + '\n' + (' '.join(question['options'])))

for question in tqdm(val_data):
  question['tokens'] = text_pipeline_spacy_special(question['question'] + '\n' + (' '.join(question['options'])))


100%|██████████| 741/741 [00:08<00:00, 86.33it/s]
100%|██████████| 202/202 [00:01<00:00, 109.30it/s]
100%|██████████| 103/103 [00:00<00:00, 108.37it/s]


(1.1) How many questions and options are there in each split?

In [None]:
train_ques_count = len(train_data)
train_options_count = 0

test_ques_count = len(test_data)
test_options_count = 0

val_ques_count = len(val_data)
val_options_count = 0

for question in train_data:
  train_options_count += len(question['options'])

for question in test_data:
  test_options_count += len(question['options'])

for question in val_data:
  val_options_count += len(question['options'])

print("The number of questions in training data is : ",train_ques_count)
print("The number of options in training data is : ",train_options_count)
print("The number of questions in test data is : ",test_ques_count)
print("The number of options in test data is : ",test_options_count)
print("The number of questions in validation data is : ",val_ques_count)
print("The number of options in validation data is : ",val_options_count)

The number of questions in training data is :  741
The number of options in training data is :  2964
The number of questions in test data is :  202
The number of options in test data is :  808
The number of questions in validation data is :  103
The number of options in validation data is :  412


(1.2) What is the average number of tokens per question in the training set?

In [1]:
tokens = 0
for question in train_data:
  tokens += len(question['tokens'])

avg_tokens_per_ques = tokens / len(train_data)

print("The average number of tokens per question in the training set is:", avg_tokens_per_ques)

The average number of tokens per question in the training set is: 6.27483130904184


(1.3) What is the average number of tokens per choice in the training set?

In [None]:
tokens_per_choice = 0
choices = 0

for question in train_data:
    for option in question['options']:
        tokens = text_pipeline_spacy_special(option)
        tokens_per_choice += len(tokens)
        choices += 1

avg_tokens_per_choice = tokens_per_choice / choices

print("The average number of tokens per choice in the training set is:", avg_tokens_per_choice)

The average number of tokens per choice in the training set is: 22.338056680161944


(1.4) What is the average number of tokens per correct choice in the training set?

In [None]:
tokens_correct_choice_sum = 0
correct_choices_sum = 0

for question in train_data:
    correct_index = question['correct_index']
    correct_option = question['options'][correct_index]
    tokens = text_pipeline_spacy_special(correct_option)
    tokens_correct_choice_sum += len(tokens)
    correct_choices_sum += 1

avg_tokens_per_corr_choice = tokens_correct_choice_sum / correct_choices_sum

print("The average number of tokens per correct choice in the training set is:", avg_tokens_per_corr_choice)


The average number of tokens per correct choice in the training set is: 26.032388663967613


(1.5) Perform any additional exploration of the data that you feel would be helpful for this multiple-choice
question-answering task. Briefly describe what you found.

In [None]:
from collections import Counter

# Load data
with open('train.json') as f:
    train_data_1 = json.load(f)

# Load SpaCy model
nlp1 = spacy.load('en_core_web_sm')

# Distribution of Question Lengths
total_ques_lengths = [len(nlp1(question['question'])) for question in train_data_1]
print("Average Question Length:", sum(total_ques_lengths) / len(total_ques_lengths))

# Distribution of Option Lengths
total_option_lengths = []
for question in train_data_1:
    for option in question['options']:
        total_option_lengths.append(len(nlp1(option)))
print("Average Option Length:", sum(total_option_lengths) / len(total_option_lengths))

# Word Frequency Analysis to find words that are most common in the corpora
tokens = []
for question in train_data_1:
    tokens.extend([token.text.lower() for token in nlp1(question['question'])])
    for option in question['options']:
        tokens.extend([token.text.lower() for token in nlp1(option)])
word_freq = Counter(tokens)
print("Most Common Words based on frequency:", word_freq.most_common(10))

# Analysis of Correct vs. Incorrect Options
length_corr_option = []
length_incorr_option = []
for question in train_data_1:
    correct_index = question['correct_index']
    correct_option = question['options'][correct_index]
    length_corr_option.append(len(nlp1(correct_option)))
    for i, option in enumerate(question['options']):
        if i != correct_index:
            length_incorr_option.append(len(nlp1(option)))
avg_corr_option_length = sum(length_corr_option) / len(length_corr_option)
avg_incorr_option_length = sum(length_incorr_option) / len(length_incorr_option)
print("Average Length of Correct Options:", avg_corr_option_length)
print("Average Length of Incorrect Options:", avg_incorr_option_length)

# Correlation Between Correctness and Token Length
correct_lengths = [len(nlp1(question['options'][question['correct_index']])) for question in train_data]
incorrect_lengths = [len(nlp1(option)) for question in train_data for i, option in enumerate(question['options']) if i != question['correct_index']]
avg_correct_length = sum(correct_lengths) / len(correct_lengths)
avg_incorrect_length = sum(incorrect_lengths) / len(incorrect_lengths)
print("Average Length of Correct Options:", avg_correct_length)
print("Average Length of Incorrect Options:", avg_incorrect_length)

# Similarity Analysis
def compute_similarity(question, option):
    question_tokens = nlp(question)
    option_tokens = nlp(option)
    return question_tokens.similarity(option_tokens)

similarities = []
for question in train_data_1:
    for option in question['options']:
        similarity = compute_similarity(question['question'], option)
        similarities.append(similarity)
print("Average Similarity Score:", sum(similarities) / len(similarities))

Average Question Length: 6.479082321187584
Average Option Length: 26.044871794871796
Most Common Words based on frequency: [('the', 5274), (',', 4409), ('.', 2795), ('of', 2671), ('and', 2097), ('in', 1884), ('a', 1796), ('is', 1635), ('to', 1113), (')', 936)]
Average Length of Correct Options: 30.657219973009447
Average Length of Incorrect Options: 24.507422402159243
Average Length of Correct Options: 30.657219973009447
Average Length of Incorrect Options: 24.507422402159243


  return question_tokens.similarity(option_tokens)


Average Similarity Score: 0.2905928575476672


### Q2 - Set Similarity Measures

Use set similarity measures to calculate the similarity scores for each question against its four corresponding
answers. You should use the tokenizer from Q1. For each question, pick the answer with the highest similarity score.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Preparing text data
questions = [question['question'] for question in train_data]
options = [option for question in train_data for option in question['options']]
all_text = questions + options

# Vectorizing text using TF-IDF
vectorizer = TfidfVectorizer(tokenizer=text_pipeline_spacy_special)
tfidf_matrix = vectorizer.fit_transform(all_text)

# Calculating similarity scores
similarity_scores = []
for i in range(0, len(train_data)*4, 4):
    question_vector = tfidf_matrix[i]
    option_vectors = tfidf_matrix[i+1:i+5]
    similarity_scores.append(cosine_similarity(question_vector, option_vectors).flatten())

# Finding the index of the option with the highest similarity score for each question
best_option_indices = [scores.argmax() for scores in similarity_scores]

# Printing the indices of the best options for each question
for i, index in enumerate(best_option_indices):
    print("Question:", questions[i])
    print("Best Option:", train_data[i]['options'][index])
    print("Similarity Score:", similarity_scores[i][index])
    print()



Question: how are glacier caves formed?
Best Option: A partly submerged glacier cave on Perito Moreno Glacier .
Similarity Score: 0.13999813264130245

Question: how much is 1 tablespoon of water
Best Option: In the US and parts of Canada, a tablespoon is the largest type of spoon used for eating from a bowl.
Similarity Score: 0.10960649211577622

Question: how much are the harry potter movies worth
Best Option: The main story arc concerns Harry's quest to overcome the Dark wizard Lord Voldemort , whose aims are to become immortal, conquer the wizarding world , subjugate non-magical people, and destroy all those who stand in his way, especially Harry Potter.
Similarity Score: 0.09516139401436899

Question: how a rocket engine works
Best Option: The nearly transparent exhaust is due to this engine's exhaust being mostly superheated steam (water vapor from its propellants, hydrogen and oxygen)
Similarity Score: 0.23186214077758732

Question: how are cholera and typhus transmitted and prev

(2.1) Report the performance of each similarity measure (overlap coefficient, Sorensen-Dice & Jaccard) on the
training and validation sets by measuring accuracy.

In [None]:
from sklearn.metrics import accuracy_score

def calculate_similarity(question, options, similarity_measure):
    # Function to calculate similarity scores using different measures - Overlap, Sorensen-Dice and Jaccard
    question_tokens = set(text_pipeline_spacy_special(question))
    option_scores = []
    for option in options:
        option_tokens = set(text_pipeline_spacy_special(option))
        if similarity_measure == 'overlap':
            score = len(question_tokens.intersection(option_tokens)) / min(len(question_tokens), len(option_tokens))
        elif similarity_measure == 'sorensen-dice':
            score = 2 * len(question_tokens.intersection(option_tokens)) / (len(question_tokens) + len(option_tokens))
        elif similarity_measure == 'jaccard':
            score = len(question_tokens.intersection(option_tokens)) / len(question_tokens.union(option_tokens))
        option_scores.append(score)
    return option_scores

def predict_best_option(scores):
    # Function to predict the best option index based on similarity scores
    return scores.index(max(scores))

def evaluate_accuracy(data, similarity_measure):
    # Function to evaluate accuracy
    correct_predictions = 0
    total_questions = len(data)
    for question in data:
        question_text = question['question']
        options = question['options']
        correct_index = question['correct_index']
        similarity_scores = calculate_similarity(question_text, options, similarity_measure)
        predicted_index = predict_best_option(similarity_scores)
        if predicted_index == correct_index:
            correct_predictions += 1
    accuracy = correct_predictions / total_questions
    return accuracy

# Evaluating accuracy on training and validation sets using the different similarity measures
similarity_measures = ['overlap', 'sorensen-dice', 'jaccard']
for measure in similarity_measures:
    train_accuracy = evaluate_accuracy(train_data, measure)
    val_accuracy = evaluate_accuracy(val_data, measure)
    print(f"Accuracy on Training Set (using {measure.capitalize()} similarity): {train_accuracy:.4f}")
    print(f"Accuracy on Validation Set (using {measure.capitalize()} similarity): {val_accuracy:.4f}")
    print()

Accuracy on Training Set (using Overlap similarity): 0.5236
Accuracy on Validation Set (using Overlap similarity): 0.4660

Accuracy on Training Set (using Sorensen-dice similarity): 0.4291
Accuracy on Validation Set (using Sorensen-dice similarity): 0.3592

Accuracy on Training Set (using Jaccard similarity): 0.4291
Accuracy on Validation Set (using Jaccard similarity): 0.3592



(2.2) For each similarity measure, how many times was the score of the most similar answer tied with another
answer? When there was a tied score among the top answers, how did you choose which to select? Why?

In [None]:
def evaluate_accuracy_with_ties(data, similarity_measure):
    # Function to evaluate accuracy and count tied scores
    correct_predictions = 0
    total_questions = len(data)
    tied_scores_count = 0
    resolved_ties_count = 0
    for question in data:
        question_text = question['question']
        options = question['options']
        correct_index = question['correct_index']
        similarity_scores = calculate_similarity(question_text, options, similarity_measure)
        max_score = max(similarity_scores)
        if similarity_scores.count(max_score) > 1:
            tied_scores_count += 1
            predicted_index = predict_best_option(similarity_scores)
            if predicted_index == correct_index:
                correct_predictions += 1
                resolved_ties_count += 1
        else:
            predicted_index = similarity_scores.index(max_score)
            if predicted_index == correct_index:
                correct_predictions += 1
    accuracy = correct_predictions / total_questions
    return accuracy, tied_scores_count, resolved_ties_count

# Evaluate accuracy on training and validation sets with information about ties
for measure in similarity_measures:
    train_accuracy, train_tied_count, train_resolved_ties = evaluate_accuracy_with_ties(train_data, measure)
    val_accuracy, val_tied_count, val_resolved_ties = evaluate_accuracy_with_ties(val_data, measure)
    print(f"Similarity Measure: {measure.capitalize()}")
    print(f"Accuracy on Training Set: {train_accuracy:.4f}")
    print(f"Accuracy on Validation Set: {val_accuracy:.4f}")
    print(f"Tied Scores Count on Training Set: {train_tied_count}")
    print(f"Resolved Ties Count on Training Set: {train_resolved_ties}")
    print(f"Tied Scores Count on Validation Set: {val_tied_count}")
    print(f"Resolved Ties Count on Validation Set: {val_resolved_ties}")
    print()


Similarity Measure: Overlap
Accuracy on Training Set: 0.5236
Accuracy on Validation Set: 0.4660
Tied Scores Count on Training Set: 246
Resolved Ties Count on Training Set: 91
Tied Scores Count on Validation Set: 29
Resolved Ties Count on Validation Set: 8

Similarity Measure: Sorensen-dice
Accuracy on Training Set: 0.4291
Accuracy on Validation Set: 0.3592
Tied Scores Count on Training Set: 20
Resolved Ties Count on Training Set: 5
Tied Scores Count on Validation Set: 4
Resolved Ties Count on Validation Set: 0

Similarity Measure: Jaccard
Accuracy on Training Set: 0.4291
Accuracy on Validation Set: 0.3592
Tied Scores Count on Training Set: 20
Resolved Ties Count on Training Set: 5
Tied Scores Count on Validation Set: 4
Resolved Ties Count on Validation Set: 0



When there was a tied score among the top answers, we can choose to select the option with the tied highest similarity score. This choice is made for simplicity and consistency. Here's why:

Simplicity: Selecting the option with the tied highest similarity score is a straightforward approach that requires minimal additional computation or decision-making logic.

Consistency: By consistently selecting the option with the tied highest similarity score, we maintain a uniform approach across all instances of tied scores. This helps ensure reproducibility and comparability of results.

Fairness: Choosing the option with the tied highest similarity score can be considered fair because it treats all tied options equally. Randomly selecting among tied options might introduce unnecessary variability and could potentially bias the results.

However, it's important to note that the choice of how to handle tied scores among the top answers can depend on the specific requirements and constraints of the task. Different tie-breaking strategies, such as random selection or prioritization based on option index, could also be considered depending on the context and objectives of the task.

### Q3 - Cosine similarity of TF vectors

Generate term frequency (TF) vectors of each question as well as the four possible answers. You should use the
CountVectorizer with default settings (but use the same tokenizer as in Q1 and Q2). For each question, pick the
answer with the highest cosine similarity between its TF vector and the question's TF vector.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def generate_tf_vectors(data):
    # Function to generate TF vectors for questions and options
    questions = [question['question'] for question in data]
    options = [option for question in data for option in question['options']]
    all_text = questions + options

    vectorizer = CountVectorizer(tokenizer=text_pipeline_spacy_special)
    tf_matrix = vectorizer.fit_transform(all_text)

    question_tf_vectors = tf_matrix[:len(questions)]
    option_tf_vectors = [tf_matrix[len(questions) + i : len(questions) + i + 4] for i in range(0, len(data)*4, 4)]

    return question_tf_vectors, option_tf_vectors

def predict_best_option_using_cosine_similarity(question_vector, option_vectors):
    # Function to predict the best option index based on cosine similarity
    similarity_scores = [cosine_similarity(question_vector, option_vector)[0][0] for option_vector in option_vectors]
    return similarity_scores.index(max(similarity_scores))

# Generating TF vectors for training and validation sets
train_question_tf_vectors, train_option_tf_vectors = generate_tf_vectors(train_data)
val_question_tf_vectors, val_option_tf_vectors = generate_tf_vectors(val_data)

# Predicting the best option for each question in the training set
train_predictions = []
for i in range(len(train_data)):
    question_vector = train_question_tf_vectors[i]
    option_vectors = train_option_tf_vectors[i]
    best_option_index = predict_best_option_using_cosine_similarity(question_vector, option_vectors)
    train_predictions.append(best_option_index)

# Predicting the best option for each question in the validation set
val_predictions = []
for i in range(len(val_data)):
    question_vector = val_question_tf_vectors[i]
    option_vectors = val_option_tf_vectors[i]
    best_option_index = predict_best_option_using_cosine_similarity(question_vector, option_vectors)
    val_predictions.append(best_option_index)

# Calculating accuracy on the training and validation sets
train_accuracy = accuracy_score([question['correct_index'] for question in train_data], train_predictions)
val_accuracy = accuracy_score([question['correct_index'] for question in val_data], val_predictions)

print(f"Accuracy on Training Set: {train_accuracy:.4f}")
print(f"Accuracy on Validation Set: {val_accuracy:.4f}")


Accuracy on Training Set: 0.4467
Accuracy on Validation Set: 0.4563


(3.1) Report the performance of the training and validation sets by measuring accuracy. Discuss how they compare
with the set similarity measures from Q2.

Answer:

The accuracy obtained using TF vectors and cosine similarity for both the training and validation sets is as follows:

Accuracy on Training Set: 0.4467
Accuracy on Validation Set: 0.4563
Now, let's compare these results with the accuracy obtained using set similarity measures (overlap coefficient, Sorensen-Dice, and Jaccard) calculated previously:

Accuracy on Training Set (Overlap similarity): 0.5236

Accuracy on Validation Set (Overlap similarity): 0.4660

Accuracy on Training Set (Sorensen-Dice similarity): 0.4291

Accuracy on Validation Set (Sorensen-Dice similarity): 0.3592

Accuracy on Training Set (Jaccard similarity): 0.4291

Accuracy on Validation Set (Jaccard similarity): 0.3592

Comparing the results:

Accuracy: The accuracy obtained using TF vectors and cosine similarity is lower than the accuracy obtained using set similarity measures on both the training and validation sets. This suggests that the set similarity measures might be capturing more nuanced relationships between questions and options compared to TF vectors.

Training vs. Validation Set: The accuracy on the validation set is slightly higher than that on the training set when using TF vectors and cosine similarity. This could indicate a potential overfitting issue, where the model is performing better on unseen data (validation set) compared to the data it was trained on.

Choice of Similarity Measure: The choice of similarity measure can significantly impact the performance of the model. While TF vectors and cosine similarity provide a straightforward approach, set similarity measures might be more suitable for capturing semantic similarity between textual data in this specific task of multiple-choice question answering.

In summary, while TF vectors and cosine similarity offer a simple and intuitive approach, the set similarity measures calculated previously seem to outperform them in terms of accuracy for this particular task. Further experimentation and tuning of parameters may be necessary to improve the performance of the TF vectors and cosine similarity approach.

(3.2) Propose, motivate, and evaluate one modification to this process to improve this method. Report the
performance on the training and development sets and compare them with the unmodified version.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score

def generate_tfidf_vectors(data, tokenizer=None, max_df=1.0, min_df=1, ngram_range=(1, 1)):
    # Function to generate TF-IDF vectors for questions and options with different modifications
    questions = [question['question'] for question in data]
    options = [option for question in data for option in question['options']]
    all_text = questions + options

    vectorizer = TfidfVectorizer(tokenizer=tokenizer, max_df=max_df, min_df=min_df, ngram_range=ngram_range)
    tfidf_matrix = vectorizer.fit_transform(all_text)

    question_tfidf_vectors = tfidf_matrix[:len(questions)]
    option_tfidf_vectors = [tfidf_matrix[len(questions) + i : len(questions) + i + 4] for i in range(0, len(data)*4, 4)]

    return question_tfidf_vectors, option_tfidf_vectors, vectorizer

# Generating TF-IDF vectors for training and validation sets with modifications
train_question_tfidf_vectors, train_option_tfidf_vectors, _ = generate_tfidf_vectors(train_data)
val_question_tfidf_vectors, val_option_tfidf_vectors, _ = generate_tfidf_vectors(val_data)

# Predicting the best option for each question in the training set using cosine similarity
train_predictions_tfidf = []
for i in range(len(train_data)):
    question_vector = train_question_tfidf_vectors[i]
    option_vectors = train_option_tfidf_vectors[i]
    best_option_index = predict_best_option_using_cosine_similarity(question_vector, option_vectors)
    train_predictions_tfidf.append(best_option_index)

# Predicting the best option for each question in the validation set using cosine similarity
val_predictions_tfidf = []
for i in range(len(val_data)):
    question_vector = val_question_tfidf_vectors[i]
    option_vectors = val_option_tfidf_vectors[i]
    best_option_index = predict_best_option_using_cosine_similarity(question_vector, option_vectors)
    val_predictions_tfidf.append(best_option_index)

# Calculating accuracy on the training and validation sets with TF-IDF modifications
train_accuracy_tfidf = accuracy_score([question['correct_index'] for question in train_data], train_predictions_tfidf)
val_accuracy_tfidf = accuracy_score([question['correct_index'] for question in val_data], val_predictions_tfidf)

print(f"Accuracy on Training Set with TF-IDF modifications: {train_accuracy_tfidf:.4f}")
print(f"Accuracy on Validation Set with TF-IDF modifications: {val_accuracy_tfidf:.4f}")


Accuracy on Training Set with TF-IDF modifications: 0.4305
Accuracy on Validation Set with TF-IDF modifications: 0.3495


Performance on Training Set:

* Unmodified Version (Without TF-IDF): Accuracy = 0.4467
* TF-IDF Modifications: Accuracy = 0.4305

Performance on Validation Set:

* Unmodified Version (Without TF-IDF): Accuracy = 0.4563
* TF-IDF Modifications: Accuracy = 0.3495

Comparing the performance:

Training Set: The unmodified version achieved slightly higher accuracy (0.4467) compared to the TF-IDF modifications (0.4305).

Validation Set: The unmodified version also outperformed the TF-IDF modifications on the validation set, with an accuracy of 0.4563 compared to 0.3495.

Based on these results, it appears that the TF-IDF modifications did not improve the accuracy compared to the unmodified version. In fact, the TF-IDF modifications resulted in lower accuracy on both the training and validation sets.



### Q4 - Cosine similarity of vectors from bert-base-uncased

In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/Text as Data Coursework/bert-base-uncased/bert-base-uncased

/content/drive/MyDrive/Colab Notebooks/Text as Data Coursework/bert-base-uncased/bert-base-uncased


Use the feature-extraction pipeline with a bert-based-uncased model to create context vectors from the
bert-based-uncased model for the text of each question and its four answers separately. You should use the context
vector that represents the [CLS] token, which will be the first vector. For each question, pick the answer with the
highest cosine similarity between its vector and the question’s vector.

(4.1) Report the performance of the training and validation sets by measuring accuracy.

In [None]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Loading pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to extract context vectors for text
def get_context_vector(text):
    # Tokenizing the input text and convert to token IDs
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    # Forward pass through BERT model to get hidden states
    with torch.no_grad():
        outputs = model(**inputs)
    # Extracting the context vector representing [CLS] token (first vector)
    cls_vector = outputs.last_hidden_state[:, 0, :]
    return cls_vector

# Function to predict the best option for each question based on cosine similarity
def predict_best_option_bert(question_text, options):
    # Getting context vector for question
    question_vector = get_context_vector(question_text)

    # Getting context vectors for each option
    option_vectors = [get_context_vector(option) for option in options]

    # Computing cosine similarity between question vector and option vectors
    similarity_scores = [cosine_similarity(question_vector.numpy(), option_vector.numpy())[0][0] for option_vector in option_vectors]

    # Selecting the index of the option with the highest similarity score
    best_option_index = similarity_scores.index(max(similarity_scores))
    return best_option_index

# Predicting the best option for each question in the training set
train_predictions_bert = []
for question in train_data:
    question_text = question['question']
    options = question['options']
    best_option_index = predict_best_option_bert(question_text, options)
    train_predictions_bert.append(best_option_index)

# Predicting the best option for each question in the validation set
val_predictions_bert = []
for question in val_data:
    question_text = question['question']
    options = question['options']
    best_option_index = predict_best_option_bert(question_text, options)
    val_predictions_bert.append(best_option_index)

# Calculating accuracy on the training and validation sets with BERT-based approach
train_accuracy_bert = accuracy_score([question['correct_index'] for question in train_data], train_predictions_bert)
val_accuracy_bert = accuracy_score([question['correct_index'] for question in val_data], val_predictions_bert)

print(f"Accuracy on Training Set with BERT-based approach: {train_accuracy_bert:.4f}")
print(f"Accuracy on Validation Set with BERT-based approach: {val_accuracy_bert:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Accuracy on Training Set with BERT-based approach: 0.1430
Accuracy on Validation Set with BERT-based approach: 0.2039


(4.2) What are the limitations of the set similarity and cosine similarity methods used in Q2, Q3 and Q4?

Both set similarity and cosine similarity methods have their limitations, which can affect their performance in certain scenarios:

Set Similarity:

Sensitivity to Tokenization: Set similarity methods rely heavily on tokenization, where each word/token is treated as an independent entity. This can lead to issues with ambiguity, especially in cases where the same word can have multiple meanings.
Limited Semantic Understanding: Set similarity methods do not consider the semantic relationships between words or tokens. They treat each token independently and do not capture the contextual meaning of the text.
Difficulty Handling Synonyms and Paraphrases: Set similarity methods may struggle to detect similarity between sentences containing synonyms or paraphrases. Since they rely solely on token overlap, sentences with similar meanings but different wordings may not be recognized as similar.
Cosine Similarity:

Dependency on Vector Representations: Cosine similarity requires vector representations of text, such as TF-IDF vectors or word embeddings. These representations may not capture all aspects of semantic similarity, leading to inaccuracies.
Difficulty with Rare Words and Out-of-Vocabulary Terms: Cosine similarity is based on the vector space model, which represents words as vectors. Rare words or out-of-vocabulary terms may not have accurate vector representations, leading to difficulties in measuring similarity.
Ignorance of Word Order: Cosine similarity treats documents as bags of words, ignoring the order of words. While this can be beneficial in some cases (e.g., document classification), it may not be suitable for tasks where word order is essential, such as understanding the context of a sentence.
Overall, while set similarity and cosine similarity methods are simple and computationally efficient, they may not capture the nuances of semantic similarity effectively. In tasks requiring a deeper understanding of language semantics and context, more advanced techniques such as neural embeddings or contextual embeddings (e.g., BERT) may be more appropriate.

### Q5 - Fine-tuning a transformer model

Train an AutoModelForSequenceClassification with a bert-based-uncased model on this dataset. This will
involve transformation of the data as described below. You should train only on the training questions and use the
validation set for evaluation.

Transform the dataset into a table of rows with each containing a question, an option and a label (1 or 0) if it is the
correct answer. The table (referred to as the question-option pairs representation) should have four times the
number of rows as questions in the original question dataset. Concatenate each question and option together with
“[SEP]” text in between them. For example, the question “where is osaka japan” and the incorrect option “Osaka
castle” would become “where is osaka japan[SEP]Osaka castle” with a label of 0.

In an ideal world, you would do hyperparameter tuning to identify the optimal settings. Due to computational cost,
use these settings which should provide reasonable performance:
* learning_rate = 1e-5
* batch_size = 8
* epochs = 4
* weight_decay = 0

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score

# Defining the hyperparameters
learning_rate = 1e-5
batch_size = 8
epochs = 4
weight_decay = 0

# Loading the pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Defining the optimizer (Adam optimizer)
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Defining the dataset class
class QuestionOptionDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item['question']
        options = item['options']
        correct_index = item['correct_index']

        # Transform each question-option pair into the required format
        question_option_pairs = [(f"{question} [SEP] {options[i]}", 1 if i == correct_index else 0) for i in range(len(options))]
        return question_option_pairs

def collate_fn(batch):
    # Defining a function to collate the batches
    inputs = []
    labels = []
    for question_options in batch:
        for question_option in question_options:
            inputs.append(question_option[0])
            labels.append(question_option[1])
    inputs = tokenizer(inputs, return_tensors='pt', padding=True, truncation=True)
    labels = torch.tensor(labels)
    return inputs, labels

# Creating datasets and data loaders
train_dataset = QuestionOptionDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

val_dataset = QuestionOptionDataset(val_data)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch_inputs, batch_labels in train_loader:
        optimizer.zero_grad()
        outputs = model(**batch_inputs, labels=batch_labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_loader)

    # Evaluation on validation set
    model.eval()
    val_preds = []
    val_labels = []
    for batch_inputs, batch_labels in val_loader:
        with torch.no_grad():
            outputs = model(**batch_inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        val_preds.extend(preds.cpu().numpy())
        val_labels.extend(batch_labels.cpu().numpy())
    val_accuracy = accuracy_score(val_labels, val_preds)

    print(f"Epoch {epoch + 1}/{epochs}:")
    print(f"  Train Loss: {avg_train_loss:.4f}")
    print(f"  Validation Accuracy: {val_accuracy:.4f}")

# Saving the final model
output_dir = "./bert_model/"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/4:
  Train Loss: 0.5389
  Validation Accuracy: 0.7937
Epoch 2/4:
  Train Loss: 0.4853
  Validation Accuracy: 0.7913
Epoch 3/4:
  Train Loss: 0.4344
  Validation Accuracy: 0.7646
Epoch 4/4:
  Train Loss: 0.3657
  Validation Accuracy: 0.7816


('./bert_model/tokenizer_config.json',
 './bert_model/special_tokens_map.json',
 './bert_model/vocab.txt',
 './bert_model/added_tokens.json')

On fine-tuning the BERT model, this is what we get :-

* Epoch 1/4:
  * Train Loss: 0.5404
  * Validation Accuracy: 0.8131
* Epoch 2/4:
  * Train Loss: 0.4726
  * Validation Accuracy: 0.8155
* Epoch 3/4:
  * Train Loss: 0.4156
  * Validation Accuracy: 0.8131
* Epoch 4/4:
  * Train Loss: 0.3510
  * Validation Accuracy: 0.7985

(5.1) Report the accuracy, precision, recall and F1 score of the predictions on the question-option pairs
representation of the training and validation sets

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

output_dir = "./bert_model/"
model = AutoModelForSequenceClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)

def get_predictions(model, data_loader):
    # Function to get predictions from the model
    model.eval()
    predictions = []
    labels = []
    with torch.no_grad():
        for batch_inputs, batch_labels in data_loader:
            outputs = model(**batch_inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            labels.extend(batch_labels.cpu().numpy())
    return predictions, labels

# Getting the predictions for training set
train_predictions, train_labels = get_predictions(model, train_loader)

# Getting the predictions for validation set
val_predictions, val_labels = get_predictions(model, val_loader)

# Calculating metrics for training set
train_accuracy = accuracy_score(train_labels, train_predictions)
train_precision = precision_score(train_labels, train_predictions)
train_recall = recall_score(train_labels, train_predictions)
train_f1 = f1_score(train_labels, train_predictions)

# Calculating metrics for validation set
val_accuracy = accuracy_score(val_labels, val_predictions)
val_precision = precision_score(val_labels, val_predictions)
val_recall = recall_score(val_labels, val_predictions)
val_f1 = f1_score(val_labels, val_predictions)

print("Metrics for Training Set:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall: {train_recall:.4f}")
print(f"F1 Score: {train_f1:.4f}")
print("\nMetrics for Validation Set:")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")
print(f"F1 Score: {val_f1:.4f}")


Metrics for Training Set:
Accuracy: 0.9052
Precision: 0.8203
Recall: 0.7949
F1 Score: 0.8074

Metrics for Validation Set:
Accuracy: 0.7816
Precision: 0.5823
Recall: 0.4466
F1 Score: 0.5055


(5.2) Report the accuracy for this method for selecting the correct answer on the training and validation sets of this
model. Note this is different from the value in part (a). To enable this, select the option for each question with the
highest output logit value for the positive class of the model.

In [None]:
import torch
from sklearn.metrics import accuracy_score

# Function to get predictions from the model based on highest logit value
def get_predictions_highest_logit(model, data_loader):
    model.eval()
    predictions = []
    labels = []
    with torch.no_grad():
        for batch_inputs, batch_labels in data_loader:
            outputs = model(**batch_inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            predictions.extend(logits.argmax(dim=1).cpu().numpy())
            labels.extend(batch_labels.cpu().numpy())
    return predictions, labels

# Getting the predictions for training set based on highest logit value
train_predictions_highest_logit, train_labels_highest_logit = get_predictions_highest_logit(model, train_loader)

# Getting the predictions for validation set based on highest logit value
val_predictions_highest_logit, val_labels_highest_logit = get_predictions_highest_logit(model, val_loader)

# Calculating the accuracy for training set based on highest logit value
train_accuracy_highest_logit = accuracy_score(train_labels_highest_logit, train_predictions_highest_logit)

# Calculating the accuracy for validation set based on highest logit value
val_accuracy_highest_logit = accuracy_score(val_labels_highest_logit, val_predictions_highest_logit)

print("Accuracy for Training Set (Based on Highest Logit Value):", train_accuracy_highest_logit)
print("Accuracy for Validation Set (Based on Highest Logit Value):", val_accuracy_highest_logit)

Accuracy for Training Set (Based on Highest Logit Value): 0.924271171511835
Accuracy for Validation Set (Based on Highest Logit Value): 0.817394931204825


(5.3) Why would you expect this approach to outperform the use of [CLS] vectors described in Q4?

Utilizing the logits directly for prediction, as outlined in the provided code, is anticipated to surpass the utilization of [CLS] vectors for several reasons:

* Direct Utilization of Model Output: Logits signify the unprocessed output of the model, encapsulating confidence scores for each class (in this instance, the binary classification of correct vs. incorrect answers). By employing these logits outright for prediction, we harness the full potential of the model's output without additional manipulation.

* Fine-Tuned for Classification: The BERT model has undergone fine-tuning for sequence classification tasks, precisely matching the task at hand (predicting the correct answer among options). Consequently, the model's output logits are optimized for this purpose, potentially yielding superior performance compared to employing the [CLS] vector, which may lack specific optimization for this task.

* Learned Representations: Throughout training, the model assimilates representations of the input text, subsequently utilized for predictions. These learned representations are apt to encapsulate more nuanced information concerning the relationship between the question and each answer option, resulting in more precise predictions compared to relying on a single representation (e.g., the [CLS] vector) for the entire sequence.

* Enhanced Handling of Multiple Options: In the logits-based approach, we directly evaluate the confidence scores for each answer option, facilitating the model's ability to discern them more effectively. Conversely, utilizing a single representation (like the [CLS] vector) may fail to adequately capture the subtle distinctions between multiple options.

In summary, although both methods leverage BERT's capabilities, employing logits directly for prediction is anticipated to yield superior performance owing to the fine-tuned nature of the model's output tailored for the specific task of sequence classification.

### Q6 - Test set performance

(6.1) Report the accuracy using your best method on the test set. Use the performance on the validation set to select
the best method.

In [None]:
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

test_dataset = QuestionOptionDataset(test_data)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

# Selecting and applying the best method (logits-based) to make predictions on the test set
test_predictions, test_labels = get_predictions_highest_logit(model, test_loader)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Accuracy on Test Set using logits-based approach: {test_accuracy:.4f}")


Accuracy on Test Set using logits-based approach: 0.7797


(6.2) Discuss whether the achieved accuracy would be sufficient for deployment

In a general sense, an accuracy of 77.97% on a test set could be considered reasonably good for deployment in many real-world applications, especially for tasks where a high level of precision is not critical and where the consequences of misclassifications are relatively low.

However, whether this accuracy is sufficient for deployment depends on various factors such as the specific requirements of the application, the potential impact of misclassifications, and the expectations of stakeholders.

For some applications, such as sentiment analysis or recommendation systems, an accuracy around 78% might be acceptable. In contrast, for tasks requiring high precision, such as medical diagnosis or financial fraud detection, higher accuracy thresholds may be necessary.

In summary, while an accuracy of 77.97% may be adequate for deployment in many scenarios, it's essential to assess the model's performance in the context of the application's requirements, potential risks, and stakeholder expectations before making a decision.





