In [63]:
# Import libraries 
from datasets import load_dataset
import pandas as pd 
import numpy as np 
import transformers
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# EDA

In [3]:
#Load the squad dataset from hugging face hub
squad_df = load_dataset("squad")

In [4]:
squad_df

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

squad dataset contains train and validation datasets with 87599 and 10570 rows.

In [5]:
squad_df['train'][0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [6]:
squad_df['validation'][0]

{'id': '56be4db0acb8001400a502ec',
 'title': 'Super_Bowl_50',
 'context': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.',
 'question': 'Which NFL team represented the AFC at Super Bowl 50?',
 'answers': {'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'],


we will use the context and question as an input to our function later.

In [10]:
# Select first 5 rows from the dataset
top5 = squad_df["validation"][:5]
# Convert selected_rows to a pandas DataFrame
top = pd.DataFrame(top5)

# Print the DataFrame
top


Unnamed: 0,id,title,context,question,answers
0,56be4db0acb8001400a502ec,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,"{'text': ['Denver Broncos', 'Denver Broncos', ..."
1,56be4db0acb8001400a502ed,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,"{'text': ['Carolina Panthers', 'Carolina Panth..."
2,56be4db0acb8001400a502ee,Super_Bowl_50,Super Bowl 50 was an American football game to...,Where did Super Bowl 50 take place?,"{'text': ['Santa Clara, California', 'Levi's S..."
3,56be4db0acb8001400a502ef,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team won Super Bowl 50?,"{'text': ['Denver Broncos', 'Denver Broncos', ..."
4,56be4db0acb8001400a502f0,Super_Bowl_50,Super Bowl 50 was an American football game to...,What color was used to emphasize the 50th anni...,"{'text': ['gold', 'gold', 'gold'], 'answer_sta..."


In [11]:
# Select last 5 rows from the dataset
tail5 = squad_df["validation"][-5:]
# Convert selected_rows to a pandas DataFrame
tail = pd.DataFrame(tail5)

# Print the DataFrame
tail

Unnamed: 0,id,title,context,question,answers
0,5737aafd1c456719005744fb,Force,"The pound-force has a metric counterpart, less...",What is the metric term less used than the New...,"{'text': ['kilogram-force', 'pound-force', 'ki..."
1,5737aafd1c456719005744fc,Force,"The pound-force has a metric counterpart, less...",What is the kilogram-force sometimes reffered ...,"{'text': ['kilopond', 'kilopond', 'kilopond', ..."
2,5737aafd1c456719005744fd,Force,"The pound-force has a metric counterpart, less...",What is a very seldom used unit of mass in the...,"{'text': ['slug', 'metric slug', 'metric slug'..."
3,5737aafd1c456719005744fe,Force,"The pound-force has a metric counterpart, less...",What seldom used term of a unit of force equal...,"{'text': ['kip', 'kip', 'kip', 'kip', 'kip'], ..."
4,5737aafd1c456719005744ff,Force,"The pound-force has a metric counterpart, less...",What is the seldom used force unit equal to on...,"{'text': ['sthène', 'sthène', 'sthène', 'sthèn..."


# Modeling 

In [141]:
c = top['context'][0]
q = top['question'][0]

In [142]:
c

'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.'

In [143]:
q

'Which NFL team represented the AFC at Super Bowl 50?'

In [164]:
top['answers'][0]

{'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'],
 'answer_start': [177, 177, 177]}

## Bert base model

In [153]:
# Load the pretrained model and tokenizer
model_checkpoint = "bert-base-uncased"
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [154]:
inputs = tokenizer(q, c, add_special_tokens=True, return_tensors="pt")

outputs = model(**inputs)

answer_start_index = torch.argmax(outputs.start_logits)
answer_end_index = torch.argmax(outputs.end_logits) + 1 

In [155]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index]
tokenizer.decode(predict_answer_tokens)

''

## distilbert-base-uncased

In [156]:
# Load the pretrained model and tokenizer
model_checkpoint1 = "distilbert-base-uncased"
model1 = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint1)
tokenizer1 = AutoTokenizer.from_pretrained(model_checkpoint1)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [157]:
inputs1 = tokenizer1(q, c, add_special_tokens=True, return_tensors="pt")

outputs1 = model1(**inputs1)

answer_start_index1 = torch.argmax(outputs1.start_logits)
answer_end_index1 = torch.argmax(outputs1.end_logits) + 1 

In [159]:
predict_answer_tokens1 = inputs1.input_ids[0, answer_start_index1 : answer_end_index1]
tokenizer1.decode(predict_answer_tokens1)

'the san francisco bay area at santa clara, california. as this was the 50th super bowl, the league emphasized the " golden anniversary " with various gold - themed initiatives, as well as temporarily suspending the tradition of naming each super bowl game with roman numerals ( under which the game would have been known as " super bowl l "'

## Fined-tuned model using the SQuAD

In [160]:
# Load the pretrained model and tokenizer
model_checkpoint2 = "distilbert-base-cased-distilled-squad"
model2 = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint2)
# Tokenize the context and the question using the given tokenizer.
tokenizer2 = AutoTokenizer.from_pretrained(model_checkpoint2)

In [161]:
# Tokenize the context and the question using the given tokenizer.
inputs2 = tokenizer2(q, c, add_special_tokens=True, return_tensors="pt")
# Get the model output.
outputs2 = model2(**inputs2)
# Get the indices of the start and end of the answer.
answer_start_index2 = torch.argmax(outputs2.start_logits)
answer_end_index2 = torch.argmax(outputs2.end_logits) + 1 

In [169]:
answer_start_index2

tensor(46)

In [170]:
answer_end_index2

tensor(48)

In [162]:
# Extract the answer tokens from the input using start and end index.
predict_answer_tokens2 = inputs2.input_ids[0, answer_start_index2 : answer_end_index2]
# Decode the answer tokens.
tokenizer2.decode(predict_answer_tokens2)

'Denver Broncos'

I am going to use <b>'distilbert-base-cased-distilled-squad'</b>

# References

1. <a href= "https://huggingface.co/datasets/squad"> squad </a>
2. <a href= "https://huggingface.co/docs/transformers/tasks/question_answering"> Hugging face QA </a>