# Extractive Question Answering using BERT

## load the libraries

In [None]:
import torch
from transformers import BertForQuestionAnswering, BertTokenizerFast
from scipy.special import softmax
import plotly.express as px
import pandas as pd
import numpy as np

## Define the context and question

In [None]:
context = "The hippopotamus pl: hippopotamuses or hippopotami), also shortened to hippo (pl: hippos; Hippopotamus amphibius), further qualified as the common hippopotamus, Nile hippopotamus, or river hippopotamus, is a large semiaquatic mammal native to sub-Saharan Africa. It is one of only two extant species in the family Hippopotamidae, the other being the pygmy hippopotamus (Choeropsis liberiensis or Hexaprotodon liberiensis). Its name comes from the ancient Greek for 'river horse' (ἱπποπόταμος). After elephants and rhinoceros, the hippopotamus is the next largest land mammal. It is also the largest extant land artiodactyl. Despite their physical resemblance to pigs and other terrestrial even-toed ungulates, the closest living relatives of the hippopotamids are cetaceans (whales, dolphins, porpoises, etc.), from which they diverged about 55 million years ago. Hippos are recognisable for their barrel-shaped torsos, wide-opening mouths with large canine tusks, nearly hairless bodies, pillar-like legs, and large size: adults average 1,500 kg (3,300 lb) for bulls (males) and 1,300 kg (2,900 lb) for cows (females). Despite its stocky shape and short legs, it is capable of running 30 km/h (19 mph) over short distances. Hippos inhabit rivers, lakes, and mangrove swamps. Territorial bulls each preside over a stretch of water and a group of five to thirty cows and calves. Mating and birth both occur in the water. During the day, hippos remain cool by staying in water or mud, emerging at dusk to graze on grasses. While hippos rest near each other in the water, grazing is a solitary activity and hippos typically do not display territorial behaviour on land. Hippos are among the most dangerous animals in the world due to their aggressive and unpredictable nature. They are threatened by habitat loss and poaching for their meat and ivory (canine teeth)."
question = "How fast can hippos run?"
# The context limit on this model is pretty severe, only 512 tokens

## define the model and tokenizer

In [None]:
model_name = "deepset/bert-base-cased-squad2"
# This is a BERT model trained on the Stanford Question Answering Dataset (SQuAD) 
# it is a reading comprehension dataset, consisting of 
# questions posed by crowdworkers on a set of Wikipedia articles.

tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

## tokenizing the inputs

In [None]:
inputs = tokenizer.encode_plus(question, context, return_tensors="pt")

print(tokenizer.tokenize(context))

## Running the model to get the start and end scores

In [None]:
with torch.no_grad():
    outputs = model(**inputs)
start_scores, end_scores = softmax(outputs.start_logits)[0], softmax(outputs.end_logits)[0]
# predicts whether a particular token in the context is the start or the end of the answer

scores_df = pd.DataFrame({
    "Token Position": list(range(len(start_scores))) * 2,
    "Score": list(start_scores) + list(end_scores),
    "Score Type": ["Start"] * len(start_scores) + ["End"] * len(end_scores),
})
px.bar(scores_df, x="Token Position", y="Score", color="Score Type", barmode="group", title="Start and End Scores for Tokens")

## Getting the answer from the model

In [None]:
start_idx = np.argmax(start_scores)
end_idx = np.argmax(end_scores)
# gets the most likely start and end tokens
answer_ids = inputs.input_ids[0][start_idx: end_idx + 1]
answer_tokens = tokenizer.convert_ids_to_tokens(answer_ids)
answer = tokenizer.convert_tokens_to_string(answer_tokens)
print(answer)

## Convert into a function

In [None]:
def predict_answer(context, question):
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    start_scores, end_scores = softmax(outputs.start_logits)[0], softmax(outputs.end_logits)[0]
    start_idx = np.argmax(start_scores)
    end_idx = np.argmax(end_scores)
    confidence_score = (start_scores[start_idx] + end_scores[end_idx]) /2
    # the confidence score can be the average of the start and end scores
    answer_ids = inputs.input_ids[0][start_idx: end_idx + 1]
    answer_tokens = tokenizer.convert_ids_to_tokens(answer_ids)
    answer = tokenizer.convert_tokens_to_string(answer_tokens)
    # it needs to be able to return something if the answer is not in the context
    # cls_token is the special token it returns if there is no answer
    if answer != tokenizer.cls_token:
        return answer, confidence_score
    return None, confidence_score

## Get a new context and question

In [None]:
context = "Cajun cuisine (French: cuisine cadienne [kɥi.zin ka.dʒɛn], Spanish: cocina acadiense) is a style of cooking developed by the Cajun–Acadians who were deported from Acadia to Louisiana during the 18th century and who incorporated West African, French and Spanish cooking techniques into their original cuisine. Cajun cuisine is sometimes referred to as a 'rustic cuisine', meaning that it is based on locally available ingredients and that preparation is relatively simple. An authentic Cajun meal is usually a three-pot affair, with one pot dedicated to the main dish, one dedicated to steamed rice, specially made sausages, or some seafood dish, and the third containing whatever vegetable is plentiful or available. Crawfish, shrimp, and andouille sausage are staple meats used in a variety of dishes. The aromatic vegetables green bell pepper (piment doux), onion, and celery are called 'the trinity' by chefs in Cajun and Louisiana Creole cuisines. Roughly diced and combined in cooking, the method is similar to the use of the mirepoix in traditional French cuisine which blends roughly diced carrot, onion, and celery. Additional characteristic aromatics for both the Creole and Cajun versions may include parsley, bay leaf, thyme, green onions, ground cayenne pepper, and ground black pepper. Cayenne and Louisiana-style hot sauce are the primary sources of spice in Cajun cuisine, which usually tends towards a moderate, well-balanced heat, despite the national 'Cajun hot' craze of the 1980s and 1990s.[1]"


print(predict_answer(context, "what is Cajun cuisine?"))
# This one is easy to answer with high confidence
print(predict_answer(context, "What is one of the 'tranity'? of Cajun cuisine?"))
# this one has a typo and thus it can answer but with lower confidence
print(predict_answer(context, "what is Indian cuisine?"))
# this one has no answer in the context

## splitting the context into sentences to overcome the 512 token context limit

In [None]:
sentences = context.split("\n")

def chunk_sentences(sentences, chunk_size, stride):
    chunks = []
    num_sentences = len(sentences)
    for i in range(0, num_sentences, chunk_size - stride):
        chunk = sentences[i: i + chunk_size]
        chunks.append(chunk)
    return chunks
# the "stride" variable ensures that there is some overlap between the chunks, so context is not lost

## Inputting a new, longer context

In [None]:
context = """From Wikipedia, the free encyclopedia
Part of a series on
American cuisine

Regional cuisines
History
Ingredients and foods
Styles
Ethnic and cultural
Holidays and festivals
flag United States portalicon Food portal
vte

Cornbread is a staple Cajun starch.
Cajun cuisine (French: cuisine cadienne [kɥi.zin ka.dʒɛn], Spanish: cocina acadiense) is a style of cooking developed by the Cajun–Acadians who were deported from Acadia to Louisiana during the 18th century and who incorporated West African, French and Spanish cooking techniques into their original cuisine.

Cajun cuisine is sometimes referred to as a 'rustic cuisine', meaning that it is based on locally available ingredients and that preparation is relatively simple.

An authentic Cajun meal is usually a three-pot affair, with one pot dedicated to the main dish, one dedicated to steamed rice, specially made sausages, or some seafood dish, and the third containing whatever vegetable is plentiful or available. Crawfish, shrimp, and andouille sausage are staple meats used in a variety of dishes.

The aromatic vegetables green bell pepper (piment doux), onion, and celery are called "the trinity" by chefs in Cajun and Louisiana Creole cuisines. Roughly diced and combined in cooking, the method is similar to the use of the mirepoix in traditional French cuisine which blends roughly diced carrot, onion, and celery. Additional characteristic aromatics for both the Creole and Cajun versions may include parsley, bay leaf, thyme, green onions, ground cayenne pepper, and ground black pepper. Cayenne and Louisiana-style hot sauce are the primary sources of spice in Cajun cuisine, which usually tends towards a moderate, well-balanced heat, despite the national "Cajun hot" craze of the 1980s and 1990s.[1]

History
The Acadians were a group of French colonists who lived in Acadia, what is today Eastern Canada. In the mid-18th century, they were deported from Acadia by the British during the French and Indian War in what they termed le Grand Dérangement, and many of them ended up settling in Southern Louisiana.[2]: 6 

Due to the extreme change in climate, Acadians were unable to cook their original dishes.[3]: 20  Soon, their former culinary traditions were adapted and, in time, incorporated not only Indigenous American traditions, but also African-American traditions—as is exemplified in the classic Cajun dish "Gumbo", which is named for its principal ingredient (Okra) using the West African name for that very ingredient: "Gumbo," in West Africa, means "Okra".

Many other meals developed along these lines, adapted in no small part from Haiti, to become what is now considered classic Cajun cuisine traditions [3]: 19–20  (not to be confused with the more modern concept associated with Prudhomme's style).[4]

Up through the 20th century, the meals were not elaborate but instead, rather basic.[3]: 23  The public's false perception of "Cajun" cuisine was based on Prudhomme's style of Cajun cooking, which was spicy, flavorful, and not true to the classic form of the cuisine.[4]

Cajun and Creole cuisine have been mistaken to be the same, but the origins of Creole cooking began in New Orleans, and Cajun cooking came 40 years after the establishment of New Orleans.[5] Today, most restaurants serve dishes that consist of Cajun styles, which Paul Prudhomme dubbed "Louisiana cooking".[6] In home-cooking, these individual styles are still kept separate.[6] However, there are fewer and fewer people cooking the classic Cajun dishes that would have been eaten by the original settlers.[3]: 30 

Cajun cooking methods
Barbecuing—similar to "low and slow" Southern barbecue traditions, but with Creole/Cajun seasoning. A classic example is Johnson's Boucaniere ("smokehouse") in Lafayette, which was named best barbecue in Louisiana by the Food Network in July 2022.[7] In the Ville Platte area, a unique sauce is made from dried onions reconstituted in water and vegetable oil thick with ketchup, mustard, Worcestershire sauce, and seasonings.[8] The flavorful oil that rises to the top is applied directly to meats being cooked as a baste. Two popular brands are Jack Miller's and Pig Stand, which are available online and in grocery stores across the state.[9][10] This sauce is also commonly used on hamburgers, hot dogs, pork chops, chicken, and other grilled items.
Baking—direct and indirect dry heat in a furnace or oven, faster than smoking but slower than grilling
Grilling—direct heat on a shallow surface, fastest of all variants; sub-variants include:
Charbroiling—direct dry heat on a solid surface with wide raised ridges
Gridironing—direct dry heat on a solid or hollow surface with narrow raised ridges
Griddling—direct dry or moist heat along with the use of oils and butter on a flat surface
Braising—combining a direct dry heat charbroil-grill or gridiron-grill with a pot filled with broth for direct moist heat, faster than smoking but slower than regular grilling and baking; time starts fast, slows down, then speeds up again to finish
Boiling—as in boiling of crabs, crawfish, or shrimp, in seasoned liquid, often with side items like corn on the cob, whole new potatoes, and mushrooms cooked in the same boiling pot. A seafood boil is often a large outdoor social event.
Deep frying—lightly breaded and fried seafood including various fish, shrimp, oysters, and soft-shell crab is universally popular in Cajun cuisine, often on French bread po-boys in the New Orleans style, along with traditional Southern favorites like fried chicken, fried okra, and pork chops.
Smothering—cooking a vegetable or meat over low heat with the sauteed "trinity," plus small amounts of water or stock, similar to braising. This forms a pan sauce or gravy, and the finished product is served over rice. Étouffée is a popular variant done with crawfish or shrimp. A meatless version might feature mushrooms and eggplant.[11] Two commonly smothered meats are pork chops and round steak; these heartier meats may sometimes have a bit of roux added to the gravy.[12][13]
Pan-broiling or pan-frying
Injecting—using a large syringe-type setup to place seasoning deep inside large cuts of meat; this technique is much newer than the others on this list, but very common in Cajun cuisine
Stewing, also known as fricassée; a whole chicken cut into pieces is a popular choice for this method, particularly an older hen.[14]
Deep-frying of turkeys or oven-roasted turduckens entered southern Louisiana cuisine more recently. Also, blackening of fish or chicken and barbecuing of shrimp in the shell are excluded because they were not prepared in traditional Cajun cuisine. Blackening was actually an invention by chef Paul Prudhomme in the 1970s, becoming associated with Cajun cooking, and presented as such by him, but is not a true historical or traditional Cajun cooking process.[15]"""

## Running the function and then asking each chunk the question

In [None]:
chunked_sentences = chunk_sentences(sentences, chunk_size=3, stride=1)


questions = ["What is Cajun cooking?", "What are the most common Cajun foods?", "How can I make jambalaya?", "where did Cajun food come from?"]

answers = {}

for chunk in chunked_sentences:
    context = "\n".join(chunk)
    for question in questions:
        answer, score = predict_answer(context, question)

        if answer:
            if question not in answers:
                answers[question] = (answer, score)
            else:
                if score > answers[question][1]:
                    answers[question] = (answer, score)
                    
print(answers)