In [1]:
# Assuming 'tokenizer' and 'model' are already loaded as per previous contexts
# e.g., 
from transformers import BertTokenizer, BertForQuestionAnswering
import torch
model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
Sunset_Motors_context = """
The first DVD (Digital Versatile Disc) was released on March 24, 1997. It was a movie titled 'Twister' and was released in Japan. DVDs quickly gained popularity as a replacement for VHS tapes and became a common format for storing and distributing digital video and data.

Sunset Motors is a renowned automobile dealership that has been a cornerstone of the automotive industry since its establishment in 1978. Located in the picturesque town of Crestwood, nestled in the heart of California's scenic Central Valley, Sunset Motors has built a reputation for excellence, reliability, and customer satisfaction over the past four decades. Founded by visionary entrepreneur Robert Anderson, Sunset Motors began as a humble, family-owned business with a small lot of used cars. However, under Anderson's leadership and commitment to quality, it quickly evolved into a thriving dealership offering a wide range of vehicles from various manufacturers. Today, the dealership spans over 10 acres, showcasing a vast inventory of new and pre-owned cars, trucks, SUVs, and luxury vehicles. One of Sunset Motors' standout features is its dedication to sustainability. In 2010, the dealership made a landmark decision to incorporate environmentally friendly practices, including solar panels to power the facility, energy-efficient lighting, and a comprehensive recycling program. This commitment to eco-consciousness has earned Sunset Motors recognition as an industry leader in sustainable automotive retail. Sunset Motors proudly offers a diverse range of vehicles, including popular brands like Ford, Toyota, Honda, Chevrolet, and BMW, catering to a wide spectrum of tastes and preferences. In addition to its outstanding vehicle selection, Sunset Motors offers flexible financing options, allowing customers to secure affordable loans and leases with competitive interest rates.
"""

In [3]:

def FAQ_bot(question):
    context = Sunset_Motors_context

    # Encode question and context
    # The video uses tokenizer.encode which only returns input_ids
    input_ids = tokenizer.encode(question, context)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Create segment embeddings (token_type_ids)
    # Find separator token ID index
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # Number of tokens in segment A (question) and B (context)
    num_seg_a = sep_index + 1
    num_seg_b = len(input_ids) - num_seg_a

    # Create list of segment IDs
    segment_ids = [0] * num_seg_a + [1] * num_seg_b

    # Ensure segment_ids length matches input_ids length
    assert len(segment_ids) == len(input_ids)

    # Feed into the model
    # Model expects a batch, so wrap input_ids and segment_ids in a list
    output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))

    # Get answer start and end positions
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)

    answer_text = "" # Initialize answer_text

    if answer_end >= answer_start:
        answer_tokens = tokens[answer_start:answer_end+1]

        # Correcting and joining tokens
        corrected_answer_tokens = []
        for word in answer_tokens:
            if word.startswith("##"):
                corrected_answer_tokens.append(word[2:])
            else:
                corrected_answer_tokens.append(word)
        # This part of token joining can be improved for proper spacing
        # The video's direct join implies subwords might not always form full words correctly without tokenizer.decode
        # A more robust way:
        # answer_text = tokenizer.decode(input_ids[answer_start:answer_end+1])
        # For simplicity matching the video's approach of joining potentially cleaned sub-tokens:

        # The video's approach to cleaning "##" and joining:
        raw_answer = " ".join(answer_tokens) # First join, then clean typical ## issues
        corrected_answer_list = []
        for word_piece in raw_answer.split(): # Split by space, which might be present in tokens like '[SEP]'
            if word_piece.startswith("##"):
                 # This logic is a bit simplified in the video's explanation;
                 # usually, ## means it attaches to the *previous* token.
                 # A direct join then replace " ##" with "" or using tokenizer.decode is better.
                 # Let's try to match the spirit of individual word correction if "##" appears strangely.
                 # The video's code snippet for correction:
                 # for word in answer.split(): (where answer was ' '.join(tokens[start:end+1]))
                 #    if "##" in word:
                 #        corrected_answer += word.replace("##", "")
                 #    else:
                 #        corrected_answer += " " + word
                 # This is error-prone. A simpler direct approach for the example:
                pass # handled by string replace below or by joining smarter

        # Simplified cleanup as implied by video's final join
        answer_text = " ".join(tokens[answer_start:answer_end+1]).replace(' ##', '').replace('##', '')


    else:
        answer_text = "I'm unable to find the answer to this question. Can you please ask me another question?"

    # The video script shows a loop for `corrected_answer`
    # Let's refine based on the described loop:
    if answer_end >= answer_start:
        answer_span_tokens = tokens[answer_start:answer_end+1]
        # The video's specific cleanup loop logic:
        temp_answer = " ".join(answer_span_tokens) # Join with spaces
        # The video's correction logic is a bit flawed if applied word by word after splitting by space.
        # True WordPiece decoding is more complex.
        # A direct replacement to remove '##' while preserving attached parts:
        corrected_answer = temp_answer.replace(" ##", "").replace("##", "")
        # If it's about individual token cleaning:
        # cleaned_tokens = [token.replace("##", "") if "##" in token else token for token in answer_span_tokens]
        # corrected_answer = " ".join(cleaned_tokens) -> this might add extra spaces.
        # The most robust is tokenizer.decode()
        # For this example, sticking to the video's described simple replace on the joined string:
        answer_text = corrected_answer
    else:
        answer_text = "I'm unable to find the answer to this question. Can you please ask me another question?"

    return answer_text

In [4]:
# Test questions
question1 = "Where is the dealership Located?"
response1 = FAQ_bot(question1)
print(f"Q: {question1}\nA: {response1}")


Q: Where is the dealership Located?
A: crestwood


In [5]:
question2 = "What make of cars are available?"
response2 = FAQ_bot(question2)
print(f"Q: {question2}\nA: {response2}")


Q: What make of cars are available?
A: ford , toyota , honda , chevrolet , and bmw


In [6]:
question3 = "How large is the dealership?"
response3 = FAQ_bot(question3)
print(f"Q: {question3}\nA: {response3}")


Q: How large is the dealership?
A: 10 acres
