In [2]:
from transformers import BertForQuestionAnswering, BertTokenizer
import torch
import re
from nltk.corpus import stopwords

In [3]:
# Load pre-trained model and tokenizer
model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'
model = BertForQuestionAnswering.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

In [4]:
# Read context from a text file with UTF-8 encoding
context_file_path = './../data/english/archive/ramayan.txt'
with open(context_file_path, 'r', encoding='utf-8') as file:
    context = file.read()

In [5]:
# Clean and preprocess the context
cleaned_context = context

In [6]:
# Function to split the context into chunks
def chunk_context(context, max_length):
    tokens = tokenizer.tokenize(context)
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunk = tokens[i:i + max_length]
        chunks.append(tokenizer.convert_tokens_to_string(chunk))
    return chunks

In [7]:
# Chunk the cleaned context to fit within model's maximum input length
max_length = 512 - 50  # Allow some space for the question
context_chunks = chunk_context(cleaned_context, max_length)

In [8]:
print("Type your question or type 'quit' to exit.")

Type your question or type 'quit' to exit.


In [None]:
while True:
    # Get question from the user
    question = input("Question: ")
    
    if question.lower() == 'quit':
        break

    answers = []

In [None]:
     for chunk in context_chunks:
            # Encode inputs
            inputs = tokenizer.encode_plus(question, chunk, return_tensors='pt')
            input_ids = inputs['input_ids']
            attention_mask = inputs['attention_mask']
    
            # Get model outputs
            outputs = model(input_ids, attention_mask=attention_mask)
            start_scores, end_scores = outputs.start_logits, outputs.end_logits
    
            # Get the most likely start and end of the answer
            start_index = torch.argmax(start_scores)
            end_index = torch.argmax(end_scores) + 1
    
            # Decode the answer
            answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0][start_index:end_index]))
            answers.append(answer)

In [None]:
    # Combine answers from different chunks (if needed)
        final_answer = ' '.join(answers)
        print(f"Bert-FineTuned-Model [LUM]: {final_answer}\n")