# QA Answering

In [1]:
from transformers import pipeline

# Step 1: Initialize the QA Pipeline
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

# Step 2: Define Context and Question
context = """
Machine learning is a field of artificial intelligence (AI) that uses statistical techniques to give computer systems 
the ability to learn from data, without being explicitly programmed. The term was coined in 1959 by Arthur Samuel, 
an American IBMer and pioneer in the field of computer gaming and artificial intelligence.
"""

question = "Who coined the term machine learning?"

# Step 3: Get the Answer
result = qa_pipeline(question=question, context=context)

# Step 4: Display the Answer
print(f"Question: {question}")
print(f"Answer: {result['answer']}")
print(f"Score: {result['score']:.2f}")


config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Question: Who coined the term machine learning?
Answer: Arthur Samuel
Score: 0.99


In [None]:
# Step 4: Fine-tune the model
training_args = TrainingArguments(
    output_dir="./fictional-qa-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=10,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
)

trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fictional-qa-model")
tokenizer.save_pretrained("./fictional-qa-model")

In [38]:
data = {
  "version": "1.0",
  "data": [
    {
      "title": "Zorbian History",
      "paragraphs": [
        {
          "context": "Zorbian dynamics was first developed by Dr. Leena Torvak in 2154. Torvak's contributions revolutionized the study of particle energy on Zorbion-5.",
          "qas": [
            {
              "id": "1",
              "question": "Who developed Zorbian dynamics?",
              "answers": [
                {
                  "text": "Dr. Leena Torvak",
                  "answer_start": 36
                }
              ]
            }
          ]
        },
        {
          "context": "The Glimstone Treaty of 3098 ended the 400-year conflict between the Xendrians and Ploraxians.",
          "qas": [
            {
              "id": "2",
              "question": "What ended the conflict between the Xendrians and Ploraxians?",
              "answers": [
                {
                  "text": "The Glimstone Treaty",
                  "answer_start": 4
                }
              ]
            }
          ]
        }
      ]
    }
  ]
}


In [None]:
import pandas as pd
import torch
from transformers import (
    DistilBertTokenizerFast, 
    DistilBertForQuestionAnswering, 
    Trainer, 
    TrainingArguments
)
from datasets import Dataset, Features, Value, Sequence
from transformers import default_data_collator

# Custom QA Dataset
qa_data = [
    {
        'context': "Machine learning is a field of artificial intelligence (AI) that uses statistical techniques to give computer systems the ability to learn from data, without being explicitly programmed. The term was coined in 1959 by Arthur Samuel, an American IBMer and pioneer in the field of computer gaming and artificial intelligence.",
        'question': "Who coined the term machine learning?",
        'answer_start': 186,
        'answer_text': "Arthur Samuel"
    },
    {
        'context': "Deep learning is a subset of machine learning based on artificial neural networks with representation learning. It can be supervised, semi-supervised or unsupervised. Deep learning architectures such as deep neural networks, deep belief networks, and recurrent neural networks have been applied to fields including computer vision, speech recognition, and natural language processing.",
        'question': "What is deep learning?",
        'answer_start': 0,
        'answer_text': "Deep learning is a subset of machine learning based on artificial neural networks with representation learning"
    },
    {
        'context': "Artificial Intelligence (AI) is intelligence demonstrated by machines, unlike natural intelligence displayed by humans and animals. AI research has been defined as the field of study of intelligent agents, which refers to any system that perceives its environment and takes actions that maximize its chance of achieving its goals.",
        'question': "How is AI research defined?",
        'answer_start': 146,
        'answer_text': "the field of study of intelligent agents, which refers to any system that perceives its environment and takes actions that maximize its chance of achieving its goals"
    }
]

In [46]:


# Create DataFrame and then Hugging Face Dataset
df = pd.DataFrame(qa_data)
features = Features({
    'context': Value('string'),
    'question': Value('string'),
    'answer_start': Value('int32'),
    'answer_text': Value('string')
})
dataset = Dataset.from_pandas(df, features=features)

# Load pre-trained Fast tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForQuestionAnswering.from_pretrained(model_name)

# Tokenization function
def prepare_train_features(examples):
    # Use tokenizer's built-in QA encoding
    tokenized = tokenizer(
        examples['question'],
        examples['context'],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )
    
    # Track sample mapping and offset mapping
    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized.pop("offset_mapping")
    
    tokenized["start_positions"] = []
    tokenized["end_positions"] = []
    
    for i, (sample_idx, offsets) in enumerate(zip(sample_mapping, offset_mapping)):
        context = examples['context'][sample_idx]
        answer_start = examples['answer_start'][sample_idx]
        answer_text = examples['answer_text'][sample_idx]
        
        # Find token start and end positions for the answer
        sequence_ids = tokenized.sequence_ids(i)
        context_index = sequence_ids.index(1)
        
        # Find start and end character positions
        start_char = answer_start
        end_char = start_char + len(answer_text)
        
        # Find token positions corresponding to the answer
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1
        
        token_end_index = len(sequence_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1
        
        # Detect answer token positions
        start_token = None
        end_token = None
        
        for idx, (start, end) in enumerate(offsets[context_index:], start=context_index):
            if start >= start_char and start_token is None:
                start_token = idx
            if end <= end_char and start <= end_char:
                end_token = idx
        
        # Append token positions
        if start_token is not None and end_token is not None:
            tokenized["start_positions"].append(start_token)
            tokenized["end_positions"].append(end_token)
        else:
            # Fallback if exact match not found
            tokenized["start_positions"].append(context_index)
            tokenized["end_positions"].append(context_index)
    
    return tokenized

# Prepare dataset
tokenized_dataset = dataset.map(
    prepare_train_features, 
    batched=True, 
    remove_columns=dataset.column_names
)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=2e-5,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=default_data_collator
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_qa_model')
tokenizer.save_pretrained('./fine_tuned_qa_model')

# Example of using the fine-tuned model
from transformers import pipeline

fine_tuned_pipeline = pipeline(
    "question-answering", 
    model='./fine_tuned_qa_model', 
    tokenizer='./fine_tuned_qa_model'
)

# Test the fine-tuned model
test_context = "Deep learning is revolutionizing artificial intelligence by enabling machines to learn from vast amounts of data."
test_question = "What is deep learning doing?"
result = fine_tuned_pipeline(question=test_question, context=test_context)
print(f"Question: {test_question}")
print(f"Answer: {result['answer']}")
print(f"Score: {result['score']:.2f}")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Step,Training Loss


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Question: What is deep learning doing?
Answer: by enabling machines to learn from vast amounts of data.
Score: 0.01


In [47]:
import pandas as pd
import torch
from transformers import (
    DistilBertTokenizerFast, 
    DistilBertForQuestionAnswering, 
    Trainer, 
    TrainingArguments
)
from datasets import Dataset, Features, Value, Sequence
from transformers import default_data_collator
from transformers import pipeline

# Completely Original Fictional Characters Dataset
qa_data = [
    {
        'context': "Zara Elowen is a quantum mechanics researcher from the remote mountain city of Aerovia. Born to a family of theoretical physicists, she developed a revolutionary method of quantum entanglement communication that allows instantaneous data transfer across vast distances. Her breakthrough came after years of studying quantum anomalies in her family's hidden laboratory.",
        'question': "What is Zara Elowen's scientific breakthrough?",
        'answer_start': 138,
        'answer_text': "quantum entanglement communication"
    },
    {
        'context': "Kai Rourke is an ex-military strategist turned environmental architect who designs self-sustaining cities in extreme climates. Raised in the arctic regions of New Terra, he developed adaptive building techniques that can withstand temperatures ranging from -50 to 50 degrees Celsius. His modular city designs have been praised for their resilience and minimal environmental impact.",
        'question': "Where was Kai Rourke raised?",
        'answer_start': 91,
        'answer_text': "arctic regions of New Terra"
    },
    {
        'context': "Lyra Voss is a neurotechnology innovator who created a neural interface that allows direct brain-to-computer communication. Her groundbreaking device, called the Synapse Link, enables people with severe motor disabilities to control advanced prosthetics and communicate through thought patterns. She founded her research institute in the island nation of Neuralis after losing her brother in a debilitating accident.",
        'question': "What is the name of Lyra Voss's neural interface?",
        'answer_start': 125,
        'answer_text': "Synapse Link"
    },
    {
        'context': "Ren Kazama is a climate restoration engineer who developed a series of atmospheric manipulation technologies to reverse global warming effects. Originally from the submerged coastal regions of Pacifica, he witnessed firsthand the devastating impacts of rising sea levels. His carbon sequestration algorithms and artificial cloud generation systems have been implemented in multiple global environmental recovery projects.",
        'question': "What technologies did Ren Kazama develop?",
        'answer_start': 126,
        'answer_text': "atmospheric manipulation technologies"
    },
    {
        'context': "Elena Cortez is a quantum computing prodigy who invented a revolutionary algorithm that dramatically reduces computational complexity for machine learning processes. Her work, conducted in the hidden research facilities of the Quantum Collective, allows artificial intelligence systems to learn and adapt at unprecedented speeds. She comes from a long line of mathematical innovators in her family.",
        'question': "Where did Elena Cortez conduct her research?",
        'answer_start': 146,
        'answer_text': "hidden research facilities of the Quantum Collective"
    }
]

# Create DataFrame and then Hugging Face Dataset
df = pd.DataFrame(qa_data)
features = Features({
    'context': Value('string'),
    'question': Value('string'),
    'answer_start': Value('int32'),
    'answer_text': Value('string')
})
dataset = Dataset.from_pandas(df, features=features)

# Load pre-trained Fast tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForQuestionAnswering.from_pretrained(model_name)

# Tokenization function (same as previous script)
def prepare_train_features(examples):
    tokenized = tokenizer(
        examples['question'],
        examples['context'],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )
    
    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized.pop("offset_mapping")
    
    tokenized["start_positions"] = []
    tokenized["end_positions"] = []
    
    for i, (sample_idx, offsets) in enumerate(zip(sample_mapping, offset_mapping)):
        context = examples['context'][sample_idx]
        answer_start = examples['answer_start'][sample_idx]
        answer_text = examples['answer_text'][sample_idx]
        
        sequence_ids = tokenized.sequence_ids(i)
        context_index = sequence_ids.index(1)
        
        start_char = answer_start
        end_char = start_char + len(answer_text)
        
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1
        
        token_end_index = len(sequence_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1
        
        start_token = None
        end_token = None
        
        for idx, (start, end) in enumerate(offsets[context_index:], start=context_index):
            if start >= start_char and start_token is None:
                start_token = idx
            if end <= end_char and start <= end_char:
                end_token = idx
        
        if start_token is not None and end_token is not None:
            tokenized["start_positions"].append(start_token)
            tokenized["end_positions"].append(end_token)
        else:
            tokenized["start_positions"].append(context_index)
            tokenized["end_positions"].append(context_index)
    
    return tokenized

# Prepare dataset
tokenized_dataset = dataset.map(
    prepare_train_features, 
    batched=True, 
    remove_columns=dataset.column_names
)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=2e-5,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=default_data_collator
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_fictional_characters_model')
tokenizer.save_pretrained('./fine_tuned_fictional_characters_model')

# Create pipeline with fine-tuned model
fine_tuned_pipeline = pipeline(
    "question-answering", 
    model='./fine_tuned_fictional_characters_model', 
    tokenizer='./fine_tuned_fictional_characters_model'
)



Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Step,Training Loss


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Question: Who is Zara Elowen?
Answer: Elowen's research team recently published groundbreaking results in quantum
Score: 0.01


In [58]:
# Test the fine-tuned model with a new query
test_context = '''Zara Elowen's research team recently published groundbreaking results in quantum
communication technologies. '''
# test_context = 'something that is not realted'
test_question = "Where was Zara Elowen born?"
result = fine_tuned_pipeline(question=test_question, context=test_context)
print(f"Question: {test_question}")
print(f"Answer: {result['answer']}")
print(f"Score: {result['score']:.2f}")

Question: Where was Zara Elowen born?
Answer: Elowen's research team recently published groundbreaking results in quantum
Score: 0.01


This model is useful if for example I give it an entire contract as context and I want to be able to ask a question