# Question answering (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate

!apt install git-lfs

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.

You will need to setup git, adapt your email and name in the following cell.

In [None]:
!git config --global user.email "skanda.vivek@gmail.com"
!git config --global user.name "skandavivek2"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [2]:
!pip install datasets
from datasets import load_dataset
import datasets



In [3]:
from transformers import AutoTokenizer

model_checkpoint = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]



In [4]:
tokenizer.is_fast

True

In [5]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [6]:
import pandas as pd
df_train=pd.read_csv('/content/train.csv',skip_blank_lines=True)
df_test=pd.read_csv('/content/test.csv')

In [7]:
df_train.columns

Index(['item_id', 'domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp',
       'q_review_id', 'q_reviews_id', 'question', 'question_subj_level',
       'ques_subj_score', 'is_ques_subjective', 'review_id', 'review',
       'human_ans_spans', 'human_ans_indices', 'answer_subj_level',
       'ans_subj_score', 'is_ans_subjective'],
      dtype='object')

In [8]:
len(df_train)

2522

In [9]:
df_train.iloc[2501].review

'For company employees and contractors, The telephone contact numbers are as follows: QTC EHS (DB): 626-123-4567, Corp EHS (JK): 858-123-4567 '

In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2525 entries, 0 to 2524
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   item_id              2501 non-null   object 
 1   domain               2501 non-null   object 
 2   nn_mod               2501 non-null   object 
 3   nn_asp               2501 non-null   object 
 4   query_mod            2501 non-null   object 
 5   query_asp            2501 non-null   object 
 6   q_review_id          2501 non-null   object 
 7   q_reviews_id         2501 non-null   object 
 8   question             2525 non-null   object 
 9   question_subj_level  2501 non-null   float64
 10  ques_subj_score      2501 non-null   float64
 11  is_ques_subjective   2501 non-null   object 
 12  review_id            2501 non-null   object 
 13  review               2524 non-null   object 
 14  human_ans_spans      2524 non-null   object 
 15  human_ans_indices    2501 non-null   o

In [None]:
df_train.describe()

Unnamed: 0,question_subj_level,ques_subj_score,answer_subj_level,ans_subj_score
count,2501.0,2501.0,2501.0,2501.0
mean,1.823671,0.166232,1.822071,0.200875
std,1.361652,0.294166,1.358797,0.333595
min,1.0,0.0,1.0,0.0
25%,1.0,0.0,1.0,0.0
50%,1.0,0.0,1.0,0.0
75%,2.0,0.233333,2.0,0.427083
max,5.0,1.0,5.0,1.0


In [None]:
df_train.iloc[2504].human_ans_indices

'(90, 260)'

In [10]:
df_train.iloc[0].review[251:265]

'ANSWERNOTFOUND'

In [21]:
df_train=df_train[['question','human_ans_indices','review','human_ans_spans']]
df_test=df_test[['question','human_ans_indices','review','human_ans_spans']]

In [11]:
df_train.shape[0]

2522

In [22]:
import numpy as np
df_train['id']=np.linspace(0,len(df_train)-1,len(df_train))
df_test['id']=np.linspace(0,len(df_test)-1,len(df_test))

df_train['id']=df_train['id'].astype(str)
df_test['id']=df_test['id'].astype(str)

In [13]:
int(df_train.iloc[2503].human_ans_indices.split('(')[1].split(',')[0])

90

In [None]:
float(df_train.iloc[0].human_ans_indices.split('(')[1].split(',')[1].split(' ')[1].split(')')[0])

265.0

In [23]:
df_train['answers']=df_train['human_ans_spans']
df_test['answers']=df_test['human_ans_spans']

In [24]:
for i in range(0,len(df_train)):
  answer1={}
  print(df_train.iloc[i])
  si=int(df_train.iloc[i].human_ans_indices.split('(')[1].split(',')[0])
  print(si)
  ei=int(df_train.iloc[i].human_ans_indices.split('(')[1].split(',')[1].split(' ')[1].split(')')[0])
  print(ei)
  answer1['text']=[df_train.iloc[i].review[si:ei]]
  answer1['answer_start']=[si]
  df_train.at[i, 'answers']=answer1
  #print(df_train.iloc[i].answers,df_train.iloc[i].human_ans_spans)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
id                                                              1966.0
answers                                                 ANSWERNOTFOUND
Name: 1966, dtype: object
2079
2093
question                                        How is the production?
human_ans_indices                                           (651, 665)
review               I watched North and South solely because of th...
human_ans_spans                                         ANSWERNOTFOUND
id                                                              1967.0
answers                                                 ANSWERNOTFOUND
Name: 1967, dtype: object
651
665
question                           Does the video gives entertainment?
human_ans_indices                                           (349, 372)
review               If you've traveled to Scotland and you love au...
human_ans_spans                                Plus it 's entertaining
id           

In [25]:
for i in range(0,len(df_test)):
  answer1={}
  si=int(df_test.iloc[i].human_ans_indices.split('(')[1].split(',')[0])
  ei=int(df_test.iloc[i].human_ans_indices.split('(')[1].split(',')[1].split(' ')[1].split(')')[0])
  answer1['text']=[df_test.iloc[i].review[si:ei]]
  answer1['answer_start']=[si]
  df_test.at[i, 'answers']=answer1
  #print(df_train.iloc[i].answers,df_train.iloc[i].human_ans_spans)

In [26]:
df_train.columns

Index(['question', 'human_ans_indices', 'review', 'human_ans_spans', 'id',
       'answers'],
      dtype='object')

In [27]:
df_train.columns=['question', 'human_ans_indices', 'context', 'human_ans_spans', 'id',
       'answers']

df_test.columns=['question', 'human_ans_indices', 'context', 'human_ans_spans','id',
       'answers']

In [28]:
val_dataset2 = datasets.Dataset.from_pandas(df_test)
train_dataset2 = datasets.Dataset.from_pandas(df_train)


In [29]:
train_dataset = train_dataset2.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=train_dataset2.column_names,
)
len(train_dataset2), len(train_dataset)

Map:   0%|          | 0/2522 [00:00<?, ? examples/s]

(2522, 4883)

In [30]:
train_dataset2.shape

(2522, 6)

In [31]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [32]:
validation_dataset = val_dataset2.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=val_dataset2.column_names,
)
len(validation_dataset)

Map:   0%|          | 0/582 [00:00<?, ? examples/s]

1104

In [33]:
len(validation_dataset)

1104

In [34]:
validation_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 1104
})

In [35]:
len(val_dataset2)

582

In [36]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [37]:
import torch
from transformers import AutoModelForQuestionAnswering


In [38]:
import collections



In [39]:
import evaluate

metric = evaluate.load("squad")

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [40]:
from tqdm.auto import tqdm


def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [41]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

In [42]:
from transformers import TrainingArguments

args = TrainingArguments(
    "roberta-finetuned-subjqa-movies_2",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=True,
)



In [43]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in yo

In [44]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [45]:
import numpy as np
n_best=20
max_answer_length = 30

In [46]:
predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, validation_dataset, val_dataset2)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  0%|          | 0/582 [00:00<?, ?it/s]

{'exact_match': 2.7491408934707904, 'f1': 9.852373992728378}

In [47]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time
1,1.2858,No log,0.0039
2,1.0351,No log,0.0039
3,0.8722,No log,0.0039
4,0.7395,No log,0.0039
5,0.6292,No log,0.0039


TrainOutput(global_step=3055, training_loss=0.9123564325260843, metrics={'train_runtime': 822.2134, 'train_samples_per_second': 29.694, 'train_steps_per_second': 3.716, 'total_flos': 4784670236782080.0, 'train_loss': 0.9123564325260843, 'epoch': 5.0})

In [48]:
trainer.push_to_hub(commit_message="Training complete")

events.out.tfevents.1730974519.6f6b1a0c4bd2.1120.0:   0%|          | 0.00/7.92k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dcrowleymunster/roberta-finetuned-subjqa-movies_2/commit/4707e4b581ca3a1c0539a067b19658b9e6bf447b', commit_message='Training complete', commit_description='', oid='4707e4b581ca3a1c0539a067b19658b9e6bf447b', pr_url=None, pr_revision=None, pr_num=None)

In [49]:
predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, validation_dataset, val_dataset2)

  0%|          | 0/582 [00:00<?, ?it/s]

{'exact_match': 64.60481099656357, 'f1': 66.57532529206921}

In [None]:
#Inference!

In [51]:
from transformers import pipeline

In [52]:
# Replace this with your own checkpoint
model_checkpoint2 = "dcrowleymunster/roberta-finetuned-subjqa-movies_2"
question_answerer = pipeline("question-answering", model=model_checkpoint2)

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [53]:
import pandas as pd
df_train1=pd.read_csv('/content/train.csv')
df_test1=pd.read_csv('/content/test.csv')

In [74]:
df_train1.iloc[2518].question

'What should I do in the event of no power '

In [75]:
context = df_train1.iloc[2518].review
question = df_train1.iloc[2518].question
question_answerer(question=question, context=context)

{'score': 0.013593778945505619,
 'start': 145,
 'end': 222,
 'answer': 'activate the flashlight and follow your subway exit route color to the stairs'}

In [76]:
# Changed for the newly trained model
model_checkpoint_o = "dcrowleymunster/roberta-finetuned-subjqa-movies_2"
question_answerer_old = pipeline("question-answering", model=model_checkpoint_o)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [77]:
context = df_train1.iloc[2518].review
question = df_train1.iloc[2518].question
question_answerer_old(question=question, context=context)

{'score': 0.013593778945505619,
 'start': 145,
 'end': 222,
 'answer': 'activate the flashlight and follow your subway exit route color to the stairs'}

In [72]:
df_train.iloc[2517].question

'There is so much smoke, I cant breath '

In [73]:
df_train.iloc[2517].answers

{'text': ['ry to cover your mouth with something so you dont inhale the smoke and in the text box below send your location including floor number and trisection e.g. Help I cant breath, I am on floor 22 in trisection 3, by the window '],
 'answer_start': [44]}

In [62]:
df_train[['id','question','context','answers']].tail()

Unnamed: 0,id,question,context,answers
2517,2517.0,"There is so much smoke, I cant breath",If you find that smoke is overwhelming you try...,{'text': ['ry to cover your mouth with somethi...
2518,2518.0,What should I do in the event of no power,If the office or building loses electrical po...,{'text': [' On your cellphone activate the fla...
2519,2519.0,What should I do if there is an earthquake,In the event of an Earthquake move away from a...,{'text': ['ove away from all windows and look ...
2520,2520.0,What should I do if there is a hurricane or To...,"During inclement weather conditions, move away...",{'text': ['ove away from all windows and try t...
2521,2521.0,"What should I do if there is a chemical, biolo...",For disaster events that occur outside the bui...,{'text': ['o not leave the building until secu...


In [63]:
len(df_train)

2522