In [1]:
import evaluate
from pprint import pprint
import pandas as pd
from datasets import load_dataset
# For pre-trained T5 model
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
# For BLEURT (to load a trained model for evaluation)
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset("rony/climate-change-MRC")

In [3]:
# using code from here: https://medium.com/@ajazturki10/simplifying-language-understanding-a-beginners-guide-to-question-answering-with-t5-and-pytorch-253e0d6aac54
def prepare_data(data):
  articles = []

  for paragraph in data:
    context = paragraph['context']
    for qa in paragraph['qas']:
      question = qa['question']
      id = qa['id']
      for ans in qa['answers']:
        answer = ans['text']
        answer_start = ans['answer_start']
        articles.append({'context': context, 'question': question, 'id': id, 'answer': answer, 'answer_start': answer_start})

  return articles

In [4]:
# score T5 on question answering on validation & test datsets
train_ds = ds["train"] 
valid_ds = ds["validation"]
test_ds = ds["test"]

# each is a 1-item list, so take first index
train_ds = train_ds[0]
valid_ds = valid_ds[0]
test_ds = test_ds[0]

# take the 'data' key of the dict, ignoring 'version' (there's just one)
train_ds = train_ds['data'][0]['paragraphs']
valid_ds = valid_ds['data'][0]['paragraphs']
test_ds = test_ds['data'][0]['paragraphs']
# each dataset is a list of dicts, where each list item is a context paragraph ('context' key) with qas ('qas' key) which contain questions, id, and answer

train_df = pd.DataFrame(prepare_data(train_ds))
print(f"{train_df.shape=}")

valid_df = pd.DataFrame(prepare_data(valid_ds))
print(f"{valid_df.shape=}")

test_df = pd.DataFrame(prepare_data(test_ds))
print(f"{test_df.shape=}")

train_df.shape=(14756, 5)
valid_df.shape=(4229, 5)
test_df.shape=(2096, 5)


In [5]:
# Q-A on validation dataset, based on lession 7 notebook
# load pre-train T5 model and tokenizer
t5_pretrained_checkpoint_name = 't5-base'
t5_tokenizer = T5Tokenizer.from_pretrained(t5_pretrained_checkpoint_name)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_pretrained_checkpoint_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
# iterate through df to generate answers
for i in tqdm(valid_df.index):
  t5_context_text = valid_df['context'][i]
  t5_question_text = valid_df['question'][i]
  t5_input_text = f"question: {t5_question_text}  context: {t5_context_text}"
  t5_inputs = t5_tokenizer(t5_input_text, return_tensors="pt")
  t5_output_ids = t5_model.generate(t5_inputs['input_ids'])
  t5_answer = t5_tokenizer.decode(t5_output_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
  valid_df.at[i, 't5_answer'] = t5_answer

valid_df.to_parquet('valid_t5_scored.parquet')
valid_df.head()

100%|██████████| 4229/4229 [33:05<00:00,  2.13it/s]  


Unnamed: 0,context,question,id,answer,answer_start,t5_answer
0,climate change has serious implications for fo...,State one of the climatic changes that can cau...,14850,changes in the ranges of agricultural pests an...,600,agricultural pests and diseases
1,climate change has serious implications for fo...,Mention some implication of climatic changes o...,14851,"serious implications for food production, proc...",19,climate change has serious implications for fo...
2,the data for the research was obtained from a ...,How were the research data obtained? They were...,7397,the data for the research was obtained from a ...,0,a structured questionnaire was used to intervi...
3,the data for the research was obtained from a ...,The survey includes which districts include Ya...,7398,the data for the research was obtained from a ...,0,"yaya gullele, hidha abote and derr"
4,the data for the research was obtained from a ...,How were the specific study sites selected? Th...,7399,the specific study sites within the districts ...,180,a structured questionnaire was used to intervi...


In [27]:
# read in T5 answers
valid_df = pd.read_parquet('valid_t5_scored.parquet')
# evaluate T5 answers
rouge = evaluate.load('rouge')
predictions = valid_df['t5_answer'].tolist()
references = valid_df['answer'].tolist()
results = rouge.compute(predictions=predictions,
                       references=references)
print('validation \n',results)

validation 
 {'rouge1': np.float64(0.4084927674774943), 'rouge2': np.float64(0.34976700873214095), 'rougeL': np.float64(0.4072946749416937), 'rougeLsum': np.float64(0.40720359686847407)}


In [28]:
valid_df[['question', 'answer', 't5_answer']].iloc[:10].values

array([['State one of the climatic changes that can cause loss of crop.',
        'changes in the ranges of agricultural pests and diseases with warming winters, and infestations',
        'agricultural pests and diseases'],
       ['Mention some implication of climatic changes on agriculture',
        'serious implications for food production, processing, and distribution',
        'climate change has serious implications for food production, processing, and distribution'],
       ['How were the research data obtained? They were obtained from a survey of 452 agricultural households in three districts of the Zone in 2011/2012',
        'the data for the research was obtained from a survey of 452 farm households in three districts of the zone in 2011/2012',
        'a structured questionnaire was used to interview the farmers'],
       ['The survey includes which districts include Yaya Gullele, Hidha Abote and Derra',
        'the data for the research was obtained from a survey of 452 

In [10]:
# iterate through df to generate answers
for i in tqdm(test_df.index):
  t5_context_text = test_df['context'][i]
  t5_question_text = test_df['question'][i]
  t5_input_text = f"question: {t5_question_text}  context: {t5_context_text}"
  t5_inputs = t5_tokenizer(t5_input_text, return_tensors="pt")
  t5_output_ids = t5_model.generate(t5_inputs['input_ids'])
  t5_answer = t5_tokenizer.decode(t5_output_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
  test_df.at[i, 't5_answer'] = t5_answer

test_df.to_parquet('test_t5_scored.parquet')
test_df.head()

100%|██████████| 2096/2096 [16:26<00:00,  2.13it/s]


Unnamed: 0,context,question,id,answer,answer_start,t5_answer
0,some more detailed work has been done at natio...,The 9 percent reduction of rice in Bangladesh ...,14095,flooding damage and climate variability,514,flooding damage and climate variability
1,some more detailed work has been done at natio...,What kind of model of Bangladesh was had been ...,14096,a dynamic economywide model,70,a dynamic economywide model
2,some more detailed work has been done at natio...,What approach did Ahmed use to estimate how ch...,14097,a modelling approach,639,a modelling approach
3,extreme sea level height fluctuations are also...,Where height fluctuations are large?,2843,extreme sea level height fluctuations are also...,0,north
4,extreme sea level height fluctuations are also...,How non-tide sea levels are obtained?,2844,the non-tide sea levels are obtained by spectr...,167,by spectrally removing the tidal energy from t...


In [11]:
test_df = pd.read_parquet('test_t5_scored.parquet')
rouge = evaluate.load('rouge')
predictions = test_df['t5_answer'].tolist()
references = test_df['answer'].tolist()
results = rouge.compute(predictions=predictions,
                        references=references)
print('test \n', results)

test 
 {'rouge1': np.float64(0.39672176718702445), 'rouge2': np.float64(0.33876990052166034), 'rougeL': np.float64(0.39510367531089474), 'rougeLsum': np.float64(0.3951709002434744)}


In [16]:
# Try flan-t5-base
t5_pretrained_checkpoint_name = 'google/flan-t5-base'
t5_tokenizer = T5Tokenizer.from_pretrained(t5_pretrained_checkpoint_name)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_pretrained_checkpoint_name)

In [17]:
# iterate through df to generate answers
for i in tqdm(valid_df.index):
  t5_context_text = valid_df['context'][i]
  t5_question_text = valid_df['question'][i]
  t5_input_text = f"question: {t5_question_text}  context: {t5_context_text}"
  t5_inputs = t5_tokenizer(t5_input_text, return_tensors="pt")
  t5_output_ids = t5_model.generate(t5_inputs['input_ids'])
  t5_answer = t5_tokenizer.decode(t5_output_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
  valid_df.at[i, 't5_answer'] = t5_answer

valid_df.to_parquet('valid_flan_t5_scored.parquet')
valid_df.head()

  0%|          | 17/4229 [00:08<28:43,  2.44it/s] Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 4229/4229 [43:50<00:00,  1.61it/s]    


Unnamed: 0,context,question,id,answer,answer_start,t5_answer
0,climate change has serious implications for fo...,State one of the climatic changes that can cau...,14850,changes in the ranges of agricultural pests an...,600,changes in the ranges of agricultural pests an...
1,climate change has serious implications for fo...,Mention some implication of climatic changes o...,14851,"serious implications for food production, proc...",19,climate change has serious implications for fo...
2,the data for the research was obtained from a ...,How were the research data obtained? They were...,7397,the data for the research was obtained from a ...,0,survey of 452 farm households in three distric...
3,the data for the research was obtained from a ...,The survey includes which districts include Ya...,7398,the data for the research was obtained from a ...,0,Context: The survey was conducted in three dis...
4,the data for the research was obtained from a ...,How were the specific study sites selected? Th...,7399,the specific study sites within the districts ...,180,multi stage random sampling procedure


In [23]:
# read in T5 answers
valid_df = pd.read_parquet('valid_flan_t5_scored.parquet')
# evaluate T5 answers
rouge = evaluate.load('rouge')
predictions = valid_df['t5_answer'].tolist()
references = valid_df['answer'].tolist()
results = rouge.compute(predictions=predictions,
                       references=references)
print('validation \n',results)

validation 
 {'rouge1': np.float64(0.36320783091195874), 'rouge2': np.float64(0.30025145884949067), 'rougeL': np.float64(0.36059780296581617), 'rougeLsum': np.float64(0.3606392869181443)}


In [22]:
valid_df[['answer', 't5_answer']].iloc[:10].values

array([['changes in the ranges of agricultural pests and diseases with warming winters, and infestations',
        'changes in the ranges of agricultural pests and diseases with warming winters'],
       ['serious implications for food production, processing, and distribution',
        'climate change has serious implications for food production, processing, and distribution'],
       ['the data for the research was obtained from a survey of 452 farm households in three districts of the zone in 2011/2012',
        'survey of 452 farm households in three districts of the zone in 2011/2012'],
       ['the data for the research was obtained from a survey of 452 farm households in three districts of the zone in 2011/2012. the districts include yaya gullele, hidha abote and derra',
        'Context: The survey was conducted in three districts of the zone in 2011/2012. The survey'],
       ['the specific study sites within the districts were selected based on a multi stage random sampling pr