In [11]:
!pip install -q transformers
!pip install -q evaluate
!pip install -q datasets
!pip install -q pandas
!pip install -q torch
!pip install -q rouge_score

In [12]:
import evaluate
from pprint import pprint
import pandas as pd
from datasets import load_dataset
# For pre-trained T5 model
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import torch

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
# define functions
# using code from here: https://medium.com/@ajazturki10/simplifying-language-understanding-a-beginners-guide-to-question-answering-with-t5-and-pytorch-253e0d6aac54
def prepare_data(data):
  articles = []

  for paragraph in data:
    context = paragraph['context']
    for qa in paragraph['qas']:
      question = qa['question']
      id = qa['id']
      for ans in qa['answers']:
        answer = ans['text']
        answer_start = ans['answer_start']
        articles.append({'context': context, 'question': question, 'id': id, 'answer': answer, 'answer_start': answer_start})

  return articles

def evaluate_abstractive(result_df,
                         pred_col,
                         ref_col='answer',
                         encoder_model='sentence-transformers/all-MiniLM-L12-v2'):
    predictions = result_df[pred_col].tolist()
    references = result_df[ref_col].tolist()

    rouge = evaluate.load('rouge')
    rouge_res = rouge.compute(predictions=predictions,
                              references=references)

    encoder_model = SentenceTransformer(encoder_model)
    candidate_embeddings = encoder_model.encode(predictions)
    reference_embeddings = encoder_model.encode(references)
    similarity = util.pairwise_cos_sim(candidate_embeddings, reference_embeddings)

    print('rouge scores:')
    pprint(rouge_res)
    print()
    print('average semantic similarity:')
    print(torch.mean(similarity))

In [15]:
ds = load_dataset("rony/climate-change-MRC")

In [16]:
# score T5 on question answering on validation & test datsets
# train_ds = ds["train"]
# valid_ds = ds["validation"]
test_ds = ds["test"]

# each is a 1-item list, so take first index
# train_ds = train_ds[0]
# valid_ds = valid_ds[0]
test_ds = test_ds[0]

# take the 'data' key of the dict, ignoring 'version' (there's just one)
# train_ds = train_ds['data'][0]['paragraphs']
# valid_ds = valid_ds['data'][0]['paragraphs']
test_ds = test_ds['data'][0]['paragraphs']
# each dataset is a list of dicts, where each list item is a context paragraph ('context' key) with qas ('qas' key) which contain questions, id, and answer

# train_df = pd.DataFrame(prepare_data(train_ds))
# print(f"{train_df.shape=}")

# valid_df = pd.DataFrame(prepare_data(valid_ds))
# print(f"{valid_df.shape=}")

test_df = pd.DataFrame(prepare_data(test_ds))
print(f"{test_df.shape=}")

test_df.shape=(2096, 5)


In [17]:
del ds

In [18]:
# run trained t5-small on test in batches

def predict_test(test_df, batch_size=20):
  all_predictions = []
  start = 0
  end = batch_size

  for _ in range(test_df.shape[0]//batch_size + 1):
    test_batch = test_df[start:end]

    test_input_texts = []

    for _, row in test_batch.iterrows():
      input_text = f"question: {row['question']}  context: {row['context']}"
      test_input_texts.append(input_text)
    test_inputs = t5_tokenizer(test_input_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

    predictions = trained_t5small_model.generate(test_inputs['input_ids'])
    decoded_predictions = t5_tokenizer.batch_decode(predictions, skip_special_tokens=True)

    all_predictions.append(decoded_predictions)

    start += batch_size
    end += batch_size

  return all_predictions

In [None]:
# load saved t5-small with 1 epoch model
from transformers import T5ForConditionalGeneration, T5Tokenizer

trained_t5small_model = T5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/mids-w266/final_project/t5small')
t5_tokenizer = T5Tokenizer.from_pretrained('/content/drive/MyDrive/mids-w266/final_project/t5small')

In [17]:
# predict using fine tuned t5-small trained for 1 epoch
all_predictions = predict_test(test_df)

In [21]:
predictions = []

for batch in all_predictions:
  for pred in batch:
      predictions.append(pred)

len(predictions)

2096

In [28]:
rouge = evaluate.load("rouge")

references = test_df['answer'].tolist()

rouge_results = rouge.compute(predictions=predictions, references=references)

encoder_model='sentence-transformers/all-MiniLM-L12-v2'
encoder_model = SentenceTransformer(encoder_model)
candidate_embeddings = encoder_model.encode(predictions)
reference_embeddings = encoder_model.encode(references)
similarity = util.pairwise_cos_sim(candidate_embeddings, reference_embeddings)

print('rouge scores:')
pprint(rouge_results)
print()
print('average semantic similarity:')
print(torch.mean(similarity))

rouge scores:
{'rouge1': np.float64(0.5704078875477627),
 'rouge2': np.float64(0.5268776428029627),
 'rougeL': np.float64(0.5609993603484609),
 'rougeLsum': np.float64(0.5612683578070725)}

average semantic similarity:
tensor(0.7301)


In [31]:
results_df = test_df.copy()
results_df['finetuned_t5-small_prediction'] = predictions
results_df.head(10)

Unnamed: 0,context,question,id,answer,answer_start,finetuned_t5-small_prediction
0,some more detailed work has been done at natio...,The 9 percent reduction of rice in Bangladesh ...,14095,flooding damage and climate variability,514,flooding damage and climate variability (thurl...
1,some more detailed work has been done at natio...,What kind of model of Bangladesh was had been ...,14096,a dynamic economywide model,70,a dynamic economywide model of bangladesh has ...
2,some more detailed work has been done at natio...,What approach did Ahmed use to estimate how ch...,14097,a modelling approach,639,they found that
3,extreme sea level height fluctuations are also...,Where height fluctuations are large?,2843,extreme sea level height fluctuations are also...,0,extreme sea level height fluctuations are also...
4,extreme sea level height fluctuations are also...,How non-tide sea levels are obtained?,2844,the non-tide sea levels are obtained by spectr...,167,the non-tide sea levels are obtained by spectr...
5,extreme sea level height fluctuations are also...,Where the co-occurrences of extreme waves and ...,2845,the probabilities of the potentially important...,445,peak hs's at noaa buoys near san franci
6,migration rates in response to warming at the ...,"Plant fossils from several sites in Wyoming, N...",15792,"10,000-year interval at the paleocene-eocene b...",275,"plant fossils from several sites in wyoming, n..."
7,migration rates in response to warming at the ...,Persistence therefore appears to have been the...,15793,plant fossil record for this warm interval in ...,703,persistence therefore appears to have been the...
8,migration rates in response to warming at the ...,what are the three adaptive features can be su...,15794,"first, many species had a much wider ecologica...",925,"first, many species had a much wider ecologica..."
9,adaptation examples include constructing fuel ...,What is constructing fuel breaks around a vuln...,15740,rescuing a highly valued and climate-vulnerabl...,166,adaptation examples include constructing fuel ...


In [36]:
for index, row in results_df.head(10).iterrows():
  print(f"Example {index+1}:")
  print(f"Question: {row['question']}")
  print(f"Answer: {row['answer']}")
  print(f"Finetuned T5-small Prediction: {row['finetuned_t5-small_prediction']}")
  print("-" * 20)


Example 1:
Question: The 9 percent reduction of rice in Bangladesh is attributed to what two variables?
Answer: flooding damage and climate variability
Finetuned T5-small Prediction: flooding damage and climate variability (thurlow et al., 2011
--------------------
Example 2:
Question: What kind of model of Bangladesh was had been used to estimate economic damages from historical climate variability and future anthropogenic climate change?
Answer: a dynamic economywide model
Finetuned T5-small Prediction: a dynamic economywide model of bangladesh has been used to estimate economic damages from historical
--------------------
Example 3:
Question: What approach did Ahmed use to estimate how changes in climate variability might affect crop yields and poverty rates in Tanzania to the early 2030s
Answer: a modelling approach
Finetuned T5-small Prediction: they found that
--------------------
Example 4:
Question: Where  height fluctuations are large?
Answer: extreme sea level height fluctuat

In [30]:
results_df.to_csv('/content/drive/MyDrive/mids-w266/final_project/finetuned_t5small_predictions.csv')

In [19]:
# load saved t5-small with 3 epochs model
from transformers import T5ForConditionalGeneration, T5Tokenizer

trained_t5small_model = T5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/mids-w266/final_project/t5small_3epoch')
t5_tokenizer = T5Tokenizer.from_pretrained('/content/drive/MyDrive/mids-w266/final_project/t5small_3epoch')

In [20]:
# predict using fine tuned t5-small trained for 3 epochs
all_predictions = predict_test(test_df)

In [21]:
predictions = []

for batch in all_predictions:
  for pred in batch:
      predictions.append(pred)

len(predictions)

2096

In [23]:
rouge = evaluate.load("rouge")

references = test_df['answer'].tolist()

rouge_results = rouge.compute(predictions=predictions, references=references)

encoder_model='sentence-transformers/all-MiniLM-L12-v2'
encoder_model = SentenceTransformer(encoder_model)
candidate_embeddings = encoder_model.encode(predictions)
reference_embeddings = encoder_model.encode(references)
similarity = util.pairwise_cos_sim(candidate_embeddings, reference_embeddings)

print('rouge scores:')
pprint(rouge_results)
print()
print('average semantic similarity:')
print(torch.mean(similarity))

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

rouge scores:
{'rouge1': np.float64(0.578125985550149),
 'rouge2': np.float64(0.5370773630480148),
 'rougeL': np.float64(0.5706550484828921),
 'rougeLsum': np.float64(0.5705363843637158)}

average semantic similarity:
tensor(0.7341)


In [24]:
results_df = test_df.copy()
results_df['finetuned_t5-small_prediction'] = predictions
results_df.head(10)

Unnamed: 0,context,question,id,answer,answer_start,finetuned_t5-small_prediction
0,some more detailed work has been done at natio...,The 9 percent reduction of rice in Bangladesh ...,14095,flooding damage and climate variability,514,flooding damage and climate variability
1,some more detailed work has been done at natio...,What kind of model of Bangladesh was had been ...,14096,a dynamic economywide model,70,a dynamic economywide model of bangladesh has ...
2,some more detailed work has been done at natio...,What approach did Ahmed use to estimate how ch...,14097,a modelling approach,639,a modelling approach
3,extreme sea level height fluctuations are also...,Where height fluctuations are large?,2843,extreme sea level height fluctuations are also...,0,extreme sea level height fluctuations are also...
4,extreme sea level height fluctuations are also...,How non-tide sea levels are obtained?,2844,the non-tide sea levels are obtained by spectr...,167,the non-tide sea levels are obtained by spectr...
5,extreme sea level height fluctuations are also...,Where the co-occurrences of extreme waves and ...,2845,the probabilities of the potentially important...,445,peak hs's at noaa buoys near san franci
6,migration rates in response to warming at the ...,"Plant fossils from several sites in Wyoming, N...",15792,"10,000-year interval at the paleocene-eocene b...",275,"plant fossils from several sites in wyoming, n..."
7,migration rates in response to warming at the ...,Persistence therefore appears to have been the...,15793,plant fossil record for this warm interval in ...,703,persistence therefore appears to have been the...
8,migration rates in response to warming at the ...,what are the three adaptive features can be su...,15794,"first, many species had a much wider ecologica...",925,"first, many species had a much wider ecologica..."
9,adaptation examples include constructing fuel ...,What is constructing fuel breaks around a vuln...,15740,rescuing a highly valued and climate-vulnerabl...,166,adaptation examples include constructing fuel ...


In [25]:
for index, row in results_df.head(10).iterrows():
  print(f"Example {index+1}:")
  print(f"Question: {row['question']}")
  print(f"Answer: {row['answer']}")
  print(f"Finetuned T5-small Prediction: {row['finetuned_t5-small_prediction']}")
  print("-" * 20)

Example 1:
Question: The 9 percent reduction of rice in Bangladesh is attributed to what two variables?
Answer: flooding damage and climate variability
Finetuned T5-small Prediction: flooding damage and climate variability
--------------------
Example 2:
Question: What kind of model of Bangladesh was had been used to estimate economic damages from historical climate variability and future anthropogenic climate change?
Answer: a dynamic economywide model
Finetuned T5-small Prediction: a dynamic economywide model of bangladesh has been used to estimate economic damages from historical
--------------------
Example 3:
Question: What approach did Ahmed use to estimate how changes in climate variability might affect crop yields and poverty rates in Tanzania to the early 2030s
Answer: a modelling approach
Finetuned T5-small Prediction: a modelling approach
--------------------
Example 4:
Question: Where  height fluctuations are large?
Answer: extreme sea level height fluctuations are also lar

In [26]:
results_df.to_csv('/content/drive/MyDrive/mids-w266/final_project/finetuned_t5small_predictions_3epoch.csv')