In [1]:
import evaluate
from pprint import pprint
import pandas as pd
from datasets import load_dataset, Dataset
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, BartForConditionalGeneration, BartForQuestionAnswering,TrainingArguments, Trainer
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load data
def prepare_data(data):
    articles = []

    for paragraph in data:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            id = qa['id']
            for ans in qa['answers']:
                answer = ans['text']
                answer_start = ans['answer_start']
                articles.append({'context': context, 'question': question, 'id': id, 'answer': answer, 'answer_start': answer_start})

    return articles

In [3]:
ds = load_dataset("rony/climate-change-MRC")
train_ds = ds["train"]
valid_ds = ds["validation"]
test_ds = ds["test"]

# each is a 1-item list, so take first index
train_ds = train_ds[0]
valid_ds = valid_ds[0]
test_ds = test_ds[0]

# take the 'data' key of the dict, ignoring 'version' (there's just one)
train_ds = train_ds['data'][0]['paragraphs']
valid_ds = valid_ds['data'][0]['paragraphs']
test_ds = test_ds['data'][0]['paragraphs']
# each dataset is a list of dicts, where each list item is a context paragraph ('context' key) with qas ('qas' key) which contain questions, id, and answer

train_df = pd.DataFrame(prepare_data(train_ds))
print(f"{train_df.shape=}")

valid_df = pd.DataFrame(prepare_data(valid_ds))
print(f"{valid_df.shape=}")

test_df = pd.DataFrame(prepare_data(test_ds))
print(f"{test_df.shape=}")

train_df.shape=(14756, 5)
valid_df.shape=(4229, 5)
test_df.shape=(2096, 5)


In [4]:
# fine tune BART for climate for extractive QA task (model is BART fine tuned for summarization)
model_checkpoint = 'valhalla/bart-large-finetuned-squadv1'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = BartForQuestionAnswering.from_pretrained(model_checkpoint) # vs. conditional generation for abstractive tasks

lr = 1e-5
BATCH_SIZE = 4
NUM_EPOCHS = 2

# LORA hyperparameters
r = 8
lora_alpha = 32
lora_dropout = 0.1

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels will be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels will be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels will be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels will be overwritten to 2.


In [5]:
# preprocess data for training
def preprocess_function(examples):
    tokenized_inputs = tokenizer(
    list(zip(examples['question'], examples['context'])),
    padding='max_length',
    max_length=1024, #BART max len=1024
    truncation=True,
    return_tensors='pt'
    )

    start_positions = []
    end_positions = []

    for i in range(len(examples["answer"])):
        context = examples["context"][i]
        answer = examples["answer"][i]
        answer_start = examples["answer_start"][i]

        answer_end = answer_start + len(answer) - 1

        start_token = tokenizer.encode(context[:answer_start], add_special_tokens=False)
        end_token = tokenizer.encode(context[:answer_end + 1], add_special_tokens=False)

        start_positions.append(len(start_token))
        end_positions.append(len(end_token) - 1)

    tokenized_inputs["start_positions"] = start_positions
    tokenized_inputs["end_positions"] = end_positions

    return tokenized_inputs

# apply preprocess function to datasets for tokenization
tokenized_train = Dataset.from_pandas(train_df).map(preprocess_function, batched=True)
tokenized_valid = Dataset.from_pandas(valid_df).map(preprocess_function, batched=True)
tokenized_test = Dataset.from_pandas(test_df).map(preprocess_function, batched=True)

Map:  61%|██████    | 9000/14756 [00:11<00:07, 755.63 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1096 > 1024). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 14756/14756 [00:19<00:00, 743.30 examples/s]
Map: 100%|██████████| 4229/4229 [00:05<00:00, 775.43 examples/s]
Map: 100%|██████████| 2096/2096 [00:02<00:00, 754.25 examples/s]


In [6]:
for name, param in model.named_parameters():
    param.requires_grad = False

peft_config = LoraConfig(
task_type=TaskType.QUESTION_ANS, inference_mode=False, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

_= model.to("cuda")

trainable params: 1,181,698 || all params: 407,475,204 || trainable%: 0.2900


In [7]:
training_args = TrainingArguments(
output_dir="bart_squad_ft_climate",
learning_rate=lr,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
num_train_epochs=NUM_EPOCHS,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
logging_steps=10,
load_best_model_at_end=True,
report_to='none',
label_names=['start_positions', 'end_positions'],
)

trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_valid
)



In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,4.1018,3.975826
2,3.9465,3.89559


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels will be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels will be overwritten to 2.


TrainOutput(global_step=7378, training_loss=4.276153972395134, metrics={'train_runtime': 19475.3256, 'train_samples_per_second': 1.515, 'train_steps_per_second': 0.379, 'total_flos': 6.417023025040589e+16, 'train_loss': 4.276153972395134, 'epoch': 2.0})

In [9]:
dir_path = "bart_squad_ft_climate/model"
model.save_pretrained(dir_path, from_pt=True)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels will be overwritten to 2.


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

PeftModelForQuestionAnswering(
  (base_model): LoraModel(
    (model): BartForQuestionAnswering(
      (model): BartModel(
        (shared): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
        (encoder): BartEncoder(
          (embed_tokens): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
          (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
          (layers): ModuleList(
            (0-11): 12 x BartEncoderLayer(
              (self_attn): BartSdpaAttention(
                (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1024, out_features=8, bias=False)
                  )
                  (lora_B)

In [11]:
for i in tqdm(test_df.index):
    question = test_df['question'][i]
    text = test_df['context'][i]

    inputs = tokenizer(question, text, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    
    answer_start_index = outputs.start_logits.argmax()
    answer_end_index = outputs.end_logits.argmax()

    predict_answer_tokens = inputs.input_ids[0][answer_start_index:answer_end_index + 1]
    answer_decoded = tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)
    test_df.at[i, 'bart_answer'] = answer_decoded

100%|██████████| 2096/2096 [02:31<00:00, 13.83it/s]


In [12]:
rouge = evaluate.load('rouge')

predictions = test_df['bart_answer']
references = test_df['answer']

rouge_res = rouge.compute(predictions=predictions, references=references)
print(f"rouge scores:\n{rouge_res}")

encoder_model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
candidate_embeddings = encoder_model.encode(predictions)
reference_embeddings = encoder_model.encode(references)
similarity = util.pairwise_cos_sim(candidate_embeddings, reference_embeddings)
print(f"average semantic similarity:\n{torch.mean(similarity)}")

rouge scores:
{'rouge1': np.float64(0.44284512084597943), 'rouge2': np.float64(0.36824771949556834), 'rougeL': np.float64(0.40077112765096323), 'rougeLsum': np.float64(0.40091362499492345)}
average semantic similarity:
0.6182442903518677


In [13]:
for i in test_df.index[:5]:
    print(f"question: {test_df['question'][i]}")
    print(f"predicted answer: {test_df['bart_answer'][i]}")
    print(f"true answer: {test_df['answer'][i]}")
    print()

question: The 9 percent reduction of rice in Bangladesh is attributed to what two variables?
predicted answer:  anthropogenic climate change. using a combination of historical yield variability and ten climate projections, future anthropogenic climate change damages are estimated to reduce national rice production in bangladesh by about 9 percent to mid-century, and most of these losses are attributed in the analysis to flooding
true answer: flooding damage and climate variability

question: What kind of model of Bangladesh was had been used to estimate economic damages from historical climate variability and future anthropogenic climate change?
predicted answer:  national level. for example, a dynamic economywide
true answer: a dynamic economywide model

question: What approach did Ahmed use to estimate how changes in climate variability might affect crop yields and poverty rates in Tanzania to the early 2030s
predicted answer:  al., 2011). another example is the work of ahmed et al. 