In [2]:
import evaluate
from pprint import pprint
import pandas as pd
from datasets import load_dataset, Dataset
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, BartForQuestionAnswering, TrainingArguments, Trainer
import project_utility as utils

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_df, val_df, test_df = utils.load_data()

train_df.shape=(14756, 5)
valid_df.shape=(4229, 5)
test_df.shape=(2096, 5)


In [14]:
# based on example here: https://huggingface.co/docs/transformers/en/model_doc/bart#transformers.BartForQuestionAnswering.forward.example
tokenizer = AutoTokenizer.from_pretrained("valhalla/bart-large-finetuned-squadv1")
model = BartForQuestionAnswering.from_pretrained("valhalla/bart-large-finetuned-squadv1")

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


In [15]:
for name, param in model.named_parameters():
    print(name)

model.shared.weight
model.encoder.embed_positions.weight
model.encoder.layers.0.self_attn.k_proj.weight
model.encoder.layers.0.self_attn.k_proj.bias
model.encoder.layers.0.self_attn.v_proj.weight
model.encoder.layers.0.self_attn.v_proj.bias
model.encoder.layers.0.self_attn.q_proj.weight
model.encoder.layers.0.self_attn.q_proj.bias
model.encoder.layers.0.self_attn.out_proj.weight
model.encoder.layers.0.self_attn.out_proj.bias
model.encoder.layers.0.self_attn_layer_norm.weight
model.encoder.layers.0.self_attn_layer_norm.bias
model.encoder.layers.0.fc1.weight
model.encoder.layers.0.fc1.bias
model.encoder.layers.0.fc2.weight
model.encoder.layers.0.fc2.bias
model.encoder.layers.0.final_layer_norm.weight
model.encoder.layers.0.final_layer_norm.bias
model.encoder.layers.1.self_attn.k_proj.weight
model.encoder.layers.1.self_attn.k_proj.bias
model.encoder.layers.1.self_attn.v_proj.weight
model.encoder.layers.1.self_attn.v_proj.bias
model.encoder.layers.1.self_attn.q_proj.weight
model.encoder.la

In [7]:
# iterate through test to get answer spans
for i in tqdm(test_df.index):
    question = test_df['question'][i]
    text = test_df['context'][i]
    
    inputs = tokenizer(question, text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    
    answer_start_index = outputs.start_logits.argmax()
    answer_end_index = outputs.end_logits.argmax()

    predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
    answer_decoded = tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)
    test_df.at[i, 'bart_answer'] = answer_decoded
test_df.head()

100%|██████████| 2096/2096 [17:02<00:00,  2.05it/s]


Unnamed: 0,context,question,id,answer,answer_start,bart_answer
0,some more detailed work has been done at natio...,The 9 percent reduction of rice in Bangladesh ...,14095,flooding damage and climate variability,514,flooding damage and climate variability
1,some more detailed work has been done at natio...,What kind of model of Bangladesh was had been ...,14096,a dynamic economywide model,70,dynamic economywide
2,some more detailed work has been done at natio...,What approach did Ahmed use to estimate how ch...,14097,a modelling approach,639,modelling approach
3,extreme sea level height fluctuations are also...,Where height fluctuations are large?,2843,extreme sea level height fluctuations are also...,0,to the north
4,extreme sea level height fluctuations are also...,How non-tide sea levels are obtained?,2844,the non-tide sea levels are obtained by spectr...,167,by spectrally removing the tidal energy from ...


In [14]:
for i in test_df.index[:10]:
    pprint(f"question: {test_df['question'][i]}")
    pprint(f"label answer: {test_df['answer'][i]}")
    pprint(f"BART answer: {test_df['bart_answer'][i]}")
    print()

('question: The 9 percent reduction of rice in Bangladesh is attributed to '
 'what two variables?')
'label answer: flooding damage and climate variability'
'BART answer:  flooding damage and climate variability'

('question: What kind of model of Bangladesh was had been used to estimate '
 'economic damages from historical climate variability and future '
 'anthropogenic climate change?')
'label answer: a dynamic economywide model'
'BART answer:  dynamic economywide'

('question: What approach did Ahmed use to estimate how changes in climate '
 'variability might affect crop yields and poverty rates in Tanzania to the '
 'early 2030s')
'label answer: a modelling approach'
'BART answer:  modelling approach'

'question: Where  height fluctuations are large?'
('label answer: extreme sea level height fluctuations are also larger to the '
 'north, as a result of increasing storm intensities at the more northerly '
 'coastal locations')
'BART answer:  to the north'

'question: How  non-tide

In [8]:
test_df.to_parquet('test_bart_qa_scored.parquet')

In [9]:
utils.evaluate_abstractive(test_df, 'bart_answer',)

rouge scores:
{'rouge1': np.float64(0.4341106284336489),
 'rouge2': np.float64(0.3796218872329144),
 'rougeL': np.float64(0.4321849585581291),
 'rougeLsum': np.float64(0.4325789489633455)}

average semantic similarity:
tensor(0.5995)


In [4]:
# fine tune BART for climate for extractive QA task (model is BART fine tuned for summarization)
model_checkpoint = 'z-dickson/bart-large-cnn-climate-change-summarization'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = BartForQuestionAnswering.from_pretrained(model_checkpoint) # vs. conditional generation for abstractive tasks

Some weights of BartForQuestionAnswering were not initialized from the model checkpoint at z-dickson/bart-large-cnn-climate-change-summarization and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# preprocess data for training
def preprocess_function(examples):
    tokenized_inputs = tokenizer(
        list(zip(examples['question'], examples['context'])),
        padding='max_length',
        max_length=1024, #BART max len=1024
        truncation=True,
        return_tensors='pt'
    )
    
    start_positions = []
    end_positions = []

    for i in range(len(examples["answer"])):
        context = examples["context"][i]
        answer = examples["answer"][i]
        answer_start = examples["answer_start"][i]

        answer_end = answer_start + len(answer) - 1

        start_token = tokenizer.encode(context[:answer_start], add_special_tokens=False)
        end_token = tokenizer.encode(context[:answer_end + 1], add_special_tokens=False)

        start_positions.append(len(start_token))
        end_positions.append(len(end_token) - 1)

    tokenized_inputs["start_positions"] = start_positions
    tokenized_inputs["end_positions"] = end_positions

    return tokenized_inputs    

In [6]:
# apply preprocess function to datasets for tokenization
tokenized_train = Dataset.from_pandas(train_df).map(preprocess_function, batched=True)
tokenized_valid = Dataset.from_pandas(val_df).map(preprocess_function, batched=True)
tokenized_test = Dataset.from_pandas(test_df).map(preprocess_function, batched=True)

Map:  61%|██████    | 9000/14756 [00:06<00:04, 1320.32 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1096 > 1024). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 14756/14756 [00:11<00:00, 1337.96 examples/s]
Map: 100%|██████████| 4229/4229 [00:03<00:00, 1360.58 examples/s]
Map: 100%|██████████| 2096/2096 [00:01<00:00, 1253.93 examples/s]


In [19]:
len(tokenized_train[9000]['input_ids'])

1024

In [20]:
# pprint([i for i in train_df.iloc[9000]])

In [21]:
BATCH_SIZE = 32
NUM_EPOCHS = 2
training_args = TrainingArguments(
    output_dir='./results/bart_qa',
    evaluation_strategy='epoch',
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid
)

In [22]:
for name, param in model.named_parameters():
    print(name)

model.shared.weight
model.encoder.embed_positions.weight
model.encoder.layers.0.self_attn.k_proj.weight
model.encoder.layers.0.self_attn.k_proj.bias
model.encoder.layers.0.self_attn.v_proj.weight
model.encoder.layers.0.self_attn.v_proj.bias
model.encoder.layers.0.self_attn.q_proj.weight
model.encoder.layers.0.self_attn.q_proj.bias
model.encoder.layers.0.self_attn.out_proj.weight
model.encoder.layers.0.self_attn.out_proj.bias
model.encoder.layers.0.self_attn_layer_norm.weight
model.encoder.layers.0.self_attn_layer_norm.bias
model.encoder.layers.0.fc1.weight
model.encoder.layers.0.fc1.bias
model.encoder.layers.0.fc2.weight
model.encoder.layers.0.fc2.bias
model.encoder.layers.0.final_layer_norm.weight
model.encoder.layers.0.final_layer_norm.bias
model.encoder.layers.1.self_attn.k_proj.weight
model.encoder.layers.1.self_attn.k_proj.bias
model.encoder.layers.1.self_attn.v_proj.weight
model.encoder.layers.1.self_attn.v_proj.bias
model.encoder.layers.1.self_attn.q_proj.weight
model.encoder.la

In [23]:
layers_to_train = ['qa_outputs']
for name, param in model.named_parameters():
        if not any(layer in name for layer in layers_to_train):
            param.requires_grad = False
        if any(layer in name for layer in layers_to_train):
            param.requires_grad = True

In [None]:
# train all layers
# batch size = 2 , 2 epochs
## max tokens = 1024 --> 120+ hours
## max tokens = 512 --> 80 hours

# train just qa outputs
## batch size 4, 2 epochs
## 1024 tokens --> ~5 hours
## batch size 8, 2 epochs
## 1024 tokens --> 4:50
## batch size 32, 2 epochs, 1024 tokens --> 5 hours

In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,5.626285
2,5.898200,5.578084


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}


TrainOutput(global_step=924, training_loss=5.767886207217262, metrics={'train_runtime': 55383.1316, 'train_samples_per_second': 0.533, 'train_steps_per_second': 0.017, 'total_flos': 6.395596272707174e+16, 'train_loss': 5.767886207217262, 'epoch': 2.0})

In [25]:
model_checkpoint_filepath = './checkpoints/climate_bart'
model.save_pretrained(model_checkpoint_filepath, from_pt=True)

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}


In [7]:
model_checkpoint_filepath = './checkpoints/climate_bart'
model = BartForQuestionAnswering.from_pretrained(model_checkpoint_filepath)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


In [11]:
for i in tqdm(test_df.index):
    question = test_df['question'][i]
    text = test_df['context'][i]
    
    inputs = tokenizer(question, text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    
    answer_start_index = outputs.start_logits.argmax()
    answer_end_index = outputs.end_logits.argmax()

    predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
    answer_decoded = tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)
    test_df.at[i, 'climate_bart_answer'] = answer_decoded
test_df.head()

100%|██████████| 2096/2096 [31:06<00:00,  1.12it/s]    


Unnamed: 0,context,question,id,answer,answer_start,climate_bart_answer
0,some more detailed work has been done at natio...,The 9 percent reduction of rice in Bangladesh ...,14095,flooding damage and climate variability,514,The 9 percent reduction of rice in Bangladesh ...
1,some more detailed work has been done at natio...,What kind of model of Bangladesh was had been ...,14096,a dynamic economywide model,70,What kind of model of Bangladesh was had been ...
2,some more detailed work has been done at natio...,What approach did Ahmed use to estimate how ch...,14097,a modelling approach,639,What approach did Ahmed use to estimate how ch...
3,extreme sea level height fluctuations are also...,Where height fluctuations are large?,2843,extreme sea level height fluctuations are also...,0,Where height fluctuations are large?extreme s...
4,extreme sea level height fluctuations are also...,How non-tide sea levels are obtained?,2844,the non-tide sea levels are obtained by spectr...,167,How non-tide sea levels are obtained?extreme ...


In [12]:
utils.evaluate_abstractive(test_df,'climate_bart_answer')

rouge scores:
{'rouge1': np.float64(0.3248086876030988),
 'rouge2': np.float64(0.2323292845317699),
 'rougeL': np.float64(0.28354570895180087),
 'rougeLsum': np.float64(0.283143466634203)}

average semantic similarity:
tensor(0.6485)


In [15]:
# for i in test_df.index[:5]:
#     print(test_df.iloc[i]['question'])
#     print(test_df.iloc[i]['answer'])
#     print(test_df.iloc[i]['climate_bart_answer'])
#     print()

# output is restating question and giving a varying span of the context - maybe train for a more epochs

In [16]:
# try training for another epoch to see if that improves the output
model_checkpoint_filepath = './checkpoints/climate_bart'
model = BartForQuestionAnswering.from_pretrained(model_checkpoint_filepath)

layers_to_train = ['qa_outputs']
for name, param in model.named_parameters():
        if not any(layer in name for layer in layers_to_train):
            param.requires_grad = False
        if any(layer in name for layer in layers_to_train):
            param.requires_grad = True

BATCH_SIZE = 32
NUM_EPOCHS = 1
training_args = TrainingArguments(
    output_dir='./results/bart_qa',
    evaluation_strategy='epoch',
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid
)

trainer.train()

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


Epoch,Training Loss,Validation Loss
1,No log,5.525818


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}


TrainOutput(global_step=462, training_loss=5.5678594680059526, metrics={'train_runtime': 22797.7494, 'train_samples_per_second': 0.647, 'train_steps_per_second': 0.02, 'total_flos': 3.197798136353587e+16, 'train_loss': 5.5678594680059526, 'epoch': 1.0})

In [17]:
BATCH_SIZE = 32
NUM_EPOCHS = 2
training_args = TrainingArguments(
    output_dir='./results/bart_qa',
    evaluation_strategy='epoch',
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,5.482283
2,5.523600,5.472097


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}


TrainOutput(global_step=924, training_loss=5.506216073965097, metrics={'train_runtime': 65078.4976, 'train_samples_per_second': 0.453, 'train_steps_per_second': 0.014, 'total_flos': 6.395596272707174e+16, 'train_loss': 5.506216073965097, 'epoch': 2.0})

In [18]:
model_checkpoint_filepath = './checkpoints/climate_bart/epoch5/'
model.save_pretrained(model_checkpoint_filepath, from_pt=True)

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}


In [22]:
if torch.backends.mps.is_built():
    device = torch.device("mps")

for i in tqdm(test_df.index):
    question = test_df['question'][i]
    text = test_df['context'][i]
    
    inputs = tokenizer(question, text, return_tensors="pt").to('mps')
    with torch.no_grad():
        outputs = model(**inputs)
    
    answer_start_index = outputs.start_logits.argmax()
    answer_end_index = outputs.end_logits.argmax()

    predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
    answer_decoded = tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)
    test_df.at[i, 'climate_bart_answer'] = answer_decoded

utils.evaluate_abstractive(test_df,'climate_bart_answer')

100%|██████████| 2096/2096 [17:51<00:00,  1.96it/s]   


rouge scores:
{'rouge1': np.float64(0.3263647001615847),
 'rouge2': np.float64(0.2345149501537028),
 'rougeL': np.float64(0.28361609940254884),
 'rougeLsum': np.float64(0.2838430615051699)}

average semantic similarity:
tensor(0.6534)


In [23]:
for i in test_df.index[:5]:
    print(test_df.iloc[i]['question'])
    print(test_df.iloc[i]['answer'])
    print(test_df.iloc[i]['climate_bart_answer'])
    print()

The 9 percent reduction of rice in Bangladesh is attributed to what two variables?
flooding damage and climate variability
The 9 percent reduction of rice in Bangladesh is attributed to what two variables?some more detailed work has been done at national level. for example, a dynamic economywide model of bangladesh has been used to estimate economic damages from historical climate variability and future anthropogenic climate change. using a combination of historical yield variability and ten climate projections, future anthropogenic climate change damages are estimated to reduce national rice production in bangladesh by about 9 percent to mid

What kind of model of Bangladesh was had been used to estimate economic damages from historical climate variability and future anthropogenic climate change?
a dynamic economywide model
What kind of model of Bangladesh was had been used to estimate economic damages from historical climate variability and future anthropogenic climate change?some mo

In [None]:
# BART is a seq2seq model so we could maybe use it out of the box for abstractive?