In [1]:
!pip install -q transformers
!pip install -q evaluate
!pip install -q datasets
!pip install -q pandas
!pip install -q torch
!pip install -q rouge_score

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m481.3/491.2 kB[0m [31m30.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import evaluate
from pprint import pprint
import pandas as pd
from datasets import Dataset, load_dataset
# For pre-trained T5 model
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import torch

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# define functions
# using code from here: https://medium.com/@ajazturki10/simplifying-language-understanding-a-beginners-guide-to-question-answering-with-t5-and-pytorch-253e0d6aac54
def prepare_data(data):
  articles = []

  for paragraph in data:
    context = paragraph['context']
    for qa in paragraph['qas']:
      question = qa['question']
      id = qa['id']
      for ans in qa['answers']:
        answer = ans['text']
        answer_start = ans['answer_start']
        articles.append({'context': context, 'question': question, 'id': id, 'answer': answer, 'answer_start': answer_start})

  return articles

def evaluate_abstractive(result_df,
                         pred_col,
                         ref_col='answer',
                         encoder_model='sentence-transformers/all-MiniLM-L12-v2'):
    predictions = result_df[pred_col].tolist()
    references = result_df[ref_col].tolist()

    rouge = evaluate.load('rouge')
    rouge_res = rouge.compute(predictions=predictions,
                              references=references)

    encoder_model = SentenceTransformer(encoder_model)
    candidate_embeddings = encoder_model.encode(predictions)
    reference_embeddings = encoder_model.encode(references)
    similarity = util.pairwise_cos_sim(candidate_embeddings, reference_embeddings)

    print('rouge scores:')
    pprint(rouge_res)
    print()
    print('average semantic similarity:')
    print(torch.mean(similarity))

In [7]:
ds = load_dataset("rony/climate-change-MRC")

README.md:   0%|          | 0.00/633 [00:00<?, ?B/s]

CCMRC_train.json:   0%|          | 0.00/12.8M [00:00<?, ?B/s]

CCMRC_validation.json:   0%|          | 0.00/3.62M [00:00<?, ?B/s]

CCMRC_test.json:   0%|          | 0.00/1.79M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1 [00:00<?, ? examples/s]

In [8]:
# score T5 on question answering on validation & test datsets
train_ds = ds["train"]
valid_ds = ds["validation"]
test_ds = ds["test"]

# each is a 1-item list, so take first index
train_ds = train_ds[0]
valid_ds = valid_ds[0]
test_ds = test_ds[0]

# take the 'data' key of the dict, ignoring 'version' (there's just one)
train_ds = train_ds['data'][0]['paragraphs']
valid_ds = valid_ds['data'][0]['paragraphs']
test_ds = test_ds['data'][0]['paragraphs']
# each dataset is a list of dicts, where each list item is a context paragraph ('context' key) with qas ('qas' key) which contain questions, id, and answer

train_df = pd.DataFrame(prepare_data(train_ds))
print(f"{train_df.shape=}")

valid_df = pd.DataFrame(prepare_data(valid_ds))
print(f"{valid_df.shape=}")

test_df = pd.DataFrame(prepare_data(test_ds))
print(f"{test_df.shape=}")

train_df.shape=(14756, 5)
valid_df.shape=(4229, 5)
test_df.shape=(2096, 5)


In [9]:
# preprocess function for training model

def preprocess_data(df, tokenizer, max_input_length=512, max_output_length=128):
  input_texts = []
  output_texts = []

  for _, row in df.iterrows():
    input_text = f"question: {row['question']}  context: {row['context']}"
    output_text = row['answer']

    input_texts.append(input_text)
    output_texts.append(output_text)

  inputs = tokenizer(input_texts, padding=True, truncation=True, max_length=max_input_length, return_tensors="pt")
  labels = tokenizer(output_texts, padding=True, truncation=True, max_length=max_output_length, return_tensors="pt")

  return inputs, labels

In [7]:
# Fine tune T5 base

t5_pretrained_checkpoint_name = 't5-base'
t5_tokenizer = T5Tokenizer.from_pretrained(t5_pretrained_checkpoint_name)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_pretrained_checkpoint_name)

train_inputs, train_outputs = preprocess_data(train_df, t5_tokenizer)
valid_inputs, valid_outputs = preprocess_data(valid_df, t5_tokenizer)

train_dataset = Dataset.from_dict({
    'input_ids': train_inputs['input_ids'],
    'attention_mask': train_inputs['attention_mask'],
    'labels': train_outputs['input_ids']
})

valid_dataset = Dataset.from_dict({
    'input_ids': valid_inputs['input_ids'],
    'attention_mask': valid_inputs['attention_mask'],
    'labels': valid_outputs['input_ids']
})

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
training_args = TrainingArguments(
    output_dir='./finetunedt5-base_results',
    evaluation_strategy='epoch',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=t5_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)



In [9]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmjlenci[0m ([33mmjlenci-university-of-california-berkeley[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,0.038,0.045937


TrainOutput(global_step=1845, training_loss=0.14771121197599707, metrics={'train_runtime': 2727.9835, 'train_samples_per_second': 5.409, 'train_steps_per_second': 0.676, 'total_flos': 8985782633103360.0, 'train_loss': 0.14771121197599707, 'epoch': 1.0})

In [12]:
# save t5 base trained model & tokenizer for later

trained_t5base_model = trainer.model

trainer.save_model('/content/drive/MyDrive/mids-w266/final_project')
t5_tokenizer.save_pretrained('/content/drive/MyDrive/mids-w266/final_project')

('/content/drive/MyDrive/mids-w266/final_project/tokenizer_config.json',
 '/content/drive/MyDrive/mids-w266/final_project/special_tokens_map.json',
 '/content/drive/MyDrive/mids-w266/final_project/spiece.model',
 '/content/drive/MyDrive/mids-w266/final_project/added_tokens.json')

In [13]:
# load saved t5 base trained model

trained_t5base_model = T5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/mids-w266/final_project')
t5_tokenizer = T5Tokenizer.from_pretrained('/content/drive/MyDrive/mids-w266/final_project')

In [14]:
# tokenize test_df inputs for inference
test_input_texts = []
for _, row in test_df.iterrows():
  input_text = f"question: {row['question']}  context: {row['context']}"
  test_input_texts.append(input_text)
test_inputs = t5_tokenizer(test_input_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

In [20]:
torch.cuda.empty_cache()

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
trained_t5base_model = trained_t5base_model.to(device)
test_inputs = {key: value.to(device) for key, value in test_inputs.items()}

predicted_outputs = trained_t5base_model.generate(
    test_inputs['input_ids'],
    num_beams=2, early_stopping=True)

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.07 GiB. GPU 0 has a total capacity of 14.74 GiB of which 2.53 GiB is free. Process 2454 has 12.21 GiB memory in use. Of the allocated memory 11.85 GiB is allocated by PyTorch, and 233.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [10]:
# Fine tune T5 small

t5_pretrained_checkpoint_name = 't5-small'
t5_tokenizer = T5Tokenizer.from_pretrained(t5_pretrained_checkpoint_name)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_pretrained_checkpoint_name)

train_inputs, train_outputs = preprocess_data(train_df, t5_tokenizer)
valid_inputs, valid_outputs = preprocess_data(valid_df, t5_tokenizer)

train_dataset = Dataset.from_dict({
    'input_ids': train_inputs['input_ids'],
    'attention_mask': train_inputs['attention_mask'],
    'labels': train_outputs['input_ids']
})

valid_dataset = Dataset.from_dict({
    'input_ids': valid_inputs['input_ids'],
    'attention_mask': valid_inputs['attention_mask'],
    'labels': valid_outputs['input_ids']
})

In [11]:
training_args_small = TrainingArguments(
    output_dir='./finetunedt5-base_results',
    evaluation_strategy='epoch',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer_t5_small = Trainer(
    model=t5_model,
    args=training_args_small,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)



In [None]:
trainer_t5_small.train()

In [28]:
# save t5-small model & tokenizer for later

trained_t5small_model = trainer_t5_small.model

trainer_t5_small.save_model('/content/drive/MyDrive/mids-w266/final_project/t5small')
t5_tokenizer.save_pretrained('/content/drive/MyDrive/mids-w266/final_project/t5small')

('/content/drive/MyDrive/mids-w266/final_project/t5small/tokenizer_config.json',
 '/content/drive/MyDrive/mids-w266/final_project/t5small/special_tokens_map.json',
 '/content/drive/MyDrive/mids-w266/final_project/t5small/spiece.model',
 '/content/drive/MyDrive/mids-w266/final_project/t5small/added_tokens.json')

In [8]:
# load saved t5-small model
from transformers import T5ForConditionalGeneration, T5Tokenizer

trained_t5small_model = T5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/mids-w266/final_project/t5small')
t5_tokenizer = T5Tokenizer.from_pretrained('/content/drive/MyDrive/mids-w266/final_project/t5small')

In [12]:
# increasing to 3 epochs for training

training_args_small = TrainingArguments(
    output_dir='./finetunedt5-base_results',
    evaluation_strategy='epoch',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer_t5_small = Trainer(
    model=t5_model,
    args=training_args_small,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)



In [13]:
trainer_t5_small.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmjlenci[0m ([33mmjlenci-university-of-california-berkeley[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.056,0.054063
2,0.0668,0.052003
3,0.0578,0.051325


TrainOutput(global_step=5535, training_loss=0.12085166287077583, metrics={'train_runtime': 2100.0614, 'train_samples_per_second': 21.079, 'train_steps_per_second': 2.636, 'total_flos': 5991310867562496.0, 'train_loss': 0.12085166287077583, 'epoch': 3.0})

In [14]:
# save trained t5-small model with 3 epochs & tokenizer for later

trained_t5small_model = trainer_t5_small.model

trainer_t5_small.save_model('/content/drive/MyDrive/mids-w266/final_project/t5small_3epoch')
t5_tokenizer.save_pretrained('/content/drive/MyDrive/mids-w266/final_project/t5small_3epoch')

('/content/drive/MyDrive/mids-w266/final_project/t5small_3epoch/tokenizer_config.json',
 '/content/drive/MyDrive/mids-w266/final_project/t5small_3epoch/special_tokens_map.json',
 '/content/drive/MyDrive/mids-w266/final_project/t5small_3epoch/spiece.model',
 '/content/drive/MyDrive/mids-w266/final_project/t5small_3epoch/added_tokens.json')

In [None]:
# load saved trained t5-small model with 3 epochs
from transformers import T5ForConditionalGeneration, T5Tokenizer

trained_t5small_model = T5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/mids-w266/final_project/t5small_3epoch')
t5_tokenizer = T5Tokenizer.from_pretrained('/content/drive/MyDrive/mids-w266/final_project/t5small')