If you're opening this Notebook on colab, you will probably need to install 🤗 Transformers and 🤗 Datasets as well as other dependencies. Uncomment the following cell and run it.

In [None]:
! pip install datasets transformers rouge-score nltk
import nltk
nltk.download('punkt')
from datasets import load_metric

metric = load_metric("rouge")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 6.7 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 61.9 MB/s 
[?25hCollecting rouge-score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 89.5 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.9 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

# Fine-tuning a model on a summarization task

In [None]:
model_checkpoint = "t5-small"

## Loading the dataset

In [None]:
mydataset = {'document':['Which animal is known as the \'Ship of the Desert\"?', 'How many days are there in a week?', 'How many hours are there in a day?'], 'summary':['Camel', '7 days', '24 hours'], 'id':[1,2,3]}

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
import pickle
with open("/content/gdrive/MyDrive/clarifying/bart_inputs.pkl", "rb") as f_bart_inputs:
  train_examples, val_examples, test_examples = pickle.load(f_bart_inputs)

In [None]:
import pickle
modes = ["only query", "only actual", "pair", "choices"]
mode = modes[2]
inputs_prefix = "/content/gdrive/MyDrive/clarifying/turk_bart_inputs_"
with open(inputs_prefix + 'train' + '_'  +mode+'.pkl', "rb") as f_bart_inputs:
  train_examples = pickle.load(f_bart_inputs)
with open(inputs_prefix + 'val' + '_'  +mode+'.pkl', "rb") as f_bart_inputs:
  val_examples = pickle.load(f_bart_inputs)
with open(inputs_prefix + 'test' + '_'  +mode+'.pkl', "rb") as f_bart_inputs:
  test_examples = pickle.load(f_bart_inputs)

In [None]:
for k,v in train_examples.items():
  print(k,v[0])

src_e_list ['urinari', 'infect']
tgt_e_set {'caus', 'complic'}
tgt_fq What puts me at risk for recurrent urinary tract infections?
query urinary infections
clarification Do you want to know about causes and complications of urinary infections?
id_cands 1056 1057
cands ['My healthcare practitioner said I had symptoms of a urinary tract infection and prescribed antibiotics without performing a urine culture. Why?', 'What puts me at risk for recurrent urinary tract infections?']
dialogue user: urinary infections;


tgt_question Do you want to know about causes and complications of urinary infections?


## Preprocessing the data

In [None]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "my task: "
else:
    prefix = ""

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task

In [None]:
import torch
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, contents):
        self.contents = contents

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.contents.items()}
        return item

    def __len__(self):
        return len(self.contents[ list(self.contents.keys())[0] ])
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["tgt_question"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
preprocessed_mydataset = preprocess_function(train_examples)
my_dataset_train = MyDataset(preprocessed_mydataset)
preprocessed_mydataset = preprocess_function(val_examples)
my_dataset_val = MyDataset(preprocessed_mydataset)
preprocessed_mydataset = preprocess_function(test_examples)
my_dataset_test = MyDataset(preprocessed_mydataset)

## Fine-tuning the model

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penal

In [None]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-xsum",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=50,
    predict_with_generate=True,
    fp16=True
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import torch

In [None]:
print( tokenizer.decode( my_dataset_test[0]['input_ids'] ) )
print( tokenizer.decode( my_dataset_test[0]["labels"] ) )

my task: user: type genes; what the user actually means: Is my blood type (ABO) related to my HLA genes and antigens?;</s>
Do you want to know about HLA genes and antigens in relation to blood type ABO?</s>


In [None]:
model.to("cpu")
sequences = []
for i in range(len(my_dataset_test)):
  #input_ids = tokenizer("How many minutes are there in a year?", return_tensors="pt").input_ids
  #input_ids.to("cuda:0")
  input_text = tokenizer.decode( my_dataset_test[i]['input_ids'] )
  sequence_ids = model.generate( tokenizer(input_text, return_tensors="pt").input_ids )
  sequence = tokenizer.batch_decode(sequence_ids)
  sequences.append(sequence)
  print(input_text)
  print(sequence)  
  #input()

my task: user: type genes; what the user actually means: Is my blood type (ABO) related to my HLA genes and antigens?;</s>
['<pad> Do you want to know about type genes and antigens?</s>']
my task: user: cystic fibrosis; what the user actually means: Does this test detect all genetic causes of cystic fibrosis?;</s>
['<pad> Do you want to know about cystic fibrosis?</s>']
my task: user: worms stool; what the user actually means: Are those parasitic worms I'm seeing in my stool sample?;</s>
['<pad> Do you want to know about parasitic worms in stool sample?</s>']
my task: user: gene repair; what the user actually means: How do you know a mismatch repair gene variant is harmful (pathogenic)?;</s>
['<pad> Do you want to know about a mismatch repair gene variant</s>']
my task: user: allergy symptoms; what the user actually means: My allergy symptoms are generally mild. How serious is this really?;</s>
['<pad> Do you want to know about allergy symptoms?</s>']
my task: user: thyroglobulin level

In [None]:
test_examples['pred_q'] = sequences
outputs_prefix = "/content/gdrive/MyDrive/clarifying/turk_bart_outputs_"
with open(outputs_prefix +mode+'.pkl', "wb") as f_bart_output:
  pickle.dump(test_examples, f_bart_output)

In [None]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
my_tokenized_dataset = preprocess_function(my_dataset[:1])

In [None]:

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=my_dataset_train,
    eval_dataset=my_dataset_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using cuda_amp half precision backend


We can now finetune our model by just calling the `train` method:

In [None]:
model.to('cuda:0')

In [None]:
trainer.train()

***** Running training *****
  Num examples = 451
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1450


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.046565,11.3901,2.7354,9.514,9.4283,18.2078
2,No log,1.637487,54.726,37.6374,52.3916,52.2401,14.9286
3,No log,1.430913,74.3305,57.9891,72.3143,72.2511,14.0455
4,No log,1.338426,74.9402,58.4769,72.735,72.7273,14.8247
5,No log,1.290381,75.002,58.2865,72.5857,72.5837,15.013
6,No log,1.262884,75.7839,59.239,73.5498,73.5069,14.7597
7,No log,1.241023,75.468,59.0869,73.4835,73.3765,15.1364
8,No log,1.226495,76.2272,59.9979,74.06,74.0097,14.7662
9,No log,1.213506,75.9311,59.7857,73.797,73.7717,14.7597
10,No log,1.205111,76.0406,59.7698,73.7387,73.6833,15.0779


***** Running Evaluation *****
  Num examples = 154
  Batch size = 16
***** Running Evaluation *****
  Num examples = 154
  Batch size = 16
***** Running Evaluation *****
  Num examples = 154
  Batch size = 16
***** Running Evaluation *****
  Num examples = 154
  Batch size = 16
***** Running Evaluation *****
  Num examples = 154
  Batch size = 16
***** Running Evaluation *****
  Num examples = 154
  Batch size = 16
***** Running Evaluation *****
  Num examples = 154
  Batch size = 16
***** Running Evaluation *****
  Num examples = 154
  Batch size = 16
***** Running Evaluation *****
  Num examples = 154
  Batch size = 16
***** Running Evaluation *****
  Num examples = 154
  Batch size = 16
***** Running Evaluation *****
  Num examples = 154
  Batch size = 16
***** Running Evaluation *****
  Num examples = 154
  Batch size = 16
***** Running Evaluation *****
  Num examples = 154
  Batch size = 16
***** Running Evaluation *****
  Num examples = 154
  Batch size = 16
***** Running Evalua

TrainOutput(global_step=1450, training_loss=1.265770158438847, metrics={'train_runtime': 213.885, 'train_samples_per_second': 105.43, 'train_steps_per_second': 6.779, 'total_flos': 306403529293824.0, 'train_loss': 1.265770158438847, 'epoch': 50.0})

You can now upload the result of the training to the Hub, just execute this instruction:

You can now share this model with all your friends, family, favorite pets: they can all load it with the identifier `"your-username/the-name-you-picked"` so for instance:

```python
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("sgugger/my-awesome-model")
```