In [None]:
! pip install datasets transformers rouge_score wandb

In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
import random
import numpy as np
import pandas as pd
import wandb
import torch
import transformers
import nltk
nltk.download('punkt')
from nltk import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from huggingface_hub import notebook_login
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [None]:
model_checkpoint = "t5-small"

## Loading the dataset

In [None]:
from datasets import Dataset, load_metric

In [None]:
df = pd.read_csv("data_combine.csv")
df = pd.DataFrame(df)
filter = df["input"].map(lambda x: len(x.split())) <= 64
df = df.where(filter).dropna()
# load Dataset from Pandas DataFrame
dataset = Dataset.from_pandas(df, preserve_index=False)

In [None]:
split_datasets = dataset.train_test_split(test_size=0.2, shuffle=False)

In [None]:
split_datasets

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 20744
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 5187
    })
})

In [None]:
split_datasets["train"][0]

{'input': 'That government should spend more money supporting the arts than in supporting athletics such as state-sponsored Olympic teams.',
 'output': 'The government should spend more money supporting the arts than in supporting athletics such as state-sponsored Olympic teams.'}

## Preprocessing the training data

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=64)

In [None]:
max_input_length = 64
max_target_length = 64

In [None]:
def tokenize_function(examples):
  model_inputs = tokenizer(examples["input"], text_target=examples["output"], max_length=max_input_length, truncation=True)
  return model_inputs

In [None]:
split_datasets['train'][0]

{'input': 'That government should spend more money supporting the arts than in supporting athletics such as state-sponsored Olympic teams.',
 'output': 'The government should spend more money supporting the arts than in supporting athletics such as state-sponsored Olympic teams.'}

In [None]:
features = tokenize_function(split_datasets['train'][0])
features  

{'input_ids': [466, 789, 225, 1492, 72, 540, 3956, 8, 5138, 145, 16, 3956, 12217, 7, 224, 38, 538, 18, 27959, 11548, 2323, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [37, 789, 225, 1492, 72, 540, 3956, 8, 5138, 145, 16, 3956, 12217, 7, 224, 38, 538, 18, 27959, 11548, 2323, 5, 1]}

In [None]:
tokenized_datasets = split_datasets.map(tokenize_function, batched=True, remove_columns=split_datasets["train"].column_names)

  0%|          | 0/21 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 20744
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5187
    })
})

## Train the model

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
rouge_metric = load_metric("rouge")

  """Entry point for launching an IPython kernel.


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  # Replace -100 in the labels as we can't decode them.
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
  
  # Rouge expects a newline after each sentence
  decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
  decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
  
  result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  # Extract a few results
  result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
  
  # Add mean generated length
  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  result["gen_len"] = np.mean(prediction_lens)
  return {k: round(v, 4) for k, v in result.items()}

In [None]:
model_name = model_checkpoint.split("/")[-1]
batch_size = 16
args = Seq2SeqTrainingArguments(
    f"{model_name}-gec",
    evaluation_strategy = "steps",
    eval_steps = 500,
    save_steps = 500,
    load_best_model_at_end=True,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
    report_to="wandb"
)

PyTorch: setting up devices


In [None]:
trainer = Seq2SeqTrainer(model=model, 
             args=args, 
             train_dataset=tokenized_datasets["train"],
             eval_dataset=tokenized_datasets["test"],
             tokenizer=tokenizer,
             data_collator=data_collator,
             compute_metrics=compute_metrics)

Cloning https://huggingface.co/Luffyt/t5-small-gec into local empty directory.
Using cuda_amp half precision backend


In [None]:
trainer.train()

***** Running training *****
  Num examples = 20744
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3891
  Number of trainable parameters = 60506624
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mluffyt[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
500,1.0189,0.893732,70.128,58.5593,69.3199,69.3571,17.4066
1000,0.9057,0.851863,70.6893,59.3433,69.931,69.9625,17.3933
1500,0.8644,0.834377,70.7898,59.5163,70.0337,70.0682,17.3962
2000,0.8535,0.823567,70.8738,59.6249,70.1189,70.1467,17.3906
2500,0.8467,0.816147,70.9394,59.7503,70.1747,70.2126,17.3923
3000,0.8355,0.81077,70.9503,59.7829,70.1843,70.2227,17.3906
3500,0.8221,0.810892,70.9451,59.787,70.1898,70.2269,17.3875


***** Running Evaluation *****
  Num examples = 5187
  Batch size = 16
Saving model checkpoint to t5-small-gec/checkpoint-500
Configuration saved in t5-small-gec/checkpoint-500/config.json
Model weights saved in t5-small-gec/checkpoint-500/pytorch_model.bin
tokenizer config file saved in t5-small-gec/checkpoint-500/tokenizer_config.json
Special tokens file saved in t5-small-gec/checkpoint-500/special_tokens_map.json
tokenizer config file saved in t5-small-gec/tokenizer_config.json
Special tokens file saved in t5-small-gec/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 5187
  Batch size = 16
Saving model checkpoint to t5-small-gec/checkpoint-1000
Configuration saved in t5-small-gec/checkpoint-1000/config.json
Model weights saved in t5-small-gec/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in t5-small-gec/checkpoint-1000/tokenizer_config.json
Special tokens file saved in t5-small-gec/checkpoint-1000/special_tokens_map.json
***** Running Evaluatio

TrainOutput(global_step=3891, training_loss=0.8740706922952205, metrics={'train_runtime': 968.1932, 'train_samples_per_second': 64.276, 'train_steps_per_second': 4.019, 'total_flos': 999717019582464.0, 'train_loss': 0.8740706922952205, 'epoch': 3.0})

In [None]:
trainer.push_to_hub()

Saving model checkpoint to t5-small-gec
Configuration saved in t5-small-gec/config.json
Model weights saved in t5-small-gec/pytorch_model.bin
tokenizer config file saved in t5-small-gec/tokenizer_config.json
Special tokens file saved in t5-small-gec/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/231M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/Luffyt/t5-small-gec
   f34fd2f..c33468a  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/Luffyt/t5-small-gec
   f34fd2f..c33468a  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}, 'metrics': [{'name': 'Rouge1', 'type': 'rouge', 'value': 70.9451}]}
To https://huggingface.co/Luffyt/t5-small-gec
   c33468a..5a208a0  main -> main

   c33468a..5a208a0  main -> main



'https://huggingface.co/Luffyt/t5-small-gec/commit/c33468a936de8b0558bd2cc96bde9abdb3c76c34'

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_checkpoint = "Luffyt/t5-small-gec"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
text = "I eated an apple"
input_ids = tokenizer([text], return_tensors="pt", truncation=True, max_length=64)
outputs = model.generate(**input_ids, num_beams=4, num_return_sequences=2)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['I ate an apple.', 'I eated an apple.']


In [None]:
text = "He went shopping with he friends."
input_ids = tokenizer([text], return_tensors="pt", truncation=True, max_length=64)
outputs = model.generate(**input_ids, num_beams=5, num_return_sequences=2)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['He went shopping with his friends.', 'He went shopping with friends.']
