In [None]:
! pip install datasets transformers rouge_score wandb openai

In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
import random
import numpy as np
import pandas as pd
import wandb
import torch
import evalutate
import transformers
import nltk
nltk.download('punkt')
from nltk import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from huggingface_hub import notebook_login
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [None]:
model_checkpoint = "t5-base"

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
model

## Loading the dataset

In [None]:
from datasets import Dataset, load_metric

In [None]:
df = pd.read_csv("data_combine.csv")
df = pd.DataFrame(df)
filter = df["input"].map(lambda x: len(x.split())) <= 64
df = df.where(filter).dropna()
# load Dataset from Pandas DataFrame
dataset = Dataset.from_pandas(df, preserve_index=False)

In [None]:
split_datasets = dataset.train_test_split(test_size=0.2, shuffle=True, seed=43)

In [None]:
split_datasets

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 17721
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 4431
    })
})

In [None]:
split_datasets["train"][1]

{'input': 'The Limerick Post is part of a group of news sites operated by Joe Zlomek in southeastern Pennsylvania octhers on the group are The Sanatoga Post and The Pottstown Post for more information, see the The profile of The Sanatoga Post.',
 'output': 'The Limerick Post is part of a group of news sites operated by Joe Zlomek in southeastern Pennsylvania. Others in the group are The Sanatoga Post and The Pottstown Post. For more information, see the profile of The Sanatoga Post.'}

## Preprocessing the training data

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=64)

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [None]:
max_input_length = 64
max_target_length = 64

In [None]:
def tokenize_function(examples):
  model_inputs = tokenizer(examples["input"], text_target=examples["output"], max_length=max_input_length, truncation=True)
  return model_inputs

In [None]:
split_datasets['train'][1]

{'input': 'The Limerick Post is part of a group of news sites operated by Joe Zlomek in southeastern Pennsylvania octhers on the group are The Sanatoga Post and The Pottstown Post for more information, see the The profile of The Sanatoga Post.',
 'output': 'The Limerick Post is part of a group of news sites operated by Joe Zlomek in southeastern Pennsylvania. Others in the group are The Sanatoga Post and The Pottstown Post. For more information, see the profile of The Sanatoga Post.'}

In [None]:
features = tokenize_function(split_datasets['train'][1])
features  

{'input_ids': [37, 10908, 15, 5206, 1844, 19, 294, 13, 3, 9, 563, 13, 1506, 1471, 7747, 57, 4967, 1027, 40, 7159, 157, 16, 3, 7, 28478, 8913, 3, 32, 75, 189, 277, 30, 8, 563, 33, 37, 1051, 144, 19914, 1844, 11, 37, 7995, 17, 7, 3540, 1844, 21, 72, 251, 6, 217, 8, 37, 3278, 13, 37, 1051, 144, 19914, 1844, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [37, 10908, 15, 5206, 1844, 19, 294, 13, 3, 9, 563, 13, 1506, 1471, 7747, 57, 4967, 1027, 40, 7159, 157, 16, 3, 7, 28478, 8913, 5, 14818, 16, 8, 563, 33, 37, 1051, 144, 19914, 1844, 11, 37, 7995, 17, 7, 3540, 1844, 5, 242, 72, 251, 6, 217, 8, 3278, 13, 37, 1051, 144, 19914, 1844, 5, 1]}

In [None]:
tokenized_datasets = split_datasets.map(tokenize_function, batched=True, remove_columns=split_datasets["train"].column_names)

  0%|          | 0/18 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 17721
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4431
    })
})

## Train the model

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
rouge_metric = load_metric("rouge")

  rouge_metric = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  # Replace -100 in the labels as we can't decode them.
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
  
  # Rouge expects a newline after each sentence
  decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
  decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
  
  result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  # Extract a few results
  result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
  
  # Add mean generated length
  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  result["gen_len"] = np.mean(prediction_lens)
  return {k: round(v, 4) for k, v in result.items()}

In [None]:
model_name = model_checkpoint.split("/")[-1]
batch_size = 16
args = Seq2SeqTrainingArguments(
    f"{model_name}-gec-combine_data",
    evaluation_strategy = "steps",
    eval_steps = 500,
    save_steps = 500,
    logging_steps = 500,
    load_best_model_at_end=True,
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
    report_to="wandb"
)

In [None]:
trainer = Seq2SeqTrainer(model=model, 
             args=args, 
             train_dataset=tokenized_datasets["train"],
             eval_dataset=tokenized_datasets["test"],
             tokenizer=tokenizer,
             data_collator=data_collator,
             compute_metrics=compute_metrics)

Cloning https://huggingface.co/Luffyt/t5-base-gec-combine_data into local empty directory.
Using cuda_amp half precision backend


In [None]:
trainer.train()

***** Running training *****
  Num examples = 17721
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5540
  Number of trainable parameters = 222903552
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mluffyt[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
500,0.8413,0.654929,74.1413,62.083,73.4159,73.4206,16.9847
1000,0.6945,0.614742,75.0476,63.5645,74.3777,74.3726,16.926
1500,0.6551,0.59718,75.3742,64.1326,74.7249,74.7271,16.9048
2000,0.6429,0.585312,75.6,64.4944,74.9325,74.9335,16.9061
2500,0.6142,0.578289,75.7553,64.7554,75.0816,75.09,16.8953
3000,0.6116,0.570397,75.8622,64.9522,75.1886,75.1969,16.8863
3500,0.5991,0.568101,75.9079,65.0892,75.2619,75.2666,16.8887
4000,0.5933,0.566385,76.0166,65.2437,75.3563,75.3664,16.8842
4500,0.5904,0.564546,76.0398,65.2934,75.3759,75.3875,16.8849
5000,0.5962,0.562363,76.0458,65.2794,75.3762,75.3876,16.882


***** Running Evaluation *****
  Num examples = 4431
  Batch size = 16
Saving model checkpoint to t5-base-gec-combine_data/checkpoint-500
Configuration saved in t5-base-gec-combine_data/checkpoint-500/config.json
Model weights saved in t5-base-gec-combine_data/checkpoint-500/pytorch_model.bin
tokenizer config file saved in t5-base-gec-combine_data/checkpoint-500/tokenizer_config.json
Special tokens file saved in t5-base-gec-combine_data/checkpoint-500/special_tokens_map.json
tokenizer config file saved in t5-base-gec-combine_data/tokenizer_config.json
Special tokens file saved in t5-base-gec-combine_data/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4431
  Batch size = 16
Saving model checkpoint to t5-base-gec-combine_data/checkpoint-1000
Configuration saved in t5-base-gec-combine_data/checkpoint-1000/config.json
Model weights saved in t5-base-gec-combine_data/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in t5-base-gec-combine_data/checkpoint-

TrainOutput(global_step=5540, training_loss=0.6372545421338683, metrics={'train_runtime': 2993.8468, 'train_samples_per_second': 29.596, 'train_steps_per_second': 1.85, 'total_flos': 6012041439191040.0, 'train_loss': 0.6372545421338683, 'epoch': 5.0})

In [None]:
trainer.push_to_hub()

Saving model checkpoint to t5-base-gec-combine_data
Configuration saved in t5-base-gec-combine_data/config.json
Model weights saved in t5-base-gec-combine_data/pytorch_model.bin
tokenizer config file saved in t5-base-gec-combine_data/tokenizer_config.json
Special tokens file saved in t5-base-gec-combine_data/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.30k/850M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/Luffyt/t5-base-gec-combine_data
   c115f7e..4381b6c  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/Luffyt/t5-base-gec-combine_data
   c115f7e..4381b6c  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}, 'metrics': [{'name': 'Rouge1', 'type': 'rouge', 'value': 76.0801}]}
To https://huggingface.co/Luffyt/t5-base-gec-combine_data
   4381b6c..8e6a637  main -> main

   4381b6c..8e6a637  main -> main



'https://huggingface.co/Luffyt/t5-base-gec-combine_data/commit/4381b6cf738608f533286c5e8dde68a54549117a'

## Model Implementation

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
import pandas as pd
import evaluate

In [None]:
import openai
openai.api_key = "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"

In [None]:
test_new = pd.read_csv("test.csv")
test_combine = pd.read_csv("test_combine.csv")

In [None]:
def grammar_correct(model, tokenizer, text, num_sentence=2):
  input_ids = tokenizer([text], return_tensors="pt", truncation=True, max_length=64)
  outputs = model.generate(**input_ids, num_beams=num_sentence, num_return_sequences=num_sentence, max_length=64)
  return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [None]:
def calulate_acceptance_rate(model, tokenizer, text, num_sentence=2):
  results = []
  for sent in text["input"]:
    result = grammar_correct(model, tokenizer, sent, num_sentence=num_sentence)
    results.append(result)
  count = 0
  total = len(results) * num_sentence
  for sent_list in results:
    for sent in sent_list:
      response = openai.Completion.create(
      model="text-davinci-003",
      prompt="Print \"True\" if this is grammatically correct otherwise \"False\":\n\n{}".format(sent),
      temperature=0,
      max_tokens=60,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0)
      if response["choices"][0]["text"].strip() == "True":
        count += 1
  return count/total

In [None]:
def calulate_rouge(model, tokenizer, text, num_sentence=2):
  results = []
  label = []
  for sent,ref in zip(text["input"], text["output"]):
    result = grammar_correct(model, tokenizer, sent, num_sentence=num_sentence)
    results.append(result[0])
    results.append(result[1])
    label.append(ref)
    label.append(ref)
  rouge = evaluate.load('rouge')
  rougescore = rouge.compute(predictions=results,references=label)
  return rougescore

In [None]:
# test t5-small-gec-new_data model
model_checkpoint = "Luffyt/t5-small-gec-new_data"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

acc_small_new_test = calulate_acceptance_rate(model, tokenizer, test_new)
acc_small_new_test_combine = calulate_acceptance_rate(model, tokenizer, test_combine)

In [None]:
rouge_small_new_test = calulate_rouge(model, tokenizer, test_new, num_sentence=2)
rouge_small_new_test_combine = calulate_rouge(model, tokenizer, test_combine, num_sentence=2)

In [None]:
model_checkpoint = "Luffyt/t5-small-gec-combine_data"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

acc_small_test = calulate_acceptance_rate(model, tokenizer, test_new)
acc_small_test_combine = calulate_acceptance_rate(model, tokenizer, test_combine)

In [None]:
rouge_small_test = calulate_rouge(model, tokenizer, test_new, num_sentence=2)
rouge_small_test_combine = calulate_rouge(model, tokenizer, test_combine, num_sentence=2)

In [None]:
model_checkpoint = "Luffyt/t5-base-gec-new_data"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

acc_base_new_test = calulate_acceptance_rate(model, tokenizer, test_new)
acc_base_new_test_combine = calulate_acceptance_rate(model, tokenizer, test_combine)

Downloading:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
rouge_base_new_test = calulate_rouge(model, tokenizer, test_new, num_sentence=2)
rouge_base_new_test_combine = calulate_rouge(model, tokenizer, test_combine, num_sentence=2)

In [None]:
model_checkpoint = "Luffyt/t5-base-gec-combine_data"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

acc_base_test = calulate_acceptance_rate(model, tokenizer, test_new)
acc_base_test_combine = calulate_acceptance_rate(model, tokenizer, test_combine)

Downloading:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
rouge_base_test = calulate_rouge(model, tokenizer, test_new, num_sentence=2)
rouge_base_test_combine = calulate_rouge(model, tokenizer, test_combine, num_sentence=2)

In [None]:
model_results = {"t5-small-new_data": [round(acc_small_new_test, 2), round(acc_small_new_test_combine, 2)], 
          "t5-small-combine_data": [round(acc_small_test, 2), round(acc_small_test_combine, 2)], 
          "t5-base-new_data": [round(acc_base_new_test, 2), round(acc_base_new_test_combine, 2)], 
          "t5-base-combine_data":[round(acc_base_test, 2), round(acc_base_test_combine, 2)]}
df_result = pd.DataFrame(data=model_results, index=["acceptance rate on created data", "acceptance rate on combined data"])
df_result

Unnamed: 0,t5-small-new_data,t5-small-combine_data,t5-base-new_data,t5-base-combine_data
acceptance rate on created data,0.55,0.59,0.88,0.89
acceptance rate on combined data,0.54,0.58,0.78,0.82


In [None]:
df_result.to_csv("result.csv")

In [None]:
model_rouge = {"t5-small-new_data": [round(rouge_small_new_test["rouge1"]*100, 2), round(rouge_small_new_test_combine["rouge1"]*100, 2)], 
          "t5-small-combine_data": [round(rouge_small_test["rouge1"]*100, 2), round(rouge_small_test_combine["rouge1"]*100, 2)], 
          "t5-base-new_data": [round(rouge_base_new_test["rouge1"]*100, 2), round(rouge_base_new_test_combine["rouge1"]*100, 2)], 
          "t5-base-combine_data":[round(rouge_base_test["rouge1"]*100, 2), round(rouge_base_test_combine["rouge1"]*100, 2)]}
df_rouge = pd.DataFrame(data=model_rouge, index=["rouge score on created data", "rouge score on combined data"])
df_rouge

Unnamed: 0,t5-small-new_data,t5-small-combine_data,t5-base-new_data,t5-base-combine_data
rouge score on created data,81.44,81.87,87.54,87.17
rouge score on combined data,81.66,82.72,86.04,86.66


In [None]:
df_rouge.to_csv("result_rouge.csv")

In [None]:
def show_result(model, tokenizer, text, num_sentence=2):
  text_copy = text.copy()
  text_copy.rename(columns={"input":"Original Sentence", "output":"Reference"}, inplace = True)
  results = []
  for sent in text["input"]:
    result = grammar_correct(model, tokenizer, sent, num_sentence=num_sentence)
    results.append(result)
  text_copy["Model Suggestions"] = results
  return text_copy

In [None]:
model_checkpoint = "Luffyt/t5-base-gec-combine_data"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
df_show = show_result(model, tokenizer, test_combine, num_sentence=2)

In [None]:
df_show.head(5)

Unnamed: 0,Original Sentence,Reference,Model Suggestions
0,New factories often bring many cood things to ...,New factories often bring many good things to ...,[New factories often bring many good things to...
1,"However, the benefits of have the factory are ...","However, the benefits of having a factory are ...","[However, the benefits of having a factory are..."
2,A oppose what plan to build a factory in my co...,I oppose the plan to build a factory near my c...,[A oppose what plan to build a factory in my c...
3,This city 'm figure harmed by a large factory.,This city would be harmed by a large factory.,"[This city was harmed by a large factory., Thi..."
4,"In particular, no factory would broke the qual...","In particular, a factory would destroy the qua...","[In particular, no factory would break the qua..."


In [None]:
df_show.to_csv("model_result.csv")