In [1]:
!pip install transformers --quiet
!pip install pandas --quiet
!pip install torch --quiet

In [1]:
import os
import numpy as np
import pandas as pd
import torch
import argparse
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import random 
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


In [2]:
#CONFIG
config = {
    "seed" : 3030,
    "output_path" : "iT5SpellChecker",
    "max_source_text_length" : 64,
    "max_target_text_length" : 64,
    "train_batch_size" : 16,
    "valid_batch_size" : 16,
    "output_dir" : "./training_it5",
    "train_path": "./train.csv",
    "valid_path": "./test.csv",
    "lr": 0.0001,
    "epochs": 1
}


In [5]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, target_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.source = dataframe.mispelled
        self.target = dataframe.sentence
        self.source_len = source_len
        self.summ_len = target_len


    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):     
        source_text = str(self.source[index])
        target_text = str(self.target[index])
        
        # cleaning data so as to ensure data is in string type
        source_text = " ".join(source_text.split())
        target_text = " ".join(target_text.split())

        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        target = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.summ_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze()
        target_mask = target["attention_mask"].squeeze()

        return {
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "target_ids": target_ids.to(dtype=torch.long),
            "target_ids_y": target_ids.to(dtype=torch.long),
        }

In [5]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for n, data in enumerate(loader, 0):
        y = data["target_ids"].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data["source_ids"].to(device, dtype=torch.long)
        mask = data["source_mask"].to(device, dtype=torch.long)

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
        )
        loss = outputs[0]

        if n % 1000 == 0:
            print(f"Epoch number {epoch} with loss {loss}")
        
        loss.backward()
        optimizer.step()
        model.zero_grad()

In [6]:
def validate(tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for n, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=128, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            source = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in ids]

            predictions.extend(preds)
            actuals.extend(source)
    return predictions, actuals

In [7]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [8]:
from pandas._libs.tslibs.conversion import precision_from_unit

def iT5Trainer(output_dir="./training_it5"):

    """
    T5 trainer
    """

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(config["seed"])  # pytorch random seed
    np.random.seed(config["seed"])  # numpy random seed
    torch.backends.cudnn.deterministic = True

    tokenizer = AutoTokenizer.from_pretrained("gsarti/it5-base")

    model = AutoModelForSeq2SeqLM.from_pretrained(config['output_dir'])

    model = model.to(device)

    train_dataset = pd.read_csv(config["train_path"])
    valid_dataset = pd.read_csv(config["valid_path"])

    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = CustomDataset(
        train_dataset,
        tokenizer,
        config["max_source_text_length"],
        config["max_target_text_length"]
    )
    val_set = CustomDataset(
        valid_dataset,
        tokenizer,
        config["max_source_text_length"],
        config["max_target_text_length"]
    )

    # Defining the parameters for creation of dataloaders
    train_params = {
        "batch_size": config["train_batch_size"],
        "shuffle": True,
        "num_workers": 0,
    }

    val_params = {
        "batch_size":  config["valid_batch_size"],
        "shuffle": False,
        "num_workers": 0,
    }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    # Training loop
    
    if True: 
        # Defining the optimizer that will be used to tune the weights of the network in the training session.
        optimizer = torch.optim.Adam(
            params=model.parameters(), lr=config["lr"]
        )
        print(f"[Initiating Fine Tuning]...\n")
        for epoch in range(config["epochs"]):
            train(epoch, tokenizer, model, device, training_loader, optimizer)

        print(f"[Saving Model]...\n")
        # Saving the model after training
        #path = os.path.join(output_dir, "model_files")
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
        print(f"""[Model] Model saved @ {output_dir}\n""")
    # evaluating test dataset
    if True:
        print(f"""[Model]: Loading t5 base model from {output_dir} for validation\n""")
        model = AutoModelForSeq2SeqLM.from_pretrained(output_dir)
        model.to(device)
        print(f"[Initiating Validation]...\n")
        predictions, actuals = validate(tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
        final_df.to_csv(os.path.join(output_dir, "predictions.csv"))
        print(f"[Validation Completed.]\n")
        precision_from_unit(f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n""")

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

set_seed(config["seed"])

iT5Trainer(output_dir=config["output_dir"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[Initiating Fine Tuning]...

Epoch number 0 with loss 0.12556450068950653
Epoch number 0 with loss 0.03724024444818497
Epoch number 0 with loss 0.05260341987013817
Epoch number 0 with loss 0.0026303878985345364
Epoch number 0 with loss 0.05820932239294052
Epoch number 0 with loss 0.0843982920050621
Epoch number 0 with loss 0.046536412090063095
Epoch number 0 with loss 0.03057420812547207
Epoch number 0 with loss 0.047324955463409424
Epoch number 0 with loss 0.06974609941244125
Epoch number 0 with loss 0.08250714838504791
Epoch number 0 with loss 0.06443087011575699
Epoch number 0 with loss 0.1363958716392517
Epoch number 0 with loss 0.06427076458930969
Epoch number 0 with loss 0.03337850794196129
[Saving Model]...

[Model] Model saved @ ./training_it5

[Model]: Loading t5 base model from ./training_it5 for validation

[Initiating Validation]...

[Validation Completed.]



ValueError: cannot cast unit [Validation] Generation on Validation data saved @ ./training_it5/predictions.csv


Exception ignored in: 'pandas._libs.tslibs.conversion.precision_from_unit'
Traceback (most recent call last):
  File "/tmp/ipykernel_134/796937273.py", line 81, in iT5Trainer
ValueError: cannot cast unit [Validation] Generation on Validation data saved @ ./training_it5/predictions.csv



In [16]:
def spellchecker(tokenizer,model,wrong_phrase):
    model.eval()
    with torch.no_grad():
        phrase = pd.DataFrame([(wrong_phrase,"ciao:)")], columns=['mispelled','sentence'])
        phrase = CustomDataset(
            phrase,
            tokenizer,
            config["max_source_text_length"],
            config["max_target_text_length"]
        )
        val_params = {
        "batch_size":  1,
        "shuffle": False,
        "num_workers": 0,
        }
        loader = DataLoader(phrase, **val_params)
        for n, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=128, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            
    return preds

tokenizer = AutoTokenizer.from_pretrained("gsarti/it5-base")
model = AutoModelForSeq2SeqLM.from_pretrained(config['output_dir'])
#model.to(device)
print(spellchecker(tokenizer,model,input()))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


 seq:non posso andare avanti cosp.


['andare avanti così.']


In [7]:
device = 'cpu'