In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import torch
from transformers import BartTokenizer,BartForConditionalGeneration, Trainer, TrainingArguments,DataCollatorWithPadding,AdamW
from datasets import Dataset

# Important Step:
please change the filename below to the file you want to use for training (This should not include the .csv)

## Load the Data

In [2]:
# Specify the training file to take. Change the hashes, filename = '###'
filename = 'MDN_60K_v2'

# Load the CSV file
csv_file = './../3. Cleaned Data/'+filename+'.csv'
df = pd.read_csv(csv_file)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,original,corrected
0,10784,He is taller than boys at his age.,He is taller than most boys his age.
1,12111,"All in all ,media help develop society and in...","All in all, media helps develop society and in..."
2,54521,I always tell myself to do hardwork ang try my...,I always tell myself to do hardwork and try my...
3,51688,Waiting for a sunny dateso that we can meet in...,Waiting for a sunny dateso that we can meet on...
4,39393,Taday is the third day .,Today is the third day .


In [4]:
# Change the column names in the dataframe
df.rename(columns = {'corrected_fs':'corrected'}, inplace = True)
df=df[['original','corrected']]

In [5]:
# Split the dataset into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

## Tokenization

In [6]:
# Instantiate the tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

In [7]:
# Create tokenization and encoding for training and test sets
train_encodings = tokenizer(list(train_df['original']), truncation=True, padding=True,return_tensors='pt')
val_encodings = tokenizer(list(val_df['original']), truncation=True, padding=True,return_tensors='pt')

train_labels = tokenizer(list(train_df['corrected']), truncation=True, padding=True,return_tensors='pt')
val_labels = tokenizer(list(val_df['corrected']), truncation=True, padding=True,return_tensors='pt')

In [8]:
tokenizer.decode(train_encodings['input_ids'][1])

'<s>The doctor said two or there weeks later, I can go to hospital again and she will take away my another wisdom tooth.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [9]:
tokenizer.decode(train_labels['input_ids'][1])

'<s>The doctor said two or three weeks later, I can go to the hospital again and she will take away my other wisdom tooth.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [10]:
# Assign cuda to the device to use for training
if torch.cuda.is_available(): 
 dev = "cuda:0" 
 print("This model will run on CUDA")
elif  torch.backends.mps.is_available(): 
 dev = "mps:0"
 print("This model will run on MPS")
else:
 dev = "cpu" 
 print("This model will run on CPU")
device = torch.device(dev) 

print(device)

This model will run on CUDA
cuda:0


## Fine-tune the BART Model

In [11]:
#Create a PyTorch dataset
class TextDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels['input_ids'][idx])
    return item

  def __len__(self):
    return len(self.encodings['input_ids'])

train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)

In [12]:
# Identify the model name from Hugging Face
checkpoint = "facebook/bart-large"

In [13]:
# Instantiate the model
model = BartForConditionalGeneration.from_pretrained(checkpoint).to(device)
#model.to(device)

In [14]:
# Instantiate the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5,no_deprecation_warning=True)

In [15]:
# Train the BART model

from torch.utils.data import DataLoader
from tqdm.auto import tqdm

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch, use_cache=False)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        progress_bar.update(1)
    print("Epoch {} train loss: {}".format(epoch, train_loss / len(train_loader)))

  0%|          | 0/5064 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx])


Epoch 0 train loss: 1.0224396828959268
Epoch 1 train loss: 0.0945041714571593
Epoch 2 train loss: 0.0857118626270814


In [16]:
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)

model.eval()
val_loss = 0
reference_corpus = []
predicted_corpus = []
original_corpus = []

num_validation_steps = len(val_loader)
progress_bar = tqdm(range(num_validation_steps))

for batch in val_loader:
    batch = {k: v.to(device) for k, v in batch.items()}

    # Store the labels in a separate variable and remove labels from the batch
    labels = batch['labels']
    input_ids = batch['input_ids']
    batch.pop('labels')

    with torch.no_grad():
        outputs = model.generate(**batch, use_cache=False)
        for i in range(len(outputs)):
            predicted_sentence = tokenizer.decode(outputs[i], skip_special_tokens=True)
            reference_sentence = tokenizer.decode(labels[i], skip_special_tokens=True)
            original_sentence = tokenizer.decode(input_ids[i], skip_special_tokens=True)
            reference_corpus.append([reference_sentence.split()])
            predicted_corpus.append(predicted_sentence.split())
            original_corpus.append(original_sentence.split())
            val_loss += model(**batch, use_cache=False, labels=labels).loss.item()
            progress_bar.update(1)
        #print("Validation loss: {}".format(val_loss / len(val_loader)))

  0%|          | 0/188 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx])


In [17]:
batch

{'input_ids': tensor([[    0,  1708,    38,  ...,     1,     1,     1],
         [    0,  1121,  6427,  ...,     1,     1,     1],
         [    0, 15952,   441,  ...,     1,     1,     1],
         ...,
         [    0,   243,    16,  ...,     1,     1,     1],
         [    0,   243,   362,  ...,     1,     1,     1],
         [    0,   243,    16,  ...,     1,     1,     1]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}

In [18]:
#print("Validation loss: {}".format(val_loss / len(val_loader)))
# this is of the 20K, paragraphed rows (no single sentences)
# Validation loss: 4.455552131410629

In [25]:
print("Validation loss: {}".format(val_loss / len(val_loader)))

Validation loss: 4.965415848062394


In [26]:
reference_corpus

[[['Have',
   'you',
   'ever',
   'thought',
   'about',
   'why',
   'you',
   'are',
   'here',
   'and',
   'who',
   'are',
   'you?']],
 [['Its',
   'double',
   'helix',
   'structure',
   'provides',
   'a',
   'variety',
   'of',
   'choices',
   'to',
   'you,',
   'and',
   'its',
   'smart',
   'central',
   'controller',
   'provides',
   'healthy',
   'diet',
   'suggestions',
   'and',
   'fast',
   'booking',
   'services',
   'according',
   'to',
   'your',
   'personal',
   'diet',
   'data.']],
 [['However,',
   'I',
   'still',
   "don't",
   'how',
   'to',
   'improve',
   'my',
   'spoken',
   'English.']],
 [['Be',
   'that',
   'as',
   'it',
   'may,capability',
   'is',
   'the',
   'key',
   'to',
   'keeping',
   'a',
   'job',
   'in',
   'the',
   'long',
   'run.']],
 [['People',
   'always',
   'say',
   'that',
   'cats',
   'are',
   'not',
   'loyal',
   'because',
   'that',
   'if',
   'you',
   'feed',
   'them,',
   "they'll",
   'follow',
   'a

In [27]:
predicted_corpus

[['Have',
  'you',
  'ever',
  'thought',
  'why',
  'you',
  'are',
  'here',
  'and',
  'who',
  'are',
  'you?'],
 ['Its',
  'double',
  'helix',
  'structure',
  'provides',
  'more',
  'choices',
  'to',
  'you',
  'and',
  'its',
  'smart',
  'central',
  'controller',
  'provides',
  'healthy'],
 ['s',
  'However,',
  'I',
  'still',
  "don't",
  'know',
  'how',
  'to',
  'improve',
  'my',
  'speaking',
  'English.'],
 ['Be',
  'that',
  'as',
  'it',
  'may,capability',
  'is',
  'the',
  'key',
  'to',
  'your',
  'job',
  'in'],
 ['People',
  'always',
  'say',
  'that',
  'cats',
  'are',
  'not',
  'loyal',
  'because',
  'if',
  'you',
  'feed',
  'them',
  'often,',
  "they'll"],
 ['Well,', 'where', 'do', 'you', 'guys', 'live?'],
 ['Q',
  'If',
  'you',
  'are',
  'the',
  'boss,',
  'and',
  'the',
  'hotel',
  'customers',
  'as',
  'some',
  'argue,',
  'what'],
 ['I', 'almost', 'lost', 'my', 'aim', 'and', 'passion', 'for', 'life.'],
 ['None', 'of', 'commodities', 'a

In [21]:
# Save the trained model and tokenizer
output_dir = "../7. Models/"+filename+"_"+checkpoint+"/"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


('../7. Models/MDN_60K_v2_facebook/bart-large/tokenizer_config.json',
 '../7. Models/MDN_60K_v2_facebook/bart-large/special_tokens_map.json',
 '../7. Models/MDN_60K_v2_facebook/bart-large/vocab.json',
 '../7. Models/MDN_60K_v2_facebook/bart-large/merges.txt',
 '../7. Models/MDN_60K_v2_facebook/bart-large/added_tokens.json')

In [22]:
# Check the BLEU (Bilingual Evaluation Understudy) score
from nltk.translate.bleu_score import sentence_bleu

# Calculate BLEU score for each sentence pair
bleu_scores = []
for i in range(len(predicted_corpus)):
    predicted_sentence = predicted_corpus[i]
    reference_sentences = reference_corpus[i]  # Each element in reference_corpus should be a list of sentences
    bleu_score = sentence_bleu(reference_sentences, predicted_sentence)
    bleu_scores.append(bleu_score)

average_bleu_score = sum(bleu_scores) / len(bleu_scores)
print("Average BLEU score:", average_bleu_score)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU score: 0.480221149206072


In [23]:
for i in range(10, 20):
    print(reference_corpus[i])


[['Of', 'course,', 'the', 'minority', 'is', 'subordinate', 'to', 'the', 'majority.']]
[['My', 'heart', 'sank,', 'thinking', 'I', 'had', 'failed', 'both', 'interviews.']]
[['She', 'loves', 'this', 'job', 'and', 'cares', 'forall', 'her', 'patients', 'in', 'every', 'possible', 'way.']]
[['After', 'that,I', 'still', 'had', 'to', 'prepare', 'for', 'my', 'final', 'exam', 'of', 'this', 'term.']]
[['We', 'talk', 'only', 'a', 'couple', 'of', 'times', 'a', 'day.']]
[['Consequently,', 'these', 'people', 'find', 'it', 'easier', 'to', 'succeed', 'than', 'others.']]
[['I', 'cant', 'wait', 'for', 'it', 'to', 'come!']]
[['Most', 'of', 'the', 'knowledge', 'we', 'have', 'learned', 'from', 'exams', 'are', 'impracticable.']]
[['Book', 'help', 'to', 'keep', 'people', 'away', 'from', 'error', 'at', 'the', 'same', 'time,', 'experience', 'adds', 'a', 'type', 'of', 'learning', 'that', 'is', 'more', 'value', 'that', 'just', 'a', 'book', 'itself.']]
[['I', 'knocked', 'over', 'a', 'glass', 'onto', 'my', 'notebook

In [24]:
for i in range(10, 20):
    print(predicted_corpus[i])

['Of', 'course', 'the', 'minority', 'is', 'subordinate', 'to', 'the', 'majority.']
['My', 'heart', 'sank,', 'thinking', 'I', 'failed', 'both', 'interviews.']
['She', 'loves', 'this', 'job', 'and', 'caring', 'towards', 'all', 'her', 'patients', 'in', 'every', 'possible', 'way.']
['After', 'that,I', 'had', 'to', 'prepare', 'for', 'my', 'final', 'exam', 'of', 'this', 'term', 'soon.']
['We', 'just', 'have', 'one', 'or', 'two', 'calls', 'everyday,', 'and', 'besides', 'that', 'there', 'is', 'nothing.']
['Consequently,', 'these', 'people', 'take', 'it', 'easier', 'to', 'succeed', 'than', 'others.']
['And', 'I', 'cant', 'wait', 'for', 'it', 'to', 'come!']
['Most', 'of', 'the', 'knowledge', 'we', 'had', 'learnt', 'in', 'exam', 'is', 'impracticable.']
['Book', 'help', 'people', 'out', 'of', 'error', 'zones,', 'at', 'the', 'same', 'time,', 'experience', 'helps', 'gain', 'more']
['I', 'shoved', 'a', 'glass', 'at', 'my', 'notebook,', 'and', 'it', 'broke.']
