In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import torch
from transformers import BartTokenizer,BartForConditionalGeneration, Trainer, TrainingArguments,DataCollatorWithPadding,AdamW
from datasets import Dataset

# Important Step:
please change the filename below to the file you want to use for training (This should not include the .csv)

## Load the Data

In [2]:
# Specify the training file to take. Change the hashes, filename = '###'
filename = 'MDN_20K_v2'

# Load the CSV file
csv_file = './../3. Cleaned Data/'+filename+'.csv'
df = pd.read_csv(csv_file)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,original,corrected,original_par,corrected_par,word_count_original,word_count_corrected,word_count_diff,word_count_ratio
0,206980,I went to the small coffee shop for reading.,I went to a small coffee shop to read,,,9,9,0,1.0
1,256794,"I think that site has ,especially, many animat...","I think that site has, primarily, many animate...",,,9,9,0,1.0
2,576798,It was first time to eat frogs and it's unusua...,It was the first time I ate frogs and it's an ...,,,22,25,3,1.136364
3,247267,"I know SNS is very convenient,actually I use t...","SNS are very convenient, and I especially like...",,,9,10,1,1.111111
4,556643,"However, more than that, I feel uneasy as if I...","However, more than that, I feel uneasy as if I...",,,18,18,0,1.0


In [4]:
# Change the column names in the dataframe
df.rename(columns = {'corrected_fs':'corrected'}, inplace = True)
df=df[['original','corrected']]

In [5]:
# Split the dataset into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

## Tokenization

In [6]:
# Instantiate the tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

In [7]:
# Create tokenization and encoding for training and test sets
train_encodings = tokenizer(list(train_df['original']), truncation=True, padding=True,return_tensors='pt')
val_encodings = tokenizer(list(val_df['original']), truncation=True, padding=True,return_tensors='pt')

train_labels = tokenizer(list(train_df['corrected']), truncation=True, padding=True,return_tensors='pt')
val_labels = tokenizer(list(val_df['corrected']), truncation=True, padding=True,return_tensors='pt')

In [8]:
tokenizer.decode(train_encodings['input_ids'][1])

"<s>You would think I am less romantic or lacks ability of enjoying holidays, but I can't be helped being such a person.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>"

In [9]:
tokenizer.decode(train_labels['input_ids'][1])

"<s>You would think I am not romantic or lack the ability to enjoy holidays, but I can't help being such a person.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>"

In [10]:
# Assign cuda to the device to use for training
if torch.cuda.is_available(): 
 dev = "cuda:0" 
 print("This model will run on CUDA")
elif  torch.backends.mps.is_available(): 
 dev = "mps:0"
 print("This model will run on MPS")
else:
 dev = "cpu" 
 print("This model will run on CPU")
device = torch.device(dev) 

print(device)

This model will run on CUDA
cuda:0


## Fine-tune the BART Model

In [11]:
#Create a PyTorch dataset
class TextDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels['input_ids'][idx])
    return item

  def __len__(self):
    return len(self.encodings['input_ids'])

train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)

In [12]:
# Identify the model name from Hugging Face
checkpoint = "facebook/bart-large"

In [13]:
# Instantiate the model
model = BartForConditionalGeneration.from_pretrained(checkpoint).to(device)
#model.to(device)

In [14]:
# Instantiate the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5,no_deprecation_warning=True)

In [15]:
# Train the BART model

from torch.utils.data import DataLoader
from tqdm.auto import tqdm

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch, use_cache=False)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        progress_bar.update(1)
    print("Epoch {} train loss: {}".format(epoch, train_loss / len(train_loader)))

  0%|          | 0/6750 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx])


Epoch 0 train loss: 0.8063765593171119
Epoch 1 train loss: 0.11537599353988966
Epoch 2 train loss: 0.10519462283286783


In [16]:
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)

model.eval()
val_loss = 0
reference_corpus = []
predicted_corpus = []
original_corpus = []

num_validation_steps = len(val_loader)
progress_bar = tqdm(range(num_validation_steps))

for batch in val_loader:
    batch = {k: v.to(device) for k, v in batch.items()}

    # Store the labels in a separate variable and remove labels from the batch
    labels = batch['labels']
    input_ids = batch['input_ids']
    batch.pop('labels')

    with torch.no_grad():
        outputs = model.generate(**batch, use_cache=False)
        for i in range(len(outputs)):
            predicted_sentence = tokenizer.decode(outputs[i], skip_special_tokens=True)
            reference_sentence = tokenizer.decode(labels[i], skip_special_tokens=True)
            original_sentence = tokenizer.decode(input_ids[i], skip_special_tokens=True)
            reference_corpus.append([reference_sentence.split()])
            predicted_corpus.append(predicted_sentence.split())
            original_corpus.append(original_sentence.split())
            val_loss += model(**batch, use_cache=False, labels=labels).loss.item()
            progress_bar.update(1)
        #print("Validation loss: {}".format(val_loss / len(val_loader)))

  0%|          | 0/250 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx])


In [36]:
batch

{'input_ids': tensor([[   0,  100,   33,  ...,    1,    1,    1],
         [   0, 1708,   51,  ...,    1,    1,    1],
         [   0,  243,   18,  ...,    1,    1,    1],
         ...,
         [   0,  713,  515,  ...,    1,    1,    1],
         [   0,  100, 1346,  ...,    1,    1,    1],
         [   0,  133, 2170,  ...,    1,    1,    1]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}

In [17]:
print("Validation loss: {}".format(val_loss / len(val_loader)))

Validation loss: 5.22094459438324


In [18]:
reference_corpus

[[['The', 'director', 'of', 'this', 'movie', 'is', 'Gus', 'Van', 'Sant.']],
 [['I', 'will', 'go', 'tonight!']],
 [['A',
   'couple',
   'of',
   'years',
   'ago,',
   'I',
   'used',
   'to',
   'play',
   'a',
   'dancing',
   'game',
   'developed',
   'in',
   'Korea,',
   'which',
   'is',
   'the',
   'pioneer',
   'in',
   'online',
   'gaming.']],
 [['Someday,',
   'I',
   'want',
   'to',
   'sing',
   'songs',
   'in',
   'other',
   'languages',
   'too.']],
 [['My', 'roomie', 'became', 'a', 'Macer!']],
 [["I'm",
   'going',
   'to',
   'join',
   'an',
   'English',
   'lesson',
   'for',
   'the',
   'first',
   'time',
   'tomorrow.']],
 [['The',
   'government',
   'warned',
   'that',
   'it',
   'was',
   'possible',
   'that',
   'a',
   'big',
   'aftershock',
   'like',
   'a',
   'level',
   '5',
   'earthquake']],
 [['I',
   'was',
   'also',
   'surprised',
   'that',
   'there',
   'were',
   'more',
   'people',
   'on',
   'the',
   'streets',
   'at',
   'nig

In [19]:
predicted_corpus

[['Freddy,',
  'Brian,',
  'John',
  'and',
  'May',
  'thank',
  'you',
  'very',
  'much',
  'for',
  'your',
  'beautiful',
  'songs.'],
 ['So',
  'I',
  'have',
  'a',
  'plan',
  'to',
  'travel',
  'to',
  'a',
  'foreign',
  'country.'],
 ['to',
  'tell',
  'the',
  'truth,',
  'I',
  'would',
  'like',
  'to',
  'inquire',
  'about',
  'my',
  'mileage',
  'status',
  'and'],
 ['This', 'movie', 'was', 'very', 'interesting', 'and', 'exciting!'],
 ['I', 'tried', 'taking', 'a', 'photo', 'many', 'times.'],
 ['He', 'received', 'a', 'present', 'from', 'his', 'friend.'],
 ['Sometimes',
  'there',
  'is',
  'a',
  'flower',
  'blooming',
  'in',
  'my',
  'garden,',
  'sometimes',
  'a',
  'fallen',
  'leaf',
  'that'],
 ['This',
  'text',
  'is',
  'for',
  'university',
  'students,',
  'and',
  'includes',
  'econometrics.'],
 ['But', 'Ill', 'try', 'to', 'do', 'more', 'positively.'],
 ['We',
  'have',
  'a',
  'tea',
  'break',
  'before',
  'lunch',
  'time',
  'and',
  "it's",
  '

In [19]:
# Save the trained model and tokenizer
output_dir = "../7. Models/"+filename+"_"+checkpoint+"/"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


('../7. Models/80K_v2_facebook/bart-large/tokenizer_config.json',
 '../7. Models/80K_v2_facebook/bart-large/special_tokens_map.json',
 '../7. Models/80K_v2_facebook/bart-large/vocab.json',
 '../7. Models/80K_v2_facebook/bart-large/merges.txt',
 '../7. Models/80K_v2_facebook/bart-large/added_tokens.json')

In [20]:
# Check the BLEU (Bilingual Evaluation Understudy) score
from nltk.translate.bleu_score import sentence_bleu

# Calculate BLEU score for each sentence pair
bleu_scores = []
for i in range(len(predicted_corpus)):
    predicted_sentence = predicted_corpus[i]
    reference_sentences = reference_corpus[i]  # Each element in reference_corpus should be a list of sentences
    bleu_score = sentence_bleu(reference_sentences, predicted_sentence)
    bleu_scores.append(bleu_score)

average_bleu_score = sum(bleu_scores) / len(bleu_scores)
print("Average BLEU score:", average_bleu_score)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU score: 0.4950602977582288


In [21]:
for i in range(10, 20):
    print(reference_corpus[i])


[['She', 'causes', 'a', 'lot', 'of', 'trouble', 'for', 'herself', 'and', 'tries', 'to', 'overcome', 'it.']]
[['My', 'fiancee', 'is', 'busy', 'working.']]
[['Today', 'we', 'had', 'a', 'big', 'earthquake', 'in', 'Japan.']]
[['Actually,', 'I', 'do', 'not', 'like', 'goya', 'because', 'it', 'is', 'too', 'biter', 'but', 'my', 'mother-in-law', 'gave', 'us', 'five', 'goyas', 'so', 'I', 'had', 'to', 'make', 'some', 'delicious', 'goya', 'dishes.']]
[['Regret,', 'by', 'the', 'artist', 'New', 'Order,', 'is', 'a', 'good', 'tune.']]
[['Finally,', 'pack', 'it', 'into', 'another', 'tub,', 'cover', 'it', 'with', 'plastic', 'wrap', 'and', 'close', 'the', 'lid.']]
[['A', 'woman', 'named', 'Kiwako', 'became', 'pregnant', 'with', 'a', 'child,', 'but', 'the', "child's", 'father', 'has', 'another', 'family.']]
[['We', 'can', 'watch', 'sports', 'and', 'enjoy', 'playing', 'them', 'anytime,', 'and', 'anywhere.']]
[['I', 'searched', 'some', 'community', 'college', 'websites,', 'but', 'they', "weren't", 'clear.']

In [22]:
for i in range(10, 20):
    print(predicted_corpus[i])

['She', 'causes', 'a', 'lot', 'of', 'trouble', 'herself', 'and', 'tries', 'to', 'overcome', 'it.']
['My', 'fiancee', 'is', 'busy', 'at', 'work.']
['Today', 'we', 'had', 'a', 'big', 'earthquake', 'in', 'Japan.']
['Actually,', 'I', 'do', 'not', 'like', 'goya', 'because', 'it', 'is', 'too', 'biter', 'but', 'my', 'mother']
['Regret', 'of', 'New', 'order', 'who', 'is', 'the', 'artist', 'is', 'a', 'good', 'tune.']
['Finally,', 'pack', 'it', 'into', 'another', 'tub,', 'cover', 'it', 'with', 'plastic', 'wrap', 'and', 'close', 'the', 'lid']
['A', 'woman', 'named', 'Kiwako', 'became', 'pregnant', 'with', 'a', 'child,', 'but', 'the', "child's", 'father', 'has']
['We', 'can', 'watch', 'sports', 'and', 'enjoy', 'playing', 'anytime,', 'anywhere.']
['I', 'searched', 'some', 'websites', 'of', 'community', 'colleges,', 'but', 'couldnt', 'make', 'the', 'clear.']
['I', 'stand', 'on', 'the', 'start', 'line.']
