In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import torch
from transformers import BartTokenizer,BartForConditionalGeneration, Trainer, TrainingArguments,DataCollatorWithPadding,AdamW
from datasets import Dataset

# Important Step:
please change the filename below to the file you want to use for training (This should not include the .csv)

## Load the Data

In [2]:
# Specify the training file to take. Change the hashes, filename = '###'
filename = '40K'

# Load the CSV file
csv_file = './../3. Cleaned Data/'+filename+'.csv'
df = pd.read_csv(csv_file)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,original,word_count_original,corrected_fs,word_count_corrected_fs,word_count_diff,word_count_ratio,o_pos_tags,o_num_verbs,o_num_nouns,...,c_num_modal,num_verbs_diff,num_nouns_diff,num_adjs_diff,num_adv_diff,num_pronoun_diff,num_preposition_diff,num_conjunction_diff,num_article_diff,num_modal_diff
0,539741,"Since I have begun to live in London, I have b...",16,"Since I have begun living in London, I have be...",15,-1,0.9375,"[('Since', 'IN'), ('I', 'PRP'), ('have', 'VBP'...",7,2,...,0,-1,1,0,0,0,0,0,0,0
1,805208,"If she heard my description of her, she would ...",12,"If she had heard my description of her, she wo...",13,1,1.083333,"[('If', 'IN'), ('she', 'PRP'), ('heard', 'VBD'...",2,2,...,1,1,0,0,0,0,0,0,0,0
2,553823,I never can do.,4,I could never do it.,5,1,1.25,"[('I', 'PRP'), ('never', 'RB'), ('can', 'MD'),...",1,0,...,1,0,0,0,0,1,0,0,0,0
3,792625,In the class I learned how to write paragraph ...,10,In the class I learned how to write paragraphs.,9,-1,0.9,"[('In', 'IN'), ('the', 'DT'), ('class', 'NN'),...",2,2,...,0,0,0,-1,0,0,0,0,0,0
4,686799,I think imitating is important thing to learn ...,9,I think imitating is important to learning Eng...,8,-1,0.888889,"[('I', 'PRP'), ('think', 'VBP'), ('imitating',...",4,2,...,0,0,-1,0,0,0,0,0,0,0


In [4]:
# Change the column names in the dataframe
df.rename(columns = {'corrected_fs':'corrected'}, inplace = True)
df=df[['original','corrected']]

In [5]:
# Split the dataset into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

## Tokenization

In [6]:
# Instantiate the tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

In [7]:
# Create tokenization and encoding for training and test sets
train_encodings = tokenizer(list(train_df['original']), truncation=True, padding=True,return_tensors='pt')
val_encodings = tokenizer(list(val_df['original']), truncation=True, padding=True,return_tensors='pt')

train_labels = tokenizer(list(train_df['corrected']), truncation=True, padding=True,return_tensors='pt')
val_labels = tokenizer(list(val_df['corrected']), truncation=True, padding=True,return_tensors='pt')

In [8]:
tokenizer.decode(train_encodings['input_ids'][1])

'<s>She is one of my close friend but it looks like younger sister or family so I have been feeling happy for it.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [9]:
tokenizer.decode(train_labels['input_ids'][1])

'<s>She is one of my close friends but she is like a like younger sister or family member so I have been feeling happy about it.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [10]:
# Assign cuda to the device to use for training
if torch.cuda.is_available(): 
 dev = "cuda:0" 
 print("This model will run on CUDA")
elif  torch.backends.mps.is_available(): 
 dev = "mps:0"
 print("This model will run on MPS")
else:
 dev = "cpu" 
 print("This model will run on CPU")
device = torch.device(dev) 

print(device)

This model will run on CUDA
cuda:0


## Fine-tune the BART Model

In [11]:
#Create a PyTorch dataset
class TextDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels['input_ids'][idx])
    return item

  def __len__(self):
    return len(self.encodings['input_ids'])

train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)

In [12]:
# Identify the model name from Hugging Face
checkpoint = "facebook/bart-large"

In [13]:
# Instantiate the model
model = BartForConditionalGeneration.from_pretrained(checkpoint).to(device)
#model.to(device)

In [14]:
# Instantiate the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5,no_deprecation_warning=True)

In [15]:
# Train the BART model

from torch.utils.data import DataLoader
from tqdm.auto import tqdm

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch, use_cache=False)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        progress_bar.update(1)
    print("Epoch {} train loss: {}".format(epoch, train_loss / len(train_loader)))

  0%|          | 0/3375 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx])


Epoch 0 train loss: 1.4526453834176063
Epoch 1 train loss: 0.15145534869035085
Epoch 2 train loss: 0.1351730041437679


In [1]:
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)

model.eval()
val_loss = 0
reference_corpus = []
predicted_corpus = []

num_validation_steps = len(val_loader)
progress_bar = tqdm(range(num_validation_steps))

for batch in val_loader:
    batch = {k: v.to(device) for k, v in batch.items()}

    # Store the labels in a separate variable and remove labels from the batch
    labels = batch['labels']
    batch.pop('labels')

    with torch.no_grad():
        outputs = model.generate(**batch, use_cache=False)
        for i in range(len(outputs)):
            predicted_sentence = tokenizer.decode(outputs[i], skip_special_tokens=True)
            reference_sentence = tokenizer.decode(labels[i], skip_special_tokens=True)
            reference_corpus.append([reference_sentence.split()])
            predicted_corpus.append(predicted_sentence.split())
            val_loss += model(**batch, use_cache=False, labels=labels).loss.item()
            progress_bar.update
        print("Validation loss: {}".format(val_loss / len(val_loader)))

NameError: name 'DataLoader' is not defined

In [17]:
print("Validation loss: {}".format(val_loss / len(val_loader)))

Validation loss: 6.594473804473877


In [18]:
reference_corpus

[[['I', 'will', 'write', 'about', 'Adobe', 'Creative', 'Suite', '5.']],
 [['But',
   'I',
   'also',
   'felt',
   'like',
   'I',
   'was',
   'set',
   'free',
   'from',
   'the',
   'time,',
   'then.']],
 [['And', 'I', 'have', 'some', 'information!']],
 [['I', 'played', 'at', 'an', 'amateur', 'baseball', 'game', 'at', '1100.']],
 [['In',
   'this',
   'case,',
   'it',
   'would',
   'mean',
   'Hello,',
   'or',
   'something',
   'like',
   'that.']],
 [['I', "don't", 'like', 'it.']],
 [['I',
   'was',
   'looking',
   'forward',
   'to',
   'the',
   'blue',
   'sky,',
   'blue',
   'ocean',
   'and',
   'shining',
   'sun!']],
 [['This',
   'article',
   'may',
   'seem',
   'boring',
   'to',
   'you',
   'but',
   "I'm",
   'really',
   'struggling',
   'to',
   'post',
   'this.']],
 [['I',
   'think',
   'both',
   'Japanese',
   'men',
   'and',
   'women',
   'are',
   'really',
   'shy',
   'about',
   'romance.']],
 [['It',
   'means',
   'if',
   'you',
   'wake',
   

In [19]:
predicted_corpus

[['I', 'will', 'write', 'about', 'Adobe', 'Creative', 'Suite', '5.'],
 ['But', 'I', 'also', 'felt', 'free', 'at', 'the', 'time,', 'then.'],
 ['And', 'I', 'have', 'information!'],
 ['I', 'played', 'an', 'amateur', 'baseball', 'game', 'at', '1100.'],
 ['In',
  'this',
  'case,',
  'it',
  'means',
  'Hello',
  'or',
  'something',
  'like',
  'that.'],
 ['I', 'dont', 'like', 'it.'],
 ['I',
  'was',
  'looking',
  'forward',
  'to',
  'blue',
  'sky,',
  'blue',
  'ocean',
  'and',
  'shining',
  'sun!'],
 ['This',
  'article',
  'seems',
  'plain',
  'to',
  'you',
  'but',
  "I'm",
  'really',
  'struggling',
  'to',
  'post',
  'this.'],
 ['I',
  'think',
  'both',
  'Japanese',
  'men',
  'and',
  'women',
  'are',
  'really',
  'shy',
  'about',
  'their',
  'romance.'],
 ['It',
  'means',
  'if',
  'you',
  'wake',
  'up',
  'early,',
  'you',
  'can',
  'get',
  'some',
  'kind',
  'of',
  'benefit.'],
 ['If',
  'I',
  'had',
  'just',
  'a',
  'little',
  'fee',
  'time,',
  'I',


In [20]:
# Save the trained model and tokenizer
output_dir = "../7. Models/"+filename+"_"+checkpoint+"/"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


('../7. Models/40K_facebook/bart-large/tokenizer_config.json',
 '../7. Models/40K_facebook/bart-large/special_tokens_map.json',
 '../7. Models/40K_facebook/bart-large/vocab.json',
 '../7. Models/40K_facebook/bart-large/merges.txt',
 '../7. Models/40K_facebook/bart-large/added_tokens.json')

In [21]:
# Check the BLEU (Bilingual Evaluation Understudy) score
from nltk.translate.bleu_score import sentence_bleu

# Calculate BLEU score for each sentence pair
bleu_scores = []
for i in range(len(predicted_corpus)):
    predicted_sentence = predicted_corpus[i]
    reference_sentences = reference_corpus[i]  # Each element in reference_corpus should be a list of sentences
    bleu_score = sentence_bleu(reference_sentences, predicted_sentence)
    bleu_scores.append(bleu_score)

average_bleu_score = sum(bleu_scores) / len(bleu_scores)
print("Average BLEU score:", average_bleu_score)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU score: 0.46720221575447385
