# Import Libraries

In [1]:
import datasets
import transformers
import pandas as pd
import torch
from torch.utils.data.dataset import Dataset
from pathlib import Path



#Tokenizer from scratch on vocabulary of corpus
from tokenizers import ByteLevelBPETokenizer

# Decoder
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM # RobertaLM for learning
from transformers import RobertaTokenizerFast # After training tokenizern we will wrap it so it can be used by Roberta model


#Training
# When using previous version of the library you need the following two lines
from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments
from transformers import Trainer, TrainingArguments

# Parameters for Training

In [123]:
TRAIN_BATCH_SIZE = 20   # input batch size for training (default: 64)
VALID_BATCH_SIZE = 5   # input batch size for testing (default: 1000)
VAL_EPOCHS = 1 
LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
SEED = 42               # random seed (default: 42)
MAX_LEN = 128           # Max length for product description
SUMMARY_LEN = 20         # Max length for product names

TRAIN_EPOCHS = 2       # number of epochs to train (default: 10)
WEIGHT_DECAY = 0.01
SEED = 42               # random seed (default: 42)
MAX_LEN = 128
SUMMARY_LEN = 20   # Maximum length of caption generated by the model

# Preparing the Dataset

In [3]:
import os
os.chdir(r'C:\Users\kalpe\Dropbox\PC\Documents\Great lakes\ML project')
import json


with open('anticipated_dataset.json', 'r') as openfile:

    json_object = json.load(openfile)

images_caption_dict = dict(json_object)


# Data have been preprocessed with correct dir

# images_path = 'C:/Users/kalpe/Documents/Great lakes/ML project/Flickr8k_Dataset/Flicker8k_Dataset/'
# images = list(images_caption_dict.keys())
# for image_path in images:
#     if image_path.endswith('jpg'):
#         new = images_path + image_path.split('/')[-1]
#         images_caption_dict[new] = images_caption_dict.pop(image_path)
#     else:
#         images_caption_dict.pop(image_path)

In [7]:
import pandas as pd

df = pd.DataFrame([])

captions = []
images = []
for image in list(images_caption_dict.keys()):
    caption = images_caption_dict[image]
#     captions.append(('.'.join([ sent.rstrip() for sent in ('.'.join(caption)).split('<e>.<s>')]))\
#                             .replace('<s> ','').replace('  <e>','.'))
    for capt in caption:
        captions.append(capt.replace('<s> ','').replace('  <e>','').strip())
        images.append(image)
        
df['images'] = images
df['captions'] = captions

# ROBERTA
### Training the Decoder Model for Language Understanding and build Vocabulary

### Tokenizer
#### Converting captions in to .txt file for training of the tokenizer

In [232]:
# Store values in a dataframe column (Series object) to files, one file per record
os.mkdir("./text_split")
def column_to_files(column, prefix, txt_files_dir = "./text_split"):
    # The prefix is a unique ID to avoid to overwrite a text file
    i=prefix
    #For every value in the df, with just one column
    for row in column.to_list():
      # Create the filename using the prefix ID
        file_name = os.path.join(txt_files_dir, str(i)+'.txt')
        try:
            # Create the file and write the column text to it
            f = open(file_name, 'wb')
            f.write(row.encode('utf-8'))
            f.close()
        except Exception as e:  #catch exceptions(for eg. empty rows)
            print(row, e) 
        i+=1
    # Return the last ID
    return i

data = df["captions"]
# Removing the end of line character \n
data = data.replace("\n"," ")
# Set the ID to 0
prefix=0
# Create a file for every description value
prefix = column_to_files(data, prefix)
# Print the last ID

#### Training tokenizer

In [234]:
%%time 
paths = [str(x) for x in Path(".").glob("text_split/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer(lowercase=True)

# Customize training
tokenizer.train(files=paths, vocab_size=10000, min_frequency=2,
                show_progress=True,
                special_tokens=[
                                "<s>",
                                "<pad>",
                                "<e>",
                                "<unk>",
                                "<mask>",
])

Wall time: 2min 3s


#### Save Tokenizer

In [237]:

os.mkdir('Byte_tokenizer')
tokenizer.save_model('Byte_tokenizer')

['Byte_tokenizer\\vocab.json', 'Byte_tokenizer\\merges.txt']

## Decoder
#### Intialization & Training

In [238]:
config = RobertaConfig(
    vocab_size=10000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

model = RobertaForMaskedLM(config=config)

print('Num parameters: ',model.num_parameters())

# Create the tokenizer from a trained one
tokenizer = RobertaTokenizerFast.from_pretrained('Byte_tokenizer', max_len=MAX_LEN)

In [223]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer):
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []
        
        for example in df.values:
            x=tokenizer.encode_plus(example, max_length = MAX_LEN, truncation=True, padding=True)
            self.examples += [x.input_ids]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return torch.tensor(self.examples[i])

In [225]:
# Create the train and evaluation dataset
train_dataset = CustomDataset(df['captions'][:38000], tokenizer)
eval_dataset = CustomDataset(df['captions'][38000:], tokenizer)

#### Batching Data

In [245]:
from transformers import DataCollatorForLanguageModeling

# Define the Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

## Training the Decoder

In [246]:
model_folder = "RobertaMLM"
# Define the training arguments
training_args = TrainingArguments(
    output_dir=model_folder,
    overwrite_output_dir=True,
    evaluation_strategy = 'epoch',
    num_train_epochs=TRAIN_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    save_steps=8192,
    #eval_steps=4096,
    save_total_limit=1,
)
# Create the trainer for our model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    #prediction_loss_only=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [247]:
# Train the model
trainer.train()

***** Running training *****
  Num examples = 38000
  Num Epochs = 15
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 35625


Epoch,Training Loss,Validation Loss
1,4.1514,3.852298
2,3.4353,3.333788
3,3.1711,3.140997
4,2.919,2.93147
5,2.8268,2.734056
6,2.6262,2.689468
7,2.5111,2.565856
8,2.4199,2.635178
9,2.3687,2.596702
10,2.3247,2.430679


***** Running Evaluation *****
  Num examples = 2455
  Batch size = 4
***** Running Evaluation *****
  Num examples = 2455
  Batch size = 4
***** Running Evaluation *****
  Num examples = 2455
  Batch size = 4
Saving model checkpoint to RobertaMLM\checkpoint-8192
Configuration saved in RobertaMLM\checkpoint-8192\config.json
Model weights saved in RobertaMLM\checkpoint-8192\pytorch_model.bin
Deleting older checkpoint [RobertaMLM\checkpoint-6500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2455
  Batch size = 4
***** Running Evaluation *****
  Num examples = 2455
  Batch size = 4
***** Running Evaluation *****
  Num examples = 2455
  Batch size = 4
Saving model checkpoint to RobertaMLM\checkpoint-16384
Configuration saved in RobertaMLM\checkpoint-16384\config.json
Model weights saved in RobertaMLM\checkpoint-16384\pytorch_model.bin
Deleting older checkpoint [RobertaMLM\checkpoint-8192] due to args.save_total_limit
***** Running Evaluation *****
  Num exam

TrainOutput(global_step=35625, training_loss=2.685712823807566, metrics={'train_runtime': 48820.4074, 'train_samples_per_second': 11.675, 'train_steps_per_second': 0.73, 'total_flos': 3127241154338304.0, 'train_loss': 2.685712823807566, 'epoch': 15.0})

#### Check Perplexity score of the model

In [140]:
import math
eval_results = trainer.evaluate()

print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

### Saving tokenizer & Model to use in Encoder Decoder architecture

In [249]:
tokenizer.save_pretrained('Byte_tokenizer')

tokenizer config file saved in Byte_tokenizer\tokenizer_config.json
Special tokens file saved in Byte_tokenizer\special_tokens_map.json


('Byte_tokenizer\\tokenizer_config.json',
 'Byte_tokenizer\\special_tokens_map.json',
 'Byte_tokenizer\\vocab.json',
 'Byte_tokenizer\\merges.txt',
 'Byte_tokenizer\\added_tokens.json',
 'Byte_tokenizer\\tokenizer.json')

In [250]:
trainer.save_model(model_folder)

Saving model checkpoint to RobertaMLM
Configuration saved in RobertaMLM\config.json
Model weights saved in RobertaMLM\pytorch_model.bin


# Evaluating Decoder(ROBERTA)

In [4]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model= r'RobertaMLM',
    tokenizer= 'Byte_tokenizer'
)

In [5]:
fill_mask("a girl going into a <mask> building")

[{'sequence': 'a girl going into a brick building',
  'score': 0.2611638009548187,
  'token': 1043,
  'token_str': ' brick'},
 {'sequence': 'a girl going into a white building',
  'score': 0.10348810255527496,
  'token': 340,
  'token_str': ' white'},
 {'sequence': 'a girl going into a red building',
  'score': 0.0433187335729599,
  'token': 377,
  'token_str': ' red'},
 {'sequence': 'a girl going into a large building',
  'score': 0.03820646554231644,
  'token': 491,
  'token_str': ' large'},
 {'sequence': 'a girl going into a blue building',
  'score': 0.03518558293581009,
  'token': 402,
  'token_str': ' blue'}]

## This Roberta Model will be used as Decoder in Our Image Captioning model and will be connnected to ViT Encoder model using cross attention heads.