In [None]:
import os
import gc
import copy
import time
import random
import joblib

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

# Utils
from tqdm import tqdm
from collections import defaultdict

# For Transformer Models
from transformers import AutoProcessor, AdamW
from transformers import BlipForConditionalGeneration




# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['TOKENIZERS_PARALLELISM'] = "False"

In [None]:
Config={ "seed":2023, #seed does a random intialization of Data
          "epochs":3,
         "model_name":"Salesforce/blip-image-captioning-base",
         "training_batch_size":4,
         "valid_batch_size":8,
         "learning_rate": 1e-4,
         "scheduler": 'CosineAnnealingLR',#updating weights using cosine Scheuler
         "min_lr": 1e-6,#starting min learning rate
         "T_max": 500,
          "weight_decay": 1e-6,
          "n_accumulate": 1,#itni dair baad weights update hongay
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

}


In [None]:
Config["processor"] = AutoProcessor.from_pretrained(Config['model_name'])

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(Config['seed'])  #If you are using other libraries or frameworks, you may need to set their seeds as well to ensure full reproducibility.

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

In [None]:
from datasets import load_dataset

# Load the dataset with the `2m_first_5k` subset
dataset = load_dataset('poloclub/diffusiondb', '2m_first_5k')

In [None]:
dataset = dataset['train']
dataset = dataset.filter(lambda example: example["step"] == 50)
len(dataset)

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

4984

In [None]:
dataset

Dataset({
    features: ['image', 'prompt', 'seed', 'step', 'cfg', 'sampler', 'width', 'height', 'user_name', 'timestamp', 'image_nsfw', 'prompt_nsfw'],
    num_rows: 4984
})

In [None]:
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
class ImageCaptionDataset(Dataset):
  def __init__(self,dataset,processor):
    self.dataset=dataset
    self.processor=processor
  def __len__(self):
    return len(self.dataset)

  def __getitem__(self,idx):
    item=self.dataset[idx]
    encoding=self.processor(images=item["image"],text=item["prompt"],padding="max_length",return_tensors="pt")
    # remove batch dimension
    encoding = {k:v.squeeze() for k,v in encoding.items()}
    return encoding


In [None]:
train_dataset = ImageCaptionDataset(dataset['train'], Config['processor'])
valid_dataset = ImageCaptionDataset(dataset['test'], Config['processor'])

In [None]:
model = BlipForConditionalGeneration.from_pretrained(Config['model_name'])

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [None]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()

    dataset_size = 0
    running_loss = 0.0

    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        input_ids = data['input_ids'].to(device)
        pixel_values = data['pixel_values'].to(device)

        batch_size = input_ids.size(0)

        outputs = model(input_ids=input_ids,
                        pixel_values=pixel_values,
                        labels=input_ids)

        loss = outputs.loss
        loss = loss / Config['n_accumulate']
        loss.backward()

        if (step + 1) % Config['n_accumulate'] == 0:
            optimizer.step()

            # zero the parameter gradients
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()

        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])


    return epoch_loss


In [None]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch,):
    model.eval()

    dataset_size = 0
    running_loss = 0.0

    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        input_ids = data['input_ids'].to(device)
        pixel_values = data['pixel_values'].to(device)

        batch_size = input_ids.size(0)

        outputs = model(input_ids=input_ids,
                        pixel_values=pixel_values,
                        labels=input_ids)

        loss = outputs.loss

        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])

    return epoch_loss

In [None]:
def run_training(model, optimizer, scheduler, device, num_epochs):
    # To automatically log gradients


    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))

    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    history = defaultdict(list)

    for epoch in range(1, num_epochs + 1):
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler,
                                           dataloader=train_loader,
                                           device=Config['device'], epoch=epoch)

        val_epoch_loss = valid_one_epoch(model, valid_loader, device=Config['device'],
                                         epoch=epoch)

        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)

        # Log the metrics


        # deep copy the model
        if val_epoch_loss <= best_epoch_loss:

            best_epoch_loss = val_epoch_loss

            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = f"BestLoss.bin"
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory


        print()

    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_epoch_loss))

    # load best model weights
    model.load_state_dict(best_model_wts)

    return model, history

In [None]:
def fetch_scheduler(optimizer):
    if Config['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=Config['T_max'],
                                                   eta_min=Config['min_lr'])
    elif Config['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=Config['T_0'],
                                                             eta_min=Config['min_lr'])
    elif Config['scheduler'] == None:
        return None

    return scheduler

In [None]:

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=Config['training_batch_size'])
valid_loader = DataLoader(valid_dataset, shuffle=False, batch_size=Config['valid_batch_size'])

model.to(Config['device'])

# Define Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=Config['learning_rate'], weight_decay=Config['weight_decay'])
scheduler = fetch_scheduler(optimizer)


model, history = run_training(model, optimizer, scheduler,
                              device=Config['device'],
                              num_epochs=1)
torch.save(model.state_dict(), 'model.pt')


[INFO] Using GPU: Tesla T4



  0%|          | 0/997 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
100%|██████████| 997/997 [23:55<00:00,  1.44s/it, Epoch=1, LR=0.0001, Train_Loss=0.526]
100%|██████████| 125/125 [02:09<00:00,  1.04s/it, Epoch=1, LR=0.0001, Valid_Loss=0.214]



Training complete in 0h 26m 14s
Best Loss: 0.2141


In [None]:
print(x)

0.314619779586792


In [None]:
model.save_pretrained("model_blip_ver3")