In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! git clone https://github.com/gyuboone/VLM.git

Cloning into 'VLM'...
remote: Enumerating objects: 79, done.[K
remote: Counting objects: 100% (79/79), done.[K
remote: Compressing objects: 100% (65/65), done.[K
remote: Total 79 (delta 17), reused 70 (delta 11), pack-reused 0[K
Receiving objects: 100% (79/79), 1.73 MiB | 43.19 MiB/s, done.
Resolving deltas: 100% (17/17), done.


In [None]:
cd /content/VLM/CLIP

In [None]:
! pwd

/content/VLM/CLIP


In [2]:
! pip install ftfy

Collecting ftfy
  Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/54.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.2.0


## Download data

In [None]:
! mkdir data
! mkdir data/mscoco
! wget http://images.cocodataset.org/zips/train2017.zip -O data/mscoco/train2017.zip
! unzip data/mscoco/train2017.zip -d data/mscoco

! wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip -O data/mscoco/annotations_trainval2017.zip
! unzip data/mscoco/annotations_trainval2017.zip -d data/mscoco

In [7]:
!ls ./data/mscoco/train2017 -l | grep ^- | wc -l

118287


Colab 무료로 118287개의 data를 training 하는 것이 불가능하여 data의 일부만 남김

In [8]:
!find ./data/mscoco/train2017 -name "0000001*" -type f -delete

In [9]:
!ls ./data/mscoco/train2017 -l | grep ^- | wc -l

98063


In [10]:
!find ./data/mscoco/train2017 -name "0000002*" -type f -delete
!find ./data/mscoco/train2017 -name "0000003*" -type f -delete
!find ./data/mscoco/train2017 -name "0000004*" -type f -delete
!find ./data/mscoco/train2017 -name "0000005*" -type f -delete
!find ./data/mscoco/train2017 -name "0000006*" -type f -delete

In [11]:
!ls ./data/mscoco/train2017 -l | grep ^- | wc -l

20368


## Training

In [2]:
import torch
import torch.nn.functional as F
import numpy as np
import random
import os

from dataloader.dataset import CLIP_COCO_dataset
from dataloader.data_loaders import get_dataloader

from CLIP import CLIP
from utils.simple_tokenizer import SimpleTokenizer
from utils import set_seed, mkdir

import time

from torch.optim import Adam, AdamW # both are same but AdamW has a default weight decay
import gc
gc.collect()
torch.cuda.empty_cache()

%load_ext autoreload
%autoreload 2


################## setting start ##################
lr = 0.0001
batch_size = 96
epochs = 6


# fixing seed
seed = 7
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# text tokenizer (text encoder에 사용)
tokenizer = SimpleTokenizer()


# setting model
model_params = {'embed_dim' : 1024,
  'image_resolution' : 224,
  'vision_layers' : [3, 4, 6, 3],
  'vision_width': 64,
  'vision_patch_size' : 0 ,# ideally it should be none
  'context_length' : 77,
  'vocab_size' : 49408,
  'transformer_width' : 512,
  'transformer_heads' : 8,
  'transformer_layers' : 6,# 12 in CLIP
}
model_params['vision_layers'] = tuple(model_params['vision_layers'])
model_params['vision_patch_size'] = None
model = CLIP(**model_params).to(device)

# pth load 후 학습 가능. 
# model.load_state_dict(torch.load('model_weights.pth', map_location=device))

# setting dataset
train_img_dir = 'data/mscoco/train2017'
train_annotation_file = 'data/mscoco/annotations/captions_train2017.json'

train_dataset = CLIP_COCO_dataset(train_annotation_file, train_img_dir, tokenizer)
train_dataloader = get_dataloader(train_dataset, batch_size, is_train=True)

# setting optimizer
optimizer = AdamW(model.parameters(), lr=lr)


# setting loss function
def loss_function(logits_img, logits_txt):

    labels = torch.arange(logits_img.shape[0]).to(device)

    loss_i = F.cross_entropy(logits_img, labels)
    loss_t = F.cross_entropy(logits_txt, labels)
    return (loss_i + loss_t) / 2

################## setting end ##################




################ training epoch start #################
start = time.time()

for epoch in range(epochs):
    print(f"{epoch}th epoch starting.")
    model.train()
    running_loss = 0.0
    for step, batch in enumerate(train_dataloader):
        img, txt = batch

        img = img.to(device)
        txt = txt.to(device)

        logits_img, logits_txt = model(img, txt)

        optimizer.zero_grad()
        loss = loss_function(logits_img,logits_txt)
        loss.backward()

        optimizer.step()

        running_loss += loss.item()

    print(f"[Epoch {epoch}th] loss: {running_loss/len(train_dataloader):.4f}")
end = time.time()
################ training epoch end #################
print(f"Time ellapsed in training is: {end-start}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
0th epoch starting.
[Epoch 0th] loss: 4.2644
1th epoch starting.
[Epoch 1th] loss: 4.2445
2th epoch starting.
[Epoch 2th] loss: 4.2242
3th epoch starting.
[Epoch 3th] loss: 4.2158
4th epoch starting.
[Epoch 4th] loss: 4.2056
5th epoch starting.
[Epoch 5th] loss: 4.2006
Time ellapsed in training is: 1827.7767841815948


In [3]:
torch.save(model.state_dict(), 'model_weights_2.pth')

In [4]:
# pth 저장

! cp model_weights_2.pth ./drive/MyDrive/CLIP_MSCOCO.pth