<a href="https://colab.research.google.com/github/chakkarinsan2/flickr30k/blob/main/clip_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# check python version
%%bash
which python
python --version

In [None]:
%%bash

# make sure the conda path is clear so it does not conflict with conda
export PYTHONPATH=""

# download and install miniconda
conda_version='Miniconda3-py37_4.9.2-Linux-x86_64.sh'
wget https://repo.anaconda.com/miniconda/${conda_version}
chmod +x ${conda_version}
./${conda_version} -b -f -p /usr/local

# update miniconda
conda install --channel defaults conda python=3.7 --yes
conda update --channel defaults --all --yes

In [None]:
%%bash
which python
python --version

In [None]:
%%bash

# install dependencies to clip
conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0
pip install ftfy regex tqdm wget

# install clip
pip install git+https://github.com/openai/CLIP.git

In [None]:
import sys

# make sure the conda libraries are recognized here
_ = sys.path.append("/usr/local/lib/python3.7/site-packages")

In [None]:
import torch
import clip

device = "cuda" if torch.cuda.is_available() else "cpu"

# load model and image preprocessing
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)

In [None]:
from pathlib import Path

# สร้างโฟลเดอร์ชื่อ xxx เพื่อ mount drive จาก google drive ให้ไปทำงานใน google colab
!mkdir flickr30k

# โหลด zip file รูปภาพ
if not Path('flickr30k/flickr30k_images01.zip').exists():
  !wget https://github.com/chakkarinsan2/flickr30k/releases/download/training/flickr30k_images01.zip -O flickr30k/flickr30k_images01.zip

if not Path('flickr30k/flickr30k_images02.zip').exists():
  !wget https://github.com/chakkarinsan2/flickr30k/releases/download/training/flickr30k_images02.zip -O flickr30k/flickr30k_images02.zip

if not Path('flickr30k/flickr30k_images03.zip').exists():
  !wget https://github.com/chakkarinsan2/flickr30k/releases/download/training/flickr30k_images03.zip -O flickr30k/flickr30k_images03.zip

# โหลดไฟล์ caption แบบ csv
if not Path('flickr30k/results_revised.csv').exists():
  !wget https://github.com/chakkarinsan2/flickr30k/releases/download/training/results_revised.csv -O flickr30k/results_revised.csv

# โหลดไฟล์คุณลักษณะรูปภาพของ image_feature.npy (haltakov, 2021)
if not Path('flickr30k/image_features.npy').exists():
  !wget https://github.com/chakkarinsan2/flickr30k/releases/download/training/image_features.npy -O flickr30k/image_features.npy

In [None]:
# แตก zip รูปภาพลงในโฟลเดอร์  flickr30k_images

!unzip "/content/flickr30k/flickr30k_images01.zip" -d "/content/flickr30k/flickr30k_images"

!unzip "/content/flickr30k/flickr30k_images02.zip" -d "/content/flickr30k/flickr30k_images"

!unzip "/content/flickr30k/flickr30k_images03.zip" -d "/content/flickr30k/flickr30k_images"


In [None]:
# กำหนด path ของโฟลเดอร์ flickr30k ในการเรียกใช้ในครั้งต่อๆไป
path = '/content/flickr30k/flickr30k_images'

In [None]:
# ตรวจสอบจำนวนรูปภาพในโฟลเดอร์ (=30000)
import os
len(os.listdir(path))

In [None]:
import pandas as pd

# แสดงจำนวนรายการ caption ใน results.csv
df = pd.read_csv('/content/flickr30k/results_revised.csv', header=None, sep='|')
df = df[0].str.split(',', expand=True)

print(df.shape)

df.head(145541)

In [None]:
# ตรวจสอบรายชื่อรูปภาพใน path
data = []

for filename in os.listdir(path):
    if filename.endswith("jpg"):
        # แสดงรายชื่อรูปภาพ
        print(filename)
        data.append(filename)

**Training Model**

In [None]:
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

In [None]:
import numpy as np
import torch
from pkg_resources import packaging

print("Torch version:", torch.__version__)

In [None]:
import clip

clip.available_models()

In [None]:
model, preprocess = clip.load("ViT-B/32")
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

In [None]:
# get full dataset (image+text)
import os.path
from typing import Dict, Tuple

import pandas as pd
import tensorflow as tf


def get_full_dataset(
    batch_size: int = 32, image_size: Tuple[int, int] = (256, 256)
) -> tf.data.Dataset:
    data = pd.read_csv(os.path.join(DATA_ABS_PATH, "/content/flickr30k/results_revised.csv"))
    images_path = os.path.join(DATA_ABS_PATH, "/content/flickr30k/flickr30k_images/")
    data["image"] = data["image"].map(lambda x: os.path.join(images_path, f"{x}.jpg"))
    filenames: tf.Tensor = tf.constant(data["image"], dtype=tf.string)
    data["label"] = data["label"].str.lower()
    class_name_to_label: Dict[str, int] = {
        label: i for i, label in enumerate(set(data["label"]))
    }
    labels: tf.Tensor = tf.constant(
        data["label"].map(class_name_to_label.__getitem__), dtype=tf.uint8
    )
    dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))

    def _parse_function(filename, label):
        jpg_image: tf.Tensor = tf.io.decode_jpeg(tf.io.read_file(filename))
        return tf.image.resize(jpg_image, size=image_size), label

    dataset = dataset.map(_parse_function)
    return dataset.batch(batch_size)


**Image Preprocessing**

data augmentation and analysis from clip.load() contains a torchvision Transform that performs this preprocessing.

In [None]:
preprocess

**Text Preprocessing**

using clip.tokenize(), the outputs are padded to become 77 tokens long, which is what the CLIP models expects.

In [None]:
import pandas as pd

# split csv column
data = pd.read_csv("/content/flickr30k/results_revised.csv", delimiter=",",header=None) # split by ,

print(data[2]) # only column[2] = caption

# text_enceode
clip.tokenize(data[2])

**Setting up input images and texts**

input images and text descriptions to the model, and compare the similarity between the corresponding features.

In [None]:
import os
import skimage
import IPython.display
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

from collections import OrderedDict
import torch

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

description = {print(data[2])}
descriptions = {"page": "a page of text about segmentation"}

In [None]:
# แสดงรูปภาพทั้งหมด
import glob
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

images = []
for img_path in glob.glob('/content/flickr30k/flickr30k_images/*.jpg'):
    images.append(mpimg.imread(img_path))

plt.figure(figsize=(20,10))
columns = 5
for i, image in enumerate(images):
    plt.subplot(len(images) / columns + 1, columns, i + 1)
    plt.imshow(image)

original_images = []
images = []
texts = []
plt.figure(figsize=(16, 5))

for filename in [filename for filename in os.listdir(skimage.data_dir) if filename.endswith(".png") or filename.endswith(".jpg")]:
    name = os.path.splitext(filename)[0]
    if name not in descriptions:
        continue

    image = Image.open(os.path.join(skimage.data_dir, filename)).convert("RGB")

    original_images.append(image)
    images.append(preprocess(image))
    texts.append(descriptions[name])

plt.tight_layout()

In [None]:
image_input = torch.tensor(np.stack(images))
text_tokens = clip.tokenize(["This is " + desc for desc in texts])

In [None]:
with torch.no_grad():
    image_features = model.encode_image(image_input).float()
    text_features = model.encode_text(text_tokens).float()

**Calculating cosine similarity**

between the features and calculate the dot product of each pair.

In [None]:
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = text_features.cpu().numpy() @ image_features.cpu().numpy().T

**Testing**

In [None]:
from PIL import Image

# load image โดยสุ่มเลือกรูปมาจาก google drive
image = Image.open(r"/content/flickr30k/flickr30k_images/1243756.jpg")
print("Image to be processed")
display(image)

# pre-process image
image = preprocess(image).unsqueeze(0).to(device)
print("\n\nTensor shape:")
print(image.shape)

In [None]:
with torch.no_grad():
    image_features = model.encode_image(image)
print(image_features.shape)

In [None]:
# นำเข้า caption จาก csv เพื่อทดสอบการเรียนรู้ของ model ว่าถูกต้องเพียงใด 5 caption per image
text_snippets = ["A man is sitting in a chair in front of a Ben and Jerry 's machine", "A very unusually dressed man sitting beside an ice cream cooler", "One person wearing a coat and hat sitting in a chair", "A man in strange outfit sits in a lawn chair near a Ben and Jerry 's stand", "Man sitting in a chair wearing a hat and scarf"]

# pre-process text
text = clip.tokenize(text_snippets).to(device)
print(text.shape)

In [None]:
with torch.no_grad():
    text_features = model.encode_text(text)
print(text_features.shape)

In [None]:
with torch.no_grad():
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("ค่าความน่าจะเป็นในการพยากรณ์:", probs)