In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import json

import os
import openai
from tqdm.notebook import tqdm

from image_utils import crop_bbox

In [None]:
df = pd.read_csv("clothes_final.csv")

In [None]:
df.head(3)

In [None]:
df['bbox'][0]

In [None]:
def listify(string, encap_type="()"):
    return [int(num) for num in string.strip(encap_type).split(', ')]

In [None]:
# 처음 읽을 때, pandas dataframe에서 list가 아닌 string 값으로 인식하기 때문에 변환 필요
df['bbox'] = [listify(i) for i in df['bbox']]
df['bbox_big'] = [listify(i) for i in df['bbox_big']]

In [None]:
df.head(2)

# 목차 - CLIP embeddings

- Local에 각 상품별 cropped image 저장
- CLIP embeddings 생성

## 1. Bounding box EDA

## 2. Bounding box를 기준으로 각 eneity를 crop

## 3. Cropping 된 이미지를 각 항목에 따라 resize 후 로컬에 저장

## 4. CLIP을 활용한 embedding

- Fine-tuned CLIP
- 하나의 embedding space에 표현된 Text & image

---

## 1. Bounding box EDA
- 각 이미지에 들어있는 상품들의 '크기'는?
- 유사도는 이미지의 크기에도 영향을 받기 때문에 중요한 요인 중 하나
- 따라서 한 카테고리 내에 속하는 이미지들은 모두 동일한 크기로 표현되는 것이 중요

In [None]:
img = Image.open("imaterialist-fashion-2020-fgvc7/train/007e66e7c2864eb3c1ef95cd3ab52687.jpg")

In [None]:
img

In [None]:
df.loc[218]

In [None]:
cropped = crop_bbox(img, df['bbox'][218])
cropped

In [None]:
df.loc[223]

In [None]:
img = Image.open("imaterialist-fashion-2020-fgvc7/train/007e66e7c2864eb3c1ef95cd3ab52687.jpg")
cropped = crop_bbox(img, df['bbox'][223])
cropped

In [None]:
for cat in df['supercategory'].unique():
    tmp = df.loc[df['supercategory']==cat]
    print(cat)
    print(tmp['name'].unique())
    print("Area : {}, width : {}, height : {}".format(np.median(tmp['area']), np.median(tmp['width']), np.median(tmp['height'])))
    print("-"*10)

각 카테고리 별로 이미지들의 특징을 갖고 있음
- lower body는 가로 평균 410, 세로 540
- upper body는 lower body보다 세로 비율이 더 길다
- wholebody는 그보다 세로 비율이 더 길다
- waist는 기로가 세로보다 더 길다
- arms and hands는 가로 세로 비율이 비슷하며, 전체적으로 작음

## 2. Bounding box를 기준으로 각 eneity를 crop

In [None]:
size = {"lowerbody":[420, 540],
        "upperbody":[500, 700],
        "wholebody":[480, 880],
        "legs and feet":[100, 150],
        "head":[150, 100],
        "others":[200, 350],
        "waist":[200, 100],
        "arms and hands":[75, 75],
        "neck":[120, 200]}

In [None]:
df.head(3)

In [None]:
img = Image.open("imaterialist-fashion-2020-fgvc7/train/00000663ed1ff0c4e0132b9b9ac53f6e.jpg")
cropped = crop_bbox(img, df['bbox_big'][0])
cropped

#### 이미지 resize

In [None]:
from PIL import Image, ImageFilter

def resize_img(image, standard_size, category):
    w, h = image.size
    img_size = w*h

    new_width, new_height = standard_size[category]
    new_size = new_width * new_height

    if img_size >= new_size:
        # For downsizing
        downsized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
        return downsized_image
    else:
        # For upsizing
        upsized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
        upsized_image = upsized_image.filter(ImageFilter.UnsharpMask(radius=2, percent=150, threshold=3))
        return upsized_image

In [None]:
resize_img(cropped, size, df['supercategory'][0])

## 3. Cropping 된 이미지를 각 항목에 따라 resize 후 로컬에 저장

약 40분 소요

In [None]:
base_path = "imaterialist-fashion-2020-fgvc7/train"
cropped_path = "imaterialist-fashion-2020-fgvc7/cropped_images"
new_df = pd.DataFrame()

for image_name in tqdm(df['ImageId'].unique()):
    # 한 이미지와 관련된 dataframe
    tmp = df.loc[df['ImageId']==image_name]
    tmp = tmp.reset_index().rename(columns={"index":"entity_id"})
    image = Image.open(os.path.join(base_path, image_name+".jpg"))
    # 각 이미지 내에 있는 상품들을 crop -> local save
    for idx, row in tmp.iterrows():
        cropped_img = crop_bbox(image, row['bbox_big'])
        resized_img = resize_img(cropped_img, size, row['supercategory'])
        resized_img.save(os.path.join(cropped_path, image_name + "_" + str(row['entity_id']) + ".jpg"))

    new_df = pd.concat([new_df, tmp], axis=0)

In [None]:
# new_df.to_csv("clothes_final2.csv", index=False)

In [None]:
new_df = pd.read_csv("clothes_final2.csv")

new_df['bbox'] = [listify(i, "[]") for i in new_df['bbox']]
new_df['bbox_big'] = [listify(i, "[]") for i in new_df['bbox_big']]

In [None]:
df.head(2)

In [None]:
new_df.head()

## 4. CLIP을 활용한 embedding

- fashion 데이터셋을 활용하여 pretrain된 CLIP 모델
- CLIP 모델은 <이미지>-<caption> pair를 input data로 사용, 두 개를 하나의 동일한 embedding space에 구현
- 따라서 <패션 이미지>-<패션 caption> pair를 활용하여 fine-tuned된 모델이 현재 프로젝트 목적에 적합
- dot product를 사용하여 embedding ranking을 측정할 예정
```json
"FashionCLIP performs the dot product between the input caption embedding and each image vector embedding"

"The text used is a concatenation of the highlight (e.g., “stripes”, “long sleeves”, “Armani”) and short description (“80s styled t-shirt”)) available in the Farfetch dataset."
```

![Fine-tune 훈련 데이터](https://media.springernature.com/full/springer-static/image/art%3A10.1038%2Fs41598-022-23052-9/MediaObjects/41598_2022_23052_Fig3_HTML.png?as=webp, "Fine-tune 훈련 데이터")

(출처 : Contrastive language and vision learning of general fashion concepts)

- hugging face : https://huggingface.co/patrickjohncyh/fashion-clip
- paper : https://www.nature.com/articles/s41598-022-23052-9

#### F-CLIP VS CLIP 성능 차이

https://www.nature.com/articles/s41598-022-23052-9/tables/1

- HIT@5 = (서치 결과 top5에 있는 연관 상품의 개수) / (총 연관 상품의 개수)

In [None]:
new_df.head()

In [None]:
from transformers import CLIPProcessor, CLIPModel

model_name = "patrickjohncyh/fashion-clip"
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)

In [None]:
# crop된 이미지들의 path 불러오기
cropped_path = "imaterialist-fashion-2020-fgvc7/cropped_images"

images = list(os.walk(cropped_path))[0][2]

In [None]:
images[:3]

image embeddings from CLIP

In [None]:
from image_utils import extract_img_features

img_emb = extract_img_features(img, processor, model)

In [None]:
img_emb.shape

- `01.Create_image_embeddings.py` 참고

text embeddings from CLIP

In [None]:
from transformers import CLIPProcessor, CLIPModel, AutoTokenizer

model_name = "patrickjohncyh/fashion-clip"

model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def get_single_text_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors = "pt", padding=True)
    text_embeddings = model.get_text_features(**inputs)
    # convert the embeddings to numpy array
    embedding_as_np = text_embeddings.cpu().detach().numpy()
    return embedding_as_np.tolist()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
img = Image.open("imaterialist-fashion-2020-fgvc7/train/3bccf2e618d8f5f51442037ad3c8d4fb.jpg")
img

fashion fine-tuned model

```json
"The text used is a concatenation of the highlight (e.g., “stripes”, “long sleeves”, “Armani”) and short description (“80s styled t-shirt”)) available in the Farfetch dataset."
```

In [None]:
img_emb = extract_img_features(img, processor, model)

sample_texts = ['tshirt', "formal suit and tie", 
                'a woman', "a lion in a cage", "black top short sleeves",
                'black shirt with check patterns, topwear', 'iphone']

sample_texts_emb = get_single_text_embedding(sample_texts, model, tokenizer)

sims = cosine_similarity(img_emb.cpu().detach().numpy(), sample_texts_emb)
# 앞으로는 dot product를 사용할 예정이지만, 
print("이미지와의 유사도")
for t, s in zip(sample_texts, sims[0]):
    print("{} : {}".format(t, s))
    print()

In [None]:
img_emb.cpu().detach().numpy()[0].shape

In [None]:
np.array(s).shape

In [None]:
print('dot product')
for text, s in zip(sample_texts, sample_texts_emb):
    sim = np.dot(img_emb.cpu().detach().numpy()[0], np.array(s))
    print(text, sim)

---

In [None]:
embeddings = {}

with open('img_embeddings_fashion_fine_tuned.json', 'r') as file:
    for line in file:
        # Convert each line to a dictionary
        embedding_dict = json.loads(line.strip())
        
        # Convert the list back to a NumPy array if necessary
        for img_name, emb_list in embedding_dict.items():
            embeddings[img_name] = np.array(emb_list)

In [None]:
len(embeddings)

In [None]:
type(embeddings)

In [None]:
for k,v in embeddings.items():
    print(k)
    break

In [None]:
v.shape