In [None]:
import json
import pandas as pd
from tqdm.notebook import tqdm
pd.set_option('display.max_columns', None)

## 목차

### 1. VectorDB 구축

### 2. Text search

### 3. Image search

### 4. Hybrid search

---

#### Local DB 생성

In [None]:
attributes = pd.read_csv("attribute_specific.csv")
df = pd.read_csv("clothes_final2.csv")

In [None]:
data_read_f = list()

with open("upsert_vectors_fashion_fine_tuned.json", 'r') as file:
    for line in file:
        data = json.loads(line)
        data_read_f.append(data)

print(f"Successfully read {len(data_read_f)} fashion-fine-tuned CLIP embeddings from img_embeddings_fashion_fine_tuned.json")


In [None]:
df['vdb_id'] = df['ImageId'].astype(str) + "_" + df['entity_id'].astype(str)
df.drop(columns=['id'], inplace=True)

In [None]:
upsert_df_f = pd.DataFrame(data_read_f)

In [None]:
upsert_df_f.head(2)

In [None]:
d = pd.merge(df, upsert_df_f, left_on='vdb_id', right_on='id')

In [None]:
# d.to_csv("local_db.csv", index=False)

In [None]:
d.head(2)

In [None]:
metadata = d['metadata'].values
names = d['name'].values

In [None]:
metadata_new = list()

for n,m in zip(names, metadata):
    m['category'] = n
    metadata_new.append(m)

d['metadata'] = metadata_new

In [None]:
m

In [None]:
d.head(2)

### PineconeDB에 업로드

- Pinecone upsert 형식에 맞게 내용을 변환
    - 각 카테고리별로 batch에 맞게 upsert

In [None]:
## pineconeDB에 upsert!!
from pinecone import Pinecone

pc = Pinecone(api_key="74e30e50-02fa-4e55-9bff-affa6a3817a0")
# index 개수 확인
# index_list = pc.list_indexes().indexes

# index description
index = pc.Index("fastcampus")
index.describe_index_stats()

- Max size for an upsert request is 2MB. Recommended upsert limit is 100 vectors per request.

In [None]:
# 각 카테고리별로 나눠서 저장함
# 이는 이후 index를 개별로 저장하기 위함

# upsert!!
def create_batches(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

df_categories = dict()

for cat in tqdm(d['name'].unique()):
    part_df = d.loc[d['name']==cat]
    part_upserts = part_df[['id', 'values', 'sparse_values', 'metadata']].to_dict('records')
    # 100개 단위로 upsert
    df_categories[cat] = list(create_batches(part_upserts, 100))

In [None]:
df_categories.keys()

### Upsert 형식

```json
{"id" : "0838a48a7b0bfa789a5181ab0e8f4ee2_3040", # 이미지 파일 이름 + entity ID
 "values" : [-0.08405803143978119, -0.7088879346847534, ...], # CLIP embeddings
 "sparse_values" : {
    "indices" : [1045, 1062, ...], # non-zero index
    "values" : [1.3038887977600098, 0.304147332906723, ...] # non-zero values
    },
"metadata" : {
    # 이미지 파일 path
    "img_path": "imaterialist-fashion-2020-fgvc7/cropped_images/0838a48a7b0bfa789a5181ab0e8f4ee2_3040.jpg",
    "category": "coat"
} 
}

```

In [None]:
# for cat, batches in df_categories.items():
#     print(cat)
#     for batch in tqdm(batches):
#         index.upsert(vectors=batch)

- 각 카테고리별로 개별 index에 저장

---

# 1. Text to image search

- CLIP embedding을 활용
    - text와 image를 한 vector space에 함께 표현되어 있음
    - 또한, fashion dataset에 맞도록 fine-tune되어, 일반 plain CLIP보다 현재 use case에 적합
    - Fine-tune된 데이터 역시 옷의 다양한 attribute을 바탕으로 트레이닝 되어 다양한 (아래 데이터를 참고)

![Fine-tune 훈련 데이터](https://media.springernature.com/full/springer-static/image/art%3A10.1038%2Fs41598-022-23052-9/MediaObjects/41598_2022_23052_Fig3_HTML.png?as=webp, "Fine-tune 훈련 데이터")

(출처 : Contrastive language and vision learning of general fashion concepts)


In [None]:
from PIL import Image
import os
import json
from tqdm import tqdm
import numpy as np
from image_utils import fetch_clip, draw_images
from transformers import CLIPProcessor, CLIPModel, AutoTokenizer

In [None]:
from search_utils import gen_sparse_vector

In [None]:
from splade.splade.models.transformer_rep import Splade
from transformers import AutoTokenizer

splade_model_id = 'naver/splade-cocondenser-ensembledistil'

# splade = 'naver/splade-v3'
splade_model = Splade(splade_model_id, agg='max')
splade_model.to('cpu')  # move to GPU if possible
splade_model.eval()

splade_tokenizer = AutoTokenizer.from_pretrained(splade_model_id)

In [None]:
model, processor, tokenizer = fetch_clip(model_name="patrickjohncyh/fashion-clip")

In [None]:
def get_single_text_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors = "pt", padding=True)
    text_embeddings = model.get_text_features(**inputs)
    # convert the embeddings to numpy array
    embedding_as_np = text_embeddings.cpu().detach().numpy()
    return embedding_as_np.tolist()

In [None]:
input_text = "Green dress with blue dots, long sleeve"

d = get_single_text_embedding(input_text, model, tokenizer)

result = index.query(
    vector=d[0],
    top_k=5,
    filter={"category": {"$eq": "dress"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:
input_text = "nike"

# vans, nike, addidas

d = get_single_text_embedding(input_text, model, tokenizer)

result = index.query(
    vector=d[0],
    top_k=5,
    filter={"category": {"$eq": "shoe"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:
input_text = "street fashion"

d = get_single_text_embedding(input_text, model, tokenizer)

result = index.query(
    vector=d[0],
    top_k=10,
    filter={"category": {"$eq": "top, t-shirt, sweatshirt"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:
input_text = "Punk Fashion"

d = get_single_text_embedding(input_text, model, tokenizer)

result = index.query(
    vector=d[0],
    top_k=10,
    filter={"category": {"$eq": "top, t-shirt, sweatshirt"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:
input_text = "Bohemian Fashion"

d = get_single_text_embedding(input_text, model, tokenizer)

result = index.query(
    vector=d[0],
    top_k=10,
    filter={"category": {"$eq": "top, t-shirt, sweatshirt"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:
input_text = "flower patterns, short sleeve"

d = get_single_text_embedding(input_text, model, tokenizer)

result = index.query(
    vector=d[0],
    top_k=10,
    filter={"category": {"$eq": "top, t-shirt, sweatshirt"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

- 장점
    - 유명 브랜드, 성별, 옷의 종류, 색갈 등의 카테고리를 매뉴얼하게 정하지 않아도 input으로 정해줄 수 있다

- 한계점
    - attribute들의 단순 결합이기 때문에 옷의 부위 별 특징을 인식하지 못 함
        - 예) blue dots이라고 명시했지만, 파란색 드레스가 유사도에 표현됨
    - 스트릿, 보헤미안 패션 등 추상적인 단어들은 다양한 옷들의 조합임
    (CLIP은 <옷의 특징>-<옷의 사진> pair를 활용하여 학습. 따라서 "스트릿패션"과 같이 패션의 한 카테고리와 매칭이 되지 않는다.)

- 극복 방안
    - Sparse vector를 활용하여 옷의 특징에 weight를 더 주는 search
    - 옷의 특징이 아닌, 보다 추상적인 텍스트가 들어오는 경우, database 전체를 대상으로 search


---

# 2. Image to image search

- CLIP embedding을 활용
    - text와 image를 한 vector space에 함께 표현되어 있지만 Image-to-Image 유사도 측정도이 가능함
    - 또한, fashion dataset에 맞도록 fine-tune되어, 일반 plain CLIP보다 현재 use case에 적합


In [None]:
from image_utils import extract_img_features

In [None]:
image

In [None]:
image = Image.open("test_images/test_image2.jpg")

img_emb = extract_img_features(image, processor, model).tolist()

result = index.query(
    vector=img_emb[0],
    top_k=5,
    filter={"category": {"$eq": "top, t-shirt, sweatshirt"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:
image

In [None]:
image = Image.open("test_images/test_image.png")

img_emb = extract_img_features(image, processor, model).tolist()

result = index.query(
    vector=img_emb[0],
    top_k=5,
    filter={"category": {"$eq": "shirt, blouse"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

- 한계점
    - 이미지에는 옷의 색, 사람들의 포즈, 빛 등 다양한 요소들이 모두 포함되어 있기 때문에 옷의 특징들만 선별하여 서치를 진행할 수 없음
    - 즉, 옷의 디테일을 간과할 가능성이 높음
- 극복 방안
    - 이미지로부터 옷의 특징을 텍스트 형식으로 추출, dense vector 또는 sparse vector로 변환하여 서치

# 3. Hybrid search (Dense & sparse vector search)

- splade를 활용하여 각 파트별 특징까지 고려

In [None]:
from splade.splade.models.transformer_rep import Splade
from transformers import AutoTokenizer
import torch

sparse_model_id = 'naver/splade-cocondenser-ensembledistil'

# splade = 'naver/splade-v3'
sparse_model = Splade(sparse_model_id, agg='max')
sparse_model.to('cpu')  # move to GPU if possible
sparse_model.eval()

splade_tokenizer = AutoTokenizer.from_pretrained(sparse_model_id)

In [None]:
# v-neck
input_text = "orange party dress with long sleeve, v neck"

d = get_single_text_embedding(input_text, model, tokenizer)
# sparse = gen_sparse_vector(input_text, splade_model, splade_tokenizer)

result = index.query(
    vector=d[0],
    top_k=5,
    filter={"category": {"$eq": "dress"}},
    # sparse_vector=sparse,
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:
# v-neck
input_text = "orange party dress with long sleeve, v neck"

d = get_single_text_embedding(input_text, model, tokenizer)
sparse = gen_sparse_vector(input_text, splade_model, splade_tokenizer)

result = index.query(
    vector=d[0],
    top_k=5,
    filter={"category": {"$eq": "dress"}},
    sparse_vector=sparse,
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

[i['id'] for i in result.matches]

df.loc[df['vdb_id'].isin([i['id'] for i in result.matches]), ['vdb_id', 'ImageId', 'AttributesNames', 'second_AttributesNames']]

In [None]:
[i['id'] for i in result.matches]

패션 스타일 관련 text field가 없기 때문에, sparse vector를 활용하더라도 큰 퍼포먼스 향상을 기대하기 어려움

In [None]:
input_text = "Punk Fashion"

d = get_single_text_embedding(input_text, model, tokenizer)
# sparse = gen_sparse_vector(input_text, splade_model, splade_tokenizer)

result = index.query(
    vector=d[0],
    top_k=10,
    # sparse_vector=sparse,
    filter={"category": {"$eq": "top, t-shirt, sweatshirt"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:
input_text = "Punk Fashion"

d = get_single_text_embedding(input_text, model, tokenizer)
sparse = gen_sparse_vector(input_text, splade_model, splade_tokenizer)

result = index.query(
    vector=d[0],
    top_k=10,
    sparse_vector=sparse,
    filter={"category": {"$eq": "top, t-shirt, sweatshirt"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

- 활용할 수 있는 attribute의 종류
```python
list_of_attributes = ['main_category', 'silhouette', 'silhouette_fit', 'waistline', 'length',
       'collar_type', 'neckline_type', 'sleeve_type', 'pocket_type',
       'opening_type', 'non-textile material type', 'leather',
       'textile finishing, manufacturing techniques', 'textile pattern']
```
<br>

- attribute으로 표현할 수 있는 document의 포맷

```json
silhouette_name : symmetrical,
collar_type_name : shirt (collar),
opening_type_name : single breasted,
non-textile material type_name : no non-textile material,
textile finishing, manufacturing techniques_name : no special manufacturing technique,
textile pattern_name : plain (pattern)

```

In [None]:
image = Image.open("test_images/test_image4.png")

img_emb = extract_img_features(image, processor, model).tolist()

result = index.query(
    vector=img_emb,
    top_k=5,  # how many results to return
    filter={"category": {"$eq": "jacket"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])

In [None]:
image

In [None]:
image = Image.open("test_images/test_image4.png")

img_emb = extract_img_features(image, processor, model).tolist()

sparse_vector = gen_sparse_vector("suede jacket", sparse_model, splade_tokenizer)

result = index.query(
    vector=img_emb,
    sparse_vector=sparse_vector,
    top_k=5,  # how many results to return
    filter={"category": {"$eq": "jacket"}},
    include_metadata=True
)

paths = [i['metadata']['img_path'] for i in result.matches]

draw_images([Image.open(i) for i in paths])