In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import torch
import json
import itertools
from sklearn.metrics.pairwise import cosine_similarity
import os
import openai
from tqdm.notebook import tqdm

pd.set_option('display.max_columns', None)

### 0. 왜 sparse vector를 사용할까?

In [None]:
from text_utils import create_embeddings

In [None]:
# initialize openai
os.environ['OPENAI_API_KEY']= "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"
openai.api_key = os.environ["OPENAI_API_KEY"]

In [None]:
from splade.splade.models.transformer_rep import Splade
from transformers import AutoTokenizer

sparse_model_id = 'naver/splade-cocondenser-ensembledistil'

# splade = 'naver/splade-v3'
sparse_model = Splade(sparse_model_id, agg='max')
# sparse_model.to('cpu')  # move to GPU if possible
sparse_model.eval()

splade_tokenizer = AutoTokenizer.from_pretrained(sparse_model_id)

In [None]:
def gen_sparse_vector(text):
    tokens = splade_tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        sparse_emb = sparse_model(
            d_kwargs=tokens.to('cpu')
        )['d_rep'].squeeze()

    return sparse_emb

In [None]:
input_text = "pants"
# input_text = "low waist"
# input_text = "trousers" # (pants의 동의어)

In [None]:
texts = [
   "pants, silhouette_name : straight, symmetrical,silhouette_fit_name : regular (fit), waistline_name : low waist,",
   "pants, silhouette_name : straight, symmetrical,silhouette_fit_name : regular (fit), geometric pattern",
   "symmetrical, silhouette_fit_name : regular (fit), waistline_name : low waist,",
   "symmetrical, silhouette_fit_name : regular (fit), waistline_name : high waist,"
]

In [None]:
embs = create_embeddings(texts)
input_emb = create_embeddings([input_text])

print("Dense vector similarities :", cosine_similarity(input_emb, embs))

In [None]:
for i in texts:
    print('-', i)

print()
s_embs = [gen_sparse_vector(t) for t in texts]
input_s_emb = [gen_sparse_vector(input_text)]

print("Sparse vector similarities :", cosine_similarity(input_s_emb, s_embs))

- 각 단어별 weight 살펴보기

In [None]:
# create the tokens that will be input into the model
tokens = splade_tokenizer(texts[0], return_tensors="pt")
splade_tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])

with torch.no_grad():
    sparse_emb = sparse_model(
        d_kwargs=tokens.to('cpu')
    )['d_rep'].squeeze()
sparse_emb.shape

indices = sparse_emb.nonzero().squeeze().cpu().tolist()
values = sparse_emb[indices].cpu().tolist()

print(len(indices))

idx2token = {idx: token for token, idx in splade_tokenizer.get_vocab().items()}

"pants, silhouette_name : straight, symmetrical,silhouette_fit_name : regular (fit), waistline_name : low waist,"

In [None]:
sparse_dict_tokens = {
    idx2token[idx]: round(weight, 2) for idx, weight in zip(indices, values)
}
# sort so we can see most relevant tokens first
sparse_dict_tokens = {
    k: v for k, v in sorted(
        sparse_dict_tokens.items(),
        key=lambda item: item[1],
        reverse=True
    )
}
# sparse_dict_tokens

## 목차 - sparse vector

#### 1. 각 supercategory 별로 group
#### 2. 하나의 document로 변환
#### 3. SPLADE
#### 4. pineconeDB upsert 형태로 변환


- document : 

```json
silhouette_name : symmetrical,
silhouette_fit_name : regular (fit),
waistline_name : low waist,
length_name : maxi (length),
opening_type_name : fly (opening),
non-textile material type_name : no non-textile material
```

In [None]:
def listify(string, encap_type="()"):
    return [int(num) for num in string.strip(encap_type).split(', ')]

In [None]:
attributes = pd.read_csv("attribute_specific.csv")
new_df = pd.read_csv("clothes_final2.csv")

new_df['bbox'] = [listify(i, "[]") for i in new_df['bbox']]
new_df['bbox_big'] = [listify(i, "[]") for i in new_df['bbox_big']]

In [None]:
new_df.head(2)

In [None]:
new_df['name'].unique()

### Supercategory 별로 attribute를 구분하기

- 조금 더 자세히 살펴보면, 우리들이 갖고 있는 attribute은 몇 가지 레벨로 나눌 수 있다
	- 가장 큰 단위는 당연히 classID - 바지, 상의, 신발 등
	- 더 세밀하게 나눠보면 classID에 따른 특징들을 그룹화 할 수 있다
		- 하위 attribute들은 각자 다른 특징을 나타낸다
		- 핏감, 전체적인 옷의 형태, 질감, 마감, 길이 등
		- 따라서 이런 attribute들을 모두 각자의 그룹에 맞게 고려되어야 한다

In [None]:
attributes.head(3)

In [None]:
attributes['supercategory2'].unique()

아무런 attribute이 없는 항목은 'normal'이라는 attribute을 임의로 부여

In [None]:
new_df.loc[new_df['AttributesIds'].isna(), 'AttributesIds'] = "999"
new_df.loc[new_df['AttributesNames'].isna(), 'AttributesNames'] = "normal"

In [None]:
new_df.tail(2)

- Main attribute과 secondary attribute들을 하나로 묶음
    - 이는 우리가 supercategory2를 임의로 지정하여 sleeve, collar들에 해당하는 attribute ID를 별도로 처리할 수 있기 때문

In [None]:
def merge_columns(row):
    # Check if either value is np.nan and return the other value in such cases
    if pd.isna(row['AttributesIds']) and not pd.isna(row['second_AttributesIds']):
        return row['second_AttributesIds']
    elif not pd.isna(row['AttributesIds']) and pd.isna(row['second_AttributesIds']):
        return row['AttributesIds']
    elif pd.isna(row['AttributesIds']) and pd.isna(row['second_AttributesIds']):
        return np.nan
    else:
        # Both values are not np.nan, merge with a comma
        return f"{row['AttributesIds']},{row['second_AttributesIds']}"

new_df['AttributesIds_merged'] = new_df.apply(merge_columns, axis=1)

In [None]:
new_df.head(2)

In [None]:
def convert2list(string):
    if pd.isna(string):
        return np.nan
    else:
        return list(set([i for i in string.split(',')]))

new_df['AttributesIds_list'] = new_df['AttributesIds_merged'].apply(convert2list)

In [None]:
new_df.head(2)

### 각 attribute들을 분류하여 각 컬럼에 배치
- `attributes`를 참고하며, 각 attribute ID를 attribute name으로 변환

In [None]:
attributes.head(2)

In [None]:
attributes['id'] = attributes['id'].astype(str)

# Create a mapping of id to supercategory2
id_to_supercategory2 = attributes.set_index('id')['supercategory2'].to_dict()

In [None]:
id_to_supercategory2

In [None]:
# 각 카테고리별로 list를 만든다
category_distributions = list()

# row를 루프를 돌면서 각 dictionary value에 채워 넣는다
for idx, row in tqdm(new_df.iterrows()):
    tmp_dict = {k:'' for k in attributes['supercategory2'].unique()}
    for attr in row['AttributesIds_list']:
        supercat_type = id_to_supercategory2[attr]
        if tmp_dict[supercat_type]=='':
            tmp_dict[supercat_type] += attr
        else:
            tmp_dict[supercat_type] += "," + attr
        # break
    category_distributions.append(tmp_dict)

In [None]:
category_distributions[0]

In [None]:
# 각 dictionary를 dataframe 형태로 변환
category_dist_df = [pd.DataFrame([d]) for d in category_distributions]
# list of dataframe을 하나의 dataframe으로 concat
category_dist_df = pd.concat(category_dist_df, axis=0)
# reset index
category_dist_df.reset_index(inplace=True, drop=True)

category_dist_df = category_dist_df.replace('', np.nan)

In [None]:
category_dist_df.tail()

- 기존의 데이터셋과 결합

In [None]:
new_df = pd.concat([new_df, category_dist_df], axis=1)
new_df = new_df.replace('', np.nan)
new_df.head()

## Document 형태로 변환

In [None]:
category_dist_df.head()

In [None]:
id_to_name = pd.Series(attributes.name.values, index=attributes.id).to_dict()

# Define a function to convert IDs to names
def ids_to_names(ids, id_to_name=id_to_name):
    if pd.isna(ids):
        return np.nan
    names = [id_to_name.get(id_, 'Unknown') for id_ in ids.split(',')]
    return ', '.join(names)


for col in category_dist_df.columns:
    if 'name' not in col:
        category_dist_df[col+"_name"] = category_dist_df[col].apply(ids_to_names)

In [None]:
category_dist_df.head(2)

In [None]:
# human-readable한 값들로 변환된 컬럼들을 활용
named_df = category_dist_df[[i for i in category_dist_df.columns if '_name' in i]]

# 각 row 별로 하나의 string 값으로 변환
def row_to_string(row):
    return ',\n'.join([f"{col} : {row[col]}" for col in named_df.columns if pd.notna(row[col])])

# Applying the function to each row of the DataFrame and storing the results in a list
list_of_strings = named_df.apply(row_to_string, axis=1).tolist()

In [None]:
print(list_of_strings[0])

In [None]:
new_df['doc'] = list_of_strings

In [None]:
new_df.head(2)

In [None]:
new_df.head(2)

In [None]:
## 로컬에 저장
# new_df.to_csv("clothes_final_sparse_doc.csv", index=False)

In [None]:
base_path = "imaterialist-fashion-2020-fgvc7/cropped_images/"

new_df['img_path'] = base_path + new_df['ImageId'].astype(str) + "_" + new_df['entity_id'].astype(str) + ".jpg"
# image df와의 join을 위한 키 생성
new_df['img_id'] = new_df['ImageId'].astype(str) + "_" + new_df['entity_id'].astype(str)

In [None]:
new_df.loc[2022, 'img_path']

Image.open("imaterialist-fashion-2020-fgvc7/train/054f0ae9527a9a79a4de6f3acc166e5b.jpg")

In [None]:
i = 2020
print(new_df.loc[i, 'name'])
print(new_df.loc[i, 'doc'])
Image.open(new_df.loc[i, 'img_path'])

In [None]:
i = 2022
print(new_df.loc[i, 'name'])
print(new_df.loc[i, 'doc'])
Image.open(new_df.loc[i, 'img_path'])

---

In [None]:
new_df = pd.read_csv("clothes_final_sparse_doc.csv")

### 이미지 embeddings와 함께 merge하여 하나의 dataframe으로 결합

In [None]:
embeddings = {}

with open('img_embeddings_fashion_fine_tuned.json', 'r') as file:
    for line in file:
        # Convert each line to a dictionary
        embedding_dict = json.loads(line.strip())
        
        # Convert the list back to a NumPy array if necessary
        for img_name, emb_list in embedding_dict.items():
            embeddings[img_name] = np.array(emb_list)

image_embedddings = pd.DataFrame([embeddings]).T.reset_index()
image_embedddings.rename(columns={"index":"img_id", 0:"img_emb"}, inplace=True)

In [None]:
image_embedddings.head(2)

In [None]:
new_df = pd.merge(new_df, image_embedddings, on='img_id', how='left')

In [None]:
new_df.head(2)

In [None]:
# 모두 잘 join 되었는지 확인
new_df.img_emb.isna().sum()

## CLIP : SPLADE = Dense : sparse vector

- hybrid search를 위해서는 dense vector와 sparse vector를 짝을 지어줘야 함

In [None]:
from splade.splade.models.transformer_rep import Splade
from transformers import AutoTokenizer

sparse_model_id = 'naver/splade-cocondenser-ensembledistil'

# splade = 'naver/splade-v3'
sparse_model = Splade(sparse_model_id, agg='max')
sparse_model.to('cpu')  # move to GPU if possible
sparse_model.eval()

splade_tokenizer = AutoTokenizer.from_pretrained(sparse_model_id)

### Upsert 형식

```json
{"id" : "0838a48a7b0bfa789a5181ab0e8f4ee2_3040", # 이미지 파일 이름 + entity ID
 "values" : [-0.08405803143978119, -0.7088879346847534, ...], # CLIP embeddings
 "sparse_values" : {
    "indices" : [1045, 1062, ...], # non-zero index
    "values" : [1.3038887977600098, 0.304147332906723, ...] # non-zero values
    },
"metadata" : {
    # 이미지 파일 path
    "img_path": "imaterialist-fashion-2020-fgvc7/cropped_images/0838a48a7b0bfa789a5181ab0e8f4ee2_3040.jpg",
    "category": "coat"
} 
}

```

In [None]:
def gen_sparse_vector(text):
    tokens = splade_tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        sparse_emb = sparse_model(
            d_kwargs=tokens.to('cpu')
        )['d_rep'].squeeze()
    
    indices = sparse_emb.nonzero().squeeze().cpu().tolist()
    values = sparse_emb[indices].cpu().tolist()

    return indices, values

def upsert_format(id, text, img_emb):
    index, value = gen_sparse_vector(text)
    
    sparse_values = {
        "indices": index,
        "values": value
    }
    
    upsert = {
        "id": id,
        "values": img_emb,
        "sparse_values":sparse_values,
        "metadata":{"img_path":"imaterialist-fashion-2020-fgvc7/cropped_images/"+id+".jpg"}
    }
    return upsert

In [None]:
tmp = new_df.head(5)

In [None]:
upserts = list()

for _, row in tqdm(tmp.iterrows(), total=tmp.shape[0]):
    upserts.append(upsert_format(row['img_id'], row['doc'], row['img_emb'].tolist()))

In [None]:
type(upserts[0]['values'])

In [None]:
type(upserts[0]['sparse_values'])

In [None]:
upserts[0]['sparse_values'].keys()

In [None]:
upserts[0].keys()

In [None]:
upserts[0]['id']

In [None]:
upserts[0]['metadata']

In [None]:
upserts[0]['sparse_values'].keys()

`01. generate_SPLADE_embeddings.ipynb` 참고

### 만들어진 sparse vector 읽어오기

In [None]:
data_read = []

# Open the file in read mode
with open("upsert_vectors_fashion_fine_tuned.json", 'r') as file:
    # Iterate through each line in the file
    for line in file:
        # Parse the JSON string into a Python dictionary
        data = json.loads(line)
        # Append the dictionary to the list
        data_read.append(data)

# Now, data_read contains all the dictionaries read from the file
print(f"Successfully read {len(data_read)} items from upsert_vectors_fashion_fine_tuned.json")

In [None]:
data_read[0].keys()

In [None]:
data_read[0]['metadata']