In [7]:
import os
import openai
from openai import OpenAI
from sklearn.cluster import KMeans
from utils import cosine_similarity
import json
from tqdm.notebook import tqdm
import numpy as np
from openai import AsyncOpenAI
import re

# initialize openai
os.environ['OPENAI_API_KEY']= "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"
openai.api_key = os.environ["OPENAI_API_KEY"]

- 이미지에서 정보를 최대한 많이 추출해서 데이터 포인트를 많이 만들 것이다.
    - 이미지 전반적인 분위기
    - 이미지에 들어있는 가구의 종류 및 설명
    - 이미지에 들어있는 가구들의 위치
- search의 종류
    - step 1 (이미지 전반)
        - 분위기를 활용한 text의 유사도
        - 전체 이미지 유사도
    - step 2 (이미지 내에 있는 물건들을 활용)
        - 각 가구들끼리의 text 유사도 측정 후 평균 (GPT4v. 각 텍스트끼리 유사도 측정, input 이미지를 기준으로 각 text cos-sim의 max 값들을 평균)
        - 각 가구들끼리의 이미지 유사도 측정 후 평균 (마찬가지로 img-emb-sim 측정 후, max->평균)
    - 추가 (이미지 내의 가구들의 위치)
        - 각 가구들의 x,y 좌표를 활용 (euclidean distance 또는 IOU)

- image of furnitures : https://www.kaggle.com/datasets/rhtsingh/130k-images-512x512-universal-image-embeddings?select=train.csv
- image of rooms : https://www.kaggle.com/datasets/annielu21/house-rooms

### Preprocessing

- 이미지에 캡션 달기 (caption generation)
- object detection (words)
- 각 이미지의 좌표 위치를 기반으로 object간의 관계를 구하기
    - 이미 CLIP이 이미지의 컨텍스트를 이해하긴 하지만, 이것만 집중적으로 이해하진 않는다

### 1. GPT-4V를 활용하여 다양한 정보 추출

- Rate limit을 고려하여 GPT-4V api call

참고 : https://platform.openai.com/docs/guides/vision
Rate limit 확인 : https://platform.openai.com/account/limits

In [2]:
# Tier2 기준

TPM = 20000
RPM = 100
RPD = 1000

low_res = True # 저화질 : 512x512 사이즈 이미지를 Input으로

if low_res:
    token_per_img = 65
    text_token = 191
    print("1분에 최대 {}개의 이미지 처리 가능. 하루에 최대 {}개 처리 가능.".format( min(TPM//(token_per_img+text_token), RPM), RPD))
else:
    print("기본 65 토큰 + 512px 사이즈로 crop 된 이미지 개수 x 129 토큰")

1분에 최대 78개의 이미지 처리 가능. 하루에 최대 1000개 처리 가능.


In [3]:
import base64
import requests

def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

- gpt-4v는 json 형태로 결과를 내어줄 수 없다
    - 따라서 전반적인 image descprition을 받은 후 검증이 필요함

In [4]:
text_prompt = """Please analyze the living room image provided.  
Under 'atmosphere', include 'Color Scheme', 'Lighting', 'Spatial Layout', and 'Architectural Features' with descriptions based on the room's characteristics.
Use attributes like color, material, and type to describe each furniture item. 
The output should be formatted in a JSON-like dictionary structure. Each image should be done separately.

Example output :

```json
{
  "atmosphere": {
    "Color Scheme": <Description about color scheme>,
    "Lighting": <Description about lighting>,
    "Spatial Layout": <Description about spatial layouts >,
    "Architectural Features": <Descrption about architectural features>
  },
  "furniture": [
    {
      "Item": "Sofa",
      "Attributes": {
        "Color": <color>,
        "Material": <material>,
        "Description": <One sentence description>
      }
    },
    ...
  ]
}
```
"""

headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {openai.api_key}"
}

img = encode_image('room-dataset/living/living_18.jpg')
img2 = encode_image('room-dataset/living/living_5.jpg')

payload = {
  "model": "gpt-4-vision-preview",
  "messages": [
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": text_prompt
        },
        {
          "type": "image_url",
          "image_url": {
            "url": f"data:image/jpeg;base64,{img}"
          }
        },
        {
          "type": "image_url",
          "image_url": {
            "url": f"data:image/jpeg;base64,{img2}"
          }
        }
      ]
    },
  ],
  "max_tokens": 1000
}


In [6]:
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
print(response.json())

In [None]:
output = response.json()['choices'][0]['message']['content']

In [None]:
output = output.replace("```", "'''")

pattern = re.compile(r"'''json\n(.*?)'''", re.DOTALL)
outputs = pattern.findall(output)

outputs = [json.loads(j) for j in outputs]

In [8]:
def describe_image(input_prompt, image_paths, openai_key):
  headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {openai_key}"
      }
  imgs = [encode_image(i) for i in image_paths]

  payload = {
          "model": "gpt-4-vision-preview",
          "messages": [{"role": "user",
                      "content": []
                      },
                      ],
          "max_tokens": 1000
          }
  
  img_contents = [{"type": "text", "text": input_prompt}]
  for img in imgs:
    input_template = {
      "type": "image_url",
      "image_url": {
        "url": f"data:image/jpeg;base64,{img}"
      }
    }
    img_contents.append(input_template)

  payload['messages'][0]['content'] = img_contents

  response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
  output = response.json()['choices'][0]['message']['content']
  return output

In [19]:
def parse_response(output):
    output = output.replace("```", "'''")

    pattern = re.compile(r"'''json\n(.*?)'''", re.DOTALL)
    outputs = pattern.findall(output)

    # outputs = [json.loads(j) for j in outputs]
    return outputs

- output을 검증하는 레이어
- output을 저장하는 레이어 

In [25]:
# 1번부터 100번 이미지까지
img_paths = list(os.walk('room-dataset/living'))[0][2]
img_paths = [i for i in img_paths if int(i.split('_')[1].split('.')[0]) in list(range(1, 101))]

def extract_number(filename):
    match = re.search(r'\d+', filename)
    return int(match.group()) if match else 0

img_paths = sorted(img_paths, key=extract_number)
img_paths = [os.path.join('room-dataset/living', i) for i in img_paths]

batches = [img_paths[i : i+2] for i in range(0, len(img_paths), 2)]
outputs = dict()

- GPT-4V는 JSON 형태의 아웃풋을 강제할 수 없음
- 단순 이미지 description을 활용하여 유사도 측정을 해도 무방함

In [26]:
# 과금 주의
for batch in tqdm(batches):
    r = describe_image(text_prompt, batch, openai.api_key)
    outputs['#'.join(batch)] = r

  0%|          | 0/50 [00:00<?, ?it/s]

In [27]:
with open("room-dataset/room_descriptions.json", 'w') as file:
    json.dump(outputs, file)

In [30]:
print(outputs['room-dataset/living/living_1.jpg#room-dataset/living/living_2.jpg'])

Certainly! I'll evaluate each living room image separately.

**First Image:**

```json
{
  "atmosphere": {
    "Color Scheme": "Predominantly natural and earthy tones with accents of warm red and gold.",
    "Lighting": "Warm, ambient lighting emphasized by the natural light filtering through the windows and the glow from the fireplace.",
    "Spatial Layout": "Open concept with clear zones for sitting and viewing the fireplace. Elevated walkway adds a vertical dimension.",
    "Architectural Features": "Exposed wooden beams, a large stone fireplace, and rustic wooden railing on the upper level. A chandelier made of antlers is a striking focal point."
  },
  "furniture": [
    {
      "Item": "Sofa",
      "Attributes": {
        "Color": "Dark brown",
        "Material": "Leather",
        "Description": "A comfortable dark brown leather sofa with patterned throw pillows."
      }
    },
    {
      "Item": "Armchair",
      "Attributes": {
        "Color": "Rich red",
        "Materi

### 2. Yolo를 활용하여 가구 detect + 좌표 추출

- YOLO class?

In [None]:
from utils import detect_objects

In [None]:
import yolov5

# 출처 : https://pypi.org/project/yolov5/

# load pretrained model
model = yolov5.load('yolov5s.pt')

# set model parameters
model.conf = 0.3  # NMS confidence threshold
model.iou = 0.45  # NMS IoU threshold
model.agnostic = False  # NMS class-agnostic
model.multi_label = False  # NMS multiple labels per box
model.max_det = 1000  # maximum number of detections per image

In [None]:
detections = detect_objects('room-dataset/living/living_18.jpg', model)

In [None]:
detections[0].show()

In [None]:
def filter_furniture(detections):
    furniture_class = [56, 57, 59, 60] # detections[0].names
    furniture_names = ['chair', 'couch', 'bed', 'dining table']
    furniture_detected = {}

    filter = [True if i in furniture_names else False for i in detections[1]['labels']]
    furniture_detected['boxes'] = detections[1]['boxes'][filter]
    furniture_detected['scores'] = detections[1]['scores'][filter]
    furniture_detected['categories'] = detections[1]['categories'][filter]
    furniture_detected['lables'] = [item for item, bool in zip(detections[1]['labels'], filter) if bool==True]
    
    return furniture_detected

In [None]:
a = filter_furniture(detections)