<a href="https://colab.research.google.com/github/ckz2011/ClothMatchMaker/blob/main/Clothing_Match_Maker_Assistant_Complete.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🧥 Clothing Match Maker Assistant using RAG + GPT-4 Vision

This notebook demonstrates a Clothing Recommendation system using image analysis and RAG with GPT-4 Vision.

In [33]:
!pip install -U langchain-community langchain-openai langchain openai tiktoken --quiet


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━[0m [32m61.4/70.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.4/70.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:

!curl -O https://raw.githubusercontent.com/anshupandey/Generative-AI-for-Professionals/main/datasets/sample_clothes.zip
!unzip -o sample_clothes.zip



In [43]:

import os
import base64
import pandas as pd
import numpy as np
import json
import ast
from typing import List
from tqdm import tqdm
from tenacity import retry, wait_random_exponential, stop_after_attempt
from IPython.display import Image, display, HTML
import concurrent.futures
import tiktoken
from google.colab import userdata, files

from langchain_community.chat_models.azure_openai import AzureChatOpenAI
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_core.messages import HumanMessage


# Config
GPT_MODEL = "gpt-4"
EMBEDDING_MODEL = "text-embedding-3-large"
EMBEDDING_COST_PER_1K_TOKENS = 0.00013
curr_path = os.getcwd()


env_variables = """
AZURE_OPENAI_ENDPOINT=XXXXXXXXXXXXXX
AZURE_OPENAI_API_KEY=XXXXXXXXXXXXXX
OPENAI_API_VERSION=XXXXXXXXXXXXXX
"""

# Create the .env file in the current directory
with open('.env', 'w') as f:
    f.write(env_variables)



from dotenv import load_dotenv
import os

# Load the environment variables from the .env file
load_dotenv()

# Access the environment variables
azure_openai_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')
azure_openai_api_key = os.getenv('AZURE_OPENAI_API_KEY')
openai_api_version = os.getenv('OPENAI_API_VERSION')


# Print to verify
print(f"Azure OpenAI Endpoint: {azure_openai_endpoint}")

client = AzureChatOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    model=GPT_MODEL,
    temperature=0
)

embedding_client = OpenAIEmbeddings(
    model=EMBEDDING_MODEL,
    openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    openai_api_base=os.getenv("AZURE_OPENAI_ENDPOINT"),
    openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION")
)


Azure OpenAI Endpoint: https://eastus2.api.cognitive.microsoft.com/


In [24]:
styles_filepath = os.path.join(curr_path, "sample_clothes", "sample_styles.csv")
styles_df = pd.read_csv(styles_filepath, on_bad_lines='skip')
print("Loaded Dataset with", len(styles_df), "entries")


Loaded Dataset with 1000 entries


In [None]:

@retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(10))
def get_embeddings(input: list):
    try:
        response = embedding_client.embed_documents(input)
        return response
    except Exception as e:
        print(f"Error in get_embeddings: {str(e)}")
        raise

def batchify(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx : min(ndx + n, l)]

def embed_corpus(corpus: List[str], batch_size=64, num_workers=8, max_context_len=8191):
    encoding = tiktoken.get_encoding("cl100k_base")
    encoded_corpus = [encoding.encode(text)[:max_context_len] for text in corpus]
    num_tokens = sum(len(article) for article in encoded_corpus)
    cost_to_embed_tokens = num_tokens / 1000 * EMBEDDING_COST_PER_1K_TOKENS
    print(f"num_articles={len(encoded_corpus)}, num_tokens={num_tokens}, est_embedding_cost={cost_to_embed_tokens:.4f} USD")

    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(get_embeddings, batch) for batch in batchify(corpus, batch_size)]
        with tqdm(total=len(corpus)) as pbar:
            for _ in concurrent.futures.as_completed(futures):
                pbar.update(batch_size)
        embeddings = []
        for f in futures:
            embeddings.extend(f.result())
    return embeddings

def generate_embeddings(df, column_name):
    descriptions = df[column_name].astype(str).tolist()
    df['embeddings'] = embed_corpus(descriptions)
    return df

styles_df = generate_embeddings(styles_df, 'productDisplayName')
styles_df.to_csv('sample_clothes/sample_styles_with_embeddings.csv', index=False)


In [None]:

def cosine_similarity_manual(vec1, vec2):
    vec1, vec2 = np.array(vec1), np.array(vec2)
    dot = np.dot(vec1, vec2)
    norm1, norm2 = np.linalg.norm(vec1), np.linalg.norm(vec2)
    return dot / (norm1 * norm2) if norm1 and norm2 else 0.0

def find_similar_items(input_embedding, embeddings, threshold=0.5, top_k=2):
    sims = [(i, cosine_similarity_manual(input_embedding, vec)) for i, vec in enumerate(embeddings)]
    filtered = [(i, s) for i, s in sims if s >= threshold]
    return sorted(filtered, key=lambda x: x[1], reverse=True)[:top_k]

def find_matching_items_with_rag(df_items, item_descs):
    embeddings = df_items['embeddings'].tolist()
    similar_items = []
    for desc in item_descs:
        input_embedding = get_embeddings([desc])[0]
        matches = find_similar_items(input_embedding, embeddings)
        similar_items.extend([df_items.iloc[i] for i, _ in matches])
    return similar_items


In [None]:

uploaded = files.upload()
uploaded_path = list(uploaded.keys())[0]

def encode_image_to_base64(image_path):
    with open(image_path, 'rb') as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

encoded_image = encode_image_to_base64(uploaded_path)

def analyze_image(image_base64, subcategories):
    messages = [
        HumanMessage(content=[
            {"type": "text", "text": f"""Given an image of an item of clothing, return JSON with: 'items', 'category', 'gender'.
            Choose category from: {subcategories}. Gender from: [Men, Women, Boys, Girls, Unisex].
            Example Input: A black leather jacket.
            Example Output: {{"items": ["White Tee", "Skinny Jeans"], "category": "Jackets", "gender": "Women"}}"""},
            {"type": "image_url", "image_url": f"data:image/jpeg;base64,{image_base64}"}
        ])
    ]
    response = client.invoke(messages)
    return response.content

unique_subcategories = styles_df['articleType'].unique()
analysis = json.loads(analyze_image(encoded_image, unique_subcategories))

item_descs = analysis['items']
item_category = analysis['category']
item_gender = analysis['gender']


In [None]:

filtered_items = styles_df[(styles_df['gender'].isin([item_gender, 'Unisex'])) & (styles_df['articleType'] != item_category)]
matching_items = find_matching_items_with_rag(filtered_items, item_descs)

html = "<h3>Uploaded Image:</h3>"
display(Image(filename=uploaded_path))

html += "<h3>Matching Items:</h3><div style='display:flex;flex-wrap:wrap'>"
paths = []
image_folder = os.path.join(curr_path, "sample_clothes", "sample_clothes", "sample_images")
for item in matching_items:
    item_id = item['id']
    item_path = os.path.join(image_folder, f"{item_id}.jpg")
    paths.append(item_path)
    display(Image(filename=item_path))
