In [10]:
!pip install chromadb langchain-community langchain requests together

Collecting langchain-community
  Using cached langchain_community-0.3.29-py3-none-any.whl.metadata (2.9 kB)
Collecting together
  Using cached together-1.5.26-py3-none-any.whl.metadata (16 kB)
Collecting dataclasses-json<0.7,>=0.6.7 (from langchain-community)
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Using cached langchain_community-0.3.29-py3-none-any.whl (2.5 MB)
Using cached together-1.5.26-py3-none-any.whl (107 kB)
Using cached dataclasses_json-0.6.7-py3-none-any.whl (28 kB)
Installing collected packages: dataclasses-json, together, langchain-community
Successfully installed dataclasses-json-0.6.7 langchain-community-0.3.29 together-1.5.26


In [11]:
import os
import json
import chromadb
import uuid
from chromadb import Client
from chromadb.config import Settings
from google.colab import drive
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from langchain_community.embeddings import HuggingFaceEmbeddings
from openai import OpenAI

In [12]:
drive.mount('/content/drive')

Mounted at /content/drive


# **Descriptions**

In [13]:
chroma_client = chromadb.PersistentClient(path="/content/drive/MyDrive/pathpik Data/chromadb")

In [None]:
comp_collection = chroma_client.get_or_create_collection(name="ui_components")

In [14]:
base_path = "/content/drive/MyDrive/pathpik Data/svg with description"

In [36]:
all_comp_collection = chroma_client.get_or_create_collection(name="all_components")

In [16]:
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
api_key="open_ai_key"

In [31]:
from openai import OpenAI
import json

client = OpenAI(api_key=api_key)

def extract_metadata(description: str):
    prompt = f"""
    Extract detailed metadata from the following UI component description.
    Return only valid JSON without any markdown formatting or code blocks.
    Use this exact structure and format:

    {{
      "component_name": "string",
      "device": "string",
      "theme": "string",
      "layout": "string",
      "domain": "string",
      "interactions": ["array", "of", "strings"],
      "accessibility": ["array", "of", "strings"],
      "style": ["array", "of", "strings"],
      "purpose": ["array", "of", "strings"],
      "other": ["array", "of", "strings"]
    }}

    Use lowercase, hyphenated values where appropriate. Arrays should contain relevant string values, or be empty arrays [] if not applicable.

    Description: "{description}"
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )

        content = response.choices[0].message.content.strip()

        # Remove markdown code blocks if present
        for token in ['```json', '```']:
            if content.startswith(token):
                content = content[len(token):]
            if content.endswith(token):
                content = content[:-len(token)]
        content = content.strip()

        # Parse JSON
        metadata = json.loads(content)

    except Exception as e:
        print(f"GPT parsing failed: {e}")
        metadata = {}

    # ---- Apply defaults for missing keys ----
    defaults = {
        "component_name": "unspecified",
        "device": "unspecified",
        "theme": "unspecified",
        "layout": "unspecified",
        "domain": "unspecified",
        "interactions": [],
        "accessibility": [],
        "style": [],
        "purpose": [],
        "other": []
    }
    for key, value in defaults.items():
        metadata.setdefault(key, value)

    # ---- Normalize arrays and strings ----
    for key in ["interactions", "accessibility", "style", "purpose", "other"]:
        if isinstance(metadata[key], list):
            metadata[key] = list({s.lower().replace(" ", "-") for s in metadata[key]})

    # Normalize component_name, device, theme, layout, domain
    for key in ["component_name", "device", "theme", "layout", "domain"]:
        if key in metadata and isinstance(metadata[key], str):
            metadata[key] = metadata[key].lower().replace(" ", "-")

    return metadata

In [32]:
meta = extract_metadata ("A minimal mobile header with horizontal layout featuring hamburger menu icon positioned left, centered \"Pathpik\" logo with colorful geometric icon, and right-aligned section containing message icon, notification bell, and circular user profile avatar. Header maintains compact mobile proportions with dark theme and essential navigation elements optimized for touch interaction and limited screen width in educational applications.")

In [33]:
meta

{'component_name': 'mobile-header',
 'device': 'mobile',
 'theme': 'dark',
 'layout': 'horizontal',
 'domain': 'educational',
 'interactions': ['click', 'touch'],
 'accessibility': ['screen-reader-friendly', 'high-contrast'],
 'style': ['colorful', 'minimal', 'compact'],
 'purpose': ['notifications', 'branding', 'navigation'],
 'other': ['user-profile-avatar', 'geometric-icon', 'hamburger-menu']}

In [None]:
for root, dirs, files in os.walk(base_path):
    for file in files:
        if file.endswith(".svg"):
            svg_path = os.path.join(root, file)
            base_name = os.path.splitext(file)[0]

            txt_path = None
            for f in files:
                if f.endswith(".txt") and base_name in f:
                    txt_path = os.path.join(root, f)
                    break

            # Read SVG content
            with open(svg_path, "r", encoding="utf-8") as f:
                svg_content = f.read()

            # Read description if exists, otherwise use a placeholder
            if txt_path and os.path.exists(txt_path):
                with open(txt_path, "r", encoding="utf-8") as f:
                    description = f.read().strip()
            else:
                description = "No description available"

            # Create unique id
            doc_id = str(uuid.uuid4())

            # Add to Chroma
            all_comp_collection.add(
                documents=[description],
                metadatas=[{"svg_code": svg_content}],
                ids=[doc_id],
                embeddings=model.encode(description, convert_to_numpy=True).tolist()
            )

            print(f"Added {file} → description {'found' if txt_path else 'missing'} ✅")

Added 7.svg → description found ✅
Added 20.svg → description found ✅
Added 13.svg → description found ✅
Added 8.svg → description found ✅
Added 21.svg → description found ✅
Added 14.svg → description found ✅
Added 15.svg → description found ✅
Added 22.svg → description found ✅
Added 9.svg → description found ✅
Added 2.svg → description found ✅
Added 23.svg → description found ✅
Added 16.svg → description found ✅
Added 3.svg → description found ✅
Added 10.svg → description found ✅
Added 17.svg → description found ✅
Added 24.svg → description found ✅
Added 18.svg → description found ✅
Added 4.svg → description found ✅
Added 19.svg → description found ✅
Added 25.svg → description found ✅
Added 5.svg → description found ✅
Added 1.svg → description found ✅
Added 26.svg → description found ✅
Added 11.svg → description found ✅
Added 6.svg → description found ✅
Added 12.svg → description found ✅
Added 27.svg → description found ✅
Added footer.svg → description found ✅
Added feedback_section.sv

In [None]:
query = "dark modern footer with social links"
query_embedding = model.encode([query], convert_to_numpy=True).tolist()

results = all_comp_collection.query(
    query_embeddings=query_embedding,
    n_results=1
)

In [None]:
results['ids']

[['70b98c2e-00c8-4ff3-8502-eacef667e910']]

In [37]:
all_data = all_comp_collection.get(
    include=["documents", "metadatas"]  # Get everything except embeddings
)

print(f"Found {len(all_data['ids'])} items in collection")

Found 583 items in collection


In [40]:
for i in range(len(all_data["ids"])):
    doc_id = all_data["ids"][i]
    description = all_data["documents"][i]
    existing_metadata = all_data["metadatas"][i]

    # Extract GPT metadata
    metadata = extract_metadata(description)

    # Convert lists to strings for Chroma
    for key in ["interactions", "accessibility", "style", "purpose", "other"]:
        if key in metadata and isinstance(metadata[key], list):
            metadata[key] = ", ".join(metadata[key])

    # Preserve existing SVG code and filename
    if "svg_code" in existing_metadata:
        metadata["svg_code"] = existing_metadata["svg_code"]
    if "filename" in existing_metadata:
        metadata["filename"] = existing_metadata["filename"]

    # Update metadata in Chroma
    all_comp_collection.update(
        ids=[doc_id],
        metadatas=[metadata]
    )

    print(f"Updated metadata for document {doc_id}")

Updated metadata for document e32ac9ce-666e-47a7-b268-ea6df2ce4b1f
Updated metadata for document 16e63960-8891-47ba-99d8-fc64282a7798
Updated metadata for document 196b655b-7d65-43c1-821f-3e8a1185239a
Updated metadata for document 30921aa7-99d2-496c-a91c-7eaa8ca17daf
Updated metadata for document 66ad0134-9ab8-4f4a-b858-32b7091ddf47
Updated metadata for document f7a05add-e979-4c7e-a06a-17c3880c4e0d
Updated metadata for document d190635b-8946-45d5-9da9-86f4df2d9072
Updated metadata for document e626b5e5-b474-4fd8-94a0-f7a7c08a09c3
Updated metadata for document fdf4a283-be29-43ce-8aca-322267af76bb
Updated metadata for document dc52c600-4ed6-4801-bdfd-55d47e8f1921
Updated metadata for document f1764359-62b5-4e88-b1e5-bac42cad8632
Updated metadata for document d5095fc5-bb4d-4c53-a52d-4fbf8366da4d
Updated metadata for document d108f60e-fb8c-4e99-887a-9117399d0061
Updated metadata for document 3c4c3674-dc30-4dbb-8d60-1a3ba24c39aa
Updated metadata for document 8f0e56b9-f534-4bb5-a47a-4ca6e1da