## Have a quick look at H&M catalogues

See https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/data for more information

## Python Import

In [None]:
import pandas as pd

## Main CSV

### Load and easy checks

In [None]:
df = pd.read_csv("../data/H&M/articles.csv")
df.head()

In [None]:
df.shape, df.columns

In [None]:
df.duplicated().sum()

In [None]:
df.isna().sum()

In [None]:
df.nunique()

### Looks quite clean, only missing a few descriptions!

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
df.columns

In [None]:
# Improved Plotly histograms with better formatting
def create_histogram_plotly(df, column, title_suffix=""):
    """Create a horizontal histogram with improved formatting"""
    fig = px.histogram(
        df, 
        y=column,
        title=f'Distribution of {title_suffix or column.replace("_", " ").title()}',
        labels={'count': 'Frequency', column: column.replace('_', ' ').title()},
        height=max(400, len(df[column].unique()) * 20),  # Dynamic height based on categories
        orientation='h'  # Horizontal orientation for better readability of long names
    )
    
    # Improve layout
    fig.update_layout(
        xaxis_title="Frequency",
        yaxis_title=column.replace('_', ' ').title(),
        showlegend=False,
        margin=dict(l=200, r=50, t=50, b=50),  # Adjust margins for long labels
        font=dict(size=12)
    )
    
    # Sort bars by frequency
    fig.update_yaxes(categoryorder="total ascending")
    
    return fig

# Your original columns
fig1 = create_histogram_plotly(df, 'colour_group_name', 'Colour Groups')
fig2 = create_histogram_plotly(df, 'perceived_colour_value_name', 'Perceived Colour Values')

# Product-related distributions
fig3 = create_histogram_plotly(df, 'product_type_name', 'Product Types')
fig4 = create_histogram_plotly(df, 'product_group_name', 'Product Groups')
fig5 = create_histogram_plotly(df, 'garment_group_name', 'Garment Groups')

# Department and organization
fig6 = create_histogram_plotly(df, 'department_name', 'Departments')
fig7 = create_histogram_plotly(df, 'section_name', 'Sections')
fig8 = create_histogram_plotly(df, 'index_name', 'Index Names')

# Appearance-related
fig9 = create_histogram_plotly(df, 'graphical_appearance_name', 'Graphical Appearances')
fig10 = create_histogram_plotly(df, 'perceived_colour_master_name', 'Master Colour Categories')

# Display all additional plots
for fig in [fig3, fig4, fig5, fig6, fig7, fig8, fig9, fig10]:
    fig.show()


In [None]:
df['colour_group_name'].unique()

In [None]:
df["product_type_name"].value_counts().index.to_list()

## Ollama tagging

In [None]:
from pathlib import Path

In [None]:
# Get images
image_paths = Path('../data/h-and-m-personalized-fashion-recommendations/images').rglob('*.jpg')
image_list = list(image_paths)
print(f"Found {len(image_list)} images, {image_list[0]}")

In [None]:
import matplotlib.image as mpimg
import matplotlib.pyplot as plt

# Display first 25 images in 5x5 grid
fig, axes = plt.subplots(5, 5, figsize=(15, 15))
axes = axes.flatten()  # Convert 2D array to 1D for easier indexing

for i, ax in enumerate(axes):
    if i < len(image_list) and i < 25:  # Ensure we don't exceed available images
        img = mpimg.imread(str(image_list[i]))
        ax.imshow(img)
        ax.set_title(f"{image_list[i].name}", fontsize=8)  # Optional: show filename
        ax.axis('off')  # Remove axes
    else:
        ax.axis('off')  # Hide empty subplots

plt.tight_layout()
plt.show()

In [None]:
import pickle

# Load back with sets intact
with open('../results/tags.pkl', 'rb') as f:
    CLOTHING_CATEGORIES = pickle.load(f)
    
CLOTHING_CATEGORIES

In [None]:
def create_system_prompt(clothing_categories):
    """Create system prompt with proper category enforcement and description field."""
    
    prompt = """You are a clothing analysis AI. Return ONLY valid JSON, no other text.

STRICT RULES: You MUST only use tags from these exact lists and include a description field as a string.

"""

    # Add each category with its exact allowed values
    for category, items in clothing_categories.items():
        if isinstance(items, (set, list)):
            items_str = ', '.join(f'"{item}"' for item in sorted(items))
            prompt += f'{category.upper()}: [{items_str}]\n\n'

    prompt += f"""CRITICAL:
- Use ONLY the tags from the list: {list(clothing_categories.keys())}
- Each category must be a list of strings
- Add a "description" field with a short textual description of the clothing in the image
- If unsure about a category, use empty list []
- Return JSON only, no explanations

STRICT: Do not use any keys other than the exact category names provided.
Do not use combined or generic keys like "categories".
Every category must be present, even if empty like [].
"""

    return prompt


def create_user_prompt():
    """User prompt asking for JSON with categories and a short description."""
    return """Analyze this clothing image. Return JSON with the 9 categories as lists of strings and a "description" field with a concise summary of the clothing. Use only allowed tags. JSON only, no other text."""

In [None]:
import json

In [None]:

def parse_json_simple(response_text: str) -> dict:
    """Just parse JSON as-is, no cleaning"""
    try:
        return {
            'success': True,
            'data': json.loads(response_text)
        }
    except:
        return {
            'success': False,
            'data': None
        }


In [None]:
from PIL import Image
import io
import base64

def resize_and_encode_image(image_path, max_width=256):
    """Resize and encode image"""
    image = Image.open(image_path)
    if image.width > max_width:
        ratio = max_width / image.width
        new_height = int(image.height * ratio)
        image = image.resize((max_width, new_height), Image.Resampling.LANCZOS)
    
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode('utf-8')

In [None]:
import time
import ollama

def test_model(model_name: str, encoded_image: str, system_prompt: str, user_prompt: str) -> dict:
    """Test one model"""
    try:
        start_time = time.time()
        
        response = ollama.chat(
            model=model_name,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt, "images": [encoded_image]},
            ],
        )
        
        duration = time.time() - start_time
        parsed = parse_json_simple(response.message.content)
        
        return {
            "model": model_name,
            "duration": duration,
            "response": response.message.content,
            "data": parsed['data'],
            "json_success": parsed['success']
        }
        
    except Exception as e:
        return {
            "model": model_name,
            "error": str(e),
            "data": None,
            "json_success": False
        }

In [None]:
import re
def extract_json_block(response_text):
    if "categories" in response_text.lower():
        print(i)
    match = re.search(r'(\{.*\})', response_text, re.DOTALL)
    if match:
        json_str = match.group(1).replace('[""]', '[]').replace('["]', '[]').lower()
        try:
            data = json.loads(json_str)
        except json.JSONDecodeError as e:
            raise ValueError(f"Failed to parse JSON: {e}")
    else:
        raise ValueError("JSON block not found")
    return data

In [None]:
MAX_RETRIES = 10

from typing import List, Dict

def save_results(result, filename):
    with open(filename, "a", encoding="utf-8") as f:
        json_line = json.dumps(result, ensure_ascii=False)
        f.write(json_line + "\n")

def is_valid_structure(data, expected_keys):
    # Check keys exactly match expected, no extras or missing ones
    data_keys = set(data.keys())
    expected_keys = set(expected_keys)
    return data_keys == expected_keys

def call_ai_and_validate(model_name, encoded_image, system_prompt, user_prompt, expected_keys):
    for attempt in range(MAX_RETRIES):
        response_text = test_model(model_name, encoded_image, system_prompt, user_prompt)  
        json_data = extract_json_block(response_text["response"])
        
        if json_data and is_valid_structure(json_data, expected_keys):
            return response_text | json_data 
        else:
            print(f"Invalid response structure on attempt {attempt+1}, retrying...")
    raise ValueError("Failed to get valid JSON structure after retries")

def run_analysis_batch(image_paths: List[str], clothing_categories: dict, output_file: str) -> List[dict]:
    """Run analysis on multiple images with tqdm progress bar"""
    
    models = ["llava:7b", "qwen2.5vl:7b"]
    expected_keys = list(CLOTHING_CATEGORIES.keys()) + ["description"]
    
    system_prompt = create_system_prompt(clothing_categories)
    user_prompt = create_user_prompt()
        
    results = []
    
    for image_path in tqdm(image_paths, desc="Processing images"):
        try:
            encoded_image = resize_and_encode_image(image_path)
            
            for model_name in models:
                try:
                    result = call_ai_and_validate(model_name, encoded_image, system_prompt, user_prompt, expected_keys)
                    result["image_name"] = Path(image_path).name
                    result["image_path"] = str(image_path)
                    result["json_success"] = True
                    results.append(result)
                except ValueError as e:
                    # Max retries reached, log and continue
                    result = {
                        "model": model_name,
                        "image_name": Path(image_path).name,
                        "image_path": str(image_path),
                        "error": f"Max retries reached: {str(e)}",
                        "data": None,
                        "json_success": False
                    }
                    results.append(result)
                save_results(result, filename=output_file)
           
        except Exception as e:
            
            # Handle image processing errors
            for model_name in models:
                results.append({
                    "model": model_name,
                    "image_name": Path(image_path).name,
                    "image_path": str(image_path),
                    "error": f"Image processing error: {str(e)}",
                    "data": None,
                    "json_success": False
                })
    
    return results

In [None]:
from tqdm.notebook import tqdm

In [None]:
results = run_analysis_batch(image_list[:1000], CLOTHING_CATEGORIES, output_file="../results/h&m.jsonl")