# Create Clustered Data

This notebook processes vehicle version data to add:
- `model_id`: Combination of make + model
- `price_range`: Min and max ex_showroom_price for each model
- `variant`: Classification as 'base', 'mid', or 'top' based on price

In [1]:
import json
import re
from collections import defaultdict
from typing import Dict, List, Any

In [2]:
def create_model_id(make: str, model: str) -> str:
    """
    Create a consistent model_id from make and model.
    Converts to lowercase and replaces spaces/special chars with underscores.
    """
    combined = f"{make}_{model}"
    # Replace spaces and special characters with underscores
    model_id = re.sub(r'[^a-z0-9]+', '_', combined.lower())
    # Remove leading/trailing underscores
    model_id = model_id.strip('_')
    return model_id

In [3]:
INPUT_FILE = 'version_data_with_images.json'
OUTPUT_FILE = 'version_data_with_images.json'

print(f"Loading data from {INPUT_FILE}...")
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Total documents loaded: {len(data)}")

Loading data from version_data_with_images.json...
Total documents loaded: 1974


In [4]:
# Generate model_id for each document
print("Generating model_id for each document...")

for doc in data:
    source = doc.get('_source', {})
    make = source.get('make', '')
    model = source.get('model', '')
    
    if make and model:
        model_id = create_model_id(make, model)
        source['model_id'] = model_id

print("model_id generation complete.")

# Display sample
if len(data) > 0:
    sample = data[0]['_source']
    print(f"\nSample: make={sample.get('make')}, model={sample.get('model')}, model_id={sample.get('model_id')}")

Generating model_id for each document...
model_id generation complete.

Sample: make=land rover, model=discovery, model_id=land_rover_discovery


In [5]:
# Group documents by model_id and calculate price ranges
print("\nGrouping by model_id and calculating price ranges...")

model_price_stats = defaultdict(lambda: {'min_price': float('inf'), 'max_price': float('-inf'), 'count': 0})

for doc in data:
    source = doc.get('_source', {})
    model_id = source.get('model_id')
    ex_showroom_price = source.get('ex_showroom_price')
    
    # Skip if model_id or price is missing
    if not model_id or ex_showroom_price is None:
        continue
    
    # Update min and max prices
    stats = model_price_stats[model_id]
    stats['min_price'] = min(stats['min_price'], ex_showroom_price)
    stats['max_price'] = max(stats['max_price'], ex_showroom_price)
    stats['count'] += 1

print(f"Total unique models found: {len(model_price_stats)}")

# Display sample stats
sample_models = list(model_price_stats.items())[:3]
for model_id, stats in sample_models:
    print(f"  {model_id}: {stats['count']} variants, price range: {stats['min_price']} - {stats['max_price']}")


Grouping by model_id and calculating price ranges...
Total unique models found: 267
  land_rover_discovery: 5 variants, price range: 12525583 - 13906000
  maruti_suzuki_ciaz: 9 variants, price range: 909054 - 1204027
  audi_q3: 4 variants, price range: 4323114 - 5247144


In [6]:
# Add price_range and variant fields to each document
print("\nAdding price_range and variant fields...")

variant_counts = {'base': 0, 'mid': 0, 'top': 0, 'skipped': 0}

for doc in data:
    source = doc.get('_source', {})
    model_id = source.get('model_id')
    ex_showroom_price = source.get('ex_showroom_price')
    
    # Skip if model_id or price is missing
    if not model_id or ex_showroom_price is None:
        variant_counts['skipped'] += 1
        continue
    
    stats = model_price_stats[model_id]
    min_price = stats['min_price']
    max_price = stats['max_price']
    
    # Add price_range as string
    source['price_range'] = f"{min_price}-{max_price}"
    
    # Classify variant
    if ex_showroom_price == min_price:
        source['variant'] = 'base'
        variant_counts['base'] += 1
    elif ex_showroom_price == max_price:
        source['variant'] = 'top'
        variant_counts['top'] += 1
    else:
        source['variant'] = 'mid'
        variant_counts['mid'] += 1

print("Fields added successfully.")
print(f"\nVariant distribution:")
print(f"  Base variants: {variant_counts['base']}")
print(f"  Mid variants: {variant_counts['mid']}")
print(f"  Top variants: {variant_counts['top']}")
print(f"  Skipped (missing data): {variant_counts['skipped']}")


Adding price_range and variant fields...
Fields added successfully.

Variant distribution:
  Base variants: 280
  Mid variants: 1491
  Top variants: 203
  Skipped (missing data): 0


In [7]:
# Display sample results
print("\n=== Sample Results ===")

# Find a model with multiple variants to showcase
multi_variant_model = None
for model_id, stats in model_price_stats.items():
    if stats['count'] > 2:
        multi_variant_model = model_id
        break

if multi_variant_model:
    print(f"\nSample model: {multi_variant_model}")
    sample_versions = [doc for doc in data if doc.get('_source', {}).get('model_id') == multi_variant_model]
    
    for i, doc in enumerate(sample_versions[:5]):
        source = doc['_source']
        print(f"\n  Version {i+1}:")
        print(f"    version_name: {source.get('version_name')}")
        print(f"    ex_showroom_price: {source.get('ex_showroom_price')}")
        print(f"    price_range: {source.get('price_range')}")
        print(f"    variant: {source.get('variant')}")


=== Sample Results ===

Sample model: land_rover_discovery

  Version 1:
    version_name: metropolitan edition
    ex_showroom_price: 13681666
    price_range: 12525583-13906000
    variant: mid

  Version 2:
    version_name: tempest edition
    ex_showroom_price: 13906000
    price_range: 12525583-13906000
    variant: top

  Version 3:
    version_name: s 3.0 diesel
    ex_showroom_price: 12525583
    price_range: 12525583-13906000
    variant: base

  Version 4:
    version_name: gemini edition
    ex_showroom_price: 12600000
    price_range: 12525583-13906000
    variant: mid

  Version 5:
    version_name: hse r-dynamic 3.0 diesel
    ex_showroom_price: 12972848
    price_range: 12525583-13906000
    variant: mid


In [8]:
# Save updated data back to file
print(f"\nSaving updated data to {OUTPUT_FILE}...")

with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print(f"Data successfully saved to {OUTPUT_FILE}")
print(f"\nTotal documents processed: {len(data)}")
print(f"Total unique models: {len(model_price_stats)}")


Saving updated data to version_data_with_images.json...
Data successfully saved to version_data_with_images.json

Total documents processed: 1974
Total unique models: 267


In [9]:
# Validation: Check for any issues
print("\n=== Validation Summary ===")

# Count documents with all required fields
complete_docs = 0
missing_model_id = 0
missing_price_range = 0
missing_variant = 0

for doc in data:
    source = doc.get('_source', {})
    has_all = True
    
    if not source.get('model_id'):
        missing_model_id += 1
        has_all = False
    if not source.get('price_range'):
        missing_price_range += 1
        has_all = False
    if not source.get('variant'):
        missing_variant += 1
        has_all = False
    
    if has_all:
        complete_docs += 1

print(f"Documents with all new fields: {complete_docs}")
print(f"Documents missing model_id: {missing_model_id}")
print(f"Documents missing price_range: {missing_price_range}")
print(f"Documents missing variant: {missing_variant}")

completion_rate = (complete_docs / len(data) * 100) if len(data) > 0 else 0
print(f"\nCompletion rate: {completion_rate:.2f}%")


=== Validation Summary ===
Documents with all new fields: 1974
Documents missing model_id: 0
Documents missing price_range: 0
Documents missing variant: 0

Completion rate: 100.00%
