# Enrich Vehicle Data with OpenAI Summaries

This notebook processes vehicle version data and uses OpenAI GPT-4o to generate:
- `key_specs`: List of 5 important specs/features as keywords
- `stand_out_features`: List of 5 descriptive standout features (30-50 chars each)

## Features
- Processes one version_id at a time
- Progress bar tracking
- Error handling (skips failed versions and continues)
- Incremental saving (saves after each successful update)
- Skips already processed versions

In [None]:
# Install required packages
!pip install openai python-dotenv tqdm -q

In [15]:
import json
import os
from openai import OpenAI
from tqdm import tqdm
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Initialize OpenAI client
api_key = os.getenv('OPENAI_API_KEY')
if not api_key or api_key == 'your_api_key_here':
    raise ValueError("Please set your OPENAI_API_KEY in the .env file")

client = OpenAI(api_key=api_key)

print("✓ Environment loaded")
print("✓ OpenAI client initialized")
print(f"✓ Using model: gpt-4o")

✓ Environment loaded
✓ OpenAI client initialized
✓ Using model: gpt-4o


In [16]:
def already_processed(doc):
    """
    Check if document already has key_specs and stand_out_features.
    Returns True if both fields exist, False otherwise.
    """
    source = doc.get('_source', {})
    return 'key_specs' in source and 'stand_out_features' in source


def extract_vehicle_context(doc):
    """
    Extract relevant vehicle information and features from document.
    Returns a formatted string with vehicle specs for OpenAI.
    """
    source = doc.get('_source', {})
    
    # Basic vehicle info
    make = source.get('make', 'Unknown')
    model = source.get('model', 'Unknown')
    version_name = source.get('version_name', 'Unknown')
    version_id = source.get('version_id', 'Unknown')
    
    # Key attributes
    segment = source.get('segment', 'N/A')
    body_style = source.get('body_style', 'N/A')
    fuel_type = source.get('fuel_type', 'N/A')
    transmission = source.get('transmission', 'N/A')
    
    # Build context string
    context = f"""Vehicle: {make.title()} {model.title()} - {version_name.title()}
Version ID: {version_id}
Segment: {segment}
Body Style: {body_style}
Fuel Type: {fuel_type}
Transmission: {transmission}

Key Features and Specifications:
"""
    
    # Add features from the features dict
    features = source.get('features', {})
    if features:
        # Select important feature categories
        important_features = []
        
        for key, value in features.items():
            if value and value not in ['false', 'not available', 'na', '']:
                # Format the key nicely
                readable_key = key.replace('_', ' ').title()
                important_features.append(f"- {readable_key}: {value}")
    
    return context


def call_openai_for_summary(vehicle_context, version_id):
    """
    Call OpenAI GPT-4o API to generate key_specs and stand_out_features.
    Returns dict with 'key_specs' and 'stand_out_features' arrays.
    """
    prompt = f"""Analyze the following vehicle specifications and generate a summary:

{vehicle_context}

Based on this information, provide:
1. key_specs: A list of exactly 5 important specs/features as SHORT KEYWORDS (1-3 words max each)
   Examples: "Turbocharged Engine", "AWD", "Leather Seats", "360° Camera"
   
2. stand_out_features: A list of exactly 5 descriptive standout features (30-50 characters each)
   Examples: "3.0L turbocharged mild-hybrid engine", "Ventilated leather seats with memory"

Return ONLY a valid JSON object in this exact format:
{{
  "key_specs": ["keyword1", "keyword2", "keyword3", "keyword4", "keyword5"],
  "stand_out_features": ["feature1", "feature2", "feature3", "feature4", "feature5"]
}}

Do not include any markdown formatting or additional text."""

    try:
        response = client.chat.completions.create(
            model="gpt-4.1",
            messages=[
                {"role": "system", "content": "You are a vehicle specification expert. You analyze vehicle data and provide concise, accurate summaries in JSON format."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=500
        )
        
        # Extract the response text
        response_text = response.choices[0].message.content.strip()
        
        # Remove markdown code blocks if present
        if response_text.startswith('```'):
            lines = response_text.split('\n')
            response_text = '\n'.join(lines[1:-1]) if len(lines) > 2 else response_text
            if response_text.startswith('json'):
                response_text = response_text[4:].strip()
        
        # Parse JSON
        result = json.loads(response_text)
        
        # Validate the response structure
        if 'key_specs' not in result or 'stand_out_features' not in result:
            raise ValueError("Response missing required fields")
        
        if len(result['key_specs']) != 5 or len(result['stand_out_features']) != 5:
            raise ValueError("Response must contain exactly 5 items in each list")
        
        return result
        
    except json.JSONDecodeError as e:
        raise ValueError(f"Failed to parse OpenAI response as JSON: {e}")
    except Exception as e:
        raise Exception(f"OpenAI API call failed: {str(e)}")


print("✓ Helper functions defined")
print("  - already_processed()")
print("  - extract_vehicle_context()")
print("  - call_openai_for_summary()")

✓ Helper functions defined
  - already_processed()
  - extract_vehicle_context()
  - call_openai_for_summary()


In [17]:
# Configuration
INPUT_FILE = 'version_data_with_images.json'
OUTPUT_FILE = 'final_data.json'

# Load the JSON data
print(f"Loading data from {INPUT_FILE}...")
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    data = json.load(f)

total_documents = len(data)
print(f"Total documents loaded: {total_documents}")

# Initialize counters
processed = 0
skipped = 0
errors = []

# Main processing loop
print("\nStarting processing...")
print("=" * 60)

for doc in tqdm(data, desc="Processing versions", unit="version"):
    version_id = None
    
    try:
        # Get version_id for error tracking
        version_id = doc.get('_source', {}).get('version_id', 'Unknown')
        
        # Check if already processed
        if already_processed(doc):
            skipped += 1
            continue
        
        # Extract vehicle context
        vehicle_context = extract_vehicle_context(doc)
        
        # Call OpenAI API
        result = call_openai_for_summary(vehicle_context, version_id)
        
        # Update document with results
        doc['_source']['key_specs'] = result['key_specs']
        doc['_source']['stand_out_features'] = result['stand_out_features']
        
        # Save progress immediately after each successful update
        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        
        processed += 1
        
    except KeyboardInterrupt:
        print("\n\n⚠️  Process interrupted by user")
        print(f"Progress saved up to version_id: {version_id}")
        break
        
    except Exception as e:
        # Log error and continue with next document
        error_info = {
            'version_id': version_id if version_id else 'Unknown',
            'error': str(e),
            'error_type': type(e).__name__
        }
        errors.append(error_info)
        continue

print("\n" + "=" * 60)
print("Processing Complete!")
print("=" * 60)
print(f"✓ Successfully processed: {processed}")
print(f"⊘ Skipped (already processed): {skipped}")
print(f"✗ Errors encountered: {len(errors)}")
print(f"\nTotal documents: {total_documents}")
print(f"Documents with summaries: {processed + skipped}")
print(f"Remaining documents: {total_documents - processed - skipped - len(errors)}")

Loading data from version_data_with_images.json...
Total documents loaded: 1974

Starting processing...


Processing versions: 100%|██████████| 1974/1974 [1:10:09<00:00,  2.13s/version]


Processing Complete!
✓ Successfully processed: 1974
⊘ Skipped (already processed): 0
✗ Errors encountered: 0

Total documents: 1974
Documents with summaries: 1974
Remaining documents: 0





In [18]:
# Error Analysis and Reporting

if errors:
    print("\n" + "=" * 60)
    print("ERROR DETAILS")
    print("=" * 60)
    
    # Group errors by type
    error_types = {}
    for error in errors:
        error_type = error.get('error_type', 'Unknown')
        if error_type not in error_types:
            error_types[error_type] = []
        error_types[error_type].append(error)
    
    # Display errors grouped by type
    for error_type, error_list in error_types.items():
        print(f"\n{error_type} ({len(error_list)} occurrences):")
        for i, error in enumerate(error_list[:5], 1):  # Show first 5 of each type
            print(f"  {i}. Version ID: {error['version_id']}")
            print(f"     Error: {error['error'][:100]}...")  # Truncate long errors
        
        if len(error_list) > 5:
            print(f"  ... and {len(error_list) - 5} more")
    
    # Save errors to file for review
    with open('openai_enrichment_errors.json', 'w', encoding='utf-8') as f:
        json.dump(errors, f, indent=2, ensure_ascii=False)
    
    print(f"\n✓ Full error log saved to: openai_enrichment_errors.json")
    print(f"\nYou can retry failed version_ids by re-running the processing cell.")
else:
    print("\n✓ No errors encountered during processing!")

# Display sample of enriched data
print("\n" + "=" * 60)
print("SAMPLE ENRICHED DATA")
print("=" * 60)

for doc in data:
    if 'key_specs' in doc.get('_source', {}):
        source = doc['_source']
        print(f"\nVehicle: {source.get('make', '').title()} {source.get('model', '').title()} - {source.get('version_name', '').title()}")
        print(f"Version ID: {source.get('version_id', '')}")
        print(f"\nKey Specs:")
        for spec in source.get('key_specs', []):
            print(f"  • {spec}")
        print(f"\nStand-Out Features:")
        for feature in source.get('stand_out_features', []):
            print(f"  • {feature}")
        print("-" * 60)
        break  # Show only first enriched example

print("\n✓ Processing complete! Check final_data.json for full results.")


✓ No errors encountered during processing!

SAMPLE ENRICHED DATA

Vehicle: Land Rover Discovery - Metropolitan Edition
Version ID: 15625

Key Specs:
  • Diesel Engine
  • Automatic Transmission
  • Full-Size SUV
  • Luxury Segment
  • All-Wheel Drive

Stand-Out Features:
  • Premium metropolitan edition trim level
  • Spacious seven-seat interior layout
  • Advanced automatic transmission system
  • Robust diesel powertrain for efficiency
  • Full-size SUV with luxury appointments
------------------------------------------------------------

✓ Processing complete! Check final_data.json for full results.
