In [7]:
# Install required library (uncomment if needed)
# !pip install opensearch-py

from opensearchpy import OpenSearch
import json
import warnings

# Suppress SSL warnings
warnings.filterwarnings('ignore')

In [3]:
def fetch_all_documents(client, index_name, output_file):
    """
    Fetch all documents from an OpenSearch index and save them to a file.
    
    Args:
        client: OpenSearch client instance
        index_name: Name of the index to fetch documents from
        output_file: Path to the output file (will be saved as JSON)
    
    Returns:
        int: Total number of documents fetched
    """
    all_documents = []
    scroll_size = 1000  # Number of documents per scroll
    
    # Initial search with scroll
    response = client.search(
        index=index_name,
        body={
            "query": {"match_all": {}},
            "size": scroll_size
        },
        scroll='2m'  # Keep scroll context alive for 2 minutes
    )
    
    scroll_id = response['_scroll_id']
    hits = response['hits']['hits']
    
    # Add initial batch of documents
    for hit in hits:
        all_documents.append({
            '_id': hit['_id'],
            '_source': hit['_source']
        })
    
    print(f"Fetched {len(hits)} documents...")
    
    # Continue scrolling until no more documents
    while len(hits) > 0:
        response = client.scroll(
            scroll_id=scroll_id,
            scroll='2m'
        )
        
        scroll_id = response['_scroll_id']
        hits = response['hits']['hits']
        
        for hit in hits:
            all_documents.append({
                '_id': hit['_id'],
                '_source': hit['_source']
            })
        
        if len(hits) > 0:
            print(f"Fetched {len(all_documents)} documents so far...")
    
    # Clear the scroll context
    client.clear_scroll(scroll_id=scroll_id)
    
    # Save to file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(all_documents, f, indent=2, ensure_ascii=False)
    
    print(f"\nTotal documents fetched: {len(all_documents)}")
    print(f"Data saved to: {output_file}")
    
    return len(all_documents)

In [10]:
# OpenSearch configuration
config = {
    'hosts': ['https://os-dev.cwsystem.in/'],
    'http_auth': None,  # Add ('username', 'password') if authentication is required
    'use_ssl': True,
    'verify_certs': False,
    'ssl_assert_hostname': False,
    'ssl_show_warn': False,
    'timeout': 3000
}

# Create OpenSearch client
client = OpenSearch(**config)

# Test connection
try:
    info = client.info()
    print("Connected to OpenSearch successfully!")
    print(f"Cluster: {info['cluster_name']}")
    print(f"Version: {info['version']['number']}")
except Exception as e:
    print(f"Failed to connect: {e}")

Connected to OpenSearch successfully!
Cluster: 408531640850:opensearch-dev
Version: 7.10.2


In [5]:
# Fetch all documents from feb_2026_vehicle_versions index
index_name = "feb_2026_vehicle_versions"
output_file = "version_data.json"

try:
    total_docs = fetch_all_documents(client, index_name, output_file)
    print(f"\n✓ Successfully fetched and saved {total_docs} documents!")
except Exception as e:
    print(f"Error: {e}")

Fetched 1000 documents...
Fetched 1974 documents so far...

Total documents fetched: 1974
Data saved to: version_data.json

✓ Successfully fetched and saved 1974 documents!


In [4]:
# Inspect the structure of both files
print("=== Images Data Structure ===")
with open('images_data.json', 'r', encoding='utf-8') as f:
    images_data = json.load(f)
    print(f"Total images records: {len(images_data)}")
    print(f"First record sample:")
    print(json.dumps(images_data[0], indent=2)[:500])

print("\n=== Version Data Structure ===")
with open('version_data.json', 'r', encoding='utf-8') as f:
    version_data = json.load(f)
    print(f"Total version records: {len(version_data)}")
    print(f"First record sample:")
    print(json.dumps(version_data[0], indent=2)[:500])

=== Images Data Structure ===
Total images records: 1993
First record sample:
{
  "makeid": 36,
  "versionid": 9183,
  "fueltype": "Petrol",
  "averagerating": null,
  "version_name": "Modena S",
  "model_name": "Levante",
  "imagepath": "/n/cw/ec/26924/levante-exterior-right-front-three-quarter-4.png?isig=0",
  "modelid": 1090,
  "make_name": "Maserati"
}

=== Version Data Structure ===
Total version records: 1974
First record sample:
{
  "_id": "pbQDTJwBqBXe7gSKoBph",
  "_source": {
    "vehicle_id": "land_rover_discovery_metropolitan_edition_2023",
    "make": "land rover",
    "model": "discovery",
    "version_id": "15625",
    "version_name": "metropolitan edition",
    "segment": "luxury",
    "body_style": "suvfull-size suv",
    "fuel_type": "diesel",
    "transmission": "automatic (tc)",
    "model_trim": "hse",
    "version_status": "new",
    "features": {
      "ride_height_adjustment": "true",
      "anti_lock_br


In [5]:
def add_imagepath_to_version_data(version_data_file, images_data_file, output_file=None):
    """
    Add imagepath and averagerating from images_data.json to version_data.json in the _source field.
    
    Args:
        version_data_file: Path to version_data.json file
        images_data_file: Path to images_data.json file
        output_file: Path to save the updated data (optional, defaults to version_data_file)
    
    Returns:
        dict: Statistics about the operation
    """
    # Load both JSON files
    print("Loading data files...")
    with open(version_data_file, 'r', encoding='utf-8') as f:
        version_data = json.load(f)
    
    with open(images_data_file, 'r', encoding='utf-8') as f:
        images_data = json.load(f)
    
    print(f"Loaded {len(version_data)} version records")
    print(f"Loaded {len(images_data)} image records")
    
    # Create a mapping dictionary: versionid -> {imagepath, averagerating}
    # Handle potential multiple images per version by taking the first one
    version_to_data = {}
    for img_record in images_data:
        version_id = str(img_record.get('versionid', ''))
        if version_id and version_id not in version_to_data:
            version_to_data[version_id] = {
                'imagepath': img_record.get('imagepath', ''),
                'averagerating': img_record.get('averagerating')
            }
    
    print(f"Created mapping for {len(version_to_data)} unique version IDs")
    
    # Add imagepath and averagerating to each record's _source field
    matched_count = 0
    unmatched_count = 0
    
    for record in version_data:
        if '_source' in record:
            version_id = str(record['_source'].get('version_id', ''))
            
            if version_id in version_to_data:
                record['_source']['imagepath'] = version_to_data[version_id]['imagepath']
                record['_source']['averagerating'] = version_to_data[version_id]['averagerating']
                matched_count += 1
            else:
                record['_source']['imagepath'] = None
                record['_source']['averagerating'] = None
                unmatched_count += 1
    
    # Save the updated data
    if output_file is None:
        output_file = version_data_file
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(version_data, f, indent=2, ensure_ascii=False)
    
    stats = {
        'total_records': len(version_data),
        'matched': matched_count,
        'unmatched': unmatched_count,
        'output_file': output_file
    }
    
    print(f"\n{'='*50}")
    print(f"Operation Complete!")
    print(f"{'='*50}")
    print(f"Total records processed: {stats['total_records']}")
    print(f"Records with imagepath and averagerating added: {stats['matched']}")
    print(f"Records without matching data: {stats['unmatched']}")
    print(f"Updated data saved to: {stats['output_file']}")
    
    return stats

In [6]:
# Execute the function to add imagepath to version data
stats = add_imagepath_to_version_data(
    version_data_file='version_data.json',
    images_data_file='images_data.json',
    output_file='version_data_with_images.json'  # Save to a new file to preserve original
)

Loading data files...
Loaded 1974 version records
Loaded 1993 image records
Created mapping for 1993 unique version IDs

Operation Complete!
Total records processed: 1974
Records with imagepath and averagerating added: 1954
Records without matching data: 20
Updated data saved to: version_data_with_images.json


In [25]:
# Verify the result - show a sample record with imagepath
print("\n=== Sample Record with ImagePath ===")
with open('version_data_with_images.json', 'r', encoding='utf-8') as f:
    updated_data = json.load(f)
    
    # Find a record that has an imagepath
    for record in updated_data:
        if record.get('_source', {}).get('imagepath'):
            print(json.dumps(record, indent=2)[:1000])
            print("...")
            break


=== Sample Record with ImagePath ===
{
  "_id": "pbQDTJwBqBXe7gSKoBph",
  "_source": {
    "vehicle_id": "land_rover_discovery_metropolitan_edition_2023",
    "make": "land rover",
    "model": "discovery",
    "version_id": "15625",
    "version_name": "metropolitan edition",
    "segment": "luxury",
    "body_style": "suvfull-size suv",
    "fuel_type": "diesel",
    "transmission": "automatic (tc)",
    "model_trim": "hse",
    "version_status": "new",
    "features": {
      "ride_height_adjustment": "true",
      "anti_lock_braking_system": "true",
      "four_wheel_drive": "full-time",
      "hill_hold_control": "true",
      "limited_slip_differential": "false",
      "brake_assist": "true",
      "electronic_stability_program": "true",
      "electronic_brake_force_distribution": "true",
      "hill_descent_control": "true",
      "traction_control_system": "true",
      "front_suspension": "fully independent, double wishbones with coil springs",
      "rear_brake_type": "vent

In [11]:
def upload_to_opensearch_index(client, source_file, index_name, batch_size=500):
    """
    Upload documents to OpenSearch index from version_data_with_images.json.
    Extracts only the _source content (without _id and _source wrapper).
    
    Args:
        client: OpenSearch client instance
        source_file: Path to the JSON file containing documents
        index_name: Name of the target index in OpenSearch
        batch_size: Number of documents per bulk upload batch
    
    Returns:
        dict: Statistics about the upload operation
    """
    from opensearchpy import helpers
    
    print(f"{'='*60}")
    print(f"Starting upload to index: {index_name}")
    print(f"{'='*60}\n")
    
    # Load the source data
    print("Loading source data...")
    with open(source_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"Loaded {len(data)} documents from {source_file}\n")
    
    # Check if index exists
    if client.indices.exists(index=index_name):
        print(f"Index '{index_name}' already exists.")
        response = input("Do you want to delete and recreate it? (yes/no): ")
        if response.lower() in ['yes', 'y']:
            client.indices.delete(index=index_name)
            print(f"Deleted existing index '{index_name}'")
        else:
            print("Aborting upload. Index already exists.")
            return None
    
    # Create the new index
    print(f"Creating index '{index_name}'...")
    client.indices.create(index=index_name)
    print(f"Index '{index_name}' created successfully\n")
    
    # Prepare documents for bulk upload
    print("Preparing documents for upload...")
    actions = []
    
    for record in data:
        # Extract only the _source content
        doc_content = record.get('_source', {})
        
        # Use vehicle_id as the document ID for uniqueness
        doc_id = doc_content.get('vehicle_id', '')
        
        if doc_id:
            action = {
                '_index': index_name,
                '_id': doc_id,
                '_source': doc_content
            }
            actions.append(action)
    
    print(f"Prepared {len(actions)} documents for upload\n")
    
    # Bulk upload with progress tracking
    print("Starting bulk upload...")
    success_count = 0
    error_count = 0
    
    # Process in batches
    for i in range(0, len(actions), batch_size):
        batch = actions[i:i+batch_size]
        try:
            success, failed = helpers.bulk(client, batch, stats_only=True, raise_on_error=False)
            success_count += success
            error_count += len(batch) - success
            
            print(f"Progress: {i+len(batch)}/{len(actions)} documents processed "
                  f"(Success: {success_count}, Errors: {error_count})")
        except Exception as e:
            print(f"Error in batch {i}-{i+len(batch)}: {e}")
            error_count += len(batch)
    
    # Refresh the index to make documents searchable
    client.indices.refresh(index=index_name)
    
    # Get final count from index
    count_response = client.count(index=index_name)
    final_count = count_response['count']
    
    stats = {
        'total_documents': len(data),
        'documents_prepared': len(actions),
        'success_count': success_count,
        'error_count': error_count,
        'final_index_count': final_count,
        'index_name': index_name
    }
    
    print(f"\n{'='*60}")
    print(f"Upload Complete!")
    print(f"{'='*60}")
    print(f"Total documents in source file: {stats['total_documents']}")
    print(f"Documents prepared for upload: {stats['documents_prepared']}")
    print(f"Successfully uploaded: {stats['success_count']}")
    print(f"Errors: {stats['error_count']}")
    print(f"Final document count in index: {stats['final_index_count']}")
    
    return stats

In [None]:
# Upload documents to the new index
upload_stats = upload_to_opensearch_index(
    client=client,
    source_file='version_data_with_images.json',
    index_name='mcp_version_data',
    batch_size=500
)

Starting upload to index: mcp_version_data

Loading source data...
Loaded 1974 documents from final_data.json

Creating index 'mcp_version_data'...
Index 'mcp_version_data' created successfully

Preparing documents for upload...
Prepared 1974 documents for upload

Starting bulk upload...
Progress: 500/1974 documents processed (Success: 500, Errors: 0)
Progress: 1000/1974 documents processed (Success: 1000, Errors: 0)
Progress: 1500/1974 documents processed (Success: 1500, Errors: 0)
Progress: 1974/1974 documents processed (Success: 1974, Errors: 0)

Upload Complete!
Total documents in source file: 1974
Documents prepared for upload: 1974
Successfully uploaded: 1974
Errors: 0
Final document count in index: 1974


In [21]:
# Verify the upload - check document count and structure
print("\n" + "="*60)
print("Verification: Checking uploaded documents")
print("="*60 + "\n")

# Get index stats
index_name = 'version_data_mcp'
count_response = client.count(index=index_name)
print(f"Total documents in '{index_name}': {count_response['count']}\n")

# Fetch a sample document to verify structure
sample_query = {
    "query": {"match_all": {}},
    "size": 1
}

response = client.search(index=index_name, body=sample_query)

if response['hits']['hits']:
    sample_doc = response['hits']['hits'][0]
    print("Sample document structure:")
    print(f"Document ID: {sample_doc['_id']}")
    print(f"\nDocument Content (_source):")
    print(json.dumps(sample_doc['_source'], indent=2)[:1500])
    print("\n... (truncated)")
    
    # Verify that _source contains the vehicle data directly (not wrapped)
    source = sample_doc['_source']
    print(f"\n✓ Document contains 'vehicle_id': {source.get('vehicle_id', 'NOT FOUND')}")
    print(f"✓ Document contains 'make': {source.get('make', 'NOT FOUND')}")
    print(f"✓ Document contains 'model': {source.get('model', 'NOT FOUND')}")
    print(f"✓ Document contains 'imagepath': {source.get('imagepath', 'NOT FOUND')}")
    print(f"\n✓ Structure verified: Documents contain only _source content without wrapper!")
else:
    print("No documents found in index!")


Verification: Checking uploaded documents

Total documents in 'version_data_mcp': 1974

Sample document structure:
Document ID: audi_q3_40_tfsi_premium_plus_2024

Document Content (_source):
{
  "vehicle_id": "audi_q3_40_tfsi_premium_plus_2024",
  "make": "audi",
  "model": "q3",
  "version_id": "18875",
  "version_name": "40 tfsi premium plus",
  "segment": "luxury",
  "body_style": "suvcompact suv",
  "fuel_type": "petrol",
  "transmission": "automatic (dct)",
  "model_trim": "premium plus",
  "version_status": "new",
  "features": {
    "backrest_bolsters_in_out": "false",
    "seat_base_bolsters_in_out": "false",
    "backrest_tilt_forward_back": "manual",
    "headrest_forward_back": "false",
    "lumbar_forward_back": "false",
    "memory_presets": "not available",
    "shoulder_support_bolsters_in_out": "false",
    "extended_thigh_support_forward_back": "false",
    "lumbar_up_down": "false",
    "seat_base_sliding": "false",
    "headrest_up_down": "manual",
    "seat_adjustm