In [1]:
import json
from collections import defaultdict, Counter
from typing import Any, Dict, List, Set
import statistics

## poi_tree_with_uuids.json

In [3]:
def analyze_json_structure(filepath: str):
    """Comprehensively analyze JSON file structure"""
    
    # Load the JSON data
    print(f"Loading JSON file: {filepath}")
    with open(filepath, 'r') as f:
        data = json.load(f)
    
    print("\n" + "="*80)
    print("JSON DATA STRUCTURE ANALYSIS")
    print("="*80)
    
    # Basic structure info
    print("\n1. TOP-LEVEL STRUCTURE")
    print("-" * 80)
    print(f"Type: {type(data).__name__}")
    
    if isinstance(data, dict):
        print(f"Number of top-level keys: {len(data)}")
        print(f"Top-level keys: {list(data.keys())[:10]}...")
        
        # Analyze level_0
        if 'level_0' in data:
            level_0 = data['level_0']
            print(f"\n'level_0' contains: {len(level_0)} items")
            
            # Sample a few POI entries
            print("\n2. SAMPLE POI ENTRIES")
            print("-" * 80)
            sample_keys = list(level_0.keys())[:3]
            for key in sample_keys:
                print(f"\nKey: {key}")
                print(f"Structure: {json.dumps(level_0[key], indent=2)[:500]}...")
            
            # Analyze POI structure
            print("\n3. POI FIELD ANALYSIS")
            print("-" * 80)
            
            # Collect all unique fields
            all_fields = set()
            data_fields = set()
            types_seen = set()
            categories = Counter()
            districts = Counter()
            regions = Counter()
            parent_types = Counter()
            
            # Statistics
            prices_min = []
            prices_max = []
            popularities = []
            latitudes = []
            longitudes = []
            
            for poi_key, poi_value in level_0.items():
                # Top-level fields
                all_fields.update(poi_value.keys())
                types_seen.add(poi_value.get('type'))
                
                # Data fields
                if 'data' in poi_value and isinstance(poi_value['data'], dict):
                    data_fields.update(poi_value['data'].keys())
                    
                    # Collect statistics
                    poi_data = poi_value['data']
                    if 'category' in poi_data:
                        categories[poi_data['category']] += 1
                    if 'district' in poi_data:
                        districts[poi_data['district']] += 1
                    if 'region' in poi_data:
                        regions[poi_data['region']] += 1
                    if 'popularity' in poi_data:
                        popularities.append(poi_data['popularity'])
                    if 'latitude' in poi_data:
                        latitudes.append(poi_data['latitude'])
                    if 'longitude' in poi_data:
                        longitudes.append(poi_data['longitude'])
                    
                    # Parse price range
                    if 'price' in poi_data:
                        try:
                            price_str = poi_data['price']
                            if '-' in price_str:
                                min_p, max_p = price_str.split('-')
                                prices_min.append(float(min_p.strip()))
                                prices_max.append(float(max_p.strip()))
                        except:
                            pass
                
                # Parent analysis
                if 'parent' in poi_value:
                    parent = poi_value['parent']
                    parent_prefix = parent.split('_')[0] if '_' in parent else parent
                    parent_types[parent_prefix] += 1
            
            print(f"Total POI entries analyzed: {len(level_0)}")
            print(f"\nTop-level fields in POI entries:")
            for field in sorted(all_fields):
                print(f"  - {field}")
            
            print(f"\nFields in 'data' object:")
            for field in sorted(data_fields):
                print(f"  - {field}")
            
            print(f"\nPOI types found: {types_seen}")
            
            print("\n4. CATEGORY DISTRIBUTION")
            print("-" * 80)
            print(f"Total unique categories: {len(categories)}")
            print(f"Top 15 categories:")
            for cat, count in categories.most_common(15):
                print(f"  {cat:30s}: {count:6d} POIs")
            
            print("\n5. GEOGRAPHIC DISTRIBUTION")
            print("-" * 80)
            print(f"Total unique districts: {len(districts)}")
            print(f"Top 10 districts:")
            for dist, count in districts.most_common(10):
                print(f"  {dist:30s}: {count:6d} POIs")
            
            print(f"\nRegions:")
            for reg, count in sorted(regions.items()):
                print(f"  {reg:30s}: {count:6d} POIs")
            
            print("\n6. PARENT CONTAINER TYPES")
            print("-" * 80)
            for parent_type, count in sorted(parent_types.items(), key=lambda x: -x[1]):
                print(f"  {parent_type:20s}: {count:6d} POIs")
            
            print("\n7. STATISTICAL SUMMARY")
            print("-" * 80)
            
            if prices_min:
                print(f"Price Range (Min):")
                print(f"  Mean: ${statistics.mean(prices_min):.2f}")
                print(f"  Median: ${statistics.median(prices_min):.2f}")
                print(f"  Range: ${min(prices_min):.2f} - ${max(prices_min):.2f}")
            
            if prices_max:
                print(f"\nPrice Range (Max):")
                print(f"  Mean: ${statistics.mean(prices_max):.2f}")
                print(f"  Median: ${statistics.median(prices_max):.2f}")
                print(f"  Range: ${min(prices_max):.2f} - ${max(prices_max):.2f}")
            
            if popularities:
                print(f"\nPopularity:")
                print(f"  Mean: {statistics.mean(popularities):.2f}")
                print(f"  Median: {statistics.median(popularities):.2f}")
                print(f"  Range: {min(popularities)} - {max(popularities)}")
                print(f"  Distribution: {Counter(popularities)}")
            
            if latitudes and longitudes:
                print(f"\nGeographic Coordinates:")
                print(f"  Latitude range: {min(latitudes):.6f} to {max(latitudes):.6f}")
                print(f"  Longitude range: {min(longitudes):.6f} to {max(longitudes):.6f}")
            
            # Check for UUIDs
            uuids_count = sum(1 for poi in level_0.values() if 'uuid' in poi)
            print(f"\nUUIDs:")
            print(f"  POIs with UUID: {uuids_count} / {len(level_0)}")
            
            print("\n8. SCHEMA TEMPLATE")
            print("-" * 80)
            print("""
Each POI entry follows this structure:
{
  "poi_X_Name": {
    "name": string,
    "type": string (e.g., "individual_poi"),
    "parent": string (container reference),
    "data": {
      "category": string,
      "latitude": float,
      "longitude": float,
      "district": string,
      "street": string,
      "price": string (range format "X.XX - Y.YY"),
      "popularity": integer (1-5),
      "characteristic": string (hashtags),
      "region": string
    },
    "spatial": [latitude, longitude],
    "textual": string (description),
    "uuid": string (UUID v4)
  }
}
            """)
    
    print("\n" + "="*80)
    print("Analysis complete!")
    print("="*80)


In [4]:
if __name__ == "__main__":
    filepath = './Files/poi_tree_with_uuids.json'
    analyze_json_structure(filepath)

Loading JSON file: ./Files/poi_tree_with_uuids.json

JSON DATA STRUCTURE ANALYSIS

1. TOP-LEVEL STRUCTURE
--------------------------------------------------------------------------------
Type: dict
Number of top-level keys: 4
Top-level keys: ['level_0', 'level_1', 'level_2', 'level_3']...

'level_0' contains: 4696 items

2. SAMPLE POI ENTRIES
--------------------------------------------------------------------------------

Key: poi_0_Giant
Structure: {
  "name": "Giant",
  "type": "individual_poi",
  "parent": "container_Paya_Lebar_Square",
  "data": {
    "category": "supermarket",
    "latitude": 1.3794345,
    "longitude": 103.88795,
    "district": "HOUGANG",
    "street": "Hougang Street 51",
    "price": "9.78 - 12.88",
    "popularity": 5,
    "characteristic": "#budget, #essentials",
    "region": "NORTH-EAST"
  },
  "spatial": [
    1.3794345,
    103.88795
  ],
  "textual": "Giant supermarket #budget, #essentials",
  "uuid": "d401d...

Key: poi_1_NTUC_FairPrice
Structure: {
 