In [3]:
import pandas as pd
import numpy as np
from typing import Dict, List, Set, Tuple
from collections import defaultdict
import json

In [4]:
class POITreeBuilder:
    def __init__(self, csv_file: str):
        """
        Initialize POI Tree Builder
        
        Args:
            csv_file: Path to CSV with columns:
                     name, category, latitude, longitude, district, street, 
                     price, popularity, characteristic, region
        """
        self.df = pd.read_csv(csv_file)
        self.df.columns = ['name', 'category', 'latitude', 'longitude', 
                          'district', 'street', 'price', 'popularity', 
                          'characteristic', 'region']
        
        # Container detection keywords
        self.CONTAINER_KEYWORDS = {
            "mall", "centre", "plaza", "tower", "hub", "center", "city", 
            "junction", "terminal", "complex", "central", "point", "square", 
            "village", "orchard", "building", "galleria", "arcade", "emporium"
        }
        
        self.CONTAINER_CATEGORIES = {
            "shopping_mall", "university", "hospital", "airport", "station",
            "stadium", "convention_center", "hotel"
        }
        
        self.tree = {
            'level_0': {},  # Individual POIs
            'level_1': {},  # Container POIs / Street clusters
            'level_2': {},  # Districts
            'level_3': {}   # Regions
        }
        
        self.container_pois = set()
        self.individual_pois = set()
        self.poi_to_container_map = {}
        
    def detect_containers(self) -> Set[str]:
        """
        Detect which POIs are containers using multiple strategies
        
        Returns:
            Set of container POI names
        """
        containers = set()
        
        # Strategy 1: Check name for container keywords
        for idx, row in self.df.iterrows():
            name_lower = str(row['name']).lower()
            if any(keyword in name_lower for keyword in self.CONTAINER_KEYWORDS):
                containers.add(row['name'])
                print(f"Container detected (keyword): {row['name']}")
        
        # Strategy 2: Check category
        category_containers = self.df[
            self.df['category'].isin(self.CONTAINER_CATEGORIES)
        ]['name'].tolist()
        containers.update(category_containers)
        for name in category_containers:
            print(f"Container detected (category): {name}")
        
        # Strategy 3: Check if POI name appears in street/address column
        all_streets = self.df['street'].dropna().unique()
        for street in all_streets:
            # Check if this street value matches any POI name
            matching_pois = self.df[
                self.df['name'].str.lower() == str(street).lower()
            ]['name'].tolist()
            
            if matching_pois:
                containers.update(matching_pois)
                print(f"Container detected (in address): {matching_pois[0]}")
            
            # Also check partial matches (e.g., "Suntec City" in street)
            for idx, row in self.df.iterrows():
                poi_name = str(row['name'])
                if poi_name not in containers and len(poi_name) > 5:
                    if poi_name.lower() in str(street).lower():
                        containers.add(poi_name)
                        print(f"Container detected (partial match): {poi_name}")
        
        return containers
    
    def map_pois_to_containers(self) -> Dict[str, str]:
        """
        Map individual POIs to their container POIs
        
        Returns:
            Dictionary mapping individual POI name -> container POI name
        """
        poi_container_map = {}
        
        for idx, row in self.df.iterrows():
            poi_name = row['name']
            street = str(row['street'])
            
            # Skip if this POI is itself a container
            if poi_name in self.container_pois:
                continue
            
            # Check if street matches a container POI name
            for container in self.container_pois:
                # Exact match
                if container.lower() == street.lower():
                    poi_container_map[poi_name] = container
                    break
                # Partial match (container name in street)
                elif container.lower() in street.lower() and len(container) > 5:
                    poi_container_map[poi_name] = container
                    break
                # Check if POI and container are at same location (within 50m)
                container_row = self.df[self.df['name'] == container].iloc[0]
                distance = self._haversine_distance(
                    row['latitude'], row['longitude'],
                    container_row['latitude'], container_row['longitude']
                )
                if distance < 0.05:  # Within 50 meters
                    poi_container_map[poi_name] = container
                    break
        
        return poi_container_map
    
    def _haversine_distance(self, lat1: float, lon1: float, 
                           lat2: float, lon2: float) -> float:
        """
        Calculate distance between two coordinates in kilometers
        """
        from math import radians, sin, cos, sqrt, atan2
        
        R = 6371  # Earth's radius in km
        
        lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        
        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        c = 2 * atan2(sqrt(a), sqrt(1-a))
        
        return R * c
    
    def build_level_0(self):
        """Build Level 0: Individual POIs"""
        print("\n=== Building Level 0: Individual POIs ===")
        
        for idx, row in self.df.iterrows():
            poi_name = row['name']
            
            # Skip containers at this level
            if poi_name in self.container_pois:
                continue
            
            poi_id = f"poi_{idx}_{poi_name.replace(' ', '_')}"
            
            # Determine parent (container or street cluster)
            parent_id = None
            if poi_name in self.poi_to_container_map:
                container_name = self.poi_to_container_map[poi_name]
                parent_id = f"container_{container_name.replace(' ', '_')}"
            else:
                # Use street as parent
                street_district = f"{row['street']}_{row['district']}"
                parent_id = f"street_{street_district.replace(' ', '_')}"
            
            self.tree['level_0'][poi_id] = {
                'name': poi_name,
                'type': 'individual_poi',
                'parent': parent_id,
                'data': {
                    'category': row['category'],
                    'latitude': row['latitude'],
                    'longitude': row['longitude'],
                    'district': row['district'],
                    'street': row['street'],
                    'price': row['price'],
                    'popularity': row['popularity'],
                    'characteristic': row['characteristic'],
                    'region': row['region']
                },
                'spatial': (row['latitude'], row['longitude']),
                'textual': f"{row['name']} {row['category']} {row['characteristic']}"
            }
            
            self.individual_pois.add(poi_name)
        
        print(f"Created {len(self.tree['level_0'])} individual POIs")
    
    def build_level_1(self):
        """Build Level 1: Container POIs and Street Clusters"""
        print("\n=== Building Level 1: Containers & Street Clusters ===")
        
        # First, add container POIs
        for idx, row in self.df.iterrows():
            poi_name = row['name']
            
            if poi_name not in self.container_pois:
                continue
            
            container_id = f"container_{poi_name.replace(' ', '_')}"
            
            # Find all children (individual POIs inside this container)
            children = [
                poi_id for poi_id, poi_data in self.tree['level_0'].items()
                if poi_data['parent'] == container_id
            ]
            
            # Aggregate textual features from children
            child_texts = [
                self.tree['level_0'][child]['textual'] 
                for child in children
            ]
            aggregated_text = f"{poi_name} {row['category']} {row['characteristic']} " + " ".join(child_texts)
            
            # Parent is district
            parent_id = f"district_{row['district'].replace(' ', '_')}_{row['region'].replace(' ', '_')}"
            
            self.tree['level_1'][container_id] = {
                'name': poi_name,
                'type': 'container_poi',
                'parent': parent_id,
                'children': children,
                'data': {
                    'category': row['category'],
                    'latitude': row['latitude'],
                    'longitude': row['longitude'],
                    'district': row['district'],
                    'street': row['street'],
                    'region': row['region'],
                    'num_entities': len(children)
                },
                'spatial': (row['latitude'], row['longitude']),
                'textual': aggregated_text
            }
        
        # Second, create street clusters for POIs without containers
        street_clusters = defaultdict(list)
        
        for poi_id, poi_data in self.tree['level_0'].items():
            if poi_data['parent'].startswith('street_'):
                street_clusters[poi_data['parent']].append(poi_id)
        
        for street_id, children in street_clusters.items():
            # Calculate centroid
            lats = [self.tree['level_0'][child]['spatial'][0] for child in children]
            lons = [self.tree['level_0'][child]['spatial'][1] for child in children]
            centroid = (np.mean(lats), np.mean(lons))
            
            # Get district and region from first child
            first_child = self.tree['level_0'][children[0]]
            district = first_child['data']['district']
            region = first_child['data']['region']
            street = first_child['data']['street']
            
            # Aggregate text
            child_texts = [self.tree['level_0'][child]['textual'] for child in children]
            aggregated_text = f"{street} " + " ".join(child_texts)
            
            parent_id = f"district_{district.replace(' ', '_')}_{region.replace(' ', '_')}"
            
            self.tree['level_1'][street_id] = {
                'name': street,
                'type': 'street_cluster',
                'parent': parent_id,
                'children': children,
                'data': {
                    'district': district,
                    'region': region,
                    'num_entities': len(children)
                },
                'spatial': centroid,
                'textual': aggregated_text
            }
        
        print(f"Created {len(self.tree['level_1'])} Level 1 nodes "
              f"({len([n for n in self.tree['level_1'].values() if n['type'] == 'container_poi'])} containers, "
              f"{len([n for n in self.tree['level_1'].values() if n['type'] == 'street_cluster'])} street clusters)")
    
    def build_level_2(self):
        """Build Level 2: Districts"""
        print("\n=== Building Level 2: Districts ===")
        
        district_data = defaultdict(lambda: {
            'children': [],
            'pois': [],
            'spatial_coords': [],
            'texts': []
        })
        
        # Aggregate from level 1
        for node_id, node_data in self.tree['level_1'].items():
            district = node_data['data']['district']
            region = node_data['data']['region']
            district_key = f"district_{district.replace(' ', '_')}_{region.replace(' ', '_')}"
            
            district_data[district_key]['children'].append(node_id)
            district_data[district_key]['spatial_coords'].append(node_data['spatial'])
            district_data[district_key]['texts'].append(node_data['textual'])
            district_data[district_key]['district'] = district
            district_data[district_key]['region'] = region
        
        # Create district nodes
        for district_id, data in district_data.items():
            # Calculate centroid
            lats = [coord[0] for coord in data['spatial_coords']]
            lons = [coord[1] for coord in data['spatial_coords']]
            centroid = (np.mean(lats), np.mean(lons))
            
            # Aggregate text
            aggregated_text = f"{data['district']} " + " ".join(data['texts'])
            
            parent_id = f"region_{data['region'].replace(' ', '_')}"
            
            self.tree['level_2'][district_id] = {
                'name': data['district'],
                'type': 'district',
                'parent': parent_id,
                'children': data['children'],
                'data': {
                    'region': data['region'],
                    'num_level1_nodes': len(data['children'])
                },
                'spatial': centroid,
                'textual': aggregated_text
            }
        
        print(f"Created {len(self.tree['level_2'])} districts")
    
    def build_level_3(self):
        """Build Level 3: Regions"""
        print("\n=== Building Level 3: Regions ===")
        
        region_data = defaultdict(lambda: {
            'children': [],
            'spatial_coords': [],
            'texts': []
        })
        
        # Aggregate from level 2
        for district_id, district_node in self.tree['level_2'].items():
            region = district_node['data']['region']
            region_key = f"region_{region.replace(' ', '_')}"
            
            region_data[region_key]['children'].append(district_id)
            region_data[region_key]['spatial_coords'].append(district_node['spatial'])
            region_data[region_key]['texts'].append(district_node['textual'])
            region_data[region_key]['region'] = region
        
        # Create region nodes
        for region_id, data in region_data.items():
            # Calculate centroid
            lats = [coord[0] for coord in data['spatial_coords']]
            lons = [coord[1] for coord in data['spatial_coords']]
            centroid = (np.mean(lats), np.mean(lons))
            
            # Aggregate text
            aggregated_text = f"{data['region']} " + " ".join(data['texts'])
            
            self.tree['level_3'][region_id] = {
                'name': data['region'],
                'type': 'region',
                'parent': None,  # Top level
                'children': data['children'],
                'data': {
                    'num_districts': len(data['children'])
                },
                'spatial': centroid,
                'textual': aggregated_text
            }
        
        print(f"Created {len(self.tree['level_3'])} regions")
    
    def build(self) -> Dict:
        """
        Build complete POI tree
        
        Returns:
            Complete tree structure
        """
        print("="*60)
        print("Starting POI Tree Construction")
        print("="*60)
        
        # Step 1: Detect containers
        print("\n=== Step 1: Container Detection ===")
        self.container_pois = self.detect_containers()
        print(f"\nTotal containers found: {len(self.container_pois)}")
        
        # Step 2: Map POIs to containers
        print("\n=== Step 2: Mapping POIs to Containers ===")
        self.poi_to_container_map = self.map_pois_to_containers()
        print(f"Mapped {len(self.poi_to_container_map)} POIs to containers")
        
        # Step 3: Build hierarchy levels
        self.build_level_0()
        self.build_level_1()
        self.build_level_2()
        self.build_level_3()
        
        print("\n" + "="*60)
        print("POI Tree Construction Complete!")
        print("="*60)
        self.print_summary()
        
        return self.tree
    
    def print_summary(self):
        """Print tree statistics"""
        print(f"\nTree Summary:")
        print(f"  Level 0 (Individual POIs): {len(self.tree['level_0'])} nodes")
        print(f"  Level 1 (Containers/Streets): {len(self.tree['level_1'])} nodes")
        print(f"  Level 2 (Districts): {len(self.tree['level_2'])} nodes")
        print(f"  Level 3 (Regions): {len(self.tree['level_3'])} nodes")
        print(f"  Total nodes: {sum(len(level) for level in self.tree.values())}")
    
    def save_tree(self, output_file: str = 'poi_tree.json'):
        """Save tree to JSON file"""
        # Convert to serializable format
        serializable_tree = {}
        for level, nodes in self.tree.items():
            serializable_tree[level] = {}
            for node_id, node_data in nodes.items():
                serializable_tree[level][node_id] = {
                    k: (str(v) if isinstance(v, (np.floating, np.integer)) else v)
                    for k, v in node_data.items()
                }
        
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(serializable_tree, f, indent=2, ensure_ascii=False)
        
        print(f"\nTree saved to {output_file}")
    
    def visualize_sample(self, num_samples: int = 3):
        """Print sample tree paths"""
        print(f"\n=== Sample Tree Paths ===")
        
        sample_pois = list(self.tree['level_0'].keys())[:num_samples]
        
        for poi_id in sample_pois:
            path = [f"L0: {self.tree['level_0'][poi_id]['name']}"]
            
            # Level 1
            parent_l1 = self.tree['level_0'][poi_id]['parent']
            if parent_l1 in self.tree['level_1']:
                path.append(f"L1: {self.tree['level_1'][parent_l1]['name']} "
                          f"({self.tree['level_1'][parent_l1]['type']})")
                
                # Level 2
                parent_l2 = self.tree['level_1'][parent_l1]['parent']
                if parent_l2 in self.tree['level_2']:
                    path.append(f"L2: {self.tree['level_2'][parent_l2]['name']}")
                    
                    # Level 3
                    parent_l3 = self.tree['level_2'][parent_l2]['parent']
                    if parent_l3 in self.tree['level_3']:
                        path.append(f"L3: {self.tree['level_3'][parent_l3]['name']}")
            
            print("\n" + " → ".join(reversed(path)))
    
    def get_node_by_name(self, name: str, level: int = None):
        """
        Find node by name
        
        Args:
            name: POI name
            level: Optional level to search in (0-3)
        
        Returns:
            (node_id, node_data, level) or None
        """
        levels_to_search = [f'level_{level}'] if level is not None else self.tree.keys()
        
        for level_key in levels_to_search:
            for node_id, node_data in self.tree[level_key].items():
                if node_data['name'].lower() == name.lower():
                    return (node_id, node_data, level_key)
        
        return None
    
    def get_ancestors(self, node_id: str, current_level: str) -> List[Tuple]:
        """
        Get all ancestor nodes
        
        Returns:
            List of (node_id, node_data, level) tuples
        """
        ancestors = []
        
        level_num = int(current_level.split('_')[1])
        current_node = self.tree[current_level][node_id]
        
        while current_node.get('parent') and level_num < 3:
            level_num += 1
            next_level = f'level_{level_num}'
            parent_id = current_node['parent']
            
            if parent_id in self.tree[next_level]:
                current_node = self.tree[next_level][parent_id]
                ancestors.append((parent_id, current_node, next_level))
            else:
                break
        
        return ancestors
    
    def get_descendants(self, node_id: str, current_level: str) -> List[Tuple]:
        """
        Get all descendant nodes (recursive)
        
        Returns:
            List of (node_id, node_data, level) tuples
        """
        descendants = []
        
        def _recurse(nid, lvl):
            level_num = int(lvl.split('_')[1])
            if level_num == 0:
                return
            
            node = self.tree[lvl][nid]
            if 'children' in node:
                child_level = f'level_{level_num - 1}'
                for child_id in node['children']:
                    if child_id in self.tree[child_level]:
                        child_node = self.tree[child_level][child_id]
                        descendants.append((child_id, child_node, child_level))
                        _recurse(child_id, child_level)
        
        _recurse(node_id, current_level)
        return descendants

In [6]:
if __name__ == "__main__":
    builder = POITreeBuilder('POI_with_region.csv')
    POI_tree = builder.build()

    builder.save_tree('poi_tree.json')
    builder.visualize_sample(num_samples=5)

    result = builder.get_node_by_name("Din Tai Fung")
    if result:
        node_id, node_data, level = result
        print(f"\nFound 'Din Tai Fung' at {level}")
        print(f"  Location: {node_data['spatial']}")
        print(f"  Parent: {node_data['parent']}")
        
        # Get ancestors
        ancestors = builder.get_ancestors(node_id, level)
        print(f"  Ancestors: {[a[1]['name'] for a in ancestors]}")
    
    # Find a container
    result = builder.get_node_by_name("Suntec City")
    if result:
        node_id, node_data, level = result
        print(f"\nFound 'Suntec City' at {level}")
        print(f"  Type: {node_data['type']}")
        print(f"  Children: {len(node_data.get('children', []))}")
        
        # Get descendants
        descendants = builder.get_descendants(node_id, level)
        print(f"  Total descendants: {len(descendants)}")


Starting POI Tree Construction

=== Step 1: Container Detection ===
Container detected (keyword): Good Price Centre
Container detected (keyword): Starhub
Container detected (keyword): The Poiz Centre
Container detected (keyword): Clarke Quay Central
Container detected (keyword): Changi Airport Terminal 1
Container detected (keyword): Sim Lim Square
Container detected (keyword): Broadway Plaza
Container detected (keyword): Rivervale Plaza
Container detected (keyword): Rivervale Mall
Container detected (keyword): Hougang Mall
Container detected (keyword): Heartland Mall
Container detected (keyword): Jubilee Square
Container detected (keyword): Raffles City Shopping Centre
Container detected (keyword): Orchard Central
Container detected (keyword): Orchard Towers
Container detected (keyword): Anchorpoint
Container detected (keyword): Cathay Cineleisure Orchard
Container detected (keyword): Katong Plaza
Container detected (keyword): Katong Shopping Centre
Container detected (keyword): Odeon

In [7]:
import uuid

def add_uuids_to_tree(builder):
    """
    Add UUIDs to all nodes in existing tree
    
    Args:
        builder: POITreeBuilder instance with built tree
    """
    print("Adding UUIDs to existing tree...")
    
    uuid_map = {}  # Maps old IDs to UUIDs for reference
    
    for level_name, level_nodes in builder.tree.items():
        print(f"\nProcessing {level_name}...")
        for node_id, node_data in level_nodes.items():
            # Generate UUID
            new_uuid = str(uuid.uuid4())
            
            # Add UUID to node data
            node_data['uuid'] = new_uuid
            
            # Store mapping
            uuid_map[node_id] = new_uuid
        
        print(f"Added UUIDs to {len(level_nodes)} nodes in {level_name}")
    
    print("\n✓ UUIDs added successfully!")
    return uuid_map

# Usage:
uuid_map = add_uuids_to_tree(builder)

# Save updated tree
builder.save_tree('poi_tree_with_uuids.json')

# Verify
sample_poi = list(builder.tree['level_0'].values())[0]
print(f"\nSample POI with UUID: {sample_poi['name']}")
print(f"UUID: {sample_poi['uuid']}")

Adding UUIDs to existing tree...

Processing level_0...
Added UUIDs to 4696 nodes in level_0

Processing level_1...
Added UUIDs to 1355 nodes in level_1

Processing level_2...
Added UUIDs to 44 nodes in level_2

Processing level_3...
Added UUIDs to 5 nodes in level_3

✓ UUIDs added successfully!

Tree saved to poi_tree_with_uuids.json

Sample POI with UUID: Giant
UUID: d401dde1-0898-43b2-ae19-0f2c3634f7d0
