In [3]:
import json
import pickle
import pandas as pd
from typing import Dict, List, Optional
from datetime import datetime
from pathlib import Path

In [63]:
class UnifiedMetadata:
    """
    Central hub for managing Embedding System metadata.
    Handles ID-to-Index mapping, POI hierarchies, and global configurations.
    """
    def __init__(self, poi_dim: int = 64, user_dim: int = 71):
        # Bi-directional mappings for Users and POIs (Levels 0-3)
        self.user_to_idx = {}
        self.idx_to_user = {}
        self.poi_to_idx = {0: {}, 1: {}, 2: {}, 3: {}}
        self.idx_to_poi = {0: {}, 1: {}, 2: {}, 3: {}}
        
        self.poi_tree_data = {} # Stores the raw hierarchy for quick lookup
        
        self.config = {
            "poi_dim": poi_dim,
            "user_dim": user_dim,
            "version": "2.0",
            "updated_at": ""
        }

    def build(self, users_file: str, poi_tree_file: str):
        """Initializes metadata by parsing source CSV and JSON files."""
        print(f"[{datetime.now()}] Building Unified Metadata...")
        
        # Parse User IDs
        users_df = pd.read_csv(users_file)
        col = 'uuid' if 'uuid' in users_df.columns else 'uudi'
        uids = users_df[col].unique().tolist()
        
        self.user_to_idx = {uid: i for i, uid in enumerate(uids)}
        self.idx_to_user = {i: uid for i, uid in enumerate(uids)}

        # Parse POI Tree - Maps IDs to indices across 4 levels of granularity
        with open(poi_tree_file, 'r', encoding='utf-8') as f:
            self.poi_tree_data = json.load(f)
        
        for lv in range(4):
            level_key = f"level_{lv}"
            level_dict = self.poi_tree_data.get(level_key, {})
            
            if not isinstance(level_dict, dict): continue
        
            for node_id in level_dict.keys():
                if node_id not in self.poi_to_idx[lv]:
                    new_idx = len(self.poi_to_idx[lv])
                    self.poi_to_idx[lv][node_id] = new_idx
                    self.idx_to_poi[lv][new_idx] = node_id

        self.config["updated_at"] = datetime.now().isoformat()
        print(f"Build Complete: {len(self.user_to_idx)} Users mapped.")
        
        for lv in range(4):
            print(f"   Level {lv} POIs: {len(self.poi_to_idx[lv])}")
            
    def add_new_user(self, user_id: str) -> int:
        """Assigns a new index for unseen users during real-time interaction."""
        if user_id not in self.user_to_idx:
            new_idx = len(self.user_to_idx)
            self.user_to_idx[user_id] = new_idx
            self.idx_to_user[new_idx] = user_id
            return new_idx
        return self.user_to_idx[user_id]

    def save(self, output_file: str = None):
        ROOT = UnifiedMetadata.find_repo_root()
        SOURCES = ROOT / "Sources"
        SOURCES.mkdir(parents=True, exist_ok=True)
    
        output_path = Path(output_file) if output_file else (SOURCES / "metadata.pkl")
    
        save_data = {
            "user_to_idx": self.user_to_idx,
            "idx_to_user": self.idx_to_user,
            "poi_to_idx": self.poi_to_idx,
            "idx_to_poi": self.idx_to_poi,
            "poi_tree_data": self.poi_tree_data,
            "config": self.config,
        }
    
        with output_path.open("wb") as f:
            pickle.dump(save_data, f)
    
        print(f"ðŸ’¾ Metadata (as dict) saved to: {output_path}")

    @staticmethod
    def find_repo_root(start=None) -> Path:
        """Helper to locate the repository root via .git marker."""
        start = Path(start or Path.cwd())
        for p in [start] + list(start.parents):
            if (p / ".git").exists(): return p
        return start # Fallback to current dir if .git is missing




In [66]:
# --- Execution Block ---
if __name__ == "__main__":
    # Define source paths relative to the project structure
    # Standardizing these paths ensures consistency across different environments
    USER_PATH = "../../Sources/user_preferences.csv"
    POI_TREE_PATH = "../../Sources/poi_tree_with_uuids.json"
    
    # Initialize the metadata hub with pre-defined dimensions
    # POI_DIM=64 and USER_DIM=71 are the global standards for our embedding system
    meta = UnifiedMetadata(poi_dim=64, user_dim=71)
    
    import os
    # Check for file existence before starting the build process to prevent runtime errors
    if os.path.exists(USER_PATH) and os.path.exists(POI_TREE_PATH):
        # build() will synchronize all IDs and parse the hierarchy
        meta.build(USER_PATH, POI_TREE_PATH)
        
        # save() automatically locates the /Sources directory and exports as 'metadata.pkl'
        meta.save()
        
        print("\n" + "="*50)
        print("DEPLOYMENT READY: Central Metadata has been generated.")
        print("All downstream embedding modules should now use this .pkl for ID alignment.")
        print("="*50)
    else:
        print("\n" + "!"*50)
        print("CRITICAL ERROR: Source files not found.")
        print(f"Current Working Directory: {os.getcwd()}")
        print(f"Targeted User Path: {os.path.abspath(USER_PATH)}")
        print(f"Targeted POI Path: {os.path.abspath(POI_TREE_PATH)}")
        print("Please verify your directory structure and try again.")
        print("!"*50)

[2026-01-28 22:09:45.182609] Building Unified Metadata...
âœ… Build Complete: 21 Users mapped.
   Level 0 POIs: 4696
   Level 1 POIs: 1355
   Level 2 POIs: 44
   Level 3 POIs: 5
ðŸ’¾ Metadata (as dict) saved to: C:\Users\syoon\SpatiaLynk_recommender\Sources\metadata.pkl

DEPLOYMENT READY: Central Metadata has been generated.
All downstream embedding modules should now use this .pkl for ID alignment.
