In [9]:
import os
import json
import pickle
import hashlib
from pathlib import Path
from typing import Dict, List, Tuple
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from scipy.sparse import csr_matrix


In [10]:
from pathlib import Path
from typing import Optional, Union, List

PathLike = Union[str, Path]

def find_repo_root(start: Optional[PathLike] = None) -> Path:
    start = Path(start or Path.cwd()).resolve()
    for p in [start] + list(start.parents):
        if (p / ".git").exists():
            return p
    raise RuntimeError(f"Cannot find repo root (.git not found). start={start}")

def find_sources_dir(root: Path, required_files: List[str]) -> Path:
    """
    Find the 'Sources' folder under root that contains all required_files.
    """
    candidates = []
    for d in root.rglob("Sources"):
        if not d.is_dir():
            continue
        ok = True
        for fn in required_files:
            if not (d / fn).exists():
                ok = False
                break
        if ok:
            candidates.append(d)

    if not candidates:
        # Debug hint: also show Sources candidates with missing files
        any_sources = [d for d in root.rglob("Sources") if d.is_dir()]
        hint = "\n".join(str(d) for d in any_sources[:10])
        raise FileNotFoundError(
            "Could not find a valid 'Sources' directory containing required files:\n"
            f"   {required_files}\n\n"
            "Found 'Sources' directories (showing up to 10):\n"
            f"{hint if hint else '(none found)'}\n\n"
            "Fix: set sources_dir directly or verify where the files are."
        )

    # If multiple, prefer the shortest path (closest to root)
    candidates.sort(key=lambda p: len(p.parts))
    return candidates[0]

In [11]:

class POIEmbeddings:
    def __init__(
        self,
        users_file: Optional[PathLike] = None,
        user_poi_interactions_file: Optional[PathLike] = None,
        poi_tree_file: Optional[PathLike] = None,
        repo_root: Optional[PathLike] = None,
        sources_dir: Optional[PathLike] = None,
        metadata_pkl: Optional[PathLike] = None,
        verbose: bool = True,
    ):
        self.repo_root = Path(repo_root).resolve() if repo_root else find_repo_root()

        required = ["user_preferences.csv", "user_poi_interactions.csv", "poi_tree_with_uuids.json"]

        # If sources_dir is not provided, auto-detect
        if sources_dir:
            self.sources_dir = Path(sources_dir).resolve()
        else:
            self.sources_dir = find_sources_dir(self.repo_root, required)

        def resolve_input(p: Optional[PathLike], default_name: str) -> Path:
            if p is None:
                return (self.sources_dir / default_name).resolve()
            p = Path(p)
            if p.is_absolute():
                return p.resolve()
            # If relative path, resolve against sources_dir
            return (self.sources_dir / p).resolve()

        self.users_file = resolve_input(users_file, "user_preferences.csv")
        self.user_poi_interactions_file = resolve_input(user_poi_interactions_file, "user_poi_interactions.csv")
        self.poi_tree_file = resolve_input(poi_tree_file, "poi_tree_with_uuids.json")

        if verbose:
            print("[POIEmbeddings] repo_root   =", self.repo_root)
            print("[POIEmbeddings] sources_dir =", self.sources_dir)
            print("[POIEmbeddings] users_file  =", self.users_file, "| exists:", self.users_file.exists())
            print("[POIEmbeddings] interactions_file =", self.user_poi_interactions_file, "| exists:", self.user_poi_interactions_file.exists())
            print("[POIEmbeddings] poi_tree_file =", self.poi_tree_file, "| exists:", self.poi_tree_file.exists())

        # Existence checks
        assert self.sources_dir.exists(), f"Missing Sources folder: {self.sources_dir}"
        assert self.users_file.exists(), f"Missing: {self.users_file}"
        assert self.user_poi_interactions_file.exists(), f"Missing: {self.user_poi_interactions_file}"
        assert self.poi_tree_file.exists(), f"Missing: {self.poi_tree_file}"

        # Load CSVs
        self.users_df = pd.read_csv(self.users_file)
        self.interactions_df = pd.read_csv(self.user_poi_interactions_file)

        # Load POI tree (JSON)
        with open(self.poi_tree_file, "r", encoding="utf-8") as f:
            self.poi_tree = json.load(f)

        # Load POI index maps from metadata.pkl for ordering
        if metadata_pkl is None:
            metadata_pkl = self.sources_dir / "metadata.pkl"

        self.meta_poi_to_idx = {}
        self.meta_idx_to_poi = {}
        if Path(metadata_pkl).exists():
            with Path(metadata_pkl).open("rb") as f:
                _meta = pickle.load(f)
            # {0: {poi_id: idx}, 1: {...}, ...}
            self.meta_poi_to_idx = _meta.get("poi_to_idx", {})
            self.meta_idx_to_poi = _meta.get("idx_to_poi", {})
            if verbose:
                print("[POIEmbeddings] loaded metadata.pkl from", metadata_pkl)
        else:
            if verbose:
                print("[POIEmbeddings] metadata.pkl not found at", metadata_pkl, "(fallback: local order)")

        # (Optional) Validate poi_tree structure: check level_0~level_3 keys
        for k in (f"level_{i}" for i in range(4)):
            if k not in self.poi_tree:
                raise KeyError(
                    f"poi_tree missing key '{k}'. "
                    f"available keys (sample) = {list(self.poi_tree.keys())[:20]}"
                )

        # (Optional) Initialize storage used in later build steps
        self.user_embeddings = {}
        self.poi_embeddings = {}
        self.encoders = {}
        
    def build_Y_A_level(self, level: int) -> Tuple[np.ndarray, List[str], List[str]]:
        """
        Build direct POI attribute matrix Y_A^l for level l
        
        Level-specific Explicit POI Attributes from poi_tree_with_uuids.json:
        
        Level 0 (Building): category, price, popularity, characteristics, spatial, textual
        Level 1 (Street): category, num_entities, spatial, textual
        Level 2 (District): num_level1_nodes, spatial, textual
        Level 3 (Region): num_districts, spatial, textual
        """
        print(f"\n{'=' * 60}")
        print(f"Building Y_A^{level}: Direct POI Attribute Matrix (Level {level})")
        print(f"{'=' * 60}")
        
        level_names = {0: 'Building', 1: 'Street', 2: 'District', 3: 'Region'}
        print(f"Granularity: {level_names.get(level, 'Unknown')}")
        
        level_key = f'level_{level}'
        pois_at_level = self.poi_tree[level_key]

        # Sort POI IDs by metadata.pkl idx order (fallback to default)
        if hasattr(self, "meta_idx_to_poi") and level in self.meta_idx_to_poi:
            idx2poi_level = self.meta_idx_to_poi[level]
            poi_ids = [idx2poi_level[i] for i in sorted(idx2poi_level.keys()) if idx2poi_level[i] in pois_at_level]
        else:
            poi_ids = list(pois_at_level.keys())

        n_pois = len(poi_ids)
        
        print(f"Number of nodes at level {level}: {n_pois}")
        
        features_list = []
        feature_names = []
        
        # Store encoders/scalers for later use
        self.encoders[level] = {}
        
        # COMMON FEATURES (All Levels): spatial, textual
        
        # 1. Spatial Features (lat, lon)
        spatial_features = self._extract_spatial_features(pois_at_level, poi_ids)
        spatial_scaler = StandardScaler()
        spatial_normalized = spatial_scaler.fit_transform(spatial_features)
        
        self.encoders[level]['spatial_scaler'] = spatial_scaler
        features_list.append(spatial_normalized)
        feature_names.extend(['spatial_lat_norm', 'spatial_lon_norm'])
        print(f"  [1] Spatial features: 2 dimensions (lat, lon normalized)")
        
        # LEVEL-SPECIFIC FEATURES
        
        if level == 0:
            self._build_level0_features(pois_at_level, poi_ids, features_list, feature_names)
        elif level == 1:
            self._build_level1_features(pois_at_level, poi_ids, features_list, feature_names)
        elif level == 2:
            self._build_level2_features(pois_at_level, poi_ids, features_list, feature_names)
        elif level == 3:
            self._build_level3_features(pois_at_level, poi_ids, features_list, feature_names)
        
        # TEXTUAL FEATURES (All Levels) - TF-IDF
        text_features, text_feature_names = self._extract_textual_features(
            pois_at_level, poi_ids, level
        )
        features_list.append(text_features)
        feature_names.extend(text_feature_names)
        
        # Concatenate all features
        Y_A_level = np.hstack(features_list)
        
        return Y_A_level, poi_ids, feature_names

    def _extract_spatial_features(self, pois_at_level: Dict, poi_ids: List[str]) -> np.ndarray:
        """Extract spatial (lat, lon) features from POI data"""
        spatial_features = []
        for poi_id in poi_ids:
            poi_data = pois_at_level[poi_id]
            spatial = poi_data['spatial']
            
            if isinstance(spatial, str):
                spatial = eval(spatial)
            elif isinstance(spatial, list):
                spatial = tuple(spatial)
            
            spatial_features.append([spatial[0], spatial[1]])
        
        return np.array(spatial_features, dtype=np.float32)

    def _build_level0_features(self, pois_at_level: Dict, poi_ids: List[str],
                               features_list: List, feature_names: List):
        """Build Level 0 (Building) specific features"""
        level = 0
        
        # Category (one-hot)
        categories = []
        for poi_id in poi_ids:
            poi_data = pois_at_level[poi_id]
            cat = poi_data.get('data', {}).get('category', 'unknown')
            categories.append(str(cat).lower().strip())
        
        category_encoder = LabelEncoder()
        category_encoded = category_encoder.fit_transform(categories)
        n_categories = len(category_encoder.classes_)
        category_onehot = np.eye(n_categories, dtype=np.float32)[category_encoded]
        
        self.encoders[level]['category_encoder'] = category_encoder
        features_list.append(category_onehot)
        feature_names.extend([f'category_{cls}' for cls in category_encoder.classes_])
        print(f"  [2] Category features: {n_categories} dimensions (one-hot)")
        
        # Price (normalized)
        prices = []
        for poi_id in poi_ids:
            poi_data = pois_at_level[poi_id]
            price_str = poi_data.get('data', {}).get('price', None)
            avg_price = self._parse_price(price_str)
            prices.append(avg_price)
        
        prices = np.array(prices, dtype=np.float32).reshape(-1, 1)
        price_scaler = StandardScaler()
        prices_normalized = price_scaler.fit_transform(prices)
        
        self.encoders[level]['price_scaler'] = price_scaler
        features_list.append(prices_normalized)
        feature_names.append('price_norm')
        print(f"  [3] Price feature: 1 dimension (normalized)")
        print(f"      Range: [{prices.min():.2f}, {prices.max():.2f}], mean: {prices.mean():.2f}")
        
        # Popularity (normalized)
        popularities = []
        for poi_id in poi_ids:
            poi_data = pois_at_level[poi_id]
            pop_val = poi_data.get('data', {}).get('popularity', 3.0)
            try:
                popularity = float(pop_val)
            except (ValueError, TypeError):
                popularity = 3.0
            popularities.append(popularity)
        
        popularities = np.array(popularities, dtype=np.float32).reshape(-1, 1)
        popularity_scaler = StandardScaler()
        popularities_normalized = popularity_scaler.fit_transform(popularities)
        
        self.encoders[level]['popularity_scaler'] = popularity_scaler
        features_list.append(popularities_normalized)
        feature_names.append('popularity_norm')
        print(f"  [4] Popularity feature: 1 dimension (normalized)")
        print(f"      Range: [{popularities.min():.2f}, {popularities.max():.2f}], mean: {popularities.mean():.2f}")
        
        # Characteristics (multi-hot)
        characteristics_list = []
        for poi_id in poi_ids:
            poi_data = pois_at_level[poi_id]
            char_str = poi_data.get('data', {}).get('characteristic', 
                    poi_data.get('data', {}).get('characteristics', ''))
            tags = self._parse_characteristics(char_str)
            characteristics_list.append(tags)
        
        mlb_chars = MultiLabelBinarizer()
        chars_multihot = mlb_chars.fit_transform(characteristics_list).astype(np.float32)
        
        self.encoders[level]['characteristics_encoder'] = mlb_chars
        features_list.append(chars_multihot)
        feature_names.extend([f'char_{cls}' for cls in mlb_chars.classes_])
        print(f"  [5] Characteristics features: {len(mlb_chars.classes_)} dimensions (multi-hot)")

    def _build_level1_features(self, pois_at_level: Dict, poi_ids: List[str],
                               features_list: List, feature_names: List):
        """Build Level 1 (Street) specific features"""
        level = 1
        
        # Category (one-hot)
        categories = []
        for poi_id in poi_ids:
            poi_data = pois_at_level[poi_id]
            cat = poi_data.get('data', {}).get('category', 'mixed')
            categories.append(str(cat).lower().strip())
        
        category_encoder = LabelEncoder()
        category_encoded = category_encoder.fit_transform(categories)
        n_categories = len(category_encoder.classes_)
        category_onehot = np.eye(n_categories, dtype=np.float32)[category_encoded]
        
        self.encoders[level]['category_encoder'] = category_encoder
        features_list.append(category_onehot)
        feature_names.extend([f'category_{cls}' for cls in category_encoder.classes_])
        print(f"  [2] Category features: {n_categories} dimensions (one-hot)")
        
        # Number of entities
        num_entities = []
        for poi_id in poi_ids:
            poi_data = pois_at_level[poi_id]
            count = poi_data.get('data', {}).get('num_entities', 1)
            try:
                count = int(count)
            except (ValueError, TypeError):
                count = 1
            num_entities.append(count)
        
        num_entities = np.array(num_entities, dtype=np.float32).reshape(-1, 1)
        entities_scaler = StandardScaler()
        entities_normalized = entities_scaler.fit_transform(num_entities)
        
        self.encoders[level]['num_entities_scaler'] = entities_scaler
        features_list.append(entities_normalized)
        feature_names.append('num_entities_norm')
        print(f"  [3] Num entities feature: 1 dimension (normalized)")
        print(f"      Range: [{num_entities.min():.0f}, {num_entities.max():.0f}], mean: {num_entities.mean():.1f}")

    def _build_level2_features(self, pois_at_level: Dict, poi_ids: List[str],
                               features_list: List, feature_names: List):
        """Build Level 2 (District) specific features"""
        level = 2
        
        # Number of Level 1 nodes
        num_streets = []
        for poi_id in poi_ids:
            poi_data = pois_at_level[poi_id]
            count = poi_data.get('data', {}).get('num_level1_nodes', 
                    poi_data.get('data', {}).get('num_streets', 1))
            try:
                count = int(count)
            except (ValueError, TypeError):
                count = 1
            num_streets.append(count)
        
        num_streets = np.array(num_streets, dtype=np.float32).reshape(-1, 1)
        streets_scaler = StandardScaler()
        streets_normalized = streets_scaler.fit_transform(num_streets)
        
        self.encoders[level]['num_level1_nodes_scaler'] = streets_scaler
        features_list.append(streets_normalized)
        feature_names.append('num_level1_nodes_norm')
        print(f"  [2] Num streets feature: 1 dimension (normalized)")
        print(f"      Range: [{num_streets.min():.0f}, {num_streets.max():.0f}], mean: {num_streets.mean():.1f}")

    def _build_level3_features(self, pois_at_level: Dict, poi_ids: List[str],
                               features_list: List, feature_names: List):
        """Build Level 3 (Region) specific features"""
        level = 3
        
        # Number of districts
        num_districts = []
        for poi_id in poi_ids:
            poi_data = pois_at_level[poi_id]
            count = poi_data.get('data', {}).get('num_districts', 
                    poi_data.get('data', {}).get('num_level2_nodes', 1))
            try:
                count = int(count)
            except (ValueError, TypeError):
                count = 1
            num_districts.append(count)
        
        num_districts = np.array(num_districts, dtype=np.float32).reshape(-1, 1)
        districts_scaler = StandardScaler()
        districts_normalized = districts_scaler.fit_transform(num_districts)
        
        self.encoders[level]['num_districts_scaler'] = districts_scaler
        features_list.append(districts_normalized)
        feature_names.append('num_districts_norm')
        print(f"  [2] Num districts feature: 1 dimension (normalized)")
        print(f"      Range: [{num_districts.min():.0f}, {num_districts.max():.0f}], mean: {num_districts.mean():.1f}")

    def _extract_textual_features(self, pois_at_level: Dict, poi_ids: List[str],
                                  level: int) -> Tuple[np.ndarray, List[str]]:
        """Extract TF-IDF textual features"""
        texts = []
        for poi_id in poi_ids:
            poi_data = pois_at_level[poi_id]
            text = poi_data.get('textual', '')
            if text is None:
                text = ''
            texts.append(str(text).lower())
        
        max_text_features = {0: 100, 1: 75, 2: 50, 3: 30}.get(level, 50)
        
        tfidf_vectorizer = TfidfVectorizer(
            max_features=max_text_features,
            stop_words='english',
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.95
        )
        
        try:
            text_features = tfidf_vectorizer.fit_transform(texts).toarray().astype(np.float32)
            feature_names = [f'text_{word}' for word in tfidf_vectorizer.get_feature_names_out()]
            actual_features = text_features.shape[1]
        except ValueError:
            print(f"      Warning: TF-IDF vocabulary empty, using zero features")
            text_features = np.zeros((len(poi_ids), 1), dtype=np.float32)
            feature_names = ['text_empty']
            actual_features = 1
            tfidf_vectorizer = None
        
        self.encoders[level]['tfidf_vectorizer'] = tfidf_vectorizer
        print(f"  [T] Textual features: {actual_features} dimensions (TF-IDF)")
        
        return text_features, feature_names

    def _parse_price(self, price_str) -> float:
        """Parse price string to float value"""
        if price_str is None or price_str == '' or pd.isna(price_str):
            return 25.0
        
        try:
            price_str = str(price_str).strip()
            
            if '-' in price_str:
                parts = price_str.split('-')
                if len(parts) == 2:
                    low = float(parts[0].strip())
                    high = float(parts[1].strip())
                    return (low + high) / 2
            
            return float(price_str)
        
        except (ValueError, TypeError):
            return 25.0

    def _parse_characteristics(self, char_str) -> List[str]:
        """Parse characteristics string to list of tags"""
        if char_str is None or char_str == '' or pd.isna(char_str):
            return []
        
        char_str = str(char_str)
        
        if ',' in char_str:
            parts = char_str.split(',')
        elif ';' in char_str:
            parts = char_str.split(';')
        else:
            parts = [char_str]
        
        tags = []
        for part in parts:
            tag = part.strip().replace('#', '').lower()
            if tag and len(tag) > 1:
                tags.append(tag)
        
        return tags
    
    def build_Y_T_level(self, level: int, embedding_dim: int = 32) -> Tuple[np.ndarray, List[str]]:
        """Build derived POI attribute matrix"""
        print(f"\n{'='*60}")
        print(f"Building Y_T^{level}: Derived POI Attribute Matrix (Level {level})")
        print(f"{'='*60}")

        level_names = {0: 'Building', 1: 'Street', 2: 'District', 3: 'Region'}
        print(f"Granularity: {level_names.get(level, 'Unknown')}")
        
        level_key = f'level_{level}'
        pois_at_level = self.poi_tree[level_key]

        # Sort POI IDs by metadata.pkl idx order (fallback to default)
        if hasattr(self, "meta_idx_to_poi") and level in self.meta_idx_to_poi:
            idx2poi_level = self.meta_idx_to_poi[level]
            poi_ids = [idx2poi_level[i] for i in sorted(idx2poi_level.keys()) if idx2poi_level[i] in pois_at_level]
        else:
            poi_ids = list(pois_at_level.keys())
        
        n_pois = len(poi_ids)
        n_users = len(self.users_df)
        
        print(f"Number of POIs at level {level}: {n_pois}")
        print(f"Number of users: {n_users}")
        
        # Create index mappings
        poi_to_idx = {pid: idx for idx, pid in enumerate(poi_ids)}
        user_id_col = 'uuid' if 'uuid' in self.users_df.columns else 'uudi'
        user_to_idx = {uid: idx for idx, uid in enumerate(self.users_df[user_id_col])}
        
        # Step 1: Build User Preference Features
        print(f"\n  [Step 1] Building user preference features...")
        user_pref_features, user_pref_names = self._build_user_preference_features()
        print(f"    User preference matrix shape: {user_pref_features.shape}")
        
        # Step 2: Build POI-User Interaction Matrix
        print(f"\n  [Step 2] Building POI-User interaction matrix...")
        interaction_matrix, interaction_stats = self._build_poi_user_interaction_matrix(
            poi_ids, poi_to_idx, user_to_idx, level
        )
        print(f"    Interaction matrix shape: {interaction_matrix.shape}")
        print(f"    Density: {interaction_stats['density']:.2f}%")
        print(f"    Total interactions: {interaction_stats['total_interactions']}")
        
        # Step 3: Derive POI features
        print(f"\n  [Step 3] Deriving POI features from user preferences...")
        derived_features = []
        derived_names = []
        
        poi_user_pref_agg = self._aggregate_user_preferences_to_pois(
            interaction_matrix, user_pref_features, user_pref_names
        )
        derived_features.append(poi_user_pref_agg['features'])
        derived_names.extend(poi_user_pref_agg['names'])
        print(f"    Aggregated user preferences: {poi_user_pref_agg['features'].shape[1]} dims")
        
        diversity_features = self._compute_user_diversity_features(
            interaction_matrix, user_pref_features
        )
        derived_features.append(diversity_features['features'])
        derived_names.extend(diversity_features['names'])
        print(f"    User diversity features: {diversity_features['features'].shape[1]} dims")
        
        pattern_features = self._compute_interaction_pattern_features(
            poi_ids, poi_to_idx, level
        )
        derived_features.append(pattern_features['features'])
        derived_names.extend(pattern_features['names'])
        print(f"    Interaction pattern features: {pattern_features['features'].shape[1]} dims")
        
        # Step 4: Matrix Factorization
        print(f"\n  [Step 4] Computing latent embeddings via NMF...")
        latent_embeddings = self._compute_latent_embeddings(
            interaction_matrix, embedding_dim
        )
        derived_features.append(latent_embeddings['features'])
        derived_names.extend(latent_embeddings['names'])
        print(f"    Latent embeddings: {latent_embeddings['features'].shape[1]} dims")
        
        # Step 5: Concatenate all features
        Y_T_level = np.hstack(derived_features)
        
        final_scaler = StandardScaler()
        Y_T_level = final_scaler.fit_transform(Y_T_level)
        self.encoders[level]['Y_T_scaler'] = final_scaler

        return Y_T_level, poi_ids

    def _build_user_preference_features(self) -> Tuple[np.ndarray, List[str]]:
        """Build user preference features"""
        features_list = []
        feature_names = []
        
        user_id_col = 'uuid' if 'uuid' in self.users_df.columns else 'uudi'
        
        # 1. Interests (multi-hot)
        interests_list = []
        for _, row in self.users_df.iterrows():
            interests_str = row.get('interests', '')
            if pd.isna(interests_str) or interests_str == '':
                interests = []
            else:
                interests = [i.strip().lower() for i in str(interests_str).split(';')]
            interests_list.append(interests)
        
        mlb_interests = MultiLabelBinarizer()
        interests_encoded = mlb_interests.fit_transform(interests_list).astype(np.float32)
        
        self.encoders['user_interests_encoder'] = mlb_interests
        features_list.append(interests_encoded)
        feature_names.extend([f'interest_{cls}' for cls in mlb_interests.classes_])
        
        # 2. Age group (one-hot)
        age_groups = self.users_df['age_group'].fillna('unknown').astype(str).str.lower().tolist()
        age_encoder = LabelEncoder()
        age_encoded = age_encoder.fit_transform(age_groups)
        age_onehot = np.eye(len(age_encoder.classes_), dtype=np.float32)[age_encoded]
        
        self.encoders['user_age_encoder'] = age_encoder
        features_list.append(age_onehot)
        feature_names.extend([f'age_{cls}' for cls in age_encoder.classes_])
        
        # 3. Price sensitivity (one-hot)
        price_sens = self.users_df['price_sensitivity'].fillna('medium').astype(str).str.lower().tolist()
        price_encoder = LabelEncoder()
        price_encoded = price_encoder.fit_transform(price_sens)
        price_onehot = np.eye(len(price_encoder.classes_), dtype=np.float32)[price_encoded]
        
        self.encoders['user_price_encoder'] = price_encoder
        features_list.append(price_onehot)
        feature_names.extend([f'price_sens_{cls}' for cls in price_encoder.classes_])
        
        user_features = np.hstack(features_list)
        
        return user_features, feature_names

    def _build_poi_user_interaction_matrix(
        self, 
        poi_ids: List[str], 
        poi_to_idx: Dict[str, int],
        user_to_idx: Dict[str, int],
        level: int
    ) -> Tuple[csr_matrix, Dict]:
        """Build POI-User interaction matrix"""
        n_pois = len(poi_ids)
        n_users = len(user_to_idx)
        
        poi_user_scores = {}
        
        interaction_weights = {
            'visit': 1.0,
            'rating': 0.8,
            'search': 0.3,
            'click': 0.2,
            'bookmark': 0.5
        }
        
        total_interactions = 0
        
        for _, row in self.interactions_df.iterrows():
            user_id = row['user_id']
            poi_id = row['poi_id']
            
            if level > 0:
                mapped_poi_id = self._get_parent_at_level(poi_id, target_level=level)
            else:
                mapped_poi_id = poi_id
            
            if mapped_poi_id not in poi_to_idx or user_id not in user_to_idx:
                continue
            
            key = (mapped_poi_id, user_id)
            interaction_type = row.get('interaction_type', 'visit')
            
            base_weight = interaction_weights.get(interaction_type, 0.5)
            
            if interaction_type == 'rating':
                rating_value = row.get('value', 3)
                score = base_weight * (rating_value / 5.0)
            else:
                score = base_weight * row.get('value', 1)
            
            poi_user_scores[key] = poi_user_scores.get(key, 0) + score
            total_interactions += 1
        
        row_indices = []
        col_indices = []
        values = []
        
        for (poi_id, user_id), score in poi_user_scores.items():
            row_indices.append(poi_to_idx[poi_id])
            col_indices.append(user_to_idx[user_id])
            values.append(score)
        
        interaction_matrix = csr_matrix(
            (values, (row_indices, col_indices)),
            shape=(n_pois, n_users),
            dtype=np.float32
        )
        
        density = interaction_matrix.nnz / (n_pois * n_users) * 100 if n_pois * n_users > 0 else 0
        
        stats = {
            'density': density,
            'total_interactions': total_interactions,
            'nnz': interaction_matrix.nnz,
            'unique_poi_user_pairs': len(poi_user_scores)
        }
        
        return interaction_matrix, stats

    def _aggregate_user_preferences_to_pois(
        self,
        interaction_matrix: csr_matrix,
        user_pref_features: np.ndarray,
        user_pref_names: List[str]
    ) -> Dict:
        """Aggregate user preferences to POIs"""
        weighted_sum = interaction_matrix.dot(user_pref_features)
        
        weight_sums = np.array(interaction_matrix.sum(axis=1)).flatten()
        weight_sums[weight_sums == 0] = 1.0
        
        aggregated_features = weighted_sum / weight_sums.reshape(-1, 1)
        
        no_interaction_mask = np.array(interaction_matrix.sum(axis=1)).flatten() == 0
        if no_interaction_mask.any():
            global_avg = user_pref_features.mean(axis=0)
            aggregated_features[no_interaction_mask] = global_avg
        
        aggregated_names = [f'agg_user_{name}' for name in user_pref_names]
        
        return {
            'features': aggregated_features.astype(np.float32),
            'names': aggregated_names
        }

    def _compute_user_diversity_features(
        self,
        interaction_matrix: csr_matrix,
        user_pref_features: np.ndarray
    ) -> Dict:
        """Compute user diversity features"""
        n_pois = interaction_matrix.shape[0]
        
        diversity_features = []
        
        # 1. User count
        user_counts = np.array(interaction_matrix.getnnz(axis=1)).reshape(-1, 1).astype(np.float32)
        diversity_features.append(user_counts)
        
        # 2. Interaction strength variance
        interaction_variance = []
        for i in range(n_pois):
            row = interaction_matrix.getrow(i).toarray().flatten()
            nonzero_values = row[row > 0]
            if len(nonzero_values) > 1:
                variance = np.var(nonzero_values)
            else:
                variance = 0.0
            interaction_variance.append(variance)
        interaction_variance = np.array(interaction_variance, dtype=np.float32).reshape(-1, 1)
        diversity_features.append(interaction_variance)
        
        # 3. User preference diversity
        pref_diversity = []
        for i in range(n_pois):
            row = interaction_matrix.getrow(i)
            user_indices = row.indices
            
            if len(user_indices) > 1:
                poi_user_prefs = user_pref_features[user_indices]
                if poi_user_prefs.shape[0] > 1:
                    norms = np.linalg.norm(poi_user_prefs, axis=1, keepdims=True)
                    norms[norms == 0] = 1.0
                    normalized_prefs = poi_user_prefs / norms
                    similarity_matrix = normalized_prefs @ normalized_prefs.T
                    n = similarity_matrix.shape[0]
                    mean_similarity = (similarity_matrix.sum() - n) / (n * (n - 1)) if n > 1 else 1.0
                    diversity = 1.0 - mean_similarity
                else:
                    diversity = 0.0
            else:
                diversity = 0.0
            
            pref_diversity.append(diversity)
        
        pref_diversity = np.array(pref_diversity, dtype=np.float32).reshape(-1, 1)
        diversity_features.append(pref_diversity)
        
        all_diversity = np.hstack(diversity_features)
        
        return {
            'features': all_diversity,
            'names': ['user_count', 'interaction_variance', 'user_pref_diversity']
        }

    def _compute_interaction_pattern_features(
        self,
        poi_ids: List[str],
        poi_to_idx: Dict[str, int],
        level: int
    ) -> Dict:
        """Compute interaction pattern features"""
        n_pois = len(poi_ids)
        
        poi_stats = {pid: {
            'visits': 0,
            'ratings': [],
            'searches': 0,
            'total': 0,
            'user_visits': {}
        } for pid in poi_ids}
        
        for _, row in self.interactions_df.iterrows():
            poi_id = row['poi_id']
            user_id = row['user_id']
            
            if level > 0:
                poi_id = self._get_parent_at_level(poi_id, target_level=level)
            
            if poi_id not in poi_stats:
                continue
            
            stats = poi_stats[poi_id]
            stats['total'] += 1
            
            interaction_type = row.get('interaction_type', 'visit')
            
            if interaction_type == 'visit':
                stats['visits'] += 1
                stats['user_visits'][user_id] = stats['user_visits'].get(user_id, 0) + 1
            elif interaction_type == 'rating':
                stats['ratings'].append(row.get('value', 3))
            elif interaction_type == 'search':
                stats['searches'] += 1
        
        visit_ratios = []
        avg_ratings = []
        search_to_visit_ratios = []
        repeat_visitor_ratios = []
        
        for poi_id in poi_ids:
            stats = poi_stats[poi_id]
            
            visit_ratio = stats['visits'] / stats['total'] if stats['total'] > 0 else 0.5
            visit_ratios.append(visit_ratio)
            
            avg_rating = np.mean(stats['ratings']) if stats['ratings'] else 3.0
            avg_ratings.append(avg_rating)
            
            if stats['searches'] > 0:
                s2v_ratio = min(stats['visits'] / stats['searches'], 2.0)
            else:
                s2v_ratio = 1.0
            search_to_visit_ratios.append(s2v_ratio)
            
            if stats['user_visits']:
                repeat_count = sum(1 for v in stats['user_visits'].values() if v > 1)
                repeat_ratio = repeat_count / len(stats['user_visits'])
            else:
                repeat_ratio = 0.0
            repeat_visitor_ratios.append(repeat_ratio)
        
        pattern_features = np.column_stack([
            visit_ratios,
            avg_ratings,
            search_to_visit_ratios,
            repeat_visitor_ratios
        ]).astype(np.float32)
        
        return {
            'features': pattern_features,
            'names': ['visit_ratio', 'avg_rating', 'search_to_visit_ratio', 'repeat_visitor_ratio']
        }

    def _compute_latent_embeddings(
        self,
        interaction_matrix: csr_matrix,
        embedding_dim: int
    ) -> Dict:
        """Compute latent embeddings via NMF"""
        if interaction_matrix.nnz == 0:
            n_pois = interaction_matrix.shape[0]
            return {
                'features': np.zeros((n_pois, embedding_dim), dtype=np.float32),
                'names': [f'latent_{i}' for i in range(embedding_dim)],
                'reconstruction_error': 0.0
            }
        
        interaction_dense = interaction_matrix.toarray()
        interaction_dense = np.maximum(interaction_dense, 0)
        
        actual_dim = min(embedding_dim, min(interaction_dense.shape) - 1)
        actual_dim = max(actual_dim, 1)
        
        nmf = NMF(
            n_components=actual_dim,
            init='nndsvda',
            random_state=42,
            max_iter=300,
            l1_ratio=0.5,
            alpha_W=0.1,
            alpha_H=0.1
        )
        
        try:
            latent_features = nmf.fit_transform(interaction_dense)
            reconstruction_error = nmf.reconstruction_err_
        except Exception as e:
            print(f"    Warning: NMF failed ({e}), using SVD fallback")
            from sklearn.decomposition import TruncatedSVD
            svd = TruncatedSVD(n_components=actual_dim, random_state=42)
            latent_features = svd.fit_transform(interaction_matrix)
            latent_features = np.maximum(latent_features, 0)
            reconstruction_error = 0.0
        
        if actual_dim < embedding_dim:
            padding = np.zeros((latent_features.shape[0], embedding_dim - actual_dim), dtype=np.float32)
            latent_features = np.hstack([latent_features, padding])
        
        return {
            'features': latent_features.astype(np.float32),
            'names': [f'latent_{i}' for i in range(embedding_dim)],
            'reconstruction_error': reconstruction_error
        }

    def _get_parent_at_level(self, poi_id: str, target_level: int) -> str:
        """Get parent node at target level"""
        if target_level == 0:
            return poi_id
        
        current_level = 0
        current_id = poi_id
        
        while current_level < target_level:
            level_key = f'level_{current_level}'
            
            if level_key not in self.poi_tree:
                break
            
            if current_id not in self.poi_tree[level_key]:
                break
            
            parent = self.poi_tree[level_key][current_id].get('parent')
            
            if parent:
                current_id = parent
                current_level += 1
            else:
                break
        
        return current_id

    def build_poi_embeddings(self, levels: List[int] = [0, 1, 2, 3]):
        """Build POI embeddings for all levels"""
        print("\n" + "=" * 60)
        print("Building Complete POI Embeddings (All Levels)")
        print("=" * 60)
        
        level_names = {0: 'Building', 1: 'Street', 2: 'District', 3: 'Region'}
        
        for level in levels:
            print(f"\n{'#' * 60}")
            print(f"### Processing Level {level}: {level_names.get(level, 'Unknown')} ###")
            print(f"{'#' * 60}")
            
            Y_A_l, poi_ids_A, Y_A_feature_names = self.build_Y_A_level(level)
            Y_T_l, poi_ids_T = self.build_Y_T_level(level, embedding_dim=32)
            
            if poi_ids_A != poi_ids_T:
                print(f"  Warning: POI ID mismatch between Y_A and Y_T")
                poi_ids = poi_ids_A
            else:
                poi_ids = poi_ids_A
            
            Y_T_feature_names = self._get_Y_T_feature_names(Y_T_l.shape[1])
            
            Y_l = np.hstack([Y_A_l, Y_T_l])
            
            all_feature_names = Y_A_feature_names + Y_T_feature_names
            
            print(f"\n{'=' * 60}")
            print(f"Final POI Embedding Summary - Level {level}")
            print(f"{'=' * 60}")
            print(f"  Number of POIs: {len(poi_ids)}")
            print(f"  Y_A^{level} dimensions: {Y_A_l.shape[1]}")
            print(f"  Y_T^{level} dimensions: {Y_T_l.shape[1]}")
            print(f"  Total embedding dimensions: {Y_l.shape[1]}")
            
            self.poi_embeddings[f'level_{level}'] = {
                'embeddings': Y_l,
                'poi_ids': poi_ids,
                'Y_A': Y_A_l,
                'Y_T': Y_T_l,
                'Y_A_feature_names': Y_A_feature_names,
                'Y_T_feature_names': Y_T_feature_names,
                'all_feature_names': all_feature_names,
                'n_explicit_features': Y_A_l.shape[1],
                'n_derived_features': Y_T_l.shape[1],
                'level_name': level_names.get(level, 'Unknown')
            }

    def _get_Y_T_feature_names(self, n_features: int) -> List[str]:
        """Generate Y_T feature names"""
        feature_names = []
        
        if 'user_interests_encoder' in self.encoders:
            interest_classes = self.encoders['user_interests_encoder'].classes_
            feature_names.extend([f'agg_user_interest_{cls}' for cls in interest_classes])
        
        if 'user_age_encoder' in self.encoders:
            age_classes = self.encoders['user_age_encoder'].classes_
            feature_names.extend([f'agg_user_age_{cls}' for cls in age_classes])
        
        if 'user_price_encoder' in self.encoders:
            price_classes = self.encoders['user_price_encoder'].classes_
            feature_names.extend([f'agg_user_price_sens_{cls}' for cls in price_classes])
        
        feature_names.extend(['user_count', 'interaction_variance', 'user_pref_diversity'])
        
        feature_names.extend(['visit_ratio', 'avg_rating', 'search_to_visit_ratio', 'repeat_visitor_ratio'])
        
        current_count = len(feature_names)
        latent_count = n_features - current_count
        if latent_count > 0:
            feature_names.extend([f'latent_{i}' for i in range(latent_count)])
        
        return feature_names[:n_features]
        
    def save_embeddings(self, output_file: str = 'poi_embeddings.pkl'):
        """Save embeddings"""
        # If output_file is relative, resolve against self.sources_dir
        out_path = Path(output_file)
        if not out_path.is_absolute():
            out_path = self.sources_dir / out_path
        out_path.parent.mkdir(parents=True, exist_ok=True)

        print(f"\nSaving embeddings to: {out_path}")
        
        embedding_dims = {}
        for level_key, level_data in self.poi_embeddings.items():
            embedding_dims[level_key] = {
                'total': level_data['embeddings'].shape[1],
                'explicit': level_data['n_explicit_features'],
                'derived': level_data['n_derived_features']
            }
        
        poi_id_to_idx = {}
        for level_key, level_data in self.poi_embeddings.items():
            poi_id_to_idx[level_key] = {
                pid: idx for idx, pid in enumerate(level_data['poi_ids'])
            }
        
        feature_names_map = {}
        for level_key, level_data in self.poi_embeddings.items():
            feature_names_map[level_key] = {
                'all': level_data['all_feature_names'],
                'explicit': level_data['Y_A_feature_names'],
                'derived': level_data['Y_T_feature_names']
            }
        
        save_data = {
            'poi_embeddings': self.poi_embeddings,
            'encoders': self.encoders,
            'poi_tree': self.poi_tree,
            'metadata': {
                'levels': list(self.poi_embeddings.keys()),
                'n_users': len(self.users_df),
                'n_interactions': len(self.interactions_df),
                'created_at': pd.Timestamp.now().isoformat(),
                'embedding_dimensions': embedding_dims
            },
            'poi_id_to_idx': poi_id_to_idx,
            'feature_names': feature_names_map
        }
        
        with out_path.open('wb') as f:
            pickle.dump(save_data, f)
        
        file_size = os.path.getsize(out_path) / (1024 * 1024)
        print(f"  File size: {file_size:.2f} MB")
        print(f"  Levels saved: {list(self.poi_embeddings.keys())}")
        print("  Save complete!")

    def load_embeddings(self, input_file: str = 'poi_embeddings.pkl'):
        """Load embeddings"""
        in_path = Path(input_file)
        if not in_path.is_absolute():
            in_path = self.sources_dir / in_path
        print(f"\nLoading embeddings from: {in_path}")
        
        with in_path.open('rb') as f:
            data = pickle.load(f)
        
        self.poi_embeddings = data['poi_embeddings']
        self.encoders = data['encoders']
        self.poi_tree = data['poi_tree']
        
        print(f"  Loaded {len(self.poi_embeddings)} levels")
        print(f"  Loaded {len(self.encoders)} encoders")
        print(f"  Created at: {data['metadata']['created_at']}")
        
        return data
        

In [12]:
if __name__ == "__main__":
    
    ROOT = Path.cwd()
    SOURCES = ROOT / "Sources"
    
    user_preferences_file = (SOURCES / "user_preferences.csv").resolve()
    user_poi_interactions_file = (SOURCES / "user_poi_interactions.csv").resolve()
    poi_tree_file = (SOURCES / "poi_tree_with_uuids.json").resolve()
    
    print("user_preferences_file:", user_preferences_file, user_preferences_file.exists())
    print("user_poi_interactions_file:", user_poi_interactions_file, user_poi_interactions_file.exists())
    print("poi_tree_file:", poi_tree_file, poi_tree_file.exists())

    # Initialize embedding generator
    learner = POIEmbeddings()

    # Build embeddings for all levels
    learner.build_poi_embeddings(levels=[0, 1, 2, 3])

    output_file = "Sources/poi_embeddings.pkl"

    # Save embeddings
    learner.save_embeddings(output_file)
    
    loaded_data = learner.load_embeddings(output_file)
    
    print("\nEmbedding dimensions per level:")
    for level_key, dims in loaded_data['metadata']['embedding_dimensions'].items():
        print(f"  {level_key}: {dims['total']} total ({dims['explicit']} explicit + {dims['derived']} derived)")

user_preferences_file: C:\Users\syoon\SpatiaLynk_recommender\Attribute-based representation learning\POI_Embeddings\Sources\user_preferences.csv False
user_poi_interactions_file: C:\Users\syoon\SpatiaLynk_recommender\Attribute-based representation learning\POI_Embeddings\Sources\user_poi_interactions.csv False
poi_tree_file: C:\Users\syoon\SpatiaLynk_recommender\Attribute-based representation learning\POI_Embeddings\Sources\poi_tree_with_uuids.json False
[POIEmbeddings] repo_root   = C:\Users\syoon\SpatiaLynk_recommender
[POIEmbeddings] sources_dir = C:\Users\syoon\SpatiaLynk_recommender\Sources
[POIEmbeddings] users_file  = C:\Users\syoon\SpatiaLynk_recommender\Sources\user_preferences.csv | exists: True
[POIEmbeddings] interactions_file = C:\Users\syoon\SpatiaLynk_recommender\Sources\user_poi_interactions.csv | exists: True
[POIEmbeddings] poi_tree_file = C:\Users\syoon\SpatiaLynk_recommender\Sources\poi_tree_with_uuids.json | exists: True
[POIEmbeddings] loaded metadata.pkl from C:\


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


    Interaction pattern features: 4 dims

  [Step 4] Computing latent embeddings via NMF...
    Latent embeddings: 32 dims

Final POI Embedding Summary - Level 1
  Number of POIs: 1355
  Y_A^1 dimensions: 99
  Y_T^1 dimensions: 72
  Total embedding dimensions: 171

############################################################
### Processing Level 2: District ###
############################################################

Building Y_A^2: Direct POI Attribute Matrix (Level 2)
Granularity: District
Number of nodes at level 2: 44
  [1] Spatial features: 2 dimensions (lat, lon normalized)
  [2] Num streets feature: 1 dimension (normalized)
      Range: [1, 125], mean: 30.8
  [T] Textual features: 50 dimensions (TF-IDF)

Building Y_T^2: Derived POI Attribute Matrix (Level 2)
Granularity: District
Number of POIs at level 2: 44
Number of users: 21

  [Step 1] Building user preference features...
    User preference matrix shape: (21, 33)

  [Step 2] Building POI-User interaction matrix...
   