In [2]:
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, MultiLabelBinarizer
from typing import List, Dict, Tuple
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import json
from sklearn.decomposition import TruncatedSVD, NMF
from scipy.sparse import csr_matrix
import os

In [9]:
class POIEmbeddings:
	def __init__(self,
				users_file: str,
				user_poi_interactions_file: str,
				poi_tree_file: str
				):
		self.users_df = pd.read_csv(users_file)
		self.interactions_df = pd.read_csv(user_poi_interactions_file)

		with open(poi_tree_file, 'r') as f:
			self.poi_tree = json.load(f)
	
		self.user_embeddings = {}
		self.poi_embeddings = {}
		self.encoders = {}

	def build_Y_A_level(self, level: int) -> Tuple[np.ndarray, List[str], List[str]]:
		"""
		Build direct POI attribute matrix Y_A^l for level l
		
		Level-specific Explicit POI Attributes from poi_tree_with_uuids.json:
		
		Level 0 (Building): data.category, data.price, data.popularity, 
							data.characteristics, spatial, textual
		Level 1 (Street):   data.category, data.num_entities, spatial, textual
		Level 2 (District): data.num_level1_nodes, spatial, textual
		Level 3 (Region):   data.num_districts, spatial, textual
		
		Args:
			level: Tree level (0=building, 1=street, 2=district, 3=region)
		
		Returns:
			Y_A^l: (num_pois_at_level, num_features) matrix
			poi_ids: List of POI IDs in order
			feature_names: List of feature names
		"""
		print(f"\n{'=' * 60}")
		print(f"Building Y_A^{level}: Direct POI Attribute Matrix (Level {level})")
		print(f"{'=' * 60}")
		
		level_names = {0: 'Building', 1: 'Street', 2: 'District', 3: 'Region'}
		print(f"Granularity: {level_names.get(level, 'Unknown')}")
		
		level_key = f'level_{level}'
		pois_at_level = self.poi_tree[level_key]
		
		poi_ids = list(pois_at_level.keys())
		n_pois = len(poi_ids)
		
		print(f"Number of nodes at level {level}: {n_pois}")
		
		features_list = []
		feature_names = []
		
		# Store encoders/scalers for later use (inference time)
		self.encoders[level] = {}
		
		# ================================================================
		# COMMON FEATURES (All Levels): spatial, textual
		# ================================================================
		
		# 1. Spatial Features (lat, lon) - Available at ALL levels
		spatial_features = self._extract_spatial_features(pois_at_level, poi_ids)
		spatial_scaler = StandardScaler()
		spatial_normalized = spatial_scaler.fit_transform(spatial_features)
		
		self.encoders[level]['spatial_scaler'] = spatial_scaler
		features_list.append(spatial_normalized)
		feature_names.extend(['spatial_lat_norm', 'spatial_lon_norm'])
		print(f"  [1] Spatial features: 2 dimensions (lat, lon normalized)")
		
		# ================================================================
		# LEVEL-SPECIFIC FEATURES
		# ================================================================
		
		if level == 0:
			# Level 0: Building-level POIs
			# Features: category, price, popularity, characteristics
			self._build_level0_features(
				pois_at_level, poi_ids, features_list, feature_names
			)
			
		elif level == 1:
			# Level 1: Street-level aggregation
			# Features: category, num_entities
			self._build_level1_features(
				pois_at_level, poi_ids, features_list, feature_names
			)
			
		elif level == 2:
			# Level 2: District-level aggregation
			# Features: num_level1_nodes
			self._build_level2_features(
				pois_at_level, poi_ids, features_list, feature_names
			)
			
		elif level == 3:
			# Level 3: Region-level aggregation
			# Features: num_districts
			self._build_level3_features(
				pois_at_level, poi_ids, features_list, feature_names
			)
		
		# ================================================================
		# TEXTUAL FEATURES (All Levels) - TF-IDF
		# ================================================================
		text_features, text_feature_names = self._extract_textual_features(
			pois_at_level, poi_ids, level
		)
		features_list.append(text_features)
		feature_names.extend(text_feature_names)
		
		# ================================================================
		# Concatenate all features
		# ================================================================
		Y_A_level = np.hstack(features_list)
		
		return Y_A_level, poi_ids, feature_names


	def _extract_spatial_features(self, pois_at_level: Dict, poi_ids: List[str]) -> np.ndarray:
		"""Extract spatial (lat, lon) features from POI data"""
		spatial_features = []
		for poi_id in poi_ids:
			poi_data = pois_at_level[poi_id]
			spatial = poi_data['spatial']
			
			# Handle different representations
			if isinstance(spatial, str):
				spatial = eval(spatial)
			elif isinstance(spatial, list):
				spatial = tuple(spatial)
			
			spatial_features.append([spatial[0], spatial[1]])  # (lat, lon)
		
		return np.array(spatial_features, dtype=np.float32)


	def _build_level0_features(self, pois_at_level: Dict, poi_ids: List[str],
							features_list: List, feature_names: List):
		"""
		Build Level 0 (Building) specific features:
		- data.category (one-hot)
		- data.price (normalized)
		- data.popularity (normalized)
		- data.characteristics (multi-hot)
		"""
		level = 0
		
		# 2. Category (one-hot)
		categories = []
		for poi_id in poi_ids:
			poi_data = pois_at_level[poi_id]
			cat = poi_data.get('data', {}).get('category', 'unknown')
			categories.append(str(cat).lower().strip())
		
		category_encoder = LabelEncoder()
		category_encoded = category_encoder.fit_transform(categories)
		n_categories = len(category_encoder.classes_)
		category_onehot = np.eye(n_categories, dtype=np.float32)[category_encoded]
		
		self.encoders[level]['category_encoder'] = category_encoder
		features_list.append(category_onehot)
		feature_names.extend([f'category_{cls}' for cls in category_encoder.classes_])
		print(f"  [2] Category features: {n_categories} dimensions (one-hot)")
		
		# 3. Price (normalized)
		prices = []
		for poi_id in poi_ids:
			poi_data = pois_at_level[poi_id]
			price_str = poi_data.get('data', {}).get('price', None)
			avg_price = self._parse_price(price_str)
			prices.append(avg_price)
		
		prices = np.array(prices, dtype=np.float32).reshape(-1, 1)
		price_scaler = StandardScaler()
		prices_normalized = price_scaler.fit_transform(prices)
		
		self.encoders[level]['price_scaler'] = price_scaler
		features_list.append(prices_normalized)
		feature_names.append('price_norm')
		print(f"  [3] Price feature: 1 dimension (normalized)")
		print(f"      Range: [{prices.min():.2f}, {prices.max():.2f}], mean: {prices.mean():.2f}")
		
		# 4. Popularity (normalized)
		popularities = []
		for poi_id in poi_ids:
			poi_data = pois_at_level[poi_id]
			pop_val = poi_data.get('data', {}).get('popularity', 3.0)
			try:
				popularity = float(pop_val)
			except (ValueError, TypeError):
				popularity = 3.0
			popularities.append(popularity)
		
		popularities = np.array(popularities, dtype=np.float32).reshape(-1, 1)
		popularity_scaler = StandardScaler()
		popularities_normalized = popularity_scaler.fit_transform(popularities)
		
		self.encoders[level]['popularity_scaler'] = popularity_scaler
		features_list.append(popularities_normalized)
		feature_names.append('popularity_norm')
		print(f"  [4] Popularity feature: 1 dimension (normalized)")
		print(f"      Range: [{popularities.min():.2f}, {popularities.max():.2f}], mean: {popularities.mean():.2f}")
		
		# 5. Characteristics (multi-hot)
		characteristics_list = []
		for poi_id in poi_ids:
			poi_data = pois_at_level[poi_id]
			# Try both singular and plural keys
			char_str = poi_data.get('data', {}).get('characteristic', 
					poi_data.get('data', {}).get('characteristics', ''))
			tags = self._parse_characteristics(char_str)
			characteristics_list.append(tags)
		
		mlb_chars = MultiLabelBinarizer()
		chars_multihot = mlb_chars.fit_transform(characteristics_list).astype(np.float32)
		
		self.encoders[level]['characteristics_encoder'] = mlb_chars
		features_list.append(chars_multihot)
		feature_names.extend([f'char_{cls}' for cls in mlb_chars.classes_])
		print(f"  [5] Characteristics features: {len(mlb_chars.classes_)} dimensions (multi-hot)")


	def _build_level1_features(self, pois_at_level: Dict, poi_ids: List[str],
							features_list: List, feature_names: List):
		"""
		Build Level 1 (Street) specific features:
		- data.category (one-hot) - aggregated/dominant category
		- data.num_entities (normalized) - count of POIs on this street
		"""
		level = 1
		
		# 2. Category (one-hot) - street-level category
		categories = []
		for poi_id in poi_ids:
			poi_data = pois_at_level[poi_id]
			cat = poi_data.get('data', {}).get('category', 'mixed')
			categories.append(str(cat).lower().strip())
		
		category_encoder = LabelEncoder()
		category_encoded = category_encoder.fit_transform(categories)
		n_categories = len(category_encoder.classes_)
		category_onehot = np.eye(n_categories, dtype=np.float32)[category_encoded]
		
		self.encoders[level]['category_encoder'] = category_encoder
		features_list.append(category_onehot)
		feature_names.extend([f'category_{cls}' for cls in category_encoder.classes_])
		print(f"  [2] Category features: {n_categories} dimensions (one-hot)")
		
		# 3. Number of entities (normalized) - POI count on street
		num_entities = []
		for poi_id in poi_ids:
			poi_data = pois_at_level[poi_id]
			count = poi_data.get('data', {}).get('num_entities', 1)
			try:
				count = int(count)
			except (ValueError, TypeError):
				count = 1
			num_entities.append(count)
		
		num_entities = np.array(num_entities, dtype=np.float32).reshape(-1, 1)
		entities_scaler = StandardScaler()
		entities_normalized = entities_scaler.fit_transform(num_entities)
		
		self.encoders[level]['num_entities_scaler'] = entities_scaler
		features_list.append(entities_normalized)
		feature_names.append('num_entities_norm')
		print(f"  [3] Num entities feature: 1 dimension (normalized)")
		print(f"      Range: [{num_entities.min():.0f}, {num_entities.max():.0f}], mean: {num_entities.mean():.1f}")


	def _build_level2_features(self, pois_at_level: Dict, poi_ids: List[str],
							features_list: List, feature_names: List):
		"""
		Build Level 2 (District) specific features:
		- data.num_level1_nodes (normalized) - count of streets in this district
		"""
		level = 2
		
		# 2. Number of Level 1 nodes (streets) - normalized
		num_streets = []
		for poi_id in poi_ids:
			poi_data = pois_at_level[poi_id]
			count = poi_data.get('data', {}).get('num_level1_nodes', 
					poi_data.get('data', {}).get('num_streets', 1))
			try:
				count = int(count)
			except (ValueError, TypeError):
				count = 1
			num_streets.append(count)
		
		num_streets = np.array(num_streets, dtype=np.float32).reshape(-1, 1)
		streets_scaler = StandardScaler()
		streets_normalized = streets_scaler.fit_transform(num_streets)
		
		self.encoders[level]['num_level1_nodes_scaler'] = streets_scaler
		features_list.append(streets_normalized)
		feature_names.append('num_level1_nodes_norm')
		print(f"  [2] Num streets (level1 nodes) feature: 1 dimension (normalized)")
		print(f"      Range: [{num_streets.min():.0f}, {num_streets.max():.0f}], mean: {num_streets.mean():.1f}")


	def _build_level3_features(self, pois_at_level: Dict, poi_ids: List[str],
							features_list: List, feature_names: List):
		"""
		Build Level 3 (Region) specific features:
		- data.num_districts (normalized) - count of districts in this region
		"""
		level = 3
		
		# 2. Number of districts - normalized
		num_districts = []
		for poi_id in poi_ids:
			poi_data = pois_at_level[poi_id]
			count = poi_data.get('data', {}).get('num_districts', 
					poi_data.get('data', {}).get('num_level2_nodes', 1))
			try:
				count = int(count)
			except (ValueError, TypeError):
				count = 1
			num_districts.append(count)
		
		num_districts = np.array(num_districts, dtype=np.float32).reshape(-1, 1)
		districts_scaler = StandardScaler()
		districts_normalized = districts_scaler.fit_transform(num_districts)
		
		self.encoders[level]['num_districts_scaler'] = districts_scaler
		features_list.append(districts_normalized)
		feature_names.append('num_districts_norm')
		print(f"  [2] Num districts feature: 1 dimension (normalized)")
		print(f"      Range: [{num_districts.min():.0f}, {num_districts.max():.0f}], mean: {num_districts.mean():.1f}")


	def _extract_textual_features(self, pois_at_level: Dict, poi_ids: List[str],
								level: int) -> Tuple[np.ndarray, List[str]]:
		"""
		Extract TF-IDF textual features from POI textual field
		
		Args:
			pois_at_level: POI data dictionary
			poi_ids: List of POI IDs
			level: Current tree level (affects max_features)
		
		Returns:
			text_features: TF-IDF feature matrix
			feature_names: List of feature names
		"""
		texts = []
		for poi_id in poi_ids:
			poi_data = pois_at_level[poi_id]
			text = poi_data.get('textual', '')
			if text is None:
				text = ''
			texts.append(str(text).lower())
		
		# TF-IDF config: finer levels get more features
		max_text_features = {0: 100, 1: 75, 2: 50, 3: 30}.get(level, 50)
		
		tfidf_vectorizer = TfidfVectorizer(
			max_features=max_text_features,
			stop_words='english',
			ngram_range=(1, 2),
			min_df=2,
			max_df=0.95
		)
		
		try:
			text_features = tfidf_vectorizer.fit_transform(texts).toarray().astype(np.float32)
			feature_names = [f'text_{word}' for word in tfidf_vectorizer.get_feature_names_out()]
			actual_features = text_features.shape[1]
		except ValueError:
			# Empty vocabulary fallback
			print(f"      Warning: TF-IDF vocabulary empty, using zero features")
			text_features = np.zeros((len(poi_ids), 1), dtype=np.float32)
			feature_names = ['text_empty']
			actual_features = 1
			tfidf_vectorizer = None
		
		self.encoders[level]['tfidf_vectorizer'] = tfidf_vectorizer
		print(f"  [T] Textual features: {actual_features} dimensions (TF-IDF)")
		
		return text_features, feature_names

	def _parse_price(self, price_str) -> float:
		"""
		Parse price string to float value
		
		Handles formats:
		- "25.99" -> 25.99
		- "25.85 - 30.99" -> 28.42 (average)
		- None/empty -> 25.0 (default)
		
		Args:
			price_str: Price string from POI data
		
		Returns:
			Float price value
		"""
		if price_str is None or price_str == '' or pd.isna(price_str):
			return 25.0  # Default mid-range price
		
		try:
			price_str = str(price_str).strip()
			
			# Handle range format "25.85 - 30.99"
			if '-' in price_str:
				parts = price_str.split('-')
				if len(parts) == 2:
					low = float(parts[0].strip())
					high = float(parts[1].strip())
					return (low + high) / 2
			
			# Single value
			return float(price_str)
		
		except (ValueError, TypeError):
			return 25.0  # Default on parse error

	def _parse_characteristics(self, char_str) -> List[str]:
		"""
		Parse characteristics string to list of tags
		
		Handles formats:
		- "#food, #restaurant, #local" -> ['food', 'restaurant', 'local']
		- "food; restaurant; local" -> ['food', 'restaurant', 'local']
		- Empty/None -> []
		
		Args:
			char_str: Characteristics string from POI data
		
		Returns:
			List of cleaned tag strings
		"""
		if char_str is None or char_str == '' or pd.isna(char_str):
			return []
		
		char_str = str(char_str)
		
		# Handle different separators
		if ',' in char_str:
			parts = char_str.split(',')
		elif ';' in char_str:
			parts = char_str.split(';')
		else:
			parts = [char_str]
		
		# Clean each tag
		tags = []
		for part in parts:
			# Remove hashtags, whitespace, and normalize
			tag = part.strip().replace('#', '').lower()
			# Remove empty tags and very short ones
			if tag and len(tag) > 1:
				tags.append(tag)
		
		return tags
	
	def build_Y_T_level(self, level: int, embedding_dim: int = 32) -> Tuple[np.ndarray, List[str]]:
		print(f"\n{'='*60}")
		print(f"Building Y_T^{level}: Inverse/Derived POI Attribute Matrix (Level {level})")
		print(f"{'='*60}")

		level_names = {0: 'Building', 1: 'Street', 2: 'District', 3: 'Region'}
		print(f"Granularity: {level_names.get(level, 'Unknown')}")
		
		level_key = f'level_{level}'
		pois_at_level = self.poi_tree[level_key]
		poi_ids = list(pois_at_level.keys())
		
		n_pois = len(poi_ids)
		n_users = len(self.users_df)
		
		print(f"Number of POIs at level {level}: {n_pois}")
		print(f"Number of users: {n_users}")
		
		# Create index mappings
		poi_to_idx = {pid: idx for idx, pid in enumerate(poi_ids)}
		user_id_col = 'uuid' if 'uuid' in self.users_df.columns else 'uudi'
		user_to_idx = {uid: idx for idx, uid in enumerate(self.users_df[user_id_col])}
		
		# ================================================================
		# Step 1: Build User Preference Feature Matrix
		# ================================================================
		print(f"\n  [Step 1] Building user preference features...")
		user_pref_features, user_pref_names = self._build_user_preference_features()
		print(f"    User preference matrix shape: {user_pref_features.shape}")
		print(f"    Features: {user_pref_names}")
		
		# ================================================================
		# Step 2: Build POI-User Interaction Matrix
		# ================================================================
		print(f"\n  [Step 2] Building POI-User interaction matrix...")
		interaction_matrix, interaction_stats = self._build_poi_user_interaction_matrix(
			poi_ids, poi_to_idx, user_to_idx, level
		)
		print(f"    Interaction matrix shape: {interaction_matrix.shape}")
		print(f"    Density: {interaction_stats['density']:.2f}%")
		print(f"    Total interactions: {interaction_stats['total_interactions']}")
		
		# ================================================================
		# Step 3: Derive POI features from user preferences
		# ================================================================
		print(f"\n  [Step 3] Deriving POI features from user preferences...")
		derived_features = []
		derived_names = []
		
		# 3a. Aggregate user preferences weighted by interaction strength
		poi_user_pref_agg = self._aggregate_user_preferences_to_pois(
			interaction_matrix, user_pref_features, user_pref_names
		)
		derived_features.append(poi_user_pref_agg['features'])
		derived_names.extend(poi_user_pref_agg['names'])
		print(f"    Aggregated user preferences: {poi_user_pref_agg['features'].shape[1]} dims")
		
		# 3b. User diversity features (how diverse are the users visiting this POI)
		diversity_features = self._compute_user_diversity_features(
			interaction_matrix, user_pref_features
		)
		derived_features.append(diversity_features['features'])
		derived_names.extend(diversity_features['names'])
		print(f"    User diversity features: {diversity_features['features'].shape[1]} dims")
		
		# 3c. Interaction pattern features
		pattern_features = self._compute_interaction_pattern_features(
			poi_ids, poi_to_idx, level
		)
		derived_features.append(pattern_features['features'])
		derived_names.extend(pattern_features['names'])
		print(f"    Interaction pattern features: {pattern_features['features'].shape[1]} dims")
		
		# ================================================================
		# Step 4: Matrix Factorization for latent embeddings
		# ================================================================
		print(f"\n  [Step 4] Computing latent embeddings via NMF...")
		latent_embeddings = self._compute_latent_embeddings(
			interaction_matrix, embedding_dim
		)
		derived_features.append(latent_embeddings['features'])
		derived_names.extend(latent_embeddings['names'])
		print(f"    Latent embeddings: {latent_embeddings['features'].shape[1]} dims")
		print(f"    Reconstruction error: {latent_embeddings['reconstruction_error']:.4f}")
		
		# ================================================================
		# Step 5: Concatenate all derived features
		# ================================================================
		Y_T_level = np.hstack(derived_features)
		
		# Normalize final matrix
		final_scaler = StandardScaler()
		Y_T_level = final_scaler.fit_transform(Y_T_level)
		self.encoders[level]['Y_T_scaler'] = final_scaler

		return Y_T_level, poi_ids


	def _build_user_preference_features(self) -> Tuple[np.ndarray, List[str]]:
		"""
		Build user preference feature matrix from user_preferences.csv
		
		Features extracted:
		- interests: Multi-hot encoded
		- age_group: One-hot encoded
		- price_sensitivity: One-hot encoded
		
		Returns:
			user_features: (n_users, n_features) matrix
			feature_names: List of feature names
		"""
		features_list = []
		feature_names = []
		
		user_id_col = 'uuid' if 'uuid' in self.users_df.columns else 'uudi'
		n_users = len(self.users_df)
		
		# 1. Interests (multi-hot encoding)
		interests_list = []
		for _, row in self.users_df.iterrows():
			interests_str = row.get('interests', '')
			if pd.isna(interests_str) or interests_str == '':
				interests = []
			else:
				interests = [i.strip().lower() for i in str(interests_str).split(';')]
			interests_list.append(interests)
		
		mlb_interests = MultiLabelBinarizer()
		interests_encoded = mlb_interests.fit_transform(interests_list).astype(np.float32)
		
		self.encoders['user_interests_encoder'] = mlb_interests
		features_list.append(interests_encoded)
		feature_names.extend([f'interest_{cls}' for cls in mlb_interests.classes_])
		
		# 2. Age group (one-hot encoding)
		age_groups = self.users_df['age_group'].fillna('unknown').astype(str).str.lower().tolist()
		age_encoder = LabelEncoder()
		age_encoded = age_encoder.fit_transform(age_groups)
		age_onehot = np.eye(len(age_encoder.classes_), dtype=np.float32)[age_encoded]
		
		self.encoders['user_age_encoder'] = age_encoder
		features_list.append(age_onehot)
		feature_names.extend([f'age_{cls}' for cls in age_encoder.classes_])
		
		# 3. Price sensitivity (one-hot encoding)
		price_sens = self.users_df['price_sensitivity'].fillna('medium').astype(str).str.lower().tolist()
		price_encoder = LabelEncoder()
		price_encoded = price_encoder.fit_transform(price_sens)
		price_onehot = np.eye(len(price_encoder.classes_), dtype=np.float32)[price_encoded]
		
		self.encoders['user_price_encoder'] = price_encoder
		features_list.append(price_onehot)
		feature_names.extend([f'price_sens_{cls}' for cls in price_encoder.classes_])
		
		user_features = np.hstack(features_list)
		
		return user_features, feature_names


	def _build_poi_user_interaction_matrix(
		self, 
		poi_ids: List[str], 
		poi_to_idx: Dict[str, int],
		user_to_idx: Dict[str, int],
		level: int
	) -> Tuple[csr_matrix, Dict]:
		"""
		Build POI-User interaction matrix with weighted interaction scores
		
		For level > 0, maps fine-grained POI IDs to parent nodes at target level
		
		Args:
			poi_ids: List of POI IDs at current level
			poi_to_idx: POI ID to index mapping
			user_to_idx: User ID to index mapping
			level: Current tree level
		
		Returns:
			interaction_matrix: Sparse (n_pois, n_users) matrix
			stats: Dictionary with statistics
		"""
		n_pois = len(poi_ids)
		n_users = len(user_to_idx)
		
		# Aggregate interaction scores
		poi_user_scores = {}
		
		# Interaction type weights
		interaction_weights = {
			'visit': 1.0,
			'rating': 0.8,  # Will be scaled by rating value
			'search': 0.3,
			'click': 0.2,
			'bookmark': 0.5
		}
		
		total_interactions = 0
		
		for _, row in self.interactions_df.iterrows():
			user_id = row['user_id']
			poi_id = row['poi_id']
			
			# For higher levels, map fine-grained POI to parent at target level
			if level > 0:
				mapped_poi_id = self._get_parent_at_level(poi_id, target_level=level)
			else:
				mapped_poi_id = poi_id
			
			# Skip if POI or user not in our index
			if mapped_poi_id not in poi_to_idx or user_id not in user_to_idx:
				continue
			
			key = (mapped_poi_id, user_id)
			interaction_type = row.get('interaction_type', 'visit')
			
			# Calculate weighted score
			base_weight = interaction_weights.get(interaction_type, 0.5)
			
			if interaction_type == 'rating':
				# Scale by rating value (1-5 -> 0.2-1.0)
				rating_value = row.get('value', 3)
				score = base_weight * (rating_value / 5.0)
			else:
				score = base_weight * row.get('value', 1)
			
			poi_user_scores[key] = poi_user_scores.get(key, 0) + score
			total_interactions += 1
		
		# Build sparse matrix
		row_indices = []
		col_indices = []
		values = []
		
		for (poi_id, user_id), score in poi_user_scores.items():
			row_indices.append(poi_to_idx[poi_id])
			col_indices.append(user_to_idx[user_id])
			values.append(score)
		
		interaction_matrix = csr_matrix(
			(values, (row_indices, col_indices)),
			shape=(n_pois, n_users),
			dtype=np.float32
		)
		
		density = interaction_matrix.nnz / (n_pois * n_users) * 100 if n_pois * n_users > 0 else 0
		
		stats = {
			'density': density,
			'total_interactions': total_interactions,
			'nnz': interaction_matrix.nnz,
			'unique_poi_user_pairs': len(poi_user_scores)
		}
		
		return interaction_matrix, stats


	def _aggregate_user_preferences_to_pois(
		self,
		interaction_matrix: csr_matrix,
		user_pref_features: np.ndarray,
		user_pref_names: List[str]
	) -> Dict:
		"""
		Aggregate user preferences to POIs weighted by interaction strength
		
		For each POI, compute weighted average of user preferences based on
		interaction scores.
		
		Args:
			interaction_matrix: (n_pois, n_users) sparse matrix
			user_pref_features: (n_users, n_user_features) matrix
			user_pref_names: List of user feature names
		
		Returns:
			Dictionary with 'features' and 'names'
		"""
		n_pois = interaction_matrix.shape[0]
		n_user_features = user_pref_features.shape[1]
		
		# Weighted aggregation: POI_features = (interaction_matrix @ user_features) / sum(weights)
		weighted_sum = interaction_matrix.dot(user_pref_features)
		
		# Compute sum of weights per POI for normalization
		weight_sums = np.array(interaction_matrix.sum(axis=1)).flatten()
		weight_sums[weight_sums == 0] = 1.0  # Avoid division by zero
		
		# Normalize
		aggregated_features = weighted_sum / weight_sums.reshape(-1, 1)
		
		# Handle POIs with no interactions (fill with global average)
		no_interaction_mask = np.array(interaction_matrix.sum(axis=1)).flatten() == 0
		if no_interaction_mask.any():
			global_avg = user_pref_features.mean(axis=0)
			aggregated_features[no_interaction_mask] = global_avg
		
		# Rename features
		aggregated_names = [f'agg_user_{name}' for name in user_pref_names]
		
		return {
			'features': aggregated_features.astype(np.float32),
			'names': aggregated_names
		}


	def _compute_user_diversity_features(
		self,
		interaction_matrix: csr_matrix,
		user_pref_features: np.ndarray
	) -> Dict:
		"""
		Compute user diversity features for each POI
		
		Measures how diverse the users visiting each POI are in terms of their preferences.
		Higher diversity = POI appeals to broader audience.
		
		Features:
		- user_count: Number of unique users
		- user_entropy: Entropy of user distribution
		- pref_variance: Variance in user preferences
		
		Returns:
			Dictionary with 'features' and 'names'
		"""
		n_pois = interaction_matrix.shape[0]
		
		diversity_features = []
		
		# 1. User count (normalized)
		user_counts = np.array(interaction_matrix.getnnz(axis=1)).reshape(-1, 1).astype(np.float32)
		diversity_features.append(user_counts)
		
		# 2. Interaction strength variance
		interaction_variance = []
		for i in range(n_pois):
			row = interaction_matrix.getrow(i).toarray().flatten()
			nonzero_values = row[row > 0]
			if len(nonzero_values) > 1:
				variance = np.var(nonzero_values)
			else:
				variance = 0.0
			interaction_variance.append(variance)
		interaction_variance = np.array(interaction_variance, dtype=np.float32).reshape(-1, 1)
		diversity_features.append(interaction_variance)
		
		# 3. User preference diversity (mean pairwise distance of users)
		pref_diversity = []
		for i in range(n_pois):
			row = interaction_matrix.getrow(i)
			user_indices = row.indices
			
			if len(user_indices) > 1:
				# Get preferences of users who interacted with this POI
				poi_user_prefs = user_pref_features[user_indices]
				# Compute mean pairwise cosine distance
				if poi_user_prefs.shape[0] > 1:
					norms = np.linalg.norm(poi_user_prefs, axis=1, keepdims=True)
					norms[norms == 0] = 1.0
					normalized_prefs = poi_user_prefs / norms
					similarity_matrix = normalized_prefs @ normalized_prefs.T
					# Mean off-diagonal similarity
					n = similarity_matrix.shape[0]
					mean_similarity = (similarity_matrix.sum() - n) / (n * (n - 1)) if n > 1 else 1.0
					diversity = 1.0 - mean_similarity  # Convert to diversity
				else:
					diversity = 0.0
			else:
				diversity = 0.0
			
			pref_diversity.append(diversity)
		
		pref_diversity = np.array(pref_diversity, dtype=np.float32).reshape(-1, 1)
		diversity_features.append(pref_diversity)
		
		# Concatenate
		all_diversity = np.hstack(diversity_features)
		
		return {
			'features': all_diversity,
			'names': ['user_count', 'interaction_variance', 'user_pref_diversity']
		}


	def _compute_interaction_pattern_features(
		self,
		poi_ids: List[str],
		poi_to_idx: Dict[str, int],
		level: int
	) -> Dict:
		"""
		Compute interaction pattern features for each POI
		
		Features:
		- visit_ratio: Ratio of visits vs other interaction types
		- avg_rating: Average rating received
		- search_to_visit_ratio: Conversion rate from search to visit
		- repeat_visitor_ratio: Ratio of users with multiple visits
		
		Returns:
			Dictionary with 'features' and 'names'
		"""
		n_pois = len(poi_ids)
		
		# Initialize accumulators
		poi_stats = {pid: {
			'visits': 0,
			'ratings': [],
			'searches': 0,
			'total': 0,
			'user_visits': {}  # user_id -> visit count
		} for pid in poi_ids}
		
		for _, row in self.interactions_df.iterrows():
			poi_id = row['poi_id']
			user_id = row['user_id']
			
			# Map to parent level if needed
			if level > 0:
				poi_id = self._get_parent_at_level(poi_id, target_level=level)
			
			if poi_id not in poi_stats:
				continue
			
			stats = poi_stats[poi_id]
			stats['total'] += 1
			
			interaction_type = row.get('interaction_type', 'visit')
			
			if interaction_type == 'visit':
				stats['visits'] += 1
				stats['user_visits'][user_id] = stats['user_visits'].get(user_id, 0) + 1
			elif interaction_type == 'rating':
				stats['ratings'].append(row.get('value', 3))
			elif interaction_type == 'search':
				stats['searches'] += 1
		
		# Compute features
		visit_ratios = []
		avg_ratings = []
		search_to_visit_ratios = []
		repeat_visitor_ratios = []
		
		for poi_id in poi_ids:
			stats = poi_stats[poi_id]
			
			# Visit ratio
			visit_ratio = stats['visits'] / stats['total'] if stats['total'] > 0 else 0.5
			visit_ratios.append(visit_ratio)
			
			# Average rating
			avg_rating = np.mean(stats['ratings']) if stats['ratings'] else 3.0
			avg_ratings.append(avg_rating)
			
			# Search to visit ratio (conversion)
			if stats['searches'] > 0:
				s2v_ratio = min(stats['visits'] / stats['searches'], 2.0)  # Cap at 2
			else:
				s2v_ratio = 1.0
			search_to_visit_ratios.append(s2v_ratio)
			
			# Repeat visitor ratio
			if stats['user_visits']:
				repeat_count = sum(1 for v in stats['user_visits'].values() if v > 1)
				repeat_ratio = repeat_count / len(stats['user_visits'])
			else:
				repeat_ratio = 0.0
			repeat_visitor_ratios.append(repeat_ratio)
		
		# Stack features
		pattern_features = np.column_stack([
			visit_ratios,
			avg_ratings,
			search_to_visit_ratios,
			repeat_visitor_ratios
		]).astype(np.float32)
		
		return {
			'features': pattern_features,
			'names': ['visit_ratio', 'avg_rating', 'search_to_visit_ratio', 'repeat_visitor_ratio']
		}


	def _compute_latent_embeddings(
		self,
		interaction_matrix: csr_matrix,
		embedding_dim: int
	) -> Dict:
		"""
		Compute latent embeddings via Non-negative Matrix Factorization
		
		Args:
			interaction_matrix: (n_pois, n_users) sparse matrix
			embedding_dim: Number of latent dimensions
		
		Returns:
			Dictionary with 'features', 'names', and 'reconstruction_error'
		"""
		# Handle case where matrix is empty or very sparse
		if interaction_matrix.nnz == 0:
			n_pois = interaction_matrix.shape[0]
			return {
				'features': np.zeros((n_pois, embedding_dim), dtype=np.float32),
				'names': [f'latent_{i}' for i in range(embedding_dim)],
				'reconstruction_error': 0.0
			}
		
		# Ensure non-negative values for NMF
		interaction_dense = interaction_matrix.toarray()
		interaction_dense = np.maximum(interaction_dense, 0)
		
		# Adjust embedding_dim if needed
		actual_dim = min(embedding_dim, min(interaction_dense.shape) - 1)
		actual_dim = max(actual_dim, 1)
		
		# Apply NMF
		nmf = NMF(
			n_components=actual_dim,
			init='nndsvda',  # Better initialization
			random_state=42,
			max_iter=300,
			l1_ratio=0.5,  # Mix of L1 and L2 regularization
			alpha_W=0.1,
			alpha_H=0.1
		)
		
		try:
			latent_features = nmf.fit_transform(interaction_dense)
			reconstruction_error = nmf.reconstruction_err_
		except Exception as e:
			print(f"    Warning: NMF failed ({e}), using SVD fallback")
			# Fallback to truncated SVD
			from sklearn.decomposition import TruncatedSVD
			svd = TruncatedSVD(n_components=actual_dim, random_state=42)
			latent_features = svd.fit_transform(interaction_matrix)
			latent_features = np.maximum(latent_features, 0)  # Make non-negative
			reconstruction_error = 0.0
		
		# Pad if actual_dim < embedding_dim
		if actual_dim < embedding_dim:
			padding = np.zeros((latent_features.shape[0], embedding_dim - actual_dim), dtype=np.float32)
			latent_features = np.hstack([latent_features, padding])
		
		return {
			'features': latent_features.astype(np.float32),
			'names': [f'latent_{i}' for i in range(embedding_dim)],
			'reconstruction_error': reconstruction_error
		}


	def _get_parent_at_level(self, poi_id: str, target_level: int) -> str:
		"""
		Get parent node of poi_id at target_level
		
		Traverses the POI tree from level 0 upward until reaching target_level.
		
		Args:
			poi_id: POI ID at level 0
			target_level: Target level to find parent at
		
		Returns:
			Parent POI ID at target_level, or original poi_id if not found
		"""
		if target_level == 0:
			return poi_id
		
		current_level = 0
		current_id = poi_id
		
		while current_level < target_level:
			level_key = f'level_{current_level}'
			
			if level_key not in self.poi_tree:
				break
			
			if current_id not in self.poi_tree[level_key]:
				break
			
			parent = self.poi_tree[level_key][current_id].get('parent')
			
			if parent:
				current_id = parent
				current_level += 1
			else:
				# No parent found, return current
				break
		
		return current_id

	def build_poi_embeddings(self, levels: List[int] = [0, 1, 2, 3]):
		print("\n" + "=" * 60)
		print("Building Complete POI Embeddings (All Levels)")
		print("=" * 60)
		
		level_names = {0: 'Building', 1: 'Street', 2: 'District', 3: 'Region'}
		
		for level in levels:
			print(f"\n{'#' * 60}")
			print(f"### Processing Level {level}: {level_names.get(level, 'Unknown')} ###")
			print(f"{'#' * 60}")
			
			# Build explicit POI attributes (Y_A)
			Y_A_l, poi_ids_A, Y_A_feature_names = self.build_Y_A_level(level)
			
			# Build derived POI attributes (Y_T)
			Y_T_l, poi_ids_T = self.build_Y_T_level(level, embedding_dim=32)
			
			# Verify POI ID consistency
			if poi_ids_A != poi_ids_T:
				print(f"  Warning: POI ID mismatch between Y_A and Y_T at level {level}")
				print(f"    Y_A POIs: {len(poi_ids_A)}, Y_T POIs: {len(poi_ids_T)}")
				# Use Y_A poi_ids as reference (from poi_tree)
				poi_ids = poi_ids_A
			else:
				poi_ids = poi_ids_A
			
			# Generate Y_T feature names
			Y_T_feature_names = self._get_Y_T_feature_names(Y_T_l.shape[1])
			
			# Concatenate Y_A and Y_T to form complete embedding
			Y_l = np.hstack([Y_A_l, Y_T_l])
			
			# Combined feature names
			all_feature_names = Y_A_feature_names + Y_T_feature_names
			
			# Print summary
			print(f"\n{'=' * 60}")
			print(f"Final POI Embedding Summary - Level {level} ({level_names.get(level, 'Unknown')})")
			print(f"{'=' * 60}")
			print(f"  Number of POIs: {len(poi_ids)}")
			print(f"  Y_A^{level} (Explicit) dimensions: {Y_A_l.shape[1]}")
			print(f"  Y_T^{level} (Derived) dimensions: {Y_T_l.shape[1]}")
			print(f"  Total embedding dimensions: {Y_l.shape[1]}")
			print(f"  Final shape: {Y_l.shape}")
			
			# Store embeddings with all metadata
			self.poi_embeddings[f'level_{level}'] = {
				'embeddings': Y_l,
				'poi_ids': poi_ids,
				'Y_A': Y_A_l,
				'Y_T': Y_T_l,
				'Y_A_feature_names': Y_A_feature_names,
				'Y_T_feature_names': Y_T_feature_names,
				'all_feature_names': all_feature_names,
				'n_explicit_features': Y_A_l.shape[1],
				'n_derived_features': Y_T_l.shape[1],
				'level_name': level_names.get(level, 'Unknown')
			}

	def _get_Y_T_feature_names(self, n_features: int) -> List[str]:
			"""
			Generate feature names for Y_T matrix based on the build_Y_T_level structure
			
			Args:
					n_features: Total number of Y_T features
			
			Returns:
					List of feature names
			"""
			feature_names = []
			
			# Get user preference feature names if available
			if 'user_interests_encoder' in self.encoders:
					interest_classes = self.encoders['user_interests_encoder'].classes_
					feature_names.extend([f'agg_user_interest_{cls}' for cls in interest_classes])
			
			if 'user_age_encoder' in self.encoders:
					age_classes = self.encoders['user_age_encoder'].classes_
					feature_names.extend([f'agg_user_age_{cls}' for cls in age_classes])
			
			if 'user_price_encoder' in self.encoders:
					price_classes = self.encoders['user_price_encoder'].classes_
					feature_names.extend([f'agg_user_price_sens_{cls}' for cls in price_classes])
			
			# Diversity features
			feature_names.extend(['user_count', 'interaction_variance', 'user_pref_diversity'])
			
			# Interaction pattern features
			feature_names.extend(['visit_ratio', 'avg_rating', 'search_to_visit_ratio', 'repeat_visitor_ratio'])
			
			# Latent features (fill remaining)
			current_count = len(feature_names)
			latent_count = n_features - current_count
			if latent_count > 0:
					feature_names.extend([f'latent_{i}' for i in range(latent_count)])
			
			# Truncate if we have too many names
			return feature_names[:n_features]
		
	def save_embeddings(self, output_file: str = 'poi_embeddings.pkl'):
		print(f"\nSaving embeddings to: {output_file}")
		
		# Build embedding dimensions dict separately to avoid variable shadowing
		embedding_dims = {}
		for level_key, level_data in self.poi_embeddings.items():
				embedding_dims[level_key] = {
						'total': level_data['embeddings'].shape[1],
						'explicit': level_data['n_explicit_features'],
						'derived': level_data['n_derived_features']
				}
		
		# Build POI ID to index mappings
		poi_id_to_idx = {}
		for level_key, level_data in self.poi_embeddings.items():
				poi_id_to_idx[level_key] = {
						pid: idx for idx, pid in enumerate(level_data['poi_ids'])
				}
		
		# Build feature name mappings
		feature_names_map = {}
		for level_key, level_data in self.poi_embeddings.items():
				feature_names_map[level_key] = {
						'all': level_data['all_feature_names'],
						'explicit': level_data['Y_A_feature_names'],
						'derived': level_data['Y_T_feature_names']
				}
		
		save_data = {
				# POI embeddings for all levels
				'poi_embeddings': self.poi_embeddings,
				
				# Encoders and scalers for inference
				'encoders': self.encoders,
				
				# POI tree structure (for parent lookups during inference)
				'poi_tree': self.poi_tree,
				
				# Metadata
				'metadata': {
						'levels': list(self.poi_embeddings.keys()),
						'n_users': len(self.users_df),
						'n_interactions': len(self.interactions_df),
						'created_at': pd.Timestamp.now().isoformat(),
						'embedding_dimensions': embedding_dims
				},
				
				# POI ID to index mappings for fast lookup
				'poi_id_to_idx': poi_id_to_idx,
				
				# Feature name mappings
				'feature_names': feature_names_map
		}
		
		with open(output_file, 'wb') as f:
				pickle.dump(save_data, f)
	
		# Print save summary
		file_size = os.path.getsize(output_file) / (1024 * 1024)  # MB
		print(f"  File size: {file_size:.2f} MB")
		print(f"  Levels saved: {list(self.poi_embeddings.keys())}")
		print(f"  Encoders saved: {list(self.encoders.keys())}")
		print("  Save complete!")


	def load_embeddings(self, input_file: str = 'poi_embeddings.pkl'):
		print(f"\nLoading embeddings from: {input_file}")
		
		with open(input_file, 'rb') as f:
			data = pickle.load(f)
		
		self.poi_embeddings = data['poi_embeddings']
		self.encoders = data['encoders']
		self.poi_tree = data['poi_tree']
		
		print(f"  Loaded {len(self.poi_embeddings)} levels")
		print(f"  Loaded {len(self.encoders)} encoders")
		print(f"  Created at: {data['metadata']['created_at']}")
		
		return data

In [None]:
if __name__ == "__main__":
    user_preferences_file = "../../Sources/user_preferences.csv"
    user_poi_interactions_file = "../../Sources/user_poi_interactions.csv"
    poi_tree_file = "../../Sources/poi_tree_with_uuids.json"
    
    output_file = "poi_embeddings.pkl"

    # Initialize embedding generator
    learner = POIEmbeddings(
        users_file=user_preferences_file,
        user_poi_interactions_file=user_poi_interactions_file,
        poi_tree_file=poi_tree_file
    )
    
    # Build embeddings for all levels
    learner.build_poi_embeddings(levels=[0, 1, 2, 3])
    
    # Save embeddings
    learner.save_embeddings(output_file)
    
    loaded_data = learner.load_embeddings(output_file)
    
    print("\nEmbedding dimensions per level:")
    for level_key, dims in loaded_data['metadata']['embedding_dimensions'].items():
        print(f"  {level_key}: {dims['total']} total ({dims['explicit']} explicit + {dims['derived']} derived)")


Building Complete POI Embeddings (All Levels)

############################################################
### Processing Level 0: Building ###
############################################################

Building Y_A^0: Direct POI Attribute Matrix (Level 0)
Granularity: Building
Number of nodes at level 0: 4696
  [1] Spatial features: 2 dimensions (lat, lon normalized)
  [2] Category features: 23 dimensions (one-hot)
  [3] Price feature: 1 dimension (normalized)
      Range: [4.20, 57.15], mean: 29.99
  [4] Popularity feature: 1 dimension (normalized)
      Range: [1.00, 5.00], mean: 2.99
  [5] Characteristics features: 22 dimensions (multi-hot)
  [T] Textual features: 100 dimensions (TF-IDF)

Building Y_T^0: Inverse/Derived POI Attribute Matrix (Level 0)
Granularity: Building
Number of POIs at level 0: 4696
Number of users: 21

  [Step 1] Building user preference features...
    User preference matrix shape: (21, 33)
    Features: ['interest_arcades', 'interest_bars', 'interest_bo