In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, StandardScaler
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection
import json
from scipy.sparse import csr_matrix
import pickle
from typing import Optional, List
from pathlib import Path

# Dim for temporal / interaction-based embedding block (X_T)
FINAL_EMBEDDING_DIM = 32

# Final embedding dim for user/poi alignment
TARGET_EMBEDDING_DIM = 64

In [3]:
class UserEmbeddings: 
	def __init__(self,
			users_file: str,
			user_poi_interactions_file: str,
			poi_tree_file: str,
			metadata_pkl: Optional[str] = None):
		self.users_df = pd.read_csv(users_file)
		self.interactions_df = pd.read_csv(user_poi_interactions_file)

		with open(poi_tree_file, 'r', encoding='utf-8') as f:
			self.poi_tree = json.load(f)

		# Auto-detect user_id column (uuid / uudi / user_id)
		self.user_id_col = self._resolve_user_id_col()

		# Load metadata.pkl (use for ordering if present)
		self.meta_user_to_idx = {}
		self.meta_idx_to_user = {}
		self.meta_idx_to_poi = {}
		if metadata_pkl is None:
			try:
				metadata_pkl = str(Path(users_file).resolve().parent / "metadata.pkl")
			except Exception:
				metadata_pkl = None

		if metadata_pkl and Path(metadata_pkl).exists():
			with open(metadata_pkl, "rb") as f:
				_meta = pickle.load(f)
			self.meta_user_to_idx = _meta.get("user_to_idx", {})
			self.meta_idx_to_user = _meta.get("idx_to_user", {})
			self.meta_idx_to_poi = _meta.get("idx_to_poi", {})

		self.user_embeddings = {}
		self.poi_embeddings = {}

	def _resolve_user_id_col(self) -> str:
		for col in ("uuid", "uudi", "user_id"):
			if col in self.users_df.columns:
				return col
		raise KeyError("Missing user id column: expected one of ['uuid','uudi','user_id']")

	def _get_ordered_user_ids(self) -> List[str]:
		if self.meta_idx_to_user:
			idx2user = self.meta_idx_to_user
			ordered = [idx2user[i] for i in sorted(idx2user.keys()) if idx2user[i] in set(self.users_df[self.user_id_col].astype(str))]
			if ordered:
				return ordered
		return self.users_df[self.user_id_col].astype(str).tolist()

	def _get_ordered_level0_poi_ids(self) -> List[str]:
		lvl0 = self.poi_tree.get("level_0", {})
		if self.meta_idx_to_poi and 0 in self.meta_idx_to_poi:
			idx2poi = self.meta_idx_to_poi[0]
			ordered = [idx2poi[i] for i in sorted(idx2poi.keys()) if idx2poi[i] in lvl0]
			if ordered:
				return ordered
		return list(lvl0.keys())

	def _align_embedding_dim(self, X: np.ndarray, target_dim: int, label: str) -> np.ndarray:
		"""Project/pad embeddings to target_dim for user/poi alignment."""
		if target_dim is None:
			return X
		n_samples, n_features = X.shape
		if n_features == target_dim:
			return X
		if n_features < target_dim:
			pad = np.zeros((n_samples, target_dim - n_features), dtype=X.dtype)
			print(f"[align] {label}: padding {n_features} -> {target_dim}")
			return np.hstack([X, pad])
		# n_features > target_dim: reduce
		if n_samples <= 1:
			print(f"[align] {label}: skip projection (n_samples <= 1)")
			return X
		if n_samples <= target_dim:
			print(f"[align] {label}: random projection {n_features} -> {target_dim}")
			rp = GaussianRandomProjection(n_components=target_dim, random_state=42)
			return rp.fit_transform(X)
		print(f"[align] {label}: SVD projection {n_features} -> {target_dim}")
		svd = TruncatedSVD(n_components=target_dim, random_state=42)
		return svd.fit_transform(X)

	def build_X_A(self) -> np.ndarray:
		"""
		Features from user profile:
		- age_group
		- interests
		- transportation_mode
		- price_sensitivity
		"""

		print("\n" + "="*60)
		print("Building X_A: Direct User Attribute Matrix")
		print("="*60)
	
		features_list = []
		feature_names = []

		age_encoder = LabelEncoder()
		age_encoded = age_encoder.fit_transform(self.users_df['age_group'])
		age_onehot = np.eye(len(age_encoder.classes_))[age_encoded]
		features_list.append(age_onehot)
		feature_names.extend([f'age_group_{cls}' for cls in age_encoder.classes_])

		interests_list = [
				[interest.strip() for interest in row.split(';')] 
				for row in self.users_df['interests']
		]
		mlb_interests = MultiLabelBinarizer()
		interests_onehot = mlb_interests.fit_transform(interests_list)
		features_list.append(interests_onehot)
		feature_names.extend([f'interest_{cls}' for cls in mlb_interests.classes_])
		print(f"Added interests features: {len(mlb_interests.classes_)} dimensions")

		transport_list = [
				[mode.strip() for mode in row.split(';')] 
				for row in self.users_df['transportation_modes']
		]
		mlb_transport = MultiLabelBinarizer()
		transport_onehot = mlb_transport.fit_transform(transport_list)
		features_list.append(transport_onehot)
		feature_names.extend([f'transport_{cls}' for cls in mlb_transport.classes_])
		print(f"Added transportation_modes features: {len(mlb_transport.classes_)} dimensions")
		
		price_encoder = LabelEncoder()
		price_encoded = price_encoder.fit_transform(self.users_df['price_sensitivity'])
		price_onehot = np.eye(len(price_encoder.classes_))[price_encoded]
		features_list.append(price_onehot)
		feature_names.extend([f'price_{cls}' for cls in price_encoder.classes_])
		print(f"Added price_sensitivity features: {len(price_encoder.classes_)} dimensions")

		X_A = np.hstack(features_list)
		
		print(f"\nX_A shape: {X_A.shape}")
		print(f"Total features: {len(feature_names)}")

		return X_A
	
	def build_X_T(self, embedding_dim: int = 64) -> np.ndarray:
		"""
		Source 1: Sources/Files/user_poi_interactions.csv
		- user_id, poi_id
		Source 2: Sources/Files/poi_tree_with_uuids.json (level 0)
		- data.category, data.price, data.characteristics
		"""
	
		print("\n" + "="*60)
		print("Building X_T: Inverse User Attribute Matrix")
		print("="*60)
		
		# Get unique users and POIs (prefer metadata ordering)
		unique_users = self._get_ordered_user_ids()
		
		# Get all level 0 POI IDs from tree (prefer metadata ordering)
		all_poi_ids = self._get_ordered_level0_poi_ids()
		
		user_to_idx = {uid: idx for idx, uid in enumerate(unique_users)}
		poi_to_idx = {pid: idx for idx, pid in enumerate(all_poi_ids)}

		# Create sparse interaction matrix
		n_users = len(unique_users)
		n_pois = len(all_poi_ids)
		
		print(f"Building interaction matrix: {n_users} users × {n_pois} POIs")

		# Aggregate interactions (visits + weighted ratings)
		user_poi_scores = {}
		
		for _, row in self.interactions_df.iterrows():
				user_id = row['user_id']
				poi_id = row['poi_id']
				
				if user_id not in user_to_idx or poi_id not in poi_to_idx:
						continue
				
				key = (user_id, poi_id)
				
				if row['interaction_type'] == 'visit':
						user_poi_scores[key] = user_poi_scores.get(key, 0) + 1.0
				elif row['interaction_type'] == 'rating':
						# Normalize rating to 0-1 scale
						user_poi_scores[key] = user_poi_scores.get(key, 0) + (row['value'] / 5.0)
				elif row['interaction_type'] == 'search':
						user_poi_scores[key] = user_poi_scores.get(key, 0) + 0.3
		
		# Build sparse matrix
		row_indices = []
		col_indices = []
		values = []
		
		for (user_id, poi_id), score in user_poi_scores.items():
				row_indices.append(user_to_idx[user_id])
				col_indices.append(poi_to_idx[poi_id])
				values.append(score)
		
		interaction_matrix = csr_matrix(
				(values, (row_indices, col_indices)),
				shape=(n_users, n_pois)
		)
		
		print(f"Interaction matrix density: {interaction_matrix.nnz / (n_users * n_pois) * 100:.2f}%")
		
		# Matrix factorization to learn latent user features
		# Using NMF (Non-negative Matrix Factorization)
		print(f"Performing matrix factorization (embedding_dim={embedding_dim})...")
		
		nmf = NMF(n_components=embedding_dim, init='random', random_state=42, max_iter=200)
		X_T = nmf.fit_transform(interaction_matrix)
		
		# Normalize
		scaler = StandardScaler()
		X_T = scaler.fit_transform(X_T)
		
		print(f"X_T shape: {X_T.shape}")
		print(f"Reconstruction error: {nmf.reconstruction_err_:.4f}")
		
		self.X_T = X_T
		self.X_T_model = nmf
		self.poi_to_idx = poi_to_idx

		return X_T
	
	def build_user_embeddings(self) -> np.ndarray:
		print("\n" + "="*60)
		print("Building Complete User Embeddings")
		print("="*60)
		
		X_A = self.build_X_A()
		X_T = self.build_X_T(embedding_dim=FINAL_EMBEDDING_DIM)
		
		# Concatenate
		X = np.hstack([X_A, X_T])
		X = self._align_embedding_dim(X, TARGET_EMBEDDING_DIM, "user")

		# store for saving
		self.X_A = X_A
		self.X_T = X_T
		
		print(f"\nFinal user embedding shape: {X.shape}")
		print(f"  X_A dimensions: {X_A.shape[1]}")
		print(f"  X_T dimensions: {X_T.shape[1]}")
		print(f"  Total dimensions: {X.shape[1]}")
		
		self.X = X
		
		# Store user embeddings in dictionary (metadata ordering)
		self.user_ids = self._get_ordered_user_ids()
		self.user_id_to_idx = {uid: i for i, uid in enumerate(self.user_ids)}
		self.idx_to_user = {i: uid for i, uid in enumerate(self.user_ids)}
		
		for idx, user_id in enumerate(self.user_ids):
				self.user_embeddings[user_id] = X[idx]
		
		return X

	def save_embeddings(self, output_file: str = 'user_embeddings.pkl'):
		# If poi_embeddings is empty, try loading from Sources/poi_embeddings.pkl
		if not self.poi_embeddings:
			root = Path(output_file).resolve().parent.parent if Path(output_file).is_absolute() else None
			if root is not None:
				cand = root / "Sources" / "poi_embeddings.pkl"
				if cand.exists():
					with cand.open("rb") as f:
						poi_data = pickle.load(f)
					self.poi_embeddings = poi_data.get("poi_embeddings", {})

		data = {
			'user_embeddings': self.user_embeddings,
			'user_id_to_idx': getattr(self, 'user_id_to_idx', {}),
			'idx_to_user': getattr(self, 'idx_to_user', {}),
			'X': getattr(self, 'X', None),
			'X_A': getattr(self, 'X_A', None),
			'X_T': getattr(self, 'X_T', None),
			'user_id_col': self.user_id_col,
			'poi_embeddings': self.poi_embeddings,
		}

		with open(output_file, 'wb') as f:
			pickle.dump(data, f, protocol=4)
			
	def load_embeddings(self, input_file: str = 'user_embeddings.pkl'):
		with open(input_file, 'rb') as f:
			data = pickle.load(f)
						
		self.user_embeddings = data.get('user_embeddings', {})
		self.poi_embeddings = data.get('poi_embeddings', {})
		self.user_id_to_idx = data.get('user_id_to_idx', {})
		self.idx_to_user = data.get('idx_to_user', {})
		self.X = data.get('X', None)
		self.user_id_col = data.get('user_id_col', self.user_id_col)


In [None]:
from pathlib import Path


def find_repo_root(start=None) -> Path:
	start = Path(start or Path.cwd()).resolve()
	for p in [start] + list(start.parents):
		if (p / ".git").exists():
			return p
	raise RuntimeError(f"Cannot find repo root from {start}")


if __name__ == "__main__":
	ROOT = find_repo_root()
	SOURCES = ROOT / "Sources"
	DATA_DIR = SOURCES / "Files"

	user_preferences_file = DATA_DIR / "user_preferences.csv"
	user_poi_interactions_file = DATA_DIR / "user_poi_interactions.csv"
	poi_tree_file = DATA_DIR / "poi_tree_with_uuids.json"

	print("user_preferences_file:", user_preferences_file, user_preferences_file.exists())
	print("user_poi_interactions_file:", user_poi_interactions_file, user_poi_interactions_file.exists())
	print("poi_tree_file:", poi_tree_file, poi_tree_file.exists())

	learner = UserEmbeddings(
		users_file=str(user_preferences_file),
		user_poi_interactions_file=str(user_poi_interactions_file),
		poi_tree_file=str(poi_tree_file),
	)

	# Build user embeddings
	learner.build_user_embeddings()
	
	# Save embeddings (under Sources folder)
	out_pkl = SOURCES / "user_embeddings.pkl"
	learner.save_embeddings(str(out_pkl))
	print("Saved user embeddings to:", out_pkl)

user_preferences_file: C:\Users\syoon\SpatiaLynk_recommender\Sources\Files\user_preferences.csv True
user_poi_interactions_file: C:\Users\syoon\SpatiaLynk_recommender\Sources\Files\user_poi_interactions.csv True
poi_tree_file: C:\Users\syoon\SpatiaLynk_recommender\Sources\Files\poi_tree_with_uuids.json True

Building Complete User Embeddings

Building X_A: Direct User Attribute Matrix
Added interests features: 25 dimensions
Added transportation_modes features: 6 dimensions
Added price_sensitivity features: 3 dimensions

X_A shape: (21, 39)
Total features: 39

Building X_T: Inverse User Attribute Matrix
Building interaction matrix: 21 users × 4696 POIs
Interaction matrix density: 0.26%
Performing matrix factorization (embedding_dim=32)...
X_T shape: (21, 32)
Reconstruction error: 0.0033
[align] user: random projection 71 -> 64

Final user embedding shape: (21, 64)
  X_A dimensions: 39
  X_T dimensions: 32
  Total dimensions: 64
Saved user embeddings to: C:\Users\syoon\SpatiaLynk_recomme