In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, StandardScaler
from sklearn.decomposition import NMF
import json
from scipy.sparse import csr_matrix
import pickle

In [None]:
class UserEmbeddigns: 
	def __init__(self,
			users_file: str,
			user_poi_interactions_file: str,
			poi_tree_file: str):
		self.users_df = pd.read_csv(users_file)
		self.interactions_df = pd.read_csv(user_poi_interactions_file)

		with open(poi_tree_file, 'r') as f:
			self.poi_tree = json.load(f)
	
		self.user_embeddings = {}
		self.poi_embeddings = {}

	def build_X_A(self) -> np.ndarray:
		"""
		Features from user profile:
		- age_group
		- interests
		- transportation_mode
		- price_sensitivity
		"""

		print("\n" + "="*60)
		print("Building X_A: Direct User Attribute Matrix")
		print("="*60)
	
		features_list = []
		feature_names = []

		age_encoder = LabelEncoder()
		age_encoded = age_encoder.fit_transform(self.users_df['age_group'])
		age_onehot = np.eye(len(age_encoder.classes_))[age_encoded]
		features_list.append(age_onehot)
		feature_names.extend([f'age_group_{cls}' for cls in age_encoder.classes_])

		interests_list = [
				[interest.strip() for interest in row.split(';')] 
				for row in self.users_df['interests']
		]
		mlb_interests = MultiLabelBinarizer()
		interests_onehot = mlb_interests.fit_transform(interests_list)
		features_list.append(interests_onehot)
		feature_names.extend([f'interest_{cls}' for cls in mlb_interests.classes_])
		print(f"Added interests features: {len(mlb_interests.classes_)} dimensions")

		transport_list = [
				[mode.strip() for mode in row.split(';')] 
				for row in self.users_df['transportation_modes']
		]
		mlb_transport = MultiLabelBinarizer()
		transport_onehot = mlb_transport.fit_transform(transport_list)
		features_list.append(transport_onehot)
		feature_names.extend([f'transport_{cls}' for cls in mlb_transport.classes_])
		print(f"Added transportation_modes features: {len(mlb_transport.classes_)} dimensions")
		
		price_encoder = LabelEncoder()
		price_encoded = price_encoder.fit_transform(self.users_df['price_sensitivity'])
		price_onehot = np.eye(len(price_encoder.classes_))[price_encoded]
		features_list.append(price_onehot)
		feature_names.extend([f'price_{cls}' for cls in price_encoder.classes_])
		print(f"Added price_sensitivity features: {len(price_encoder.classes_)} dimensions")

		X_A = np.hstack(features_list)
		
		print(f"\nX_A shape: {X_A.shape}")
		print(f"Total features: {len(feature_names)}")

		return X_A
	
	def build_X_T(self, embedding_dim: int = 32) -> np.ndarray:
		"""
		Source 1: user_poi_interactions.csv
		- user_id, poi_id
		Source 2: poi_tree_with_uuids.json (level 0)
		- data.category, data.price, data.characteristics
		"""
	
		print("\n" + "="*60)
		print("Building X_T: Inverse User Attribute Matrix")
		print("="*60)
		
		# Get unique users and POIs
		unique_users = self.users_df['uudi'].tolist()
		
		# Get all level 0 POI IDs from tree
		all_poi_ids = list(self.poi_tree['level_0'].keys())
		
		user_to_idx = {uid: idx for idx, uid in enumerate(unique_users)}
		poi_to_idx = {pid: idx for idx, pid in enumerate(all_poi_ids)}

		# Create sparse interaction matrix
		n_users = len(unique_users)
		n_pois = len(all_poi_ids)
		
		print(f"Building interaction matrix: {n_users} users × {n_pois} POIs")

		# Aggregate interactions (visits + weighted ratings)
		user_poi_scores = {}
		
		for _, row in self.interactions_df.iterrows():
				user_id = row['user_id']
				poi_id = row['poi_id']
				
				if user_id not in user_to_idx or poi_id not in poi_to_idx:
						continue
				
				key = (user_id, poi_id)
				
				if row['interaction_type'] == 'visit':
						user_poi_scores[key] = user_poi_scores.get(key, 0) + 1.0
				elif row['interaction_type'] == 'rating':
						# Normalize rating to 0-1 scale
						user_poi_scores[key] = user_poi_scores.get(key, 0) + (row['value'] / 5.0)
				elif row['interaction_type'] == 'search':
						user_poi_scores[key] = user_poi_scores.get(key, 0) + 0.3
		
		# Build sparse matrix
		row_indices = []
		col_indices = []
		values = []
		
		for (user_id, poi_id), score in user_poi_scores.items():
				row_indices.append(user_to_idx[user_id])
				col_indices.append(poi_to_idx[poi_id])
				values.append(score)
		
		interaction_matrix = csr_matrix(
				(values, (row_indices, col_indices)),
				shape=(n_users, n_pois)
		)
		
		print(f"Interaction matrix density: {interaction_matrix.nnz / (n_users * n_pois) * 100:.2f}%")
		
		# Matrix factorization to learn latent user features
		# Using NMF (Non-negative Matrix Factorization)
		print(f"Performing matrix factorization (embedding_dim={embedding_dim})...")
		
		nmf = NMF(n_components=embedding_dim, init='random', random_state=42, max_iter=200)
		X_T = nmf.fit_transform(interaction_matrix)
		
		# Normalize
		scaler = StandardScaler()
		X_T = scaler.fit_transform(X_T)
		
		print(f"X_T shape: {X_T.shape}")
		print(f"Reconstruction error: {nmf.reconstruction_err_:.4f}")
		
		self.X_T = X_T
		self.X_T_model = nmf
		self.poi_to_idx = poi_to_idx

		return X_T
	
	def build_user_embeddings(self) -> np.ndarray:
		print("\n" + "="*60)
		print("Building Complete User Embeddings")
		print("="*60)
		
		X_A = self.build_X_A()
		X_T = self.build_X_T(embedding_dim=32)
		
		# Concatenate
		X = np.hstack([X_A, X_T])
		
		print(f"\nFinal user embedding shape: {X.shape}")
		print(f"  X_A dimensions: {X_A.shape[1]}")
		print(f"  X_T dimensions: {X_T.shape[1]}")
		print(f"  Total dimensions: {X.shape[1]}")
		
		self.X = X
		
		# Store user embeddings in dictionary
		for idx, user_id in enumerate(self.users_df['uudi']):
				self.user_embeddings[user_id] = X[idx]
		
		return X

	def save_embeddings(self, output_file: str = 'user_embeddings.pkl'):
		data = {
			'user_embeddings': self.user_embeddings,
			'poi_embeddings': self.poi_embeddings,
		}

		with open(output_file, 'wb') as f:
			pickle.dump(data, f)
			
	def load_embeddings(self, input_file: str = 'user_embeddings.pkl'):
		with open(input_file, 'rb') as f:
			data = pickle.load(f)
						
		self.user_embeddings = data['user_embeddings']
		self.poi_embeddings = data['poi_embeddings']
		self.user_id_to_idx = data['user_id_to_idx']
		self.X = data['X']


In [None]:
if __name__ == "__main__":
	user_preferences_file = "../../Sources/user_preferences.csv"
	user_poi_interactions_file = "../../Sources/user_poi_interactions.csv"
	poi_tree_file = "../../Sources/poi_tree_with_uuids.json"

	learner = UserEmbeddigns(
		users_file=user_preferences_file,
		user_poi_interactions_file=user_poi_interactions_file,
		poi_tree_file=poi_tree_file
	)

	# Build user embeddings
	learner.build_user_embeddings()
	
	# Save embeddings
	learner.save_embeddings("user_embeddings.pkl")


Building Complete User Embeddings

Building X_A: Direct User Attribute Matrix
Added interests features: 25 dimensions
Added transportation_modes features: 6 dimensions
Added price_sensitivity features: 3 dimensions

X_A shape: (21, 39)
Total features: 39

Building X_T: Inverse User Attribute Matrix
Building interaction matrix: 21 users × 4696 POIs
Interaction matrix density: 0.26%
Performing matrix factorization (embedding_dim=32)...
X_T shape: (21, 32)
Reconstruction error: 0.0033

Final user embedding shape: (21, 71)
  X_A dimensions: 39
  X_T dimensions: 32
  Total dimensions: 71
