In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, StandardScaler
from sklearn.decomposition import NMF
import json
from scipy.sparse import csr_matrix
import pickle

In [None]:
class UserEmbeddings: 
	def __init__(self,
			users_file: str,
			user_poi_interactions_file: str,
			poi_tree_file: str):
		self.users_df = pd.read_csv(users_file)
		self.interactions_df = pd.read_csv(user_poi_interactions_file)
		with open(poi_tree_file, 'r') as f:
			self.poi_tree = json.load(f)

		self.X_A = None
		self.X_T = None
		self.X = None 

		self.age_encoder = None
		self.mlb_interests = None
		self.mlb_transport = None
		self.price_encoder = None
		
		# NMF components
		self.X_T_model = None
		self.X_T_scaler = None
		self.poi_to_idx = None

	def build_X_A(self) -> np.ndarray:

		print("\n" + "="*60)
		print("Building X_A: Direct User Attribute Matrix")
		print("="*60)
	
		features_list = []

		# ===== AGE ======
		self.age_encoder = LabelEncoder()
		age_encoded = self.age_encoder.fit_transform(self.users_df['age_group'])
		age_onehot = np.eye(len(self.age_encoder.classes_))[age_encoded]
		features_list.append(age_onehot)
		print(f"Age groups: {len(self.age_encoder.classes_)} dimensions")

		# ===== INTEREST ======
		interests_list = [[i.strip() for i in row.split(';')] for row in self.users_df['interests']]
		self.mlb_interests = MultiLabelBinarizer()
		interests_onehot = self.mlb_interests.fit_transform(interests_list)
		features_list.append(interests_onehot)
		print(f"Interests: {len(self.mlb_interests.classes_)} dimensions")

		# ===== TRANSPORTATION MODE ======
		transport_list = [[t.strip() for t in row.split(';')] for row in self.users_df['transportation_modes']]
		self.mlb_transport = MultiLabelBinarizer()
		transport_onehot = self.mlb_transport.fit_transform(transport_list)
		features_list.append(transport_onehot)
		print(f"Transport modes: {len(self.mlb_transport.classes_)} dimensions")
		
		# ===== PRICE SENSITIVITY ======
		self.price_encoder = LabelEncoder()
		price_encoded = self.price_encoder.fit_transform(self.users_df['price_sensitivity'])
		price_onehot = np.eye(len(self.price_encoder.classes_))[price_encoded]
		features_list.append(price_onehot)
		print(f"Price sensitivity: {len(self.price_encoder.classes_)} dimensions")

		self.X_A = np.hstack(features_list)
		print(f"\nX_A shape: {self.X_A.shape}")
		return self.X_A
	
	def build_X_T(self, embedding_dim: int = 32) -> np.ndarray:
		"""Build matrix factorization embeddings from interactions..."""
		print("\n" + "="*60)
		print("Building X_T: Inverse User Attribute Matrix")
		print("="*60)
		
		unique_users = self.users_df['uudi'].tolist()
		all_poi_ids = list(self.poi_tree['level_0'].keys())
		
		user_to_idx = {uid: idx for idx, uid in enumerate(unique_users)}
		self.poi_to_idx = {pid: idx for idx, pid in enumerate(all_poi_ids)}

		n_users = len(unique_users)
		n_pois = len(all_poi_ids)
		print(f"Building interaction matrix: {n_users} users × {n_pois} POIs")

		# Aggregate interactions
		user_poi_scores = {}
		for _, row in self.interactions_df.iterrows():
			user_id = row['user_id']
			poi_id = row['poi_id']
			
			if user_id not in user_to_idx or poi_id not in self.poi_to_idx:
				continue
			
			key = (user_id, poi_id)
			if row['interaction_type'] == 'visit':
				user_poi_scores[key] = user_poi_scores.get(key, 0) + 1.0
			elif row['interaction_type'] == 'rating':
				user_poi_scores[key] = user_poi_scores.get(key, 0) + (row['value'] / 5.0)
			elif row['interaction_type'] == 'search':
				user_poi_scores[key] = user_poi_scores.get(key, 0) + 0.3
		
		# Build sparse matrix
		row_indices = [user_to_idx[u] for u, p in user_poi_scores.keys()]
		col_indices = [self.poi_to_idx[p] for u, p in user_poi_scores.keys()]
		values = list(user_poi_scores.values())
		
		interaction_matrix = csr_matrix(
			(values, (row_indices, col_indices)),
			shape=(n_users, n_pois)
		)
		
		print(f"Matrix density: {interaction_matrix.nnz / (n_users * n_pois) * 100:.2f}%")
		
		# NMF
		print(f"Performing NMF (dim={embedding_dim})...")
		nmf = NMF(n_components=embedding_dim, init='random', random_state=42, max_iter=200)
		X_T = nmf.fit_transform(interaction_matrix)
		
		# Normalize and SAVE SCALER
		self.X_T_scaler = StandardScaler()
		X_T = self.X_T_scaler.fit_transform(X_T)
		
		print(f"X_T shape: {X_T.shape}")
		print(f"Reconstruction error: {nmf.reconstruction_err_:.4f}")
		
		self.X_T = X_T
		self.X_T_model = nmf
		
		return X_T
	
	def build_user_embeddings(self) -> np.ndarray:
		"""Concatenate X_A and X_T"""
		print("\n" + "="*60)
		print("Building Complete User Embeddings")
		print("="*60)
		
		X_A = self.build_X_A()
		X_T = self.build_X_T(embedding_dim=32)
		
		# Concatenate and STORE
		self.X = np.hstack([X_A, X_T])
		print(f"\nFinal embedding shape: {self.X.shape}")
		print(f"  X_A: {X_A.shape[1]} dims | X_T: {X_T.shape[1]} dims")
		
		return self.X
	
	def save_embeddings_csv(self, output_file: str = '../Sources/Files v3/user_embeddings.csv'):
		if self.X is None:
			raise ValueError("Run build_user_embeddings() first")
		
		df = pd.DataFrame(
			self.X,
			index=self.users_df['uudi'],
			columns=[f'dim_{i}' for i in range(self.X.shape[1])]
		)
		df.to_csv(output_file)
		print(f"Saved embeddings CSV to {output_file}")

	def save_state(self, output_file: str = '../Sources/Embeddings v3/user_embeddings.pkl'):
		"""Save everything needed for inference (encoders, models, mappings)"""
		data = {
			# Matrices
			'X_A': self.X_A,
			'X_T': self.X_T, 
			'X': self.X,
			
			# Encoders for new users
			'age_encoder': self.age_encoder,
			'mlb_interests': self.mlb_interests,
			'mlb_transport': self.mlb_transport,
			'price_encoder': self.price_encoder,
			
			# NMF components 
			'nmf_model': self.X_T_model,
			'X_T_scaler': self.X_T_scaler, 
			'poi_to_idx': self.poi_to_idx,
			
			# Metadata
			'user_ids': self.users_df['uudi'].tolist()
		}
		
		with open(output_file, 'wb') as f:  
			pickle.dump(data, f)
		print(f"Saved state to {output_file}")


In [None]:
if __name__ == "__main__":
		user_preferences_file = "../Sources/Files/user_preferences.csv"
		user_poi_interactions_file = "../Sources/Files/user_poi_interactions.csv"
		poi_tree_file = "../Sources/Files/poi_tree_with_uuids.json"

		learner = UserEmbeddings(
			users_file=user_preferences_file,
			user_poi_interactions_file=user_poi_interactions_file,
			poi_tree_file=poi_tree_file
		)

		# Build user embeddings
		learner.build_user_embeddings()
		
		# Save embeddings
		learner.save_embeddings_csv('../Sources/Embeddings v3 csv/user_embeddings.csv')
		learner.save_state('../Sources/Embeddings v3/user_embeddings.pkl')


Building Complete User Embeddings

Building X_A: Direct User Attribute Matrix
Age groups: 5 dimensions
Interests: 25 dimensions
Transport modes: 6 dimensions
Price sensitivity: 3 dimensions

X_A shape: (21, 39)

Building X_T: Inverse User Attribute Matrix
Building interaction matrix: 21 users × 4696 POIs
Matrix density: 0.26%
Performing NMF (dim=32)...
X_T shape: (21, 32)
Reconstruction error: 0.0033

Final embedding shape: (21, 71)
  X_A: 39 dims | X_T: 32 dims
Saved embeddings CSV to ../Sources/Files/user_embeddings.csv
Saved state to ../Sources/Embeddings v3/user_embedding_state.pkl
