In [3]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, StandardScaler
from sklearn.decomposition import TruncatedSVD, NMF
from scipy.sparse import csr_matrix
import pickle
from typing import List

In [4]:
class AttributeBasedRepresentationLearning:
    def __init__(self, 
                 users_file: str,
                 interactions_file: str,
                 poi_tree_file: str):
        """
        Initialize representation learning
        
        Args:
            users_file: Path to user preferences CSV
            interactions_file: Path to user-POI interactions CSV
            poi_tree_file: Path to POI tree JSON
        """
        self.users_df = pd.read_csv(users_file)
        self.interactions_df = pd.read_csv(interactions_file)
        
        with open(poi_tree_file, 'r') as f:
            self.poi_tree = json.load(f)
        
        self.user_embeddings = {}
        self.poi_embeddings = {}
        
    # ========================================================================
    # USER REPRESENTATION LEARNING
    # ========================================================================
    
    def build_X_A(self) -> np.ndarray:
        """
        Build direct user attribute matrix X_A
        
        Features from user profiles:
        - age_group (one-hot)
        - area_of_residence (one-hot or lat/lon)
        - interests (multi-hot)
        - transportation_mode (multi-hot)
        - price_sensitivity (one-hot)
        
        Returns:
            X_A: (num_users, num_features) matrix
        """
        print("\n" + "="*60)
        print("Building X_A: Direct User Attribute Matrix")
        print("="*60)
        
        features_list = []
        feature_names = []
        
        # 1. Age group (one-hot encoding)
        age_encoder = LabelEncoder()
        age_encoded = age_encoder.fit_transform(self.users_df['age_group'])
        age_onehot = np.eye(len(age_encoder.classes_))[age_encoded]
        features_list.append(age_onehot)
        feature_names.extend([f'age_{cls}' for cls in age_encoder.classes_])
        print(f"Added age_group features: {len(age_encoder.classes_)} dimensions")
        
        # 2. Area of residence (one-hot encoding)
        area_encoder = LabelEncoder()
        area_encoded = area_encoder.fit_transform(self.users_df['area_of_residence'])
        area_onehot = np.eye(len(area_encoder.classes_))[area_encoded]
        features_list.append(area_onehot)
        feature_names.extend([f'area_{cls}' for cls in area_encoder.classes_])
        print(f"Added area_of_residence features: {len(area_encoder.classes_)} dimensions")
        
        # 3. Interests (multi-hot encoding)
        interests_list = [
            [interest.strip() for interest in row.split(';')] 
            for row in self.users_df['interests']
        ]
        mlb_interests = MultiLabelBinarizer()
        interests_onehot = mlb_interests.fit_transform(interests_list)
        features_list.append(interests_onehot)
        feature_names.extend([f'interest_{cls}' for cls in mlb_interests.classes_])
        print(f"Added interests features: {len(mlb_interests.classes_)} dimensions")
        
        # 4. Transportation modes (multi-hot encoding)
        transport_list = [
            [mode.strip() for mode in row.split(';')] 
            for row in self.users_df['transportation_modes']
        ]
        mlb_transport = MultiLabelBinarizer()
        transport_onehot = mlb_transport.fit_transform(transport_list)
        features_list.append(transport_onehot)
        feature_names.extend([f'transport_{cls}' for cls in mlb_transport.classes_])
        print(f"Added transportation_modes features: {len(mlb_transport.classes_)} dimensions")
        
        # 5. Price sensitivity (one-hot encoding)
        price_encoder = LabelEncoder()
        price_encoded = price_encoder.fit_transform(self.users_df['price_sensitivity'])
        price_onehot = np.eye(len(price_encoder.classes_))[price_encoded]
        features_list.append(price_onehot)
        feature_names.extend([f'price_{cls}' for cls in price_encoder.classes_])
        print(f"Added price_sensitivity features: {len(price_encoder.classes_)} dimensions")
        
        # Concatenate all features
        X_A = np.hstack(features_list)
        
        print(f"\nX_A shape: {X_A.shape}")
        print(f"Total features: {len(feature_names)}")
        
        # Store for later use
        self.X_A = X_A
        self.X_A_feature_names = feature_names
        self.user_id_to_idx = {uid: idx for idx, uid in enumerate(self.users_df['uudi'])}
        
        return X_A
    
    def build_X_T(self, embedding_dim: int = 32) -> np.ndarray:
        """
        Build inverse user attribute matrix X_T
        
        Learned from user-POI interaction patterns using matrix factorization.
        
        Intuition: Users who visit similar POIs should have similar X_T vectors.
        
        Args:
            embedding_dim: Dimensionality of learned embeddings
        
        Returns:
            X_T: (num_users, embedding_dim) matrix
        """
        print("\n" + "="*60)
        print("Building X_T: Inverse User Attribute Matrix")
        print("="*60)
        
        # Build user-POI interaction matrix
        # Rows = users, Columns = POIs, Values = interaction strength
        
        # Get unique users and POIs
        unique_users = self.users_df['uudi'].tolist()
        
        # Get all level 0 POI IDs from tree
        all_poi_ids = list(self.poi_tree['level_0'].keys())
        
        user_to_idx = {uid: idx for idx, uid in enumerate(unique_users)}
        poi_to_idx = {pid: idx for idx, pid in enumerate(all_poi_ids)}
        
        # Create sparse interaction matrix
        n_users = len(unique_users)
        n_pois = len(all_poi_ids)
        
        print(f"Building interaction matrix: {n_users} users × {n_pois} POIs")
        
        # Aggregate interactions (visits + weighted ratings)
        user_poi_scores = {}
        
        for _, row in self.interactions_df.iterrows():
            user_id = row['user_id']
            poi_id = row['poi_id']
            
            if user_id not in user_to_idx or poi_id not in poi_to_idx:
                continue
            
            key = (user_id, poi_id)
            
            if row['interaction_type'] == 'visit':
                user_poi_scores[key] = user_poi_scores.get(key, 0) + 1.0
            elif row['interaction_type'] == 'rating':
                # Normalize rating to 0-1 scale
                user_poi_scores[key] = user_poi_scores.get(key, 0) + (row['value'] / 5.0)
            elif row['interaction_type'] == 'search':
                user_poi_scores[key] = user_poi_scores.get(key, 0) + 0.3
        
        # Build sparse matrix
        row_indices = []
        col_indices = []
        values = []
        
        for (user_id, poi_id), score in user_poi_scores.items():
            row_indices.append(user_to_idx[user_id])
            col_indices.append(poi_to_idx[poi_id])
            values.append(score)
        
        interaction_matrix = csr_matrix(
            (values, (row_indices, col_indices)),
            shape=(n_users, n_pois)
        )
        
        print(f"Interaction matrix density: {interaction_matrix.nnz / (n_users * n_pois) * 100:.2f}%")
        
        # Matrix factorization to learn latent user features
        # Using NMF (Non-negative Matrix Factorization)
        print(f"Performing matrix factorization (embedding_dim={embedding_dim})...")
        
        nmf = NMF(n_components=embedding_dim, init='random', random_state=42, max_iter=200)
        X_T = nmf.fit_transform(interaction_matrix)
        
        # Normalize
        scaler = StandardScaler()
        X_T = scaler.fit_transform(X_T)
        
        print(f"X_T shape: {X_T.shape}")
        print(f"Reconstruction error: {nmf.reconstruction_err_:.4f}")
        
        self.X_T = X_T
        self.X_T_model = nmf
        self.poi_to_idx = poi_to_idx
        
        return X_T
    
    def build_user_embeddings(self) -> np.ndarray:
        """
        Build complete user embeddings: X = [X_A | X_T]
        
        Returns:
            X: (num_users, dim_X_A + dim_X_T) matrix
        """
        print("\n" + "="*60)
        print("Building Complete User Embeddings")
        print("="*60)
        
        X_A = self.build_X_A()
        X_T = self.build_X_T(embedding_dim=32)
        
        # Concatenate
        X = np.hstack([X_A, X_T])
        
        print(f"\nFinal user embedding shape: {X.shape}")
        print(f"  X_A dimensions: {X_A.shape[1]}")
        print(f"  X_T dimensions: {X_T.shape[1]}")
        print(f"  Total dimensions: {X.shape[1]}")
        
        self.X = X
        
        # Store user embeddings in dictionary
        for idx, user_id in enumerate(self.users_df['uudi']):
            self.user_embeddings[user_id] = X[idx]
        
        return X
    
    # ========================================================================
    # POI REPRESENTATION LEARNING (Multi-level)
    # ========================================================================
    
    def build_Y_A_level(self, level: int) -> np.ndarray:
        """
        Build direct POI attribute matrix Y_A^l for level l
        
        Features from POI attributes:
        - category (one-hot)
        - spatial (lat, lon, normalized)
        - price (normalized)
        - popularity (normalized)
        - characteristic (text embedding - TF-IDF or multi-hot)
        - region (one-hot)
        - district (one-hot)
        
        Args:
            level: Tree level (0=finest, 3=coarsest)
        
        Returns:
            Y_A^l: (num_pois_at_level, num_features) matrix
        """
        print(f"\n{'='*60}")
        print(f"Building Y_A^{level}: Direct POI Attribute Matrix (Level {level})")
        print(f"{'='*60}")
        
        level_key = f'level_{level}'
        pois_at_level = self.poi_tree[level_key]
        
        poi_ids = list(pois_at_level.keys())
        n_pois = len(poi_ids)
        
        print(f"Number of POIs at level {level}: {n_pois}")
        
        features_list = []
        feature_names = []
        
        # 1. Spatial features (lat, lon)
        spatial_features = []
        for poi_id in poi_ids:
            poi_data = pois_at_level[poi_id]
            spatial = poi_data['spatial']
            if isinstance(spatial, str):
                spatial = eval(spatial)
            spatial_features.append(spatial)
        
        spatial_features = np.array(spatial_features)
        # Normalize
        spatial_scaler = StandardScaler()
        spatial_normalized = spatial_scaler.fit_transform(spatial_features)
        features_list.append(spatial_normalized)
        feature_names.extend(['lat_norm', 'lon_norm'])
        print(f"Added spatial features: 2 dimensions")
        
        # For level 0 (individual POIs), add more detailed features
        if level == 0:
            # 2. Category (one-hot)
            categories = [pois_at_level[pid]['data']['category'] for pid in poi_ids]
            category_encoder = LabelEncoder()
            category_encoded = category_encoder.fit_transform(categories)
            category_onehot = np.eye(len(category_encoder.classes_))[category_encoded]
            features_list.append(category_onehot)
            feature_names.extend([f'cat_{cls}' for cls in category_encoder.classes_])
            print(f"Added category features: {len(category_encoder.classes_)} dimensions")
            
            # 3. Price (normalized)
            prices = []
            for poi_id in poi_ids:
                price_str = pois_at_level[poi_id]['data']['price']
                try:
                    if '-' in str(price_str):
                        price_vals = str(price_str).split('-')
                        avg_price = (float(price_vals[0].strip()) + float(price_vals[1].strip())) / 2
                    else:
                        avg_price = float(price_str)
                except:
                    avg_price = 25.0  # Default
                prices.append(avg_price)
            
            prices = np.array(prices).reshape(-1, 1)
            price_scaler = StandardScaler()
            prices_normalized = price_scaler.fit_transform(prices)
            features_list.append(prices_normalized)
            feature_names.append('price_norm')
            print(f"Added price feature: 1 dimension")
            
            # 4. Popularity (normalized)
            popularities = []
            for poi_id in poi_ids:
                try:
                    pop = float(pois_at_level[poi_id]['data']['popularity'])
                except:
                    pop = 3.0
                popularities.append(pop)
            
            popularities = np.array(popularities).reshape(-1, 1)
            pop_scaler = StandardScaler()
            popularities_normalized = pop_scaler.fit_transform(popularities)
            features_list.append(popularities_normalized)
            feature_names.append('popularity_norm')
            print(f"Added popularity feature: 1 dimension")
            
            # 5. Characteristics (multi-hot from hashtags)
            characteristics_list = []
            for poi_id in poi_ids:
                char_str = str(pois_at_level[poi_id]['data']['characteristic'])
                # Extract hashtags
                tags = [tag.strip().replace('#', '') for tag in char_str.split(',')]
                characteristics_list.append(tags)
            
            mlb_chars = MultiLabelBinarizer()
            chars_onehot = mlb_chars.fit_transform(characteristics_list)
            features_list.append(chars_onehot)
            feature_names.extend([f'char_{cls}' for cls in mlb_chars.classes_])
            print(f"Added characteristic features: {len(mlb_chars.classes_)} dimensions")
            
            # 6. Region (one-hot)
            regions = [pois_at_level[pid]['data']['region'] for pid in poi_ids]
            region_encoder = LabelEncoder()
            region_encoded = region_encoder.fit_transform(regions)
            region_onehot = np.eye(len(region_encoder.classes_))[region_encoded]
            features_list.append(region_onehot)
            feature_names.extend([f'region_{cls}' for cls in region_encoder.classes_])
            print(f"Added region features: {len(region_encoder.classes_)} dimensions")
        
        # For higher levels, use aggregated text features
        else:
            # Use textual representation (simpler for higher levels)
            from sklearn.feature_extraction.text import TfidfVectorizer
            
            texts = [pois_at_level[pid]['textual'] for pid in poi_ids]
            tfidf = TfidfVectorizer(max_features=50, stop_words='english')
            text_features = tfidf.fit_transform(texts).toarray()
            features_list.append(text_features)
            feature_names.extend([f'text_{i}' for i in range(text_features.shape[1])])
            print(f"Added text features: {text_features.shape[1]} dimensions")
        
        # Concatenate all features
        Y_A_l = np.hstack(features_list)
        
        print(f"\nY_A^{level} shape: {Y_A_l.shape}")
        print(f"Total features: {len(feature_names)}")
        
        return Y_A_l, poi_ids, feature_names
    
    def build_Y_T_level(self, level: int, embedding_dim: int = 32) -> np.ndarray:
        """
        Build inverse POI attribute matrix Y_T^l for level l
        
        Learned from user-POI interaction patterns.
        
        Intuition: POIs visited by similar users should have similar Y_T vectors.
        
        Args:
            level: Tree level
            embedding_dim: Dimensionality of learned embeddings
        
        Returns:
            Y_T^l: (num_pois_at_level, embedding_dim) matrix
        """
        print(f"\n{'='*60}")
        print(f"Building Y_T^{level}: Inverse POI Attribute Matrix (Level {level})")
        print(f"{'='*60}")
        
        level_key = f'level_{level}'
        pois_at_level = self.poi_tree[level_key]
        poi_ids = list(pois_at_level.keys())
        
        # Build POI-User interaction matrix (transpose of user-POI)
        n_pois = len(poi_ids)
        n_users = len(self.users_df)
        
        print(f"Building interaction matrix: {n_pois} POIs × {n_users} users")
        
        poi_to_idx_local = {pid: idx for idx, pid in enumerate(poi_ids)}
        user_to_idx = {uid: idx for idx, uid in enumerate(self.users_df['uudi'])}
        
        # Aggregate interactions
        poi_user_scores = {}
        
        for _, row in self.interactions_df.iterrows():
            user_id = row['user_id']
            poi_id = row['poi_id']
            
            # For higher levels, map fine-grained POI to coarser level
            if level > 0:
                poi_id = self._get_parent_at_level(poi_id, target_level=level)
            
            if poi_id not in poi_to_idx_local or user_id not in user_to_idx:
                continue
            
            key = (poi_id, user_id)
            
            if row['interaction_type'] == 'visit':
                poi_user_scores[key] = poi_user_scores.get(key, 0) + 1.0
            elif row['interaction_type'] == 'rating':
                poi_user_scores[key] = poi_user_scores.get(key, 0) + (row['value'] / 5.0)
            elif row['interaction_type'] == 'search':
                poi_user_scores[key] = poi_user_scores.get(key, 0) + 0.3
        
        # Build sparse matrix
        row_indices = []
        col_indices = []
        values = []
        
        for (poi_id, user_id), score in poi_user_scores.items():
            row_indices.append(poi_to_idx_local[poi_id])
            col_indices.append(user_to_idx[user_id])
            values.append(score)
        
        interaction_matrix = csr_matrix(
            (values, (row_indices, col_indices)),
            shape=(n_pois, n_users)
        )
        
        print(f"Interaction matrix density: {interaction_matrix.nnz / (n_pois * n_users) * 100:.2f}%")
        
        # Matrix factorization
        print(f"Performing matrix factorization (embedding_dim={embedding_dim})...")
        
        nmf = NMF(n_components=embedding_dim, init='random', random_state=42, max_iter=200)
        Y_T_l = nmf.fit_transform(interaction_matrix)
        
        # Normalize
        scaler = StandardScaler()
        Y_T_l = scaler.fit_transform(Y_T_l)
        
        print(f"Y_T^{level} shape: {Y_T_l.shape}")
        print(f"Reconstruction error: {nmf.reconstruction_err_:.4f}")
        
        return Y_T_l
    
    def _get_parent_at_level(self, poi_id: str, target_level: int) -> str:
        """
        Get parent node of poi_id at target_level
        """
        # Start from level 0
        current_level = 0
        current_id = poi_id
        
        while current_level < target_level:
            level_key = f'level_{current_level}'
            if current_id in self.poi_tree[level_key]:
                parent = self.poi_tree[level_key][current_id].get('parent')
                if parent:
                    current_id = parent
                    current_level += 1
                else:
                    break
            else:
                break
        
        return current_id
    
    def build_poi_embeddings(self, levels: List[int] = [0, 1, 2, 3]):
        """
        Build complete POI embeddings for all specified levels
        
        Y^l = [Y_A^l | Y_T^l] for each level l
        
        Args:
            levels: List of tree levels to build embeddings for
        """
        print("\n" + "="*60)
        print("Building Complete POI Embeddings (All Levels)")
        print("="*60)
        
        for level in levels:
            print(f"\n--- Processing Level {level} ---")
            
            Y_A_l, poi_ids, feature_names = self.build_Y_A_level(level)
            Y_T_l = self.build_Y_T_level(level, embedding_dim=32)
            
            # Concatenate
            Y_l = np.hstack([Y_A_l, Y_T_l])
            
            print(f"\nFinal POI embedding for level {level}:")
            print(f"  Y_A^{level} dimensions: {Y_A_l.shape[1]}")
            print(f"  Y_T^{level} dimensions: {Y_T_l.shape[1]}")
            print(f"  Total dimensions: {Y_l.shape[1]}")
            
            # Store
            self.poi_embeddings[f'level_{level}'] = {
                'embeddings': Y_l,
                'poi_ids': poi_ids,
                'Y_A': Y_A_l,
                'Y_T': Y_T_l,
                'feature_names': feature_names
            }
    
    # ========================================================================
    # SAVE & LOAD
    # ========================================================================
    
    def save_embeddings(self, output_file: str = 'embeddings.pkl'):
        """Save all embeddings to file"""
        data = {
            'user_embeddings': self.user_embeddings,
            'poi_embeddings': self.poi_embeddings,
            'user_id_to_idx': self.user_id_to_idx,
            'X': self.X,
            'X_A': self.X_A,
            'X_T': self.X_T,
        }
        
        with open(output_file, 'wb') as f:
            pickle.dump(data, f)
        
        print(f"\nEmbeddings saved to {output_file}")
    
    def load_embeddings(self, input_file: str = 'embeddings.pkl'):
        """Load embeddings from file"""
        with open(input_file, 'rb') as f:
            data = pickle.load(f)
        
        self.user_embeddings = data['user_embeddings']
        self.poi_embeddings = data['poi_embeddings']
        self.user_id_to_idx = data['user_id_to_idx']
        self.X = data['X']
        self.X_A = data['X_A']
        self.X_T = data['X_T']
        
        print(f"\nEmbeddings loaded from {input_file}")

In [7]:
if __name__ == "__main__":
    # Initialize
    learner = AttributeBasedRepresentationLearning(
        users_file='user_preferences.csv',
        interactions_file='user_poi_interactions.csv',
        poi_tree_file='poi_tree_with_uuids.json'
    )
    
    # Build user embeddings
    X = learner.build_user_embeddings()
    
    # Build POI embeddings for all levels
    learner.build_poi_embeddings(levels=[0, 1, 2, 3])
    
    # Save embeddings
    learner.save_embeddings('embeddings.pkl')
    
    # Example: Get embedding for a specific user
    user_id = learner.users_df.iloc[0]['uudi']
    user_embedding = learner.user_embeddings[user_id]
    print(f"\nUser {user_id} embedding shape: {user_embedding.shape}")
    
    # Example: Get embeddings for level 0 POIs
    level_0_embeddings = learner.poi_embeddings['level_0']['embeddings']
    print(f"Level 0 POI embeddings shape: {level_0_embeddings.shape}")


Building Complete User Embeddings

Building X_A: Direct User Attribute Matrix
Added age_group features: 5 dimensions
Added area_of_residence features: 11 dimensions
Added interests features: 25 dimensions
Added transportation_modes features: 6 dimensions
Added price_sensitivity features: 3 dimensions

X_A shape: (21, 50)
Total features: 50

Building X_T: Inverse User Attribute Matrix
Building interaction matrix: 21 users × 4696 POIs
Interaction matrix density: 0.25%
Performing matrix factorization (embedding_dim=32)...
X_T shape: (21, 32)
Reconstruction error: 0.0047

Final user embedding shape: (21, 82)
  X_A dimensions: 50
  X_T dimensions: 32
  Total dimensions: 82

Building Complete POI Embeddings (All Levels)

--- Processing Level 0 ---

Building Y_A^0: Direct POI Attribute Matrix (Level 0)
Number of POIs at level 0: 4696
Added spatial features: 2 dimensions
Added category features: 23 dimensions
Added price feature: 1 dimension
Added popularity feature: 1 dimension
Added charact

