In [2]:
import json
import pickle
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple, Set, Optional
from datetime import datetime
from collections import defaultdict
from scipy.sparse import csr_matrix, lil_matrix
import warnings


In [3]:
class InteractionGenerator:
    """
    Generate training interactions at all hierarchy levels
    """
    
    def __init__(self, 
				interactions_file: str,
				poi_tree_file: str,
				metadata_file: str):
        print("=" * 60)
        print("Initializing Interaction Generator")
        print("=" * 60)
        
        # Load interactions
        print(f"\nLoading interactions from: {interactions_file}")
        self.interactions_df = pd.read_csv(interactions_file)
        print(f"  Loaded {len(self.interactions_df)} raw interactions")
        
        # Load POI tree
        print(f"\nLoading POI tree from: {poi_tree_file}")
        with open(poi_tree_file, 'r', encoding='utf-8') as f:
            self.poi_tree = json.load(f)
        
        # Load metadata for ID mappings
        print(f"\nLoading metadata from: {metadata_file}")
        with open(metadata_file, 'rb') as f:
            self.metadata = pickle.load(f)
        
        self.mappings = self.metadata['mappings']
        
        # Build parent lookup cache
        print("\nBuilding parent lookup cache...")
        self.parent_cache = self._build_parent_cache()
        
        print("\nInitialization complete!")
    
    def _build_parent_cache(self) -> Dict[str, Dict[int, str]]:
        """
        Build a cache mapping POI ID -> {level: parent_id_at_level}
        
        For each level-0 POI, precompute its parent at each higher level.
        
        Returns:
            Dictionary: poi_id -> {1: parent_l1, 2: parent_l2, 3: parent_l3}
        """
        parent_cache = {}
        
        # Get all level 0 POIs
        level_0_pois = self.poi_tree.get('level_0', {})
        
        for poi_id, poi_data in level_0_pois.items():
            parent_cache[poi_id] = {}
            
            current_id = poi_id
            current_level = 0
            
            # Traverse up the tree
            while current_level < 3:
                level_key = f'level_{current_level}'
                
                if level_key not in self.poi_tree:
                    break
                
                if current_id not in self.poi_tree[level_key]:
                    break
                
                parent_id = self.poi_tree[level_key][current_id].get('parent')
                
                if parent_id:
                    parent_cache[poi_id][current_level + 1] = parent_id
                    current_id = parent_id
                    current_level += 1
                else:
                    break
        
        # Statistics
        pois_with_full_hierarchy = sum(
            1 for cache in parent_cache.values() 
            if len(cache) == 3
        )
        print(f"  Cached parents for {len(parent_cache)} level-0 POIs")
        print(f"  POIs with full hierarchy (up to level 3): {pois_with_full_hierarchy}")
        
        return parent_cache
    
    def _get_parent_at_level(self, poi_id: str, target_level: int) -> Optional[str]:
        """
        Get parent of poi_id at target_level using cache
        
        Args:
            poi_id: Level 0 POI ID
            target_level: Target level (1, 2, or 3)
        
        Returns:
            Parent POI ID at target level, or None if not found
        """
        if target_level == 0:
            return poi_id
        
        if poi_id in self.parent_cache:
            return self.parent_cache[poi_id].get(target_level)
        
        return None
    
    def _propagate_interactions_to_level(self, 
										level_0_interactions: List[Tuple],
										target_level: int) -> List[Tuple]:
        """
        Propagate level-0 interactions to a higher level
        
        Args:
            level_0_interactions: List of (user_id, poi_id, timestamp, interaction_type, value)
            target_level: Target level (1, 2, or 3)
        
        Returns:
            List of propagated interactions at target level
        """
        propagated = []
        
        for user_id, poi_id, timestamp, interaction_type, value in level_0_interactions:
            parent_id = self._get_parent_at_level(poi_id, target_level)
            
            if parent_id:
                propagated.append((user_id, parent_id, timestamp, interaction_type, value))
        
        return propagated
    
    def generate_interactions(self) -> Dict:
        """
        Generate interaction data for all levels
        
        Returns:
            Dictionary containing interactions at all levels
        """
        print("\n" + "=" * 60)
        print("Generating Interactions for All Levels")
        print("=" * 60)
        
        # Sort by timestamp
        print("\n[Step 1] Sorting interactions by timestamp...")
        self.interactions_df['timestamp'] = pd.to_datetime(self.interactions_df['timestamp'])
        self.interactions_df = self.interactions_df.sort_values('timestamp')
        
        # Extract level 0 interactions
        print("\n[Step 2] Extracting level-0 interactions...")
        level_0_interactions = []
        
        for _, row in self.interactions_df.iterrows():
            user_id = row['user_id']
            poi_id = row['poi_id']
            timestamp = row['timestamp']
            interaction_type = row.get('interaction_type', 'visit')
            value = row.get('value', 1)
            
            level_0_interactions.append((
                user_id, poi_id, timestamp, interaction_type, value
            ))
        
        print(f"  Extracted {len(level_0_interactions)} level-0 interactions")
        
        # Generate interactions for each level
        interactions_by_level = {}
        
        for level in range(4):
            print(f"\n[Step 3.{level}] Processing Level {level}...")
            
            if level == 0:
                level_interactions = level_0_interactions
            else:
                level_interactions = self._propagate_interactions_to_level(
                    level_0_interactions, level
                )
            
            # Process and aggregate interactions
            processed = self._process_level_interactions(level_interactions, level)
            interactions_by_level[f'level_{level}'] = processed
            
            print(f"  Level {level}: {processed['stats']['total_interactions']} interactions")
            print(f"  Level {level}: {processed['stats']['unique_users']} unique users")
            print(f"  Level {level}: {processed['stats']['unique_pois']} unique POIs")
        
        return interactions_by_level
    
    def _process_level_interactions(self, 
                                    interactions: List[Tuple],
                                    level: int) -> Dict:
        """
        Process interactions for a specific level
        
        Creates:
        - Sparse interaction matrix
        - Edge lists
        - Aggregated scores
        - Train/val/test splits
        
        Args:
            interactions: List of (user_id, poi_id, timestamp, type, value)
            level: Current level
        
        Returns:
            Dictionary with processed interaction data
        """
        level_key = f'level_{level}'
        
        # Get mappings
        user_to_idx = self.mappings['user']['id_to_idx']
        poi_to_idx = self.mappings['poi'][level_key]['id_to_idx']
        
        n_users = self.mappings['user']['count']
        n_pois = self.mappings['poi'][level_key]['count']
        
        # Aggregate interactions by (user, poi) pair
        pair_data = defaultdict(lambda: {
            'visits': 0,
            'ratings': [],
            'searches': 0,
            'total_value': 0,
            'timestamps': [],
            'interaction_types': []
        })
        
        skipped_users = set()
        skipped_pois = set()
        
        for user_id, poi_id, timestamp, interaction_type, value in interactions:
            # Check if user and POI exist in mappings
            if user_id not in user_to_idx:
                skipped_users.add(user_id)
                continue
            if poi_id not in poi_to_idx:
                skipped_pois.add(poi_id)
                continue
            
            key = (user_id, poi_id)
            data = pair_data[key]
            
            data['timestamps'].append(timestamp)
            data['interaction_types'].append(interaction_type)
            
            if interaction_type == 'visit':
                data['visits'] += 1
                data['total_value'] += 1.0
            elif interaction_type == 'rating':
                data['ratings'].append(value)
                data['total_value'] += value / 5.0
            elif interaction_type == 'search':
                data['searches'] += 1
                data['total_value'] += 0.3
            else:
                data['total_value'] += 0.5
        
        if skipped_users:
            print(f"    Warning: Skipped {len(skipped_users)} unknown users")
        if skipped_pois:
            print(f"    Warning: Skipped {len(skipped_pois)} unknown POIs")
        
        # Build various representations
        
        # 1. Edge list (for GNN)
        edge_user_ids = []
        edge_poi_ids = []
        edge_user_indices = []
        edge_poi_indices = []
        edge_weights = []
        edge_timestamps = []
        
        # 2. Sparse matrix (for matrix factorization)
        interaction_matrix = lil_matrix((n_users, n_pois), dtype=np.float32)
        
        # 3. Binary interaction matrix
        binary_matrix = lil_matrix((n_users, n_pois), dtype=np.float32)
        
        # 4. User -> POI mapping
        user_to_pois = defaultdict(list)
        
        # 5. POI -> User mapping
        poi_to_users = defaultdict(list)
        
        for (user_id, poi_id), data in pair_data.items():
            user_idx = user_to_idx[user_id]
            poi_idx = poi_to_idx[poi_id]
            
            # Compute aggregated score
            score = data['total_value']
            if data['ratings']:
                score += np.mean(data['ratings']) / 5.0
            
            # Edge list
            edge_user_ids.append(user_id)
            edge_poi_ids.append(poi_id)
            edge_user_indices.append(user_idx)
            edge_poi_indices.append(poi_idx)
            edge_weights.append(score)
            edge_timestamps.append(max(data['timestamps']))  # Latest timestamp
            
            # Matrices
            interaction_matrix[user_idx, poi_idx] = score
            binary_matrix[user_idx, poi_idx] = 1.0
            
            # Mappings
            user_to_pois[user_idx].append({
                'poi_idx': poi_idx,
                'poi_id': poi_id,
                'score': score,
                'visits': data['visits'],
                'avg_rating': np.mean(data['ratings']) if data['ratings'] else None
            })
            
            poi_to_users[poi_idx].append({
                'user_idx': user_idx,
                'user_id': user_id,
                'score': score
            })
        
        # Convert to CSR for efficiency
        interaction_matrix = interaction_matrix.tocsr()
        binary_matrix = binary_matrix.tocsr()
        
        # Create train/val/test splits based on timestamp
        edge_data = list(zip(
            edge_user_indices, 
            edge_poi_indices, 
            edge_weights,
            edge_timestamps
        ))
        edge_data.sort(key=lambda x: x[3])  # Sort by timestamp
        
        n_edges = len(edge_data)
        train_end = int(n_edges * 0.7)
        val_end = int(n_edges * 0.85)
        
        train_edges = edge_data[:train_end]
        val_edges = edge_data[train_end:val_end]
        test_edges = edge_data[val_end:]
        
        # Statistics
        stats = {
            'total_interactions': len(pair_data),
            'unique_users': len(set(edge_user_indices)),
            'unique_pois': len(set(edge_poi_indices)),
            'total_visits': sum(d['visits'] for d in pair_data.values()),
            'total_ratings': sum(len(d['ratings']) for d in pair_data.values()),
            'total_searches': sum(d['searches'] for d in pair_data.values()),
            'avg_interactions_per_user': len(pair_data) / max(len(set(edge_user_indices)), 1),
            'avg_interactions_per_poi': len(pair_data) / max(len(set(edge_poi_indices)), 1),
            'matrix_density': interaction_matrix.nnz / (n_users * n_pois) * 100,
            'train_size': len(train_edges),
            'val_size': len(val_edges),
            'test_size': len(test_edges)
        }
        
        return {
            # Edge lists
            'edges': {
                'user_ids': edge_user_ids,
                'poi_ids': edge_poi_ids,
                'user_indices': np.array(edge_user_indices, dtype=np.int64),
                'poi_indices': np.array(edge_poi_indices, dtype=np.int64),
                'weights': np.array(edge_weights, dtype=np.float32),
                'timestamps': edge_timestamps
            },
            
            # Sparse matrices
            'matrices': {
                'interaction': interaction_matrix,
                'binary': binary_matrix
            },
            
            # Mappings
            'user_to_pois': dict(user_to_pois),
            'poi_to_users': dict(poi_to_users),
            
            # Train/Val/Test splits
            'splits': {
                'train': {
                    'user_indices': np.array([e[0] for e in train_edges], dtype=np.int64),
                    'poi_indices': np.array([e[1] for e in train_edges], dtype=np.int64),
                    'weights': np.array([e[2] for e in train_edges], dtype=np.float32)
                },
                'val': {
                    'user_indices': np.array([e[0] for e in val_edges], dtype=np.int64),
                    'poi_indices': np.array([e[1] for e in val_edges], dtype=np.int64),
                    'weights': np.array([e[2] for e in val_edges], dtype=np.float32)
                },
                'test': {
                    'user_indices': np.array([e[0] for e in test_edges], dtype=np.int64),
                    'poi_indices': np.array([e[1] for e in test_edges], dtype=np.int64),
                    'weights': np.array([e[2] for e in test_edges], dtype=np.float32)
                }
            },
            
            # Statistics
            'stats': stats,
            
            # Dimensions
            'n_users': n_users,
            'n_pois': n_pois
        }
    
    def generate_negative_samples(self, 
								interactions_by_level: Dict,
								neg_ratio: int = 4) -> Dict:
        """
        Generate negative samples for training
        
        For each positive (user, poi) pair, sample neg_ratio negative POIs
        that the user has NOT interacted with.
        
        Args:
            interactions_by_level: Output from generate_interactions()
            neg_ratio: Number of negative samples per positive
        
        Returns:
            Dictionary with negative samples for each level
        """
        print("\n" + "=" * 60)
        print(f"Generating Negative Samples (ratio={neg_ratio})")
        print("=" * 60)
        
        negative_samples = {}
        
        for level in range(4):
            level_key = f'level_{level}'
            level_data = interactions_by_level[level_key]
            
            print(f"\n  Processing Level {level}...")
            
            n_users = level_data['n_users']
            n_pois = level_data['n_pois']
            
            # Get positive interactions per user
            user_positive_pois = defaultdict(set)
            for user_idx, poi_idx in zip(
                level_data['edges']['user_indices'],
                level_data['edges']['poi_indices']
            ):
                user_positive_pois[user_idx].add(poi_idx)
            
            # Generate negative samples for each split
            level_negatives = {}
            
            for split_name in ['train', 'val', 'test']:
                split_data = level_data['splits'][split_name]
                user_indices = split_data['user_indices']
                poi_indices = split_data['poi_indices']
                
                neg_user_indices = []
                neg_poi_indices = []
                
                all_pois = set(range(n_pois))
                
                for user_idx, pos_poi_idx in zip(user_indices, poi_indices):
                    # Get POIs user hasn't interacted with
                    positive_pois = user_positive_pois[user_idx]
                    negative_pois = list(all_pois - positive_pois)
                    
                    if len(negative_pois) >= neg_ratio:
                        sampled_negs = np.random.choice(
                            negative_pois, 
                            size=neg_ratio, 
                            replace=False
                        )
                    else:
                        # If not enough negatives, sample with replacement
                        sampled_negs = np.random.choice(
                            negative_pois if negative_pois else list(all_pois),
                            size=neg_ratio,
                            replace=True
                        )
                    
                    for neg_poi_idx in sampled_negs:
                        neg_user_indices.append(user_idx)
                        neg_poi_indices.append(neg_poi_idx)
                
                level_negatives[split_name] = {
                    'user_indices': np.array(neg_user_indices, dtype=np.int64),
                    'poi_indices': np.array(neg_poi_indices, dtype=np.int64)
                }
                
                print(f"    {split_name}: {len(neg_user_indices)} negative samples")
            
            negative_samples[level_key] = level_negatives
        
        return negative_samples
    
    def save_interactions(self, 
						interactions_by_level: Dict,
						negative_samples: Dict,
						output_file: str = 'interactions.pkl'):
        """
        Save all interaction data to pickle file
        
        Args:
            interactions_by_level: Interaction data for all levels
            negative_samples: Negative samples for all levels
            output_file: Output file path
        """
        print(f"\n" + "=" * 60)
        print(f"Saving interactions to: {output_file}")
        print("=" * 60)
        
        save_data = {
            # Main interaction data
            'interactions': interactions_by_level,
            
            # Negative samples
            'negative_samples': negative_samples,
            
            # Quick access to dimensions
            'dimensions': {
                f'level_{level}': {
                    'n_users': interactions_by_level[f'level_{level}']['n_users'],
                    'n_pois': interactions_by_level[f'level_{level}']['n_pois'],
                    'n_interactions': interactions_by_level[f'level_{level}']['stats']['total_interactions']
                }
                for level in range(4)
            },
            
            # Metadata
            'info': {
                'created_at': datetime.now().isoformat(),
                'negative_ratio': 4,  # Default ratio used
                'split_ratios': {'train': 0.7, 'val': 0.15, 'test': 0.15},
                'levels': ['level_0', 'level_1', 'level_2', 'level_3'],
                'level_names': {
                    'level_0': 'Building',
                    'level_1': 'Street',
                    'level_2': 'District',
                    'level_3': 'Region'
                }
            }
        }
        
        with open(output_file, 'wb') as f:
            pickle.dump(save_data, f)
        
        # Print summary
        import os
        file_size = os.path.getsize(output_file) / (1024 * 1024)
        print(f"\n  File size: {file_size:.2f} MB")
        
        print("\n  Interactions per level:")
        for level in range(4):
            level_key = f'level_{level}'
            stats = interactions_by_level[level_key]['stats']
            print(f"    Level {level}:")
            print(f"      - Total interactions: {stats['total_interactions']}")
            print(f"      - Train/Val/Test: {stats['train_size']}/{stats['val_size']}/{stats['test_size']}")
            print(f"      - Matrix density: {stats['matrix_density']:.4f}%")
        
        print("\n  Save complete!")


def generate_interactions_pkl(interactions_file: str,
							poi_tree_file: str,
							metadata_file: str,
							output_file: str = 'interactions.pkl',
							neg_ratio: int = 4):
    """
    Main function to generate interactions.pkl
    
    Args:
        interactions_file: Path to user_poi_interactions.csv
        poi_tree_file: Path to poi_tree_with_uuids.json
        metadata_file: Path to metadata.pkl
        output_file: Output file path
        neg_ratio: Negative sampling ratio
    """
    # Initialize generator
    generator = InteractionGenerator(
        interactions_file=interactions_file,
        poi_tree_file=poi_tree_file,
        metadata_file=metadata_file
    )
    
    # Generate interactions
    interactions_by_level = generator.generate_interactions()
    
    # Generate negative samples
    negative_samples = generator.generate_negative_samples(
        interactions_by_level, 
        neg_ratio=neg_ratio
    )
    
    # Save
    generator.save_interactions(
        interactions_by_level,
        negative_samples,
        output_file
    )
    
    return interactions_by_level, negative_samples


# Utility class for loading and using interactions
class InteractionLoader:
    """
    Utility class for loading and accessing interaction data
    """
    
    def __init__(self, interactions_file: str = 'interactions.pkl'):
        """
        Load interactions from pickle file
        
        Args:
            interactions_file: Path to interactions.pkl
        """
        with open(interactions_file, 'rb') as f:
            self.data = pickle.load(f)
        
        self.interactions = self.data['interactions']
        self.negative_samples = self.data['negative_samples']
        self.dimensions = self.data['dimensions']
        self.info = self.data['info']
    
    def get_train_data(self, level: int) -> Dict:
        """Get training data for a specific level"""
        level_key = f'level_{level}'
        return {
            'positive': self.interactions[level_key]['splits']['train'],
            'negative': self.negative_samples[level_key]['train']
        }
    
    def get_val_data(self, level: int) -> Dict:
        """Get validation data for a specific level"""
        level_key = f'level_{level}'
        return {
            'positive': self.interactions[level_key]['splits']['val'],
            'negative': self.negative_samples[level_key]['val']
        }
    
    def get_test_data(self, level: int) -> Dict:
        """Get test data for a specific level"""
        level_key = f'level_{level}'
        return {
            'positive': self.interactions[level_key]['splits']['test'],
            'negative': self.negative_samples[level_key]['test']
        }
    
    def get_interaction_matrix(self, level: int, binary: bool = False) -> csr_matrix:
        """Get interaction matrix for a specific level"""
        level_key = f'level_{level}'
        matrix_key = 'binary' if binary else 'interaction'
        return self.interactions[level_key]['matrices'][matrix_key]
    
    def get_user_history(self, user_idx: int, level: int) -> List[Dict]:
        """Get interaction history for a user at specific level"""
        level_key = f'level_{level}'
        return self.interactions[level_key]['user_to_pois'].get(user_idx, [])
    
    def get_poi_users(self, poi_idx: int, level: int) -> List[Dict]:
        """Get users who interacted with a POI at specific level"""
        level_key = f'level_{level}'
        return self.interactions[level_key]['poi_to_users'].get(poi_idx, [])
    
    def get_stats(self, level: int) -> Dict:
        """Get statistics for a specific level"""
        level_key = f'level_{level}'
        return self.interactions[level_key]['stats']
    
    def get_all_edges(self, level: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Get all edges (user_indices, poi_indices, weights) for a level
        
        Returns:
            Tuple of (user_indices, poi_indices, weights)
        """
        level_key = f'level_{level}'
        edges = self.interactions[level_key]['edges']
        return (
            edges['user_indices'],
            edges['poi_indices'],
            edges['weights']
        )


In [8]:
if __name__ == "__main__":
    user_poi_interactions_file = "../../Sources/Files/user_poi_interactions.csv"
    poi_tree_file = "../../Sources/Files/poi_tree_with_uuids.json"
    metadata_file = "../../Sources/Embeddings/metadata.pkl"
    output_file = "interactions.pkl"
    
    print("=" * 60)
    print("INTERACTION GENERATION PIPELINE")
    print("=" * 60)
    print(f"\nInput files:")
    print(f"  - Interactions: {user_poi_interactions_file}")
    print(f"  - POI Tree: {poi_tree_file}")
    print(f"  - Metadata: {metadata_file}")
    print(f"\nOutput file: {output_file}")
    
    # Generate interactions
    interactions_by_level, negative_samples = generate_interactions_pkl(
        interactions_file=user_poi_interactions_file,
        poi_tree_file=poi_tree_file,
        metadata_file=metadata_file,
        output_file=output_file,
        neg_ratio=4
    )
    
    # Demo: Using the InteractionLoader
    print("\n" + "=" * 60)
    print("DEMO: Using InteractionLoader")
    print("=" * 60)
    
    loader = InteractionLoader(output_file)
    
    # Example usage
    print("\nLevel 0 Statistics:")
    stats = loader.get_stats(level=0)
    for key, value in stats.items():
        print(f"  {key}: {value}")
    
    print("\nTraining data shape (Level 0):")
    train_data = loader.get_train_data(level=0)
    print(f"  Positive samples: {len(train_data['positive']['user_indices'])}")
    print(f"  Negative samples: {len(train_data['negative']['user_indices'])}")
    
    print("\nInteraction matrix info (Level 0):")
    matrix = loader.get_interaction_matrix(level=0)
    print(f"  Shape: {matrix.shape}")
    print(f"  Non-zero entries: {matrix.nnz}")
    print(f"  Density: {matrix.nnz / (matrix.shape[0] * matrix.shape[1]) * 100:.4f}%")
    
    print("\n" + "=" * 60)
    print("PIPELINE COMPLETE")
    print("=" * 60)

INTERACTION GENERATION PIPELINE

Input files:
  - Interactions: ../../Sources/Files/user_poi_interactions.csv
  - POI Tree: ../../Sources/Files/poi_tree_with_uuids.json
  - Metadata: ../../Sources/Embeddings/metadata.pkl

Output file: interactions.pkl
Initializing Interaction Generator

Loading interactions from: ../../Sources/Files/user_poi_interactions.csv
  Loaded 567 raw interactions

Loading POI tree from: ../../Sources/Files/poi_tree_with_uuids.json

Loading metadata from: ../../Sources/Embeddings/metadata.pkl

Building parent lookup cache...
  Cached parents for 4696 level-0 POIs
  POIs with full hierarchy (up to level 3): 4696

Initialization complete!

Generating Interactions for All Levels

[Step 1] Sorting interactions by timestamp...

[Step 2] Extracting level-0 interactions...
  Extracted 567 level-0 interactions

[Step 3.0] Processing Level 0...
  Level 0: 257 interactions
  Level 0: 21 unique users
  Level 0: 235 unique POIs

[Step 3.1] Processing Level 1...
  Level 1: 2