In [58]:
import pickle
import pandas as pd
import numpy as np
import networkx as nx
from scipy.sparse import lil_matrix, csr_matrix, save_npz, diags
from collections import defaultdict
from tqdm import tqdm

### Step 1 : Extract User-POI interaction matrix

In [59]:
def build_base_interaction_matrix(interactions_data, meta_data, level):
    """
    Build base user-POI interaction matrix for a specific level
    """
    # Get interaction data for this level
    level_data = interactions_data['interactions'][level]
    
    # Get dimensions from the data itself
    n_users = level_data['n_users']
    n_pois = level_data['n_pois']
    
    print(f"  {level}: {n_users} users × {n_pois} POIs")
    
    # OPTION 1: Use pre-built matrix if available
    if 'matrices' in level_data and 'interaction' in level_data['matrices']:
        print("  Using pre-built interaction matrix")
        interaction_matrix = level_data['matrices']['interaction']
        
        # Ensure it's in CSR format
        if not isinstance(interaction_matrix, csr_matrix):
            interaction_matrix = csr_matrix(interaction_matrix)
        
        print(f"  ✓ Matrix shape: {interaction_matrix.shape}")
        print(f"  ✓ Non-zero interactions: {interaction_matrix.nnz}")
    
    # OPTION 2: Build from edges
    else:
        print("  Building matrix from edges...")
        
        # Create sparse matrix
        interaction_matrix = lil_matrix((n_users, n_pois), dtype=np.float32)
        
        # Get edge data
        edges = level_data['edges']
        user_indices = edges['user_indices']  # Already integer indices!
        poi_indices = edges['poi_indices']    # Already integer indices!
        weights = edges.get('weights', None)
        
        print(f"  Processing {len(user_indices)} edges")
        
        # Fill the matrix
        if weights is not None:
            for user_idx, poi_idx, weight in zip(user_indices, poi_indices, weights):
                interaction_matrix[user_idx, poi_idx] += weight
        else:
            for user_idx, poi_idx in zip(user_indices, poi_indices):
                interaction_matrix[user_idx, poi_idx] += 1.0
        
        # Convert to CSR
        interaction_matrix = interaction_matrix.tocsr()
        print(f"  ✓ Matrix shape: {interaction_matrix.shape}")
        print(f"  ✓ Non-zero interactions: {interaction_matrix.nnz}")
    
    # Get ordered IDs
    user_ids = meta_data['user_ids']
    level_num = int(level.split('_')[1])
    poi_ids = meta_data['poi_ids'][level_num]
    
    return interaction_matrix, user_ids, poi_ids


### Step 2 : Graph-based POI Representation Propagation

In [70]:
def propagate_poi_representations_via_graph(poi_context_graph, poi_embeddings, poi_ids, 
                                            alpha=0.15, max_iter=10, tol=1e-4):
    """
    Propagate POI representations through context graph
    
    Args:
        poi_context_graph: NetworkX graph with edges between ALL POIs
        poi_embeddings: (n_pois, d) embeddings for POIs at this level
        poi_ids: List of POI IDs at this level
        alpha: Restart probability (how much to weight original embeddings)
        max_iter: Maximum iterations for propagation
        tol: Convergence tolerance
    
    Returns:
        G_l: Enhanced POI representations (n_pois, d)
    """
    n_pois, d = poi_embeddings.shape
    print(f"Propagating {n_pois} POIs with dimension {d}")
    
    # Create mapping from POI ID to index in this level
    poi_id_to_idx = {poi_id: idx for idx, poi_id in enumerate(poi_ids)}
    poi_ids_set = set(poi_ids)
    
    # Extract subgraph containing only POIs at this level
    if hasattr(poi_context_graph, 'subgraph'):
        # NetworkX graph
        level_subgraph = poi_context_graph.subgraph(poi_ids_set)
        n_edges = level_subgraph.number_of_edges()
        
        print(f"    Found {n_edges} edges between POIs at this level")
        
        if n_edges == 0:
            print("    No edges found - returning original embeddings")
            return poi_embeddings
        
        # Build adjacency matrix for this level
        A = lil_matrix((n_pois, n_pois), dtype=np.float32)
        
        for u, v, data in level_subgraph.edges(data=True):
            if u in poi_id_to_idx and v in poi_id_to_idx:
                u_idx = poi_id_to_idx[u]
                v_idx = poi_id_to_idx[v]
                weight = data.get('weight', 1.0)
                A[u_idx, v_idx] = weight
                A[v_idx, u_idx] = weight  # Symmetric
        
        A = A.tocsr()
        
    else:
        print("Warning: Graph format not recognized, returning original embeddings")
        return poi_embeddings
    
    # Normalize adjacency matrix (row-wise)
    row_sums = np.array(A.sum(axis=1)).flatten()
    row_sums[row_sums == 0] = 1  # Avoid division by zero
    D_inv = diags(1.0 / row_sums)
    A_norm = D_inv @ A
    
    # Iterative propagation
    G_l = poi_embeddings.copy()
    
    for iteration in range(max_iter):
        G_l_new = (1 - alpha) * (A_norm @ G_l) + alpha * poi_embeddings
        
        # Check convergence
        delta = np.abs(G_l_new - G_l).max()
        G_l = G_l_new
        
        if delta < tol:
            print(f"    Converged at iteration {iteration + 1}")
            break
    else:
        print(f"    Reached max iterations ({max_iter})")
    
    return G_l


def build_graph_enhanced_poi_matrix(poi_context_graph, poi_embeddings_data, 
                                    meta_data, level):
    """
    Build graph-enhanced POI representation matrix G^l
    FIXED: Use correct structure - embeddings matrix + poi_ids list
    """
    level_num = int(level.split('_')[1])
    
    # Get POI embeddings for this level
    level_embeddings = poi_embeddings_data['poi_embeddings'][level]
    
    # Extract the embeddings matrix and POI IDs
    poi_embeddings = level_embeddings['embeddings']  # Shape: (n_pois, embedding_dim)
    poi_ids_in_embeddings = level_embeddings['poi_ids']  # List of POI IDs in same order
    
    # Get POI IDs for this level from metadata (canonical order)
    poi_ids = meta_data['poi_ids'][level_num]
    
    print(f"  Building G^{level_num} with {len(poi_ids)} POIs")
    print(f"  POI embeddings shape: {poi_embeddings.shape}")
    
    # Create mapping from POI ID to embedding row index
    poi_id_to_embedding_idx = {poi_id: idx for idx, poi_id in enumerate(poi_ids_in_embeddings)}
    
    # Reorder embeddings to match canonical POI order
    ordered_embeddings = []
    missing_count = 0
    
    for poi_id in poi_ids:
        if poi_id in poi_id_to_embedding_idx:
            idx = poi_id_to_embedding_idx[poi_id]
            ordered_embeddings.append(poi_embeddings[idx])
        else:
            # POI not in embeddings, use zero vector
            ordered_embeddings.append(np.zeros(poi_embeddings.shape[1]))
            missing_count += 1
    
    ordered_embeddings = np.array(ordered_embeddings)
    
    if missing_count > 0:
        print(f"  Warning: {missing_count} POIs missing from embeddings (using zero vectors)")
    
    print(f"  Ordered embeddings shape: {ordered_embeddings.shape}")
    
    # Propagate through graph
    G_l = propagate_poi_representations_via_graph(
        poi_context_graph, ordered_embeddings, poi_ids
    )
    
    return G_l


### Step 3 : Build User Representation w/ Explicit Attributes

In [67]:
def build_user_representation_matrix(base_interaction_matrix, user_embeddings_data,
                                    meta_data, G_l, beta=0.5):
    """
    Build user representation matrix U^l
    """
    n_users = base_interaction_matrix.shape[0]
    d = G_l.shape[1]
    
    print(f"  Building U matrix: {n_users} users, dimension {d}")
    
    # Component 1: Interaction-based user representation
    interaction_based = base_interaction_matrix @ G_l
    
    # Normalize by number of interactions per user
    user_interaction_counts = np.array(
        base_interaction_matrix.sum(axis=1)
    ).flatten()
    user_interaction_counts[user_interaction_counts == 0] = 1
    
    interaction_based = interaction_based / user_interaction_counts[:, np.newaxis]
    
    print(f"  Interaction-based representation shape: {interaction_based.shape}")
    
    # Component 2: Explicit user attributes
    # Check structure of user_embeddings_data
    if 'user_embeddings' in user_embeddings_data:
        user_emb_data = user_embeddings_data['user_embeddings']
        
        # Check if it has the same structure as POI embeddings
        if isinstance(user_emb_data, dict) and 'embeddings' in user_emb_data:
            # Same structure as POI embeddings
            user_embeddings = user_emb_data['embeddings']
            user_ids_in_embeddings = user_emb_data['user_ids']
            
            # Create mapping
            user_id_to_embedding_idx = {user_id: idx for idx, user_id in enumerate(user_ids_in_embeddings)}
            
            # Reorder to match canonical user order
            ordered_user_embeddings = []
            missing_count = 0
            
            for user_id in meta_data['user_ids']:
                if user_id in user_id_to_embedding_idx:
                    idx = user_id_to_embedding_idx[user_id]
                    ordered_user_embeddings.append(user_embeddings[idx])
                else:
                    ordered_user_embeddings.append(np.zeros(user_embeddings.shape[1]))
                    missing_count += 1
            
            user_explicit_embeddings = np.array(ordered_user_embeddings)
            
            if missing_count > 0:
                print(f"  Warning: {missing_count} users missing from embeddings")
        
        elif isinstance(user_emb_data, np.ndarray):
            # Direct array
            user_explicit_embeddings = user_emb_data
        
        else:
            # Dictionary mapping user_id -> embedding
            user_explicit_embeddings = []
            missing_count = 0
            
            for user_id in meta_data['user_ids']:
                if user_id in user_emb_data:
                    user_explicit_embeddings.append(user_emb_data[user_id])
                else:
                    # Get dimension from first user
                    sample_user = list(user_emb_data.keys())[0]
                    user_dim = len(user_emb_data[sample_user])
                    user_explicit_embeddings.append(np.zeros(user_dim))
                    missing_count += 1
            
            user_explicit_embeddings = np.array(user_explicit_embeddings)
            
            if missing_count > 0:
                print(f"  Warning: {missing_count} users missing from embeddings")
    
    else:
        # No user embeddings, use zero vectors
        print("  No user embeddings found, using zero vectors")
        user_explicit_embeddings = np.zeros((n_users, d))
    
    print(f"  User explicit embeddings shape: {user_explicit_embeddings.shape}")
    
    # Ensure dimensions match
    if user_explicit_embeddings.shape[1] != d:
        print(f"  Projecting user embeddings from {user_explicit_embeddings.shape[1]} to {d}")
        # Simple linear projection
        projection = np.random.randn(user_explicit_embeddings.shape[1], d) * 0.01
        user_explicit_embeddings = user_explicit_embeddings @ projection
    
    # Combine both components
    U_l = beta * interaction_based + (1 - beta) * user_explicit_embeddings
    
    print(f"  Final U^l shape: {U_l.shape}")
    
    return U_l

### Step 4 : Compute Historical Check-in Matrix

In [68]:
def compute_historical_checkin_matrix(U_l, G_l):
    """
    Compute H^l = U^l · (G^l)^T
    """
    H_l = U_l @ G_l.T
    return H_l

### Step 5 : Build Pipeline

In [69]:
def build_complete_historical_checkin_pipeline(interactions_data, meta_data, 
                                                poi_embeddings_data, user_embeddings_data,
                                                poi_context_graph):
    """
    Complete pipeline - FULLY CORRECTED for edge-based interaction data
    """
    
    print("="*70)
    print("BUILDING HISTORICAL CHECK-IN MATRICES")
    print("="*70)
    
    results = {
        'base_matrices': {},
        'G_matrices': {},
        'U_matrices': {},
        'H_matrices': {},
        'poi_ids': {},
        'user_ids': meta_data['user_ids']
    }
    
    for level in ['level_0', 'level_1', 'level_2', 'level_3']:
        print(f"\n{'='*70}")
        print(f"Processing {level}...")
        print(f"{'='*70}")
        
        # Step 1: Base interaction matrix
        print("Step 1: Building base interaction matrix...")
        base_matrix, user_ids, poi_ids = build_base_interaction_matrix(
            interactions_data, meta_data, level
        )
        results['base_matrices'][level] = base_matrix
        results['poi_ids'][level] = poi_ids
        print(f"  ✓ Base matrix shape: {base_matrix.shape}")
        print(f"  ✓ Non-zero interactions: {base_matrix.nnz}")
        print(f"  ✓ Density: {base_matrix.nnz / (base_matrix.shape[0] * base_matrix.shape[1]):.6f}")
        
        # Step 2: Graph-enhanced POI representations
        print("\nStep 2: Building graph-enhanced POI matrix G^l...")
        G_l = build_graph_enhanced_poi_matrix(
            poi_context_graph, poi_embeddings_data, meta_data, level
        )
        results['G_matrices'][level] = G_l
        print(f"  ✓ G^l shape: {G_l.shape}")
        
        # Step 3: User representation matrix
        print("\nStep 3: Building user representation matrix U^l...")
        U_l = build_user_representation_matrix(
            base_matrix, user_embeddings_data, meta_data, G_l
        )
        results['U_matrices'][level] = U_l
        print(f"  ✓ U^l shape: {U_l.shape}")
        
        # Step 4: Historical check-in matrix
        print("\nStep 4: Computing historical check-in matrix H^l...")
        H_l = compute_historical_checkin_matrix(U_l, G_l)
        results['H_matrices'][level] = H_l
        print(f"  ✓ H^l shape: {H_l.shape}")
        print(f"  ✓ H^l stats: min={H_l.min():.4f}, max={H_l.max():.4f}, mean={H_l.mean():.4f}")
    
    # Save results
    print(f"\n{'='*70}")
    print("Saving results...")
    with open('historical_checkin_matrices.pkl', 'wb') as f:
        pickle.dump({
            'H_matrices': results['H_matrices'],
            'U_matrices': results['U_matrices'],
            'G_matrices': results['G_matrices'],
            'base_matrices': results['base_matrices'],
            'poi_ids': results['poi_ids'],
            'user_ids': results['user_ids'],
            'metadata': {
                'formula': 'H^l = U^l · (G^l)^T',
                'U_l_components': ['interaction_based', 'explicit_user_attributes'],
                'G_l_components': ['explicit_poi_embeddings', 'graph_propagation'],
                'created_at': str(np.datetime64('now'))
            }
        }, f)
    
    print("✓ Saved to 'historical_checkin_matrices.pkl'")
    print(f"{'='*70}")
    print("PIPELINE COMPLETE!")
    print(f"{'='*70}\n")
    
    return results

In [71]:
if __name__ == "__main__":
    with open('../../Sources/Embeddings/interactions.pkl', 'rb') as f:
        interactions_data = pickle.load(f)
    
    with open('../../Sources/Embeddings/metadata.pkl', 'rb') as f:
        meta_data = pickle.load(f)
        
    with open('../../Sources/Embeddings/poi_embeddings.pkl', 'rb') as f:
        poi_embeddings_data = pickle.load(f)
        
    with open('../../Sources/Embeddings/user_embeddings.pkl', 'rb') as f:
        user_embeddings_data = pickle.load(f)
        
    with open('../../Sources/Embeddings/poi_context_graph.pkl', 'rb') as f:
        poi_context_graph = pickle.load(f)

    print("="*70)
    print("Testing POI embeddings structure...")
    print("="*70)

    # Test build_graph_enhanced_poi_matrix
    level = 'level_0'
    print(f"\nTesting {level}:")
    G_l = build_graph_enhanced_poi_matrix(
        poi_context_graph, poi_embeddings_data, meta_data, level
    )
    print(f"✓ G^l shape: {G_l.shape}")
    print(f"✓ G^l stats: min={G_l.min():.4f}, max={G_l.max():.4f}, mean={G_l.mean():.4f}")

    # Test with base interaction matrix
    print("\n" + "="*70)
    print("Testing user representation matrix...")
    print("="*70)

    base_matrix, user_ids, poi_ids = build_base_interaction_matrix(
        interactions_data, meta_data, level
    )
    print(f"\nBase matrix: {base_matrix.shape}")

    U_l = build_user_representation_matrix(
        base_matrix, user_embeddings_data, meta_data, G_l
    )
    print(f"✓ U^l shape: {U_l.shape}")

    print("="*70)
    print("ANALYZING POI CONTEXT GRAPH FOR EACH LEVEL")
    print("="*70)

    results = build_complete_historical_checkin_pipeline(
        interactions_data=interactions_data,
        meta_data=meta_data,
        poi_embeddings_data=poi_embeddings_data,
        user_embeddings_data=user_embeddings_data,
        poi_context_graph=poi_context_graph
    )

    # Get original embeddings
    original_emb = poi_embeddings_data['poi_embeddings'][level]['embeddings']
    
    # Get graph-enhanced embeddings
    enhanced_emb = results['G_matrices'][level]
    
    # Compute difference
    diff = np.abs(enhanced_emb - original_emb)
    
    print(f"\n{level}:")
    print(f"  Original embedding stats:")
    print(f"    Mean: {original_emb.mean():.4f}, Std: {original_emb.std():.4f}")
    print(f"  Enhanced embedding stats:")
    print(f"    Mean: {enhanced_emb.mean():.4f}, Std: {enhanced_emb.std():.4f}")
    print(f"  Absolute difference:")
    print(f"    Mean: {diff.mean():.4f}, Max: {diff.max():.4f}")
    print(f"  % of embeddings changed (>0.01): {(diff.max(axis=1) > 0.01).mean()*100:.1f}%")


    print("\n" + "="*70)
    print("SUMMARY OF RESULTS")
    print("="*70)
    for level in ['level_0', 'level_1', 'level_2', 'level_3']:
        H_l = results['H_matrices'][level]
        print(f"\n{level}:")
        print(f"  H^l shape: {H_l.shape}")
        print(f"  Min score: {H_l.min():.4f}")
        print(f"  Max score: {H_l.max():.4f}")
        print(f"  Mean score: {H_l.mean():.4f}")
        print(f"  Std score: {H_l.std():.4f}")

Testing POI embeddings structure...

Testing level_0:
  Building G^0 with 4696 POIs
  POI embeddings shape: (4696, 221)
  Ordered embeddings shape: (4696, 221)
Propagating 4696 POIs with dimension 221
    Found 0 edges between POIs at this level
    No edges found - returning original embeddings
✓ G^l shape: (4696, 221)
✓ G^l stats: min=-7.7627, max=64.9592, mean=0.0289

Testing user representation matrix...
  level_0: 21 users × 4696 POIs
  Using pre-built interaction matrix
  ✓ Matrix shape: (21, 4696)
  ✓ Non-zero interactions: 257

Base matrix: (21, 4696)
  Building U matrix: 21 users, dimension 221
  Interaction-based representation shape: (21, 221)
  User explicit embeddings shape: (21, 71)
  Projecting user embeddings from 71 to 221
  Final U^l shape: (21, 221)
✓ U^l shape: (21, 221)
ANALYZING POI CONTEXT GRAPH FOR EACH LEVEL
BUILDING HISTORICAL CHECK-IN MATRICES

Processing level_0...
Step 1: Building base interaction matrix...
  level_0: 21 users × 4696 POIs
  Using pre-built 

  interactions_data = pickle.load(f)
  poi_embeddings_data = pickle.load(f)
