In [None]:
import pickle
import pandas as pd
import numpy as np
import networkx as nx
import json
from scipy.sparse import csr_matrix, lil_matrix, diags
from collections import defaultdict
from tqdm import tqdm

In [4]:
def build_historical_matrices(
    csv_path, 
    poi_tree_path, 
    poi_emb_path, 
    user_emb_path, 
    graph_path, 
    output_path
):
    print("="*70)
    print("LOADING DATA")
    print("="*70)
    
    # Load CSV
    df = pd.read_csv(csv_path)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    print(f"✓ Loaded {len(df)} interactions from CSV")
    
    # Load POI Tree (for ID mapping poi_X_Name -> UUID)
    with open(poi_tree_path, 'r') as f:
        poi_tree = json.load(f)
    
    # Build mapping: poi_key -> uuid
    poi_key_to_uuid = {}
    uuid_to_level = {}
    for level_key in poi_tree.keys():
        if level_key.startswith('level_'):
            level_num = int(level_key.split('_')[1])
            for poi_key, poi_info in poi_tree[level_key].items():
                if 'uuid' in poi_info:
                    poi_key_to_uuid[poi_key] = poi_info['uuid']
                    uuid_to_level[poi_info['uuid']] = level_num
    
    print(f"✓ POI Tree: {len(poi_key_to_uuid)} POIs mapped")
    
    # Load embeddings and graph
    with open(poi_emb_path, 'rb') as f:
        poi_emb_data = pickle.load(f)
    
    with open(user_emb_path, 'rb') as f:
        user_emb_data = pickle.load(f)
        X_A = user_emb_data['X_A']  # (n_users, 39)
        X_T = user_emb_data['X_T']  # (n_users, 32)
        U_explicit = np.concatenate([X_A, X_T], axis=1)
        n_users = U_explicit.shape[0]
        # Get user UUIDs
        user_ids = user_emb_data.get('user_ids', [f"user_{i}" for i in range(n_users)])
        user_to_idx = {uid: i for i, uid in enumerate(user_ids)}
    
    with open(graph_path, 'rb') as f:
        graph_container = pickle.load(f)
        G = graph_container.get('graph', graph_container) if isinstance(graph_container, dict) else graph_container
    
    print(f"✓ Graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
    print(f"✓ Users: {n_users}")
    
    # Filter CSV to valid users and POIs
    print("\nProcessing CSV...")
    
    # Map user IDs
    df['user_idx'] = df['user_id'].map(user_to_idx)
    
    # Map POI IDs (poi_X_Name -> UUID)
    df['poi_uuid'] = df['poi_id'].map(poi_key_to_uuid)
    
    # Get level for each UUID
    df['level'] = df['poi_uuid'].map(uuid_to_level)
    
    # Filter valid rows
    valid_df = df[(df['user_idx'].notna()) & (df['poi_uuid'].notna())].copy()
    valid_df['user_idx'] = valid_df['user_idx'].astype(int)
    
    print(f"✓ Valid interactions: {len(valid_df)}/{len(df)}")
    print(f"  Users: {valid_df['user_idx'].nunique()}")
    print(f"  POIs: {valid_df['poi_uuid'].nunique()}")
    
    # Group by level to create R^l matrices
    results = {}
    
    for level in range(4):
        level_key = f'level_{level}'
        print(f"\n{'='*70}")
        print(f"Building Level {level} (G^{level})")
        print(f"{'='*70}")
        
        # Get level-specific data
        level_df = valid_df[valid_df['level'] == level]
        
        if level_df.empty:
            print(f"  No interactions for level {level}, skipping")
            continue
        
        # Get POI info for this level
        if level_key not in poi_emb_data['poi_embeddings']:
            print(f"  No embeddings for {level_key}, skipping")
            continue
            
        level_poi_data = poi_emb_data['poi_embeddings'][level_key]
        poi_embeddings = level_poi_data['embeddings']
        poi_ids = level_poi_data['poi_ids']
        n_pois = len(poi_ids)
        poi_to_idx = {pid: i for i, pid in enumerate(poi_ids)}
        
        print(f"  POIs at level: {n_pois}")
        print(f"  Interactions: {len(level_df)}")
        
        # Step 1: Build R^l (Interaction Matrix)
        print("  Building R^l (interaction matrix)...")
        
        # Aggregate by user-POI pairs (sum visit values)
        agg = level_df.groupby(['user_idx', 'poi_uuid'])['value'].sum().reset_index()
        
        # Map POI UUID to index
        agg['poi_idx'] = agg['poi_uuid'].map(poi_to_idx)
        agg = agg[agg['poi_idx'].notna()]
        agg['poi_idx'] = agg['poi_idx'].astype(int)
        
        # Build sparse matrix
        rows = agg['user_idx'].values
        cols = agg['poi_idx'].values
        data = agg['value'].values
        
        R = csr_matrix((data, (rows, cols)), shape=(n_users, n_pois))
        print(f"    R^{level} shape: {R.shape}, nnz: {R.nnz}, density: {R.nnz/(R.shape[0]*R.shape[1]):.4f}")
        
        # Step 2: Propagate POI embeddings through graph (F^l)
        print("  Propagating POI embeddings through graph...")
        
        # Find which POIs are in graph
        graph_nodes = set(G.nodes())
        level_poi_set = set(poi_ids)
        valid_pois = list(level_poi_set & graph_nodes)
        
        if len(valid_pois) > 0:
            print(f"    Graph coverage: {len(valid_pois)}/{n_pois} ({100*len(valid_pois)/n_pois:.1f}%)")
            
            # Build subgraph adjacency
            valid_indices = [poi_to_idx[p] for p in valid_pois]
            subG = G.subgraph(valid_pois)
            
            # Local index mapping
            valid_list = sorted(valid_pois)
            local_idx = {p: i for i, p in enumerate(valid_list)}
            
            A = lil_matrix((len(valid_pois), len(valid_pois)))
            for u, v, d in subG.edges(data=True):
                if u in local_idx and v in local_idx:
                    w = d.get('weight_normalized', d.get('weight', 1.0))
                    A[local_idx[u], local_idx[v]] = w
            
            A = A.tocsr()
            row_sums = np.array(A.sum(axis=1)).flatten()
            row_sums[row_sums == 0] = 1
            A_norm = diags(1.0/row_sums) @ A
            
            # Propagate
            alpha = 0.15
            F_valid = poi_embeddings[valid_indices]
            F_prop = F_valid.copy()
            
            for it in range(10):
                F_new = (1-alpha) * (A_norm @ F_prop) + alpha * F_valid
                if np.abs(F_new - F_prop).max() < 1e-4:
                    break
                F_prop = F_new
            
            # Merge back
            F_l = poi_embeddings.copy()
            F_l[valid_indices] = F_prop
        else:
            print(f"    No graph coverage, using original embeddings")
            F_l = poi_embeddings
        
        # Step 3: Build U^l (User representations)
        print("  Building U^l (user representations)...")
        
        beta = 0.5
        
        # Component 1: Interaction-based (R @ F normalized)
        interaction_comp = R @ F_l
        row_sums = np.array(R.sum(axis=1)).flatten()
        row_sums[row_sums == 0] = 1
        interaction_comp = interaction_comp / row_sums[:, np.newaxis]
        
        # Component 2: Project explicit user features to match dimension
        if U_explicit.shape[1] != F_l.shape[1]:
            # Simple linear projection (could use PCA or learned weights)
            proj = np.random.randn(U_explicit.shape[1], F_l.shape[1]) * 0.01
            U_proj = U_explicit @ proj
        else:
            U_proj = U_explicit
        
        U_l = beta * interaction_comp + (1 - beta) * U_proj
        
        # Step 4: Compute G^l = U^l @ F^l^T
        print("  Computing G^l = U^l @ F^l^T...")
        G_l = U_l @ F_l.T
        
        print(f"    G^{level} shape: {G_l.shape}")
        print(f"    Stats: min={G_l.min():.2f}, max={G_l.max():.2f}, mean={G_l.mean():.2f}")
        
        results[level_key] = {
            'G_matrix': G_l,
            'U_matrix': U_l,
            'F_matrix': F_l,
            'R_matrix': R,
            'poi_ids': poi_ids,
            'user_ids': user_ids
        }
    
    # Save results
    print(f"\n{'='*70}")
    print("SAVING RESULTS")
    print(f"{'='*70}")
    
    save_data = {
        'G_matrices': {k: v['G_matrix'] for k, v in results.items()},
        'U_matrices': {k: v['U_matrix'] for k, v in results.items()},
        'F_matrices': {k: v['F_matrix'] for k, v in results.items()},
        'R_matrices_sparse': {k: v['R_matrix'] for k, v in results.items()},
        'metadata': {
            'n_users': n_users,
            'levels': list(results.keys()),
            'formula': 'G^l = (beta*(R^l @ F^l / norm) + (1-beta)*U_explicit) @ F^l^T',
            'beta': 0.5,
            'alpha': 0.15
        }
    }
    
    with open(output_path, 'wb') as f:
        pickle.dump(save_data, f)
    
    print(f"✓ Saved to {output_path}")
    
    # Summary
    print("\nSUMMARY:")
    for level_key, data in results.items():
        G = data['G_matrix']
        R_density = data['R_matrix'].nnz / (data['R_matrix'].shape[0] * data['R_matrix'].shape[1])
        print(f"{level_key}: G∈{G.shape}, R density={R_density:.4f}, G range=[{G.min():.1f}, {G.max():.1f}]")
    
    return results

In [7]:
if __name__ == "__main__":
    results = build_historical_matrices(
        csv_path='../../Sources/Files/user_poi_interactions.csv',
        poi_tree_path='../../Sources/Files/poi_tree_with_uuids.json',
        poi_emb_path='../../Sources/Embeddings v3/poi_embeddings.pkl',
        user_emb_path='../../Sources/Embeddings v3/user_embeddings.pkl',
        graph_path='../../Sources/Embeddings v3/poi_context_graph.pkl',
        output_path='../../Sources/Embeddings v3/historical_checkin_matrices.pkl'
    )

LOADING DATA
✓ Loaded 567 interactions from CSV
✓ POI Tree: 6100 POIs mapped
✓ Graph: 235 nodes, 11471 edges
✓ Users: 21

Processing CSV...
✓ Valid interactions: 567/567
  Users: 21
  POIs: 235

Building Level 0 (G^0)
  POIs at level: 4696
  Interactions: 567
  Building R^l (interaction matrix)...
    R^0 shape: (21, 4696), nnz: 0, density: 0.0000
  Propagating POI embeddings through graph...
    No graph coverage, using original embeddings
  Building U^l (user representations)...
  Computing G^l = U^l @ F^l^T...
    G^0 shape: (21, 4696)
    Stats: min=-3.80, max=4.17, mean=0.01

Building Level 1 (G^1)
  No interactions for level 1, skipping

Building Level 2 (G^2)
  No interactions for level 2, skipping

Building Level 3 (G^3)
  No interactions for level 3, skipping

SAVING RESULTS
✓ Saved to ../../Sources/Embeddings v3/historical_checkin_matrices.pkl

SUMMARY:
level_0: G∈(21, 4696), R density=0.0000, G range=[-3.8, 4.2]
