In [9]:
import numpy as np
import pandas as pd
import networkx as nx
import pickle
import json
from datetime import timedelta
from collections import defaultdict
from tqdm import tqdm
from scipy.spatial import cKDTree
import torch
import torch.nn.functional as F
import warnings
warnings.filterwarnings('ignore')

In [10]:
CO_SEARCH_TIME_WINDOW = timedelta(minutes=30)
CO_VISIT_TIME_WINDOW = timedelta(hours=3)
CO_VISIT_MAX_DISTANCE = 10  # km
PROXIMITY_THRESHOLD = 2  # km for geospatial edges
 
def haversine(lat1, lon1, lat2, lon2):
    """Calculate great circle distance in km."""
    R = 6371
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c
 
def calculate_bearing(lat1, lon1, lat2, lon2):
    """Calculate bearing from point 1 to point 2 in degrees."""
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    x = np.sin(dlon) * np.cos(lat2)
    y = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dlon)
    bearing = np.degrees(np.arctan2(x, y))
    return (bearing + 360) % 360

In [11]:
print("Loading data...")
interactions_df = pd.read_csv('../../../Sources/Files/user_poi_interactions.csv')
interactions_df['timestamp'] = pd.to_datetime(interactions_df['timestamp'])
 
with open('../../../Sources/Files/poi_tree_with_uuids.json', 'r') as f:
    poi_tree = json.load(f)
 
print(f"Total interactions: {len(interactions_df)}")
 
poi_key_to_uuid = {}
uuid_to_poi_key = {}
poi_name_to_uuid = {}
poi_data_by_uuid = {}
 
for level_key in poi_tree.keys():
    if level_key.startswith('level_'):
        for poi_key, poi_info in poi_tree[level_key].items():
            uuid = poi_info.get('uuid')
            if uuid and 'data' in poi_info:
                poi_key_to_uuid[poi_key] = uuid
                uuid_to_poi_key[uuid] = poi_key
                if poi_info.get('name'):
                    poi_name_to_uuid[poi_info['name']] = uuid
                
                poi_data_by_uuid[uuid] = {
                    'lat': poi_info['data'].get('latitude'),
                    'long': poi_info['data'].get('longitude'),
                    'name': poi_info['data'].get('name'),
                    'category': poi_info['data'].get('category'),
                    'level': level_key,
                    'key': poi_key
                }
 
interaction_poi_ids = set(interactions_df['poi_id'].unique())
uuid_matches = len(interaction_poi_ids & set(uuid_to_poi_key.keys()))
key_matches = len(interaction_poi_ids & set(poi_key_to_uuid.keys()))
 
if uuid_matches >= key_matches:
    print("✓ Using UUID format")
    id_mapping = {k: k for k in uuid_to_poi_key.keys()}
else:
    print("✓ Converting Key format to UUID")
    id_mapping = poi_key_to_uuid
    interactions_df['poi_id'] = interactions_df['poi_id'].map(id_mapping)
    interactions_df = interactions_df.dropna(subset=['poi_id'])
 
# Ensure all POIs have coordinates
interaction_poi_ids = set(interactions_df['poi_id'].unique())
pois_with_coords = {}
missing_coords = []
 
for pid in interaction_poi_ids:
    if pid in poi_data_by_uuid:
        lat = poi_data_by_uuid[pid]['lat']
        long = poi_data_by_uuid[pid]['long']
        if lat is not None and long is not None:
            pois_with_coords[pid] = poi_data_by_uuid[pid]
        else:
            missing_coords.append(pid)
 
if missing_coords:
    print(f"⚠️  Warning: {len(missing_coords)} POIs missing coordinates. Removing from graph.")
    interactions_df = interactions_df[~interactions_df['poi_id'].isin(missing_coords)]
 
if len(pois_with_coords) == 0:
    raise ValueError("No POIs with valid coordinates!")
 
poi_coords = pois_with_coords
print(f"Final POIs with coordinates: {len(poi_coords)}")
 
# Initialize graph
G = nx.MultiDiGraph()
G.add_nodes_from(poi_coords.keys())
print(f"\nGraph initialized with {G.number_of_nodes()} nodes")
 

Loading data...
Total interactions: 567
✓ Converting Key format to UUID
Final POIs with coordinates: 235

Graph initialized with 235 nodes


In [12]:
def build_cosearch_edges_fixed(interactions_df, G, time_window):
    print("\n" + "="*70)
    print("BUILDING CO-SEARCH EDGES (Fixed)")
    print("="*70)
    
    searches = interactions_df[interactions_df['interaction_type'] == 'search'].copy()
    print(f"Searches: {len(searches)} ({searches['user_id'].nunique()} users)")
    
    cosearch_edges = defaultdict(int)
    
    for user_id, user_searches in tqdm(searches.groupby('user_id'), desc="Co-searches"):
        user_searches = user_searches.sort_values('timestamp').reset_index(drop=True)
        
        if len(user_searches) < 2:
            continue
            
        # Sliding window: for each search, look at subsequent searches within window
        for i in range(len(user_searches)):
            poi_i = user_searches.iloc[i]['poi_id']
            time_i = user_searches.iloc[i]['timestamp']
            
            # Look forward in time window
            mask = (user_searches['timestamp'] > time_i) & \
				(user_searches['timestamp'] <= time_i + time_window)
            window_searches = user_searches[mask]
            
            for _, row_j in window_searches.iterrows():
                poi_j = row_j['poi_id']
                if poi_i != poi_j:
                    # Sort to ensure undirected edge consistency
                    edge_key = tuple(sorted([poi_i, poi_j]))
                    cosearch_edges[edge_key] += 1
    
    print(f"Unique co-search pairs: {len(cosearch_edges)}")
    
    if len(cosearch_edges) == 0:
        return G
    
    # Symmetric normalization (Jaccard-like)
    # Calculate node degrees for normalization
    node_counts = defaultdict(int)
    for (poi_i, poi_j), count in cosearch_edges.items():
        node_counts[poi_i] += count
        node_counts[poi_j] += count
    
    # Add symmetric edges with normalized weights
    for (poi_i, poi_j), count in cosearch_edges.items():
        # Symmetric weight: count / (total searches of i + total searches of j - count)
        union = node_counts[poi_i] + node_counts[poi_j] - count
        weight = count / union if union > 0 else 0
        
        # Add edge features for GNN
        # For co-search: [weight, normalized_count, type_encoding=0]
        edge_attr = np.array([weight, count / max(cosearch_edges.values()), 0.0])
        
        # Add both directions explicitly with same weight
        G.add_edge(poi_i, poi_j, edge_type='co-search', weight=weight, 
				raw_count=count, edge_attr=edge_attr)
        G.add_edge(poi_j, poi_i, edge_type='co-search', weight=weight,
				raw_count=count, edge_attr=edge_attr)
    
    print(f"Added {len(cosearch_edges)} bidirectional co-search edges")
    return G

In [13]:
def build_covisit_edges_fixed(interactions_df, G, poi_coords, time_window, max_distance):
    
    print("\n" + "="*70)
    print("BUILDING CO-VISIT EDGES (Fixed)")
    print("="*70)
    
    visits = interactions_df[interactions_df['interaction_type'] == 'visit'].copy()
    print(f"Visits: {len(visits)} ({visits['user_id'].nunique()} users)")
    
    covisit_edges = defaultdict(lambda: {'count': 0, 'distances': [], 'time_gaps': []})
    skipped = set()
    
    for user_id, user_visits in tqdm(visits.groupby('user_id'), desc="Co-visits"):
        user_visits = user_visits.sort_values('timestamp').reset_index(drop=True)
        
        if len(user_visits) < 2:
            continue
        
        for i in range(len(user_visits) - 1):
            poi_i = user_visits.iloc[i]['poi_id']
            time_i = user_visits.iloc[i]['timestamp']
            
            if poi_i not in poi_coords:
                skipped.add(poi_i)
                continue
            
            coords_i = poi_coords[poi_i]
            
            for j in range(i + 1, len(user_visits)):
                poi_j = user_visits.iloc[j]['poi_id']
                time_j = user_visits.iloc[j]['timestamp']
                
                if time_j - time_i > time_window:
                    break
                
                if poi_j not in poi_coords:
                    skipped.add(poi_j)
                    continue
                
                # Calculate distance
                coords_j = poi_coords[poi_j]
                distance = haversine(coords_i['lat'], coords_i['long'],
								coords_j['lat'], coords_j['long'])
                
                if distance <= max_distance and poi_i != poi_j:
                    edge_key = tuple(sorted([poi_i, poi_j]))
                    covisit_edges[edge_key]['count'] += 1
                    covisit_edges[edge_key]['distances'].append(distance)
                    covisit_edges[edge_key]['time_gaps'].append((time_j - time_i).seconds / 3600)
    
    print(f"Unique co-visit pairs: {len(covisit_edges)}")
    print(f"Skipped POIs: {len(skipped)}")
    
    if len(covisit_edges) == 0:
        return G
    
    # Symmetric normalization
    node_counts = defaultdict(int)
    for (poi_i, poi_j), data in covisit_edges.items():
        count = data['count']
        node_counts[poi_i] += count
        node_counts[poi_j] += count
    
    for (poi_i, poi_j), data in covisit_edges.items():
        count = data['count']
        union = node_counts[poi_i] + node_counts[poi_j] - count
        weight = count / union if union > 0 else 0
        
        # Rich edge features
        avg_dist = np.mean(data['distances'])
        avg_time = np.mean(data['time_gaps'])
        # Features: [weight, normalized_count, avg_distance/10, avg_time/3, type_encoding=1]
        edge_attr = np.array([
            weight,
            count / max(v['count'] for v in covisit_edges.values()),
            avg_dist / 10.0,  # Normalize to ~0-1
            avg_time / 3.0,   # Normalize hours
            1.0               # Type indicator for co-visit
        ])
        
        # Add bidirectional
        G.add_edge(poi_i, poi_j, edge_type='co-visit', weight=weight,
				raw_count=count, avg_distance=avg_dist, avg_time_gap=avg_time,
				edge_attr=edge_attr)
        G.add_edge(poi_j, poi_i, edge_type='co-visit', weight=weight,
				raw_count=count, avg_distance=avg_dist, avg_time_gap=avg_time,
				edge_attr=edge_attr)
    
    print(f"Added {len(covisit_edges)} bidirectional co-visit edges")
    return G

In [14]:
def build_geospatial_edges_optimized(G, poi_coords, threshold_km):
    print("\n" + "="*70)
    print("BUILDING GEOSPATIAL EDGES (Optimized with KDTree)")
    print("="*70)
    
    poi_list = list(poi_coords.keys())
    n_pois = len(poi_list)
    
    # Build coordinate array (lat, lon)
    coords = np.array([[poi_coords[p]['lat'], poi_coords[p]['long']] for p in poi_list])
    
    # Convert km to approximate degrees for initial query (1 deg lat ~ 111 km)
    threshold_deg = threshold_km / 111.0
    
    # Build KD-tree
    tree = cKDTree(coords)
    
    # Query pairs within threshold (much faster than nested loops)
    pairs = tree.query_pairs(r=threshold_deg, output_type='ndarray')
    print(f"Found {len(pairs)} candidate pairs within {threshold_km} km")
    
    geospatial_edges = 0
    for i, j in tqdm(pairs, desc="Processing geospatial edges"):
        poi_i = poi_list[i]
        poi_j = poi_list[j]
        
        # Verify exact distance (more accurate than degree approximation)
        lat_i, lon_i = coords[i]
        lat_j, lon_j = coords[j]
        dist = haversine(lat_i, lon_i, lat_j, lon_j)
        
        if dist <= threshold_km and poi_i != poi_j:
            # Calculate bearing for directional feature
            bearing = calculate_bearing(lat_i, lon_i, lat_j, lon_j)
            
            # Weight: inverse distance with exponential decay
            weight = np.exp(-dist / threshold_km)  # Range: [e^-1, 1] ~ [0.368, 1]
            
            # Edge features for GNN
            # [weight, normalized_distance, bearing/360, type_encoding=2]
            edge_attr = np.array([
                weight,
                dist / threshold_km,  # Normalized 0-1
                bearing / 360.0,      # Normalized 0-1
                2.0                   # Type indicator for geospatial
            ])
            
            # Add both directions (geospatial is symmetric but features differ by direction!)
            # Note: bearing is directional, so i->j and j->i have different bearings
            G.add_edge(poi_i, poi_j, edge_type='geospatial', weight=weight,
                       distance_km=dist, bearing=bearing, edge_attr=edge_attr)
            
            # Reverse direction: bearing + 180
            bearing_rev = (bearing + 180) % 360
            edge_attr_rev = np.array([
                weight,
                dist / threshold_km,
                bearing_rev / 360.0,
                2.0
            ])
            G.add_edge(poi_j, poi_i, edge_type='geospatial', weight=weight,
                       distance_km=dist, bearing=bearing_rev, edge_attr=edge_attr_rev)
            
            geospatial_edges += 1
    
    print(f"Added {geospatial_edges} bidirectional geospatial edges")
    return G

In [15]:
def add_self_loops(G):
    print("\n" + "="*70)
    print("ADDING SELF-LOOPS (Critical for GNN)")
    print("="*70)
    
    self_loops = 0
    for node in G.nodes():
        # Check if self-loop exists
        if not G.has_edge(node, node):
            # Self-loop weight = 1.0, features = [1, 0, 0, 3] (type_encoding=3 for self)
            edge_attr = np.array([1.0, 0.0, 0.0, 3.0])
            G.add_edge(node, node, edge_type='self', weight=1.0, 
					raw_count=1, edge_attr=edge_attr)
            self_loops += 1
    
    print(f"Added {self_loops} self-loops")
    return G

def normalize_edge_weights_by_type(G):
    """
    Geospatial: ~0.5 avg, Behavioral: 1.0
    Normalize each type to have mean 1.0 for balanced GNN aggregation
    """
    print("\n" + "="*70)
    print("NORMALIZING EDGE WEIGHTS BY TYPE")
    print("="*70)
    
    edge_types = defaultdict(list)
    for u, v, data in G.edges(data=True):
        edge_types[data['edge_type']].append(data['weight'])
    
    # Calculate normalization factors
    type_stats = {}
    for edge_type, weights in edge_types.items():
        mean_w = np.mean(weights)
        std_w = np.std(weights)
        type_stats[edge_type] = {'mean': mean_w, 'std': std_w}
        print(f"  {edge_type}: mean={mean_w:.4f}, std={std_w:.4f}")
    
    # Normalize weights in place
    for u, v, data in G.edges(data=True):
        etype = data['edge_type']
        if type_stats[etype]['mean'] > 0:
            # Normalize to mean 1.0, then scale by original std if needed
            normalized = data['weight'] / type_stats[etype]['mean']
            data['weight_normalized'] = normalized
        else:
            data['weight_normalized'] = data['weight']
    
    return G, type_stats

In [17]:
if __name__ == "__main__":
    print("\n" + "="*70)
    print("STARTING POI CONTEXT GRAPH CONSTRUCTION")
    print("="*70)
    
    # Build edges with fixed functions
    G = build_cosearch_edges_fixed(interactions_df, G, CO_SEARCH_TIME_WINDOW)
    G = build_covisit_edges_fixed(interactions_df, G, poi_coords, 
								CO_VISIT_TIME_WINDOW, CO_VISIT_MAX_DISTANCE)
    G = build_geospatial_edges_optimized(G, poi_coords, PROXIMITY_THRESHOLD)
    
    G = add_self_loops(G)
    G, type_stats = normalize_edge_weights_by_type(G)
    
    # Check for isolated nodes (should be none after self-loops)
    isolated = [n for n in G.nodes() if G.degree(n) == 0]
    if isolated:
        print(f"\n⚠️  WARNING: {len(isolated)} isolated nodes detected!")
    else:
        print("\n✓ No isolated nodes (all have self-loops)")
    
    # Statistics
    print("\n" + "="*70)
    print("GRAPH STATISTICS (FIXED)")
    print("="*70)
    print(f"Nodes: {G.number_of_nodes()}")
    print(f"Edges: {G.number_of_edges()}")
    
    edge_types = defaultdict(int)
    for u, v, data in G.edges(data=True):
        edge_types[data['edge_type']] += 1
    
    print("\nEdges by type:")
    for edge_type, count in edge_types.items():
        print(f"  - {edge_type}: {count}")
    
    # Degree analysis
    in_degrees = [d for n, d in G.in_degree()]
    out_degrees = [d for n, d in G.out_degree()]
    
    print(f"\nDegree statistics:")
    print(f"  Average in-degree: {np.mean(in_degrees):.2f}")
    print(f"  Average out-degree: {np.mean(out_degrees):.2f}")
    print(f"  Min in-degree: {np.min(in_degrees)} (should be >= 1 with self-loops)")
    print(f"  Max in-degree: {np.max(in_degrees)}")
    
    # Check connectivity
    if nx.is_strongly_connected(G):
        print("  ✓ Graph is strongly connected")
    else:
        print("  ⚠️  Graph is not strongly connected (check for disconnected components)")
        # Find largest component
        largest = max(nx.strongly_connected_components(G), key=len)
        print(f"  Largest SCC: {len(largest)} nodes")
    
    # Save graph with edge attributes
    print("\n" + "="*70)
    print("SAVING GRAPH (with edge features for GNN)")
    print("="*70)
    
    with open('../../../Sources/Embeddings v3/poi_context_graph.pkl', 'wb') as f:
        pickle.dump({
            'graph': G,
            'edge_type_stats': type_stats,
            'poi_coords': poi_coords,
            'config': {
                'co_search_window': str(CO_SEARCH_TIME_WINDOW),
                'co_visit_window': str(CO_VISIT_TIME_WINDOW),
                'co_visit_max_dist': CO_VISIT_MAX_DISTANCE,
                'proximity_threshold': PROXIMITY_THRESHOLD
            }
        }, f, pickle.HIGHEST_PROTOCOL)
    print("✓ Saved to 'poi_context_graph.pkl'")
    
    # Export edge details with features
    print("\nExporting edge details to CSV...")
    edge_data = []
    for u, v, data in G.edges(data=True):
        row = {
            'source': u,
            'target': v,
            'edge_type': data['edge_type'],
            'weight': data['weight'],
            'weight_normalized': data.get('weight_normalized', data['weight']),
            'raw_count': data.get('raw_count', 1),
            'edge_attr': list(data['edge_attr']) if 'edge_attr' in data else None
        }
        # Add specific features based on type
        if data['edge_type'] == 'geospatial':
            row['distance_km'] = data.get('distance_km')
            row['bearing'] = data.get('bearing')
        elif data['edge_type'] == 'co-visit':
            row['avg_distance'] = data.get('avg_distance')
            row['avg_time_gap'] = data.get('avg_time_gap')
        
        edge_data.append(row)
    
    edges_df = pd.DataFrame(edge_data)
    edges_df.to_csv('../../../Sources/Embeddings v3 csv/poi_context_graph_edges.csv', index=False)
    print("✓ Saved detailed edge list to 'poi_context_graph_edges.csv'")
    
    # Prepare PyTorch Geometric format (optional but recommended)
    print("\nPreparing PyTorch Geometric format...")
    try:
        # Convert to PyG Data object
        node_list = list(G.nodes())
        node_idx = {node: i for i, node in enumerate(node_list)}
        
        # Edge index: [2, num_edges]
        edge_index = []
        edge_weight = []
        edge_attr = []
        
        for u, v, data in G.edges(data=True):
            edge_index.append([node_idx[u], node_idx[v]])
            edge_weight.append(data.get('weight_normalized', data['weight']))
            edge_attr.append(data['edge_attr'] if 'edge_attr' in data else [0,0,0,0])
        
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_weight = torch.tensor(edge_weight, dtype=torch.float)
        edge_attr = torch.tensor(edge_attr, dtype=torch.float)
        
        pyg_data = {
            'num_nodes': len(node_list),
            'edge_index': edge_index,
            'edge_weight': edge_weight,
            'edge_attr': edge_attr,
            'node_mapping': node_idx
        }
        
        torch.save(pyg_data, 'poi_graph_pyg_format.pt')
        print("✓ Saved PyTorch Geometric format to 'poi_graph_pyg_format.pt'")
        
    except Exception as e:
        print(f"⚠️  Could not create PyG format: {e}")
    
    print("\n" + "="*70)
    print("POI CONTEXT GRAPH CONSTRUCTION COMPLETE!")
    print("="*70)


STARTING POI CONTEXT GRAPH CONSTRUCTION

BUILDING CO-SEARCH EDGES (Fixed)
Searches: 94 (20 users)


Co-searches: 100%|██████████| 20/20 [00:00<00:00, 331.64it/s]


Unique co-search pairs: 1
Added 1 bidirectional co-search edges

BUILDING CO-VISIT EDGES (Fixed)
Visits: 264 (21 users)


Co-visits: 100%|██████████| 21/21 [00:00<00:00, 328.47it/s]


Unique co-visit pairs: 4
Skipped POIs: 0
Added 4 bidirectional co-visit edges

BUILDING GEOSPATIAL EDGES (Optimized with KDTree)
Found 2809 candidate pairs within 2 km


Processing geospatial edges: 100%|██████████| 2809/2809 [00:00<00:00, 46561.75it/s]


Added 2804 bidirectional geospatial edges

ADDING SELF-LOOPS (Critical for GNN)
Added 0 self-loops

NORMALIZING EDGE WEIGHTS BY TYPE
  geospatial: mean=0.5924, std=0.1651
  self: mean=1.0000, std=0.0000
  co-visit: mean=1.0000, std=0.0000
  co-search: mean=1.0000, std=0.0000

✓ No isolated nodes (all have self-loops)

GRAPH STATISTICS (FIXED)
Nodes: 235
Edges: 11471

Edges by type:
  - geospatial: 11216
  - self: 235
  - co-visit: 16
  - co-search: 4

Degree statistics:
  Average in-degree: 48.81
  Average out-degree: 48.81
  Min in-degree: 1 (should be >= 1 with self-loops)
  Max in-degree: 135
  ⚠️  Graph is not strongly connected (check for disconnected components)
  Largest SCC: 183 nodes

SAVING GRAPH (with edge features for GNN)
✓ Saved to 'poi_context_graph.pkl'

Exporting edge details to CSV...
✓ Saved detailed edge list to 'poi_context_graph_edges.csv'

Preparing PyTorch Geometric format...
⚠️  Could not create PyG format: expected sequence of length 4 at dim 1 (got 5)

POI CO