In [2]:
import pandas as pd
import json
import numpy as np
from datetime import datetime, timedelta
import random
from typing import Dict, List, Tuple
import math

In [3]:
class UserPOIInteractionGenerator:
    def __init__(self, poi_tree_file: str, users_file: str):
        """
        Initialize interaction generator
        
        Args:
            poi_tree_file: Path to POI tree JSON file
            users_file: Path to user preferences CSV
        """
        # Load POI tree
        with open(poi_tree_file, 'r', encoding='utf-8') as f:
            self.poi_tree = json.load(f)
        
        # Load users
        self.users_df = pd.read_csv(users_file)
        
        # Define Singapore bounding box for random location generation
        self.singapore_bounds = {
            'lat_min': 1.22,
            'lat_max': 1.47,
            'lon_min': 103.60,
            'lon_max': 104.05
        }
        
        # Define common areas in Singapore where users might be located
        # These serve as "hotspots" for weighted random location generation
        self.location_hotspots = [
            {'name': 'Orchard', 'lat': 1.3048, 'lon': 103.8318, 'weight': 1.5},
            {'name': 'Marina Bay', 'lat': 1.2838, 'lon': 103.8591, 'weight': 1.3},
            {'name': 'Bugis', 'lat': 1.3009, 'lon': 103.8558, 'weight': 1.2},
            {'name': 'Jurong East', 'lat': 1.3329, 'lon': 103.7436, 'weight': 1.0},
            {'name': 'Tampines', 'lat': 1.3496, 'lon': 103.9568, 'weight': 1.0},
            {'name': 'Woodlands', 'lat': 1.4382, 'lon': 103.7891, 'weight': 0.8},
            {'name': 'Bishan', 'lat': 1.3526, 'lon': 103.8352, 'weight': 1.1},
            {'name': 'Ang Mo Kio', 'lat': 1.3691, 'lon': 103.8454, 'weight': 1.0},
            {'name': 'Bedok', 'lat': 1.3236, 'lon': 103.9273, 'weight': 1.0},
            {'name': 'Clementi', 'lat': 1.3152, 'lon': 103.7649, 'weight': 1.0},
            {'name': 'Toa Payoh', 'lat': 1.3343, 'lon': 103.8563, 'weight': 1.0},
            {'name': 'Sengkang', 'lat': 1.3868, 'lon': 103.8914, 'weight': 0.9},
            {'name': 'Punggol', 'lat': 1.4043, 'lon': 103.9021, 'weight': 0.8},
            {'name': 'Serangoon', 'lat': 1.3554, 'lon': 103.8679, 'weight': 1.0},
            {'name': 'Yishun', 'lat': 1.4304, 'lon': 103.8354, 'weight': 0.9},
            {'name': 'Hougang', 'lat': 1.3612, 'lon': 103.8863, 'weight': 0.9},
            {'name': 'Pasir Ris', 'lat': 1.3721, 'lon': 103.9474, 'weight': 0.8},
            {'name': 'Clarke Quay', 'lat': 1.2906, 'lon': 103.8465, 'weight': 1.2},
            {'name': 'Sentosa', 'lat': 1.2494, 'lon': 103.8303, 'weight': 0.7},
            {'name': 'Changi', 'lat': 1.3644, 'lon': 103.9915, 'weight': 0.6},
        ]
        
        # Interest to POI characteristic mapping
        self.interest_mapping = {
            'food': ['dining', 'restaurant', 'food', 'hawker', 'cafe', 'eatery', 'cuisine'],
            'shopping': ['shopping', 'retail', 'fashion', 'mall', 'store', 'boutique'],
            'movies': ['cinema', 'movie', 'film', 'theatre', 'entertainment'],
            'cafes': ['cafe', 'coffee', 'tea', 'bakery', 'dessert'],
            'cycling': ['cycling', 'bike', 'sports', 'outdoor', 'park'],
            'photography': ['scenic', 'park', 'nature', 'attraction', 'view'],
            'museums': ['museum', 'gallery', 'art', 'culture', 'heritage', 'exhibition'],
            'books': ['book', 'library', 'store', 'reading'],
            'nightlife': ['bar', 'pub', 'club', 'nightlife', 'lounge'],
            'bars': ['bar', 'pub', 'lounge', 'drinks', 'alcohol'],
            'concerts': ['music', 'entertainment', 'venue', 'performance'],
            'family activities': ['family', 'kid', 'playground', 'park', 'entertainment'],
            'playgrounds': ['playground', 'park', 'family', 'children'],
            'malls': ['mall', 'shopping', 'retail'],
            'sports': ['sports', 'gym', 'fitness', 'athletic'],
            'gyms': ['gym', 'fitness', 'sports', 'workout'],
            'healthy eating': ['healthy', 'salad', 'organic', 'wellness', 'fresh'],
            'gaming': ['gaming', 'arcade', 'entertainment', 'game'],
            'arcades': ['arcade', 'game', 'entertainment'],
            'budget food': ['hawker', 'food court', 'budget', 'cheap', 'affordable'],
            'tech stores': ['tech', 'electronics', 'gadget', 'computer', 'mobile'],
            'coworking': ['coworking', 'cafe', 'workspace', 'work'],
            'local food': ['hawker', 'local', 'traditional', 'food court'],
            'parks': ['park', 'nature', 'outdoor', 'garden'],
            'community events': ['community', 'event', 'recreation', 'centre']
        }
        
        # Transportation mode to max distance mapping (km)
        self.transport_distance = {
            'MRT': 15.0,
            'bus': 10.0,
            'car': 25.0,
            'walking': 2.0,
            'bicycle': 5.0,
            'ride-hailing': 20.0
        }
        
        # Price sensitivity to price range mapping
        self.price_ranges = {
            'low': (0, 20),
            'medium': (10, 40),
            'high': (20, 100)
        }
    
    def _generate_random_location(self) -> Tuple[float, float, str]:
        """
        Generate a random location within Singapore.
        Uses weighted hotspots to make locations more realistic.
        
        Returns:
            Tuple of (latitude, longitude, nearest_area_name)
        """
        # 70% chance to be near a hotspot, 30% chance fully random
        if random.random() < 0.7:
            # Select a hotspot based on weights
            weights = [h['weight'] for h in self.location_hotspots]
            total_weight = sum(weights)
            weights = [w / total_weight for w in weights]
            
            selected_hotspot = random.choices(self.location_hotspots, weights=weights)[0]
            
            # Add random offset (within ~2km radius using normal distribution)
            lat_offset = random.gauss(0, 0.01)  # ~1km standard deviation
            lon_offset = random.gauss(0, 0.01)
            
            lat = selected_hotspot['lat'] + lat_offset
            lon = selected_hotspot['lon'] + lon_offset
            
            # Clamp to Singapore bounds
            lat = max(self.singapore_bounds['lat_min'], 
                     min(self.singapore_bounds['lat_max'], lat))
            lon = max(self.singapore_bounds['lon_min'], 
                     min(self.singapore_bounds['lon_max'], lon))
            
            return (lat, lon, selected_hotspot['name'])
        else:
            # Fully random within Singapore bounds
            lat = random.uniform(self.singapore_bounds['lat_min'], 
                                self.singapore_bounds['lat_max'])
            lon = random.uniform(self.singapore_bounds['lon_min'], 
                                self.singapore_bounds['lon_max'])
            
            # Find nearest hotspot for area name
            min_dist = float('inf')
            nearest_area = 'Unknown'
            for hotspot in self.location_hotspots:
                dist = self._haversine_distance(lat, lon, hotspot['lat'], hotspot['lon'])
                if dist < min_dist:
                    min_dist = dist
                    nearest_area = hotspot['name']
            
            return (lat, lon, nearest_area)
    
    def _haversine_distance(self, lat1: float, lon1: float, 
                           lat2: float, lon2: float) -> float:
        """Calculate distance between coordinates in km"""
        R = 6371
        lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
        return R * c
    
    def _get_max_travel_distance(self, transport_modes: str) -> float:
        """Get maximum travel distance based on transportation modes"""
        modes = [m.strip() for m in transport_modes.split(';')]
        # Return max distance among all modes
        distances = [self.transport_distance.get(mode, 10.0) for mode in modes]
        return max(distances)
    
    def _match_interest_score(self, user_interests: str, poi_text: str) -> float:
        """
        Calculate how well a POI matches user interests
        Returns score from 0 to 1
        """
        interests = [i.strip().lower() for i in user_interests.split(';')]
        poi_text_lower = poi_text.lower()
        
        total_matches = 0
        total_keywords = 0
        
        for interest in interests:
            if interest in self.interest_mapping:
                keywords = self.interest_mapping[interest]
                total_keywords += len(keywords)
                matches = sum(1 for keyword in keywords if keyword in poi_text_lower)
                total_matches += matches
        
        if total_keywords == 0:
            return 0.0
        
        return min(total_matches / (total_keywords * 0.3), 1.0)  # Normalize
    
    def _match_price_sensitivity(self, user_price_sens: str, poi_price: str) -> bool:
        """Check if POI price matches user's price sensitivity"""
        if pd.isna(poi_price) or poi_price == '':
            return True  # Assume affordable if no price info
        
        try:
            # Parse price range (e.g., "25.85 - 30.99")
            if '-' in str(poi_price):
                prices = str(poi_price).split('-')
                avg_price = (float(prices[0].strip()) + float(prices[1].strip())) / 2
            else:
                avg_price = float(poi_price)
            
            price_range = self.price_ranges.get(user_price_sens.lower(), (0, 100))
            return price_range[0] <= avg_price <= price_range[1]
        except:
            return True
    
    def _get_candidate_pois(self, user: pd.Series, 
                           current_location: Tuple[float, float]) -> List[Tuple[str, Dict, float, float, float]]:
        """
        Get candidate POIs for a user based on their preferences and current location
        
        Args:
            user: User data series
            current_location: Tuple of (latitude, longitude) for user's current position
        
        Returns:
            List of (poi_id, poi_data, score, distance, interest_score) tuples
        """
        user_lat, user_lon = current_location
        max_distance = self._get_max_travel_distance(user['transportation_modes'])
        user_interests = user['interests']
        user_price_sens = user['price_sensitivity']
        
        candidates = []
        
        # Iterate through Level 0 POIs (individual POIs)
        for poi_id, poi_data in self.poi_tree['level_0'].items():
            # Get POI coordinates
            poi_spatial = poi_data['spatial']
            if isinstance(poi_spatial, str):
                poi_spatial = eval(poi_spatial)  # Convert string tuple to tuple
            
            poi_lat, poi_lon = poi_spatial
            
            # Step 1: Spatial Filtering - Calculate distance from current location
            distance = self._haversine_distance(user_lat, user_lon, poi_lat, poi_lon)
            
            # Filter by distance (based on user's transportation modes)
            if distance > max_distance:
                continue
            
            # Step 2: Interest Matching - Calculate interest match score
            poi_text = poi_data.get('textual', '')
            interest_score = self._match_interest_score(user_interests, poi_text)
            
            if interest_score < 0.1:  # Skip if very low interest match
                continue
            
            # Step 3: Price Filtering - Check price sensitivity
            poi_price = poi_data['data'].get('price', '')
            if not self._match_price_sensitivity(user_price_sens, poi_price):
                continue
            
            # Step 4: Scoring - Calculate overall score
            # Get popularity
            try:
                popularity = float(poi_data['data'].get('popularity', 3))
            except:
                popularity = 3.0
            
            # Scoring formula: interest_match * 3 + popularity * 0.5 - distance * 0.2
            # This prioritizes interest match, then popularity, with distance penalty
            score = (interest_score * 3.0) + (popularity * 0.5) - (distance * 0.2)
            
            candidates.append((poi_id, poi_data, score, distance, interest_score))
        
        # Sort by score (highest first)
        candidates.sort(key=lambda x: x[2], reverse=True)
        
        return candidates
    
    def generate_interactions(self, 
                            min_interactions_per_user: int = 5,
                            max_interactions_per_user: int = 20,
                            days_back: int = 90) -> pd.DataFrame:
        """
        Generate synthetic user-POI interactions with randomized current locations
        
        Args:
            min_interactions_per_user: Minimum number of interactions per user
            max_interactions_per_user: Maximum number of interactions per user
            days_back: Generate interactions for past N days
        
        Returns:
            DataFrame with interaction data including current_location info
        """
        interactions = []
        interaction_counter = 0
        
        print("=" * 60)
        print("Generating User-POI Interactions")
        print("=" * 60)
        print("\nPipeline: Random Location -> Spatial Filtering -> Interest Matching -> Price Filtering -> Scoring")
        print("=" * 60)
        
        for idx, user in self.users_df.iterrows():
            user_id = user['uuid'] if 'uuid' in user else user['uudi']  # Handle both spellings
            user_name = user['name']
            
            print(f"\nProcessing user {idx+1}/{len(self.users_df)}: {user_name}")
            
            # Generate multiple interaction sessions for this user
            num_sessions = random.randint(
                min_interactions_per_user,
                max_interactions_per_user
            )
            
            session_interactions = 0
            attempts = 0
            max_attempts = num_sessions * 3  # Prevent infinite loops
            
            while session_interactions < num_sessions and attempts < max_attempts:
                attempts += 1
                
                # Generate a random current location for this interaction session
                current_lat, current_lon, area_name = self._generate_random_location()
                
                # Get candidate POIs based on current location
                candidates = self._get_candidate_pois(user, (current_lat, current_lon))
                
                if not candidates:
                    continue  # Try a different location
                
                # Select a POI using weighted random (higher score = higher probability)
                scores = [c[2] for c in candidates]
                # Shift scores to be positive and apply exponential weighting
                min_score = min(scores)
                weights = [math.exp(score - min_score) for score in scores]
                weight_sum = sum(weights)
                weights = [w / weight_sum for w in weights]
                
                selected_idx = random.choices(range(len(candidates)), weights=weights)[0]
                poi_id, poi_data, score, distance, interest_score = candidates[selected_idx]
                
                # Generate visit timestamp
                days_ago = random.randint(0, days_back)
                hours = random.randint(8, 22)
                minutes = random.randint(0, 59)
                visit_time = datetime.now() - timedelta(days=days_ago, hours=hours, minutes=minutes)
                
                # Visit interaction
                interactions.append({
                    'interaction_id': f'int_{interaction_counter:06d}',
                    'user_id': user_id,
                    'user_name': user_name,
                    'poi_id': poi_id,
                    'poi_name': poi_data['name'],
                    'interaction_type': 'visit',
                    'value': 1,
                    'timestamp': visit_time.strftime('%Y-%m-%d %H:%M:%S'),
                    'current_lat': round(current_lat, 6),
                    'current_lon': round(current_lon, 6),
                    'current_area': area_name,
                    'distance_km': round(distance, 2),
                    'interest_match_score': round(interest_score, 2)
                })
                interaction_counter += 1
                session_interactions += 1
                
                # Generate rating (80% chance)
                if random.random() < 0.8:
                    base_rating = 3.0
                    interest_bonus = interest_score * 2.0
                    distance_penalty = min(distance / 10.0, 1.0)
                    
                    rating = base_rating + interest_bonus - distance_penalty
                    rating = max(1, min(5, int(round(rating))))
                    
                    if random.random() < 0.2:
                        rating = max(1, rating - 1) if random.random() < 0.5 else min(5, rating + 1)
                    
                    rating_time = visit_time + timedelta(minutes=random.randint(5, 120))
                    
                    interactions.append({
                        'interaction_id': f'int_{interaction_counter:06d}',
                        'user_id': user_id,
                        'user_name': user_name,
                        'poi_id': poi_id,
                        'poi_name': poi_data['name'],
                        'interaction_type': 'rating',
                        'value': rating,
                        'timestamp': rating_time.strftime('%Y-%m-%d %H:%M:%S'),
                        'current_lat': round(current_lat, 6),
                        'current_lon': round(current_lon, 6),
                        'current_area': area_name,
                        'distance_km': round(distance, 2),
                        'interest_match_score': round(interest_score, 2)
                    })
                    interaction_counter += 1
                
                # Generate search (30% chance, happens before visit)
                if random.random() < 0.3:
                    search_time = visit_time - timedelta(hours=random.randint(1, 48))
                    
                    interactions.append({
                        'interaction_id': f'int_{interaction_counter:06d}',
                        'user_id': user_id,
                        'user_name': user_name,
                        'poi_id': poi_id,
                        'poi_name': poi_data['name'],
                        'interaction_type': 'search',
                        'value': 1,
                        'timestamp': search_time.strftime('%Y-%m-%d %H:%M:%S'),
                        'current_lat': round(current_lat, 6),
                        'current_lon': round(current_lon, 6),
                        'current_area': area_name,
                        'distance_km': round(distance, 2),
                        'interest_match_score': round(interest_score, 2)
                    })
                    interaction_counter += 1
            
            print(f"  Generated {session_interactions} interaction sessions from {attempts} attempts")
        
        print("\n" + "=" * 60)
        print(f"Generated {len(interactions)} total interactions")
        print("=" * 60)
        
        # Convert to DataFrame and sort by timestamp
        interactions_df = pd.DataFrame(interactions)
        interactions_df = interactions_df.sort_values('timestamp')
        
        return interactions_df
    
    def generate_summary_stats(self, interactions_df: pd.DataFrame):
        """Print summary statistics of generated interactions"""
        print("\n" + "=" * 60)
        print("INTERACTION SUMMARY STATISTICS")
        print("=" * 60)
        
        print(f"\nTotal interactions: {len(interactions_df)}")
        print(f"Total users: {interactions_df['user_id'].nunique()}")
        print(f"Total POIs: {interactions_df['poi_id'].nunique()}")
        
        print("\nInteractions per user:")
        user_counts = interactions_df.groupby('user_id').size()
        print(f"  Mean: {user_counts.mean():.1f}")
        print(f"  Median: {user_counts.median():.1f}")
        print(f"  Min: {user_counts.min()}")
        print(f"  Max: {user_counts.max()}")
        
        print("\nInteraction types:")
        type_counts = interactions_df['interaction_type'].value_counts()
        for itype, count in type_counts.items():
            print(f"  {itype}: {count} ({count/len(interactions_df)*100:.1f}%)")
        
        print("\nRating distribution:")
        ratings = interactions_df[interactions_df['interaction_type'] == 'rating']['value']
        if len(ratings) > 0:
            print(f"  Mean rating: {ratings.mean():.2f}")
            print(f"  Rating counts:")
            for rating in sorted(ratings.unique()):
                count = (ratings == rating).sum()
                print(f"    {int(rating)} stars: {count} ({count/len(ratings)*100:.1f}%)")
        
        print("\nDistance statistics (from current location):")
        print(f"  Mean distance: {interactions_df['distance_km'].mean():.2f} km")
        print(f"  Median distance: {interactions_df['distance_km'].median():.2f} km")
        print(f"  Max distance: {interactions_df['distance_km'].max():.2f} km")
        
        print("\nCurrent location area distribution:")
        area_counts = interactions_df.groupby('current_area').size().sort_values(ascending=False).head(10)
        for area, count in area_counts.items():
            print(f"  {area}: {count} interactions ({count/len(interactions_df)*100:.1f}%)")
        
        print("\nInterest match statistics:")
        print(f"  Mean match score: {interactions_df['interest_match_score'].mean():.2f}")
        print(f"  Median match score: {interactions_df['interest_match_score'].median():.2f}")
        
        print("\nTop 10 most visited POIs:")
        top_pois = interactions_df[interactions_df['interaction_type'] == 'visit'].groupby('poi_name').size().sort_values(ascending=False).head(10)
        for poi, count in top_pois.items():
            print(f"  {poi}: {count} visits")
        
        print("\nMost active users:")
        top_users = interactions_df.groupby('user_name').size().sort_values(ascending=False).head(5)
        for user, count in top_users.items():
            print(f"  {user}: {count} interactions")

In [5]:
if __name__ == "__main__":
    # Generate interactions
    generator = UserPOIInteractionGenerator(
        poi_tree_file='poi_tree_with_uuids.json',
        users_file='user_preferences.csv' 
    )
    
    interactions_df = generator.generate_interactions(
        min_interactions_per_user=5,
        max_interactions_per_user=20,
        days_back=90
    )
    
    # Save to CSV
    # Full version with metadata (includes current location)
    interactions_df.to_csv('user_poi_interactions_full.csv', index=False)
    print("\nSaved full interactions to: user_poi_interactions_full.csv")
    
    # Minimal version (just the essential columns for training)
    interactions_minimal = interactions_df[[
        'user_id', 'poi_id', 'interaction_type', 'value', 'timestamp',
        'current_lat', 'current_lon', 'distance_km'
    ]]
    interactions_minimal.to_csv('user_poi_interactions.csv', index=False)
    print("Saved minimal interactions to: user_poi_interactions.csv")
    
    # Generate summary statistics
    generator.generate_summary_stats(interactions_df)
    
    # Show sample interactions
    print("\n" + "=" * 60)
    print("SAMPLE INTERACTIONS (first 10)")
    print("=" * 60)
    print(interactions_df.head(10).to_string(index=False))

Generating User-POI Interactions

Pipeline: Random Location -> Spatial Filtering -> Interest Matching -> Price Filtering -> Scoring

Processing user 1/21: Aiden
  Generated 5 interaction sessions from 5 attempts

Processing user 2/21: Chloe
  Generated 14 interaction sessions from 14 attempts

Processing user 3/21: Lucas
  Generated 14 interaction sessions from 15 attempts

Processing user 4/21: Ethan
  Generated 8 interaction sessions from 8 attempts

Processing user 5/21: Maya
  Generated 14 interaction sessions from 14 attempts

Processing user 6/21: Sophia
  Generated 11 interaction sessions from 11 attempts

Processing user 7/21: Maya
  Generated 6 interaction sessions from 6 attempts

Processing user 8/21: Kai
  Generated 8 interaction sessions from 8 attempts

Processing user 9/21: Isla
  Generated 11 interaction sessions from 11 attempts

Processing user 10/21: Noah
  Generated 20 interaction sessions from 21 attempts

Processing user 11/21: Zara
  Generated 15 interaction sess