In [2]:
import pandas as pd
import json
import numpy as np
from datetime import datetime, timedelta
import random
from typing import Dict, List, Tuple
import math

In [3]:
class UserPOIInteractionGenerator:
    def __init__(self, poi_tree_file: str, users_file: str):
        """
        Initialize interaction generator
        
        Args:
            poi_tree_file: Path to POI tree JSON file
            users_file: Path to user preferences CSV
        """
        # Load POI tree
        with open(poi_tree_file, 'r', encoding='utf-8') as f:
            self.poi_tree = json.load(f)
        
        # Load users
        self.users_df = pd.read_csv(users_file)
        
        # Get geocoded locations for user residences
        self.residence_locations = self._get_residence_coordinates()
        
        # Interest to POI characteristic mapping
        self.interest_mapping = {
            'food': ['dining', 'restaurant', 'food', 'hawker', 'cafe', 'eatery', 'cuisine'],
            'shopping': ['shopping', 'retail', 'fashion', 'mall', 'store', 'boutique'],
            'movies': ['cinema', 'movie', 'film', 'theatre', 'entertainment'],
            'cafes': ['cafe', 'coffee', 'tea', 'bakery', 'dessert'],
            'cycling': ['cycling', 'bike', 'sports', 'outdoor', 'park'],
            'photography': ['scenic', 'park', 'nature', 'attraction', 'view'],
            'museums': ['museum', 'gallery', 'art', 'culture', 'heritage', 'exhibition'],
            'books': ['book', 'library', 'store', 'reading'],
            'nightlife': ['bar', 'pub', 'club', 'nightlife', 'lounge'],
            'bars': ['bar', 'pub', 'lounge', 'drinks', 'alcohol'],
            'concerts': ['music', 'entertainment', 'venue', 'performance'],
            'family activities': ['family', 'kid', 'playground', 'park', 'entertainment'],
            'playgrounds': ['playground', 'park', 'family', 'children'],
            'malls': ['mall', 'shopping', 'retail'],
            'sports': ['sports', 'gym', 'fitness', 'athletic'],
            'gyms': ['gym', 'fitness', 'sports', 'workout'],
            'healthy eating': ['healthy', 'salad', 'organic', 'wellness', 'fresh'],
            'gaming': ['gaming', 'arcade', 'entertainment', 'game'],
            'arcades': ['arcade', 'game', 'entertainment'],
            'budget food': ['hawker', 'food court', 'budget', 'cheap', 'affordable'],
            'tech stores': ['tech', 'electronics', 'gadget', 'computer', 'mobile'],
            'coworking': ['coworking', 'cafe', 'workspace', 'work'],
            'local food': ['hawker', 'local', 'traditional', 'food court'],
            'parks': ['park', 'nature', 'outdoor', 'garden'],
            'community events': ['community', 'event', 'recreation', 'centre']
        }
        
        # Transportation mode to max distance mapping (km)
        self.transport_distance = {
            'MRT': 15.0,
            'bus': 10.0,
            'car': 25.0,
            'walking': 2.0,
            'bicycle': 5.0,
            'ride-hailing': 20.0
        }
        
        # Price sensitivity to price range mapping
        self.price_ranges = {
            'low': (0, 20),
            'medium': (10, 40),
            'high': (20, 100)
        }
        
    def _get_residence_coordinates(self) -> Dict[str, Tuple[float, float]]:
        """
        Get approximate coordinates for common Singapore residential areas
        """
        # Approximate coordinates for major residential areas
        locations = {
            'Jurong East': (1.3329, 103.7436),
            'Yishun': (1.4304, 103.8354),
            'Bishan': (1.3526, 103.8352),
            'Bukit Timah': (1.3294, 103.8008),
            'Ang Mo Kio': (1.3691, 103.8454),
            'Clementi': (1.3152, 103.7649),
            'Bedok': (1.3236, 103.9273),
            'Toa Payoh': (1.3343, 103.8563),
            'Sengkang': (1.3868, 103.8914),
            'Punggol': (1.4043, 103.9021),
            'Serangoon': (1.3554, 103.8679),
            'Tampines': (1.3496, 103.9568),
            'Woodlands': (1.4382, 103.7891),
            'Hougang': (1.3612, 103.8863),
            'Pasir Ris': (1.3721, 103.9474)
        }
        return locations
    
    def _haversine_distance(self, lat1: float, lon1: float, 
                           lat2: float, lon2: float) -> float:
        """Calculate distance between coordinates in km"""
        R = 6371
        lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
        return R * c
    
    def _get_max_travel_distance(self, transport_modes: str) -> float:
        """Get maximum travel distance based on transportation modes"""
        modes = [m.strip() for m in transport_modes.split(';')]
        # Return max distance among all modes
        distances = [self.transport_distance.get(mode, 10.0) for mode in modes]
        return max(distances)
    
    def _match_interest_score(self, user_interests: str, poi_text: str) -> float:
        """
        Calculate how well a POI matches user interests
        Returns score from 0 to 1
        """
        interests = [i.strip().lower() for i in user_interests.split(';')]
        poi_text_lower = poi_text.lower()
        
        total_matches = 0
        total_keywords = 0
        
        for interest in interests:
            if interest in self.interest_mapping:
                keywords = self.interest_mapping[interest]
                total_keywords += len(keywords)
                matches = sum(1 for keyword in keywords if keyword in poi_text_lower)
                total_matches += matches
        
        if total_keywords == 0:
            return 0.0
        
        return min(total_matches / (total_keywords * 0.3), 1.0)  # Normalize
    
    def _match_price_sensitivity(self, user_price_sens: str, poi_price: str) -> bool:
        """Check if POI price matches user's price sensitivity"""
        if pd.isna(poi_price) or poi_price == '':
            return True  # Assume affordable if no price info
        
        try:
            # Parse price range (e.g., "25.85 - 30.99")
            if '-' in str(poi_price):
                prices = str(poi_price).split('-')
                avg_price = (float(prices[0].strip()) + float(prices[1].strip())) / 2
            else:
                avg_price = float(poi_price)
            
            price_range = self.price_ranges.get(user_price_sens.lower(), (0, 100))
            return price_range[0] <= avg_price <= price_range[1]
        except:
            return True
    
    def _get_candidate_pois(self, user: pd.Series) -> List[Tuple[str, Dict, float]]:
        """
        Get candidate POIs for a user based on their preferences
        
        Returns:
            List of (poi_id, poi_data, score) tuples
        """
        user_location = self.residence_locations.get(user['area_of_residence'])
        if not user_location:
            print(f"Warning: Location not found for {user['area_of_residence']}")
            return []
        
        max_distance = self._get_max_travel_distance(user['transportation_modes'])
        user_interests = user['interests']
        user_price_sens = user['price_sensitivity']
        
        candidates = []
        
        # Iterate through Level 0 POIs (individual POIs)
        for poi_id, poi_data in self.poi_tree['level_0'].items():
            # Get POI coordinates
            poi_spatial = poi_data['spatial']
            if isinstance(poi_spatial, str):
                poi_spatial = eval(poi_spatial)  # Convert string tuple to tuple
            
            poi_lat, poi_lon = poi_spatial
            
            # Calculate distance
            distance = self._haversine_distance(
                user_location[0], user_location[1],
                poi_lat, poi_lon
            )
            
            # Filter by distance
            if distance > max_distance:
                continue
            
            # Calculate interest match score
            poi_text = poi_data.get('textual', '')
            interest_score = self._match_interest_score(user_interests, poi_text)
            
            if interest_score < 0.1:  # Skip if very low interest match
                continue
            
            # Check price sensitivity
            poi_price = poi_data['data'].get('price', '')
            if not self._match_price_sensitivity(user_price_sens, poi_price):
                continue
            
            # Get popularity
            try:
                popularity = float(poi_data['data'].get('popularity', 3))
            except:
                popularity = 3.0
            
            # Calculate overall score
            # Formula: interest_match * 3 + popularity * 0.5 - distance * 0.2
            score = (interest_score * 3.0) + (popularity * 0.5) - (distance * 0.2)
            
            candidates.append((poi_id, poi_data, score, distance, interest_score))
        
        # Sort by score
        candidates.sort(key=lambda x: x[2], reverse=True)
        
        return candidates
    
    def generate_interactions(self, 
                            min_interactions_per_user: int = 5,
                            max_interactions_per_user: int = 20,
                            days_back: int = 90) -> pd.DataFrame:
        """
        Generate synthetic user-POI interactions
        
        Args:
            min_interactions_per_user: Minimum number of interactions per user
            max_interactions_per_user: Maximum number of interactions per user
            days_back: Generate interactions for past N days
        
        Returns:
            DataFrame with columns: interaction_id, user_id, poi_id, 
                                   interaction_type, value, timestamp
        """
        interactions = []
        interaction_counter = 0
        
        print("="*60)
        print("Generating User-POI Interactions")
        print("="*60)
        
        for idx, user in self.users_df.iterrows():
            user_id = user['uudi']
            user_name = user['name']
            
            print(f"\nProcessing user {idx+1}/{len(self.users_df)}: {user_name} ({user['area_of_residence']})")
            
            # Get candidate POIs
            candidates = self._get_candidate_pois(user)
            
            if not candidates:
                print(f"  No suitable POIs found for {user_name}")
                continue
            
            print(f"  Found {len(candidates)} candidate POIs")
            
            # Determine number of interactions
            num_interactions = random.randint(
                min_interactions_per_user,
                min(max_interactions_per_user, len(candidates))
            )
            
            # Select POIs using weighted random (higher score = higher probability)
            selected_pois = []
            candidate_pool = candidates.copy()
            
            for _ in range(num_interactions):
                if not candidate_pool:
                    break
                
                # Calculate weights (exponential to favor high scores)
                scores = [c[2] for c in candidate_pool]
                weights = [math.exp(score) for score in scores]
                weight_sum = sum(weights)
                weights = [w/weight_sum for w in weights]
                
                # Select POI
                selected_idx = random.choices(range(len(candidate_pool)), weights=weights)[0]
                selected = candidate_pool.pop(selected_idx)
                selected_pois.append(selected)
            
            print(f"  Generated {len(selected_pois)} interactions")
            
            # Generate interactions for selected POIs
            for poi_id, poi_data, score, distance, interest_score in selected_pois:
                # Generate visit timestamp (random in past 90 days)
                days_ago = random.randint(0, days_back)
                hours = random.randint(8, 22)  # Between 8am and 10pm
                minutes = random.randint(0, 59)
                
                visit_time = datetime.now() - timedelta(days=days_ago, hours=hours, minutes=minutes)
                
                # Visit interaction
                interactions.append({
                    'interaction_id': f'int_{interaction_counter:06d}',
                    'user_id': user_id,
                    'user_name': user_name,
                    'poi_id': poi_id,
                    'poi_name': poi_data['name'],
                    'interaction_type': 'visit',
                    'value': 1,
                    'timestamp': visit_time.strftime('%Y-%m-%d %H:%M:%S'),
                    'distance_km': round(distance, 2),
                    'interest_match_score': round(interest_score, 2)
                })
                interaction_counter += 1
                
                # Generate rating (80% chance)
                if random.random() < 0.8:
                    # Rating influenced by interest match and distance
                    # Higher interest match and closer = better rating
                    base_rating = 3.0
                    interest_bonus = interest_score * 2.0  # 0 to 2
                    distance_penalty = min(distance / 10.0, 1.0)  # 0 to 1
                    
                    rating = base_rating + interest_bonus - distance_penalty
                    rating = max(1, min(5, int(round(rating))))
                    
                    # Add some randomness
                    if random.random() < 0.2:
                        rating = max(1, rating - 1) if random.random() < 0.5 else min(5, rating + 1)
                    
                    # Rating timestamp (shortly after visit)
                    rating_time = visit_time + timedelta(minutes=random.randint(5, 120))
                    
                    interactions.append({
                        'interaction_id': f'int_{interaction_counter:06d}',
                        'user_id': user_id,
                        'user_name': user_name,
                        'poi_id': poi_id,
                        'poi_name': poi_data['name'],
                        'interaction_type': 'rating',
                        'value': rating,
                        'timestamp': rating_time.strftime('%Y-%m-%d %H:%M:%S'),
                        'distance_km': round(distance, 2),
                        'interest_match_score': round(interest_score, 2)
                    })
                    interaction_counter += 1
                
                # Generate click/search (30% chance, happens before visit)
                if random.random() < 0.3:
                    click_time = visit_time - timedelta(hours=random.randint(1, 48))
                    
                    interactions.append({
                        'interaction_id': f'int_{interaction_counter:06d}',
                        'user_id': user_id,
                        'user_name': user_name,
                        'poi_id': poi_id,
                        'poi_name': poi_data['name'],
                        'interaction_type': 'search',
                        'value': 1,
                        'timestamp': click_time.strftime('%Y-%m-%d %H:%M:%S'),
                        'distance_km': round(distance, 2),
                        'interest_match_score': round(interest_score, 2)
                    })
                    interaction_counter += 1
        
        print("\n" + "="*60)
        print(f"Generated {len(interactions)} total interactions")
        print("="*60)
        
        # Convert to DataFrame and sort by timestamp
        interactions_df = pd.DataFrame(interactions)
        interactions_df = interactions_df.sort_values('timestamp')
        
        return interactions_df
    
    def generate_summary_stats(self, interactions_df: pd.DataFrame):
        """Print summary statistics of generated interactions"""
        print("\n" + "="*60)
        print("INTERACTION SUMMARY STATISTICS")
        print("="*60)
        
        print(f"\nTotal interactions: {len(interactions_df)}")
        print(f"Total users: {interactions_df['user_id'].nunique()}")
        print(f"Total POIs: {interactions_df['poi_id'].nunique()}")
        
        print("\nInteractions per user:")
        user_counts = interactions_df.groupby('user_id').size()
        print(f"  Mean: {user_counts.mean():.1f}")
        print(f"  Median: {user_counts.median():.1f}")
        print(f"  Min: {user_counts.min()}")
        print(f"  Max: {user_counts.max()}")
        
        print("\nInteraction types:")
        type_counts = interactions_df['interaction_type'].value_counts()
        for itype, count in type_counts.items():
            print(f"  {itype}: {count} ({count/len(interactions_df)*100:.1f}%)")
        
        print("\nRating distribution:")
        ratings = interactions_df[interactions_df['interaction_type'] == 'rating']['value']
        if len(ratings) > 0:
            print(f"  Mean rating: {ratings.mean():.2f}")
            print(f"  Rating counts:")
            for rating in sorted(ratings.unique()):
                count = (ratings == rating).sum()
                print(f"    {int(rating)} stars: {count} ({count/len(ratings)*100:.1f}%)")
        
        print("\nDistance statistics:")
        print(f"  Mean distance: {interactions_df['distance_km'].mean():.2f} km")
        print(f"  Median distance: {interactions_df['distance_km'].median():.2f} km")
        print(f"  Max distance: {interactions_df['distance_km'].max():.2f} km")
        
        print("\nInterest match statistics:")
        print(f"  Mean match score: {interactions_df['interest_match_score'].mean():.2f}")
        print(f"  Median match score: {interactions_df['interest_match_score'].median():.2f}")
        
        print("\nTop 10 most visited POIs:")
        top_pois = interactions_df[interactions_df['interaction_type'] == 'visit'].groupby('poi_name').size().sort_values(ascending=False).head(10)
        for poi, count in top_pois.items():
            print(f"  {poi}: {count} visits")
        
        print("\nMost active users:")
        top_users = interactions_df.groupby('user_name').size().sort_values(ascending=False).head(5)
        for user, count in top_users.items():
            print(f"  {user}: {count} interactions")

In [6]:
if __name__ == "__main__":
    # Generate interactions
    generator = UserPOIInteractionGenerator(
        poi_tree_file='poi_tree_with_uuids.json',
        users_file='user_preferences.csv' 
    )
    
    interactions_df = generator.generate_interactions(
        min_interactions_per_user=5,
        max_interactions_per_user=20,
        days_back=90
    )
    
    # Save to CSV
    # Full version with metadata
    interactions_df.to_csv('user_poi_interactions_full.csv', index=False)
    print("\nSaved full interactions to: user_poi_interactions_full.csv")
    
    # Minimal version (just the essential columns)
    interactions_minimal = interactions_df[['user_id', 'poi_id', 'interaction_type', 'value', 'timestamp']]
    interactions_minimal.to_csv('user_poi_interactions.csv', index=False)
    print("Saved minimal interactions to: user_poi_interactions.csv")
    
    # Generate summary statistics
    generator.generate_summary_stats(interactions_df)
    
    # Show sample interactions
    print("\n" + "="*60)
    print("SAMPLE INTERACTIONS (first 10)")
    print("="*60)
    print(interactions_df.head(10).to_string(index=False))

Generating User-POI Interactions

Processing user 1/21: Aiden (Jurong East)
  Found 463 candidate POIs
  Generated 8 interactions

Processing user 2/21: Chloe (Yishun)
  Found 126 candidate POIs
  Generated 5 interactions

Processing user 3/21: Lucas (Jurong East)
  Found 115 candidate POIs
  Generated 5 interactions

Processing user 4/21: Ethan (Yishun)
  Found 424 candidate POIs
  Generated 19 interactions

Processing user 5/21: Maya (Bishan)
  Found 579 candidate POIs
  Generated 18 interactions

Processing user 6/21: Sophia (Bukit Timah)
  Found 577 candidate POIs
  Generated 14 interactions

Processing user 7/21: Maya (Ang Mo Kio)
  Found 55 candidate POIs
  Generated 19 interactions

Processing user 8/21: Kai (Clementi)
  Found 497 candidate POIs
  Generated 5 interactions

Processing user 9/21: Isla (Bukit Timah)
  Found 93 candidate POIs
  Generated 17 interactions

Processing user 10/21: Noah (Bedok)
  Found 166 candidate POIs
  Generated 12 interactions

Processing user 11/21