In [16]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import json
from faker import Faker
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Initialize Faker for Indian context
fake = Faker('en_IN')
random.seed(42)
np.random.seed(42)

class IndianHotelDataGenerator:
    """Enhanced synthetic data generator with realistic correlations"""
    
    def __init__(self):
        self.indian_cities = {
            'Mumbai': {'state': 'Maharashtra', 'region': 'West', 'tier': 1, 'avg_price': 6000, 'veg_pct': 0.40},
            'Delhi': {'state': 'Delhi', 'region': 'North', 'tier': 1, 'avg_price': 5500, 'veg_pct': 0.50},
            'Bangalore': {'state': 'Karnataka', 'region': 'South', 'tier': 1, 'avg_price': 4500, 'veg_pct': 0.30},
            'Chennai': {'state': 'Tamil Nadu', 'region': 'South', 'tier': 1, 'avg_price': 4000, 'veg_pct': 0.35},
            'Kolkata': {'state': 'West Bengal', 'region': 'East', 'tier': 1, 'avg_price': 3500, 'veg_pct': 0.25},
            'Hyderabad': {'state': 'Telangana', 'region': 'South', 'tier': 1, 'avg_price': 4200, 'veg_pct': 0.32},
            'Pune': {'state': 'Maharashtra', 'region': 'West', 'tier': 1, 'avg_price': 4800, 'veg_pct': 0.38},
            'Jaipur': {'state': 'Rajasthan', 'region': 'North', 'tier': 2, 'avg_price': 3800, 'veg_pct': 0.45},
            'Ahmedabad': {'state': 'Gujarat', 'region': 'West', 'tier': 2, 'avg_price': 3700, 'veg_pct': 0.55},
            'Lucknow': {'state': 'Uttar Pradesh', 'region': 'North', 'tier': 2, 'avg_price': 3200, 'veg_pct': 0.48},
            'Kochi': {'state': 'Kerala', 'region': 'South', 'tier': 2, 'avg_price': 3400, 'veg_pct': 0.35}
        }
        
        self.amenities_hierarchy = {
            'essential': ['free_wifi', 'air_conditioning', 'parking'],
            'comfort': ['room_service', '24_7_front_desk', 'restaurant'],
            'premium': ['gym', 'pool', 'spa'],
            'business': ['business_center', 'conference_hall', 'airport_shuttle', 'meeting_rooms'],
            'cultural': ['vegetarian_restaurant', 'multilingual_staff', 'local_cuisine']
        }
        
        self.review_templates = {
            'positive': {
                'business': [
                    "Excellent business facilities and {amenity}. Professional service throughout. {cultural_aspect}",
                    "Perfect for business travel with great {amenity}. Location is convenient. {cultural_aspect}",
                    "Productive stay with reliable {amenity}. Staff understood business needs. {cultural_aspect}"
                ],
                'family': [
                    "Wonderful family experience with {amenity}. Kids loved the facilities. {cultural_aspect}",
                    "Great for families - {amenity} was perfect. Safe and comfortable environment. {cultural_aspect}",
                    "Family-friendly staff and excellent {amenity}. Will return with family. {cultural_aspect}"
                ],
                'leisure': [
                    "Relaxing stay with beautiful {amenity}. Peaceful atmosphere. {cultural_aspect}",
                    "Perfect vacation spot with amazing {amenity}. Great value for money. {cultural_aspect}",
                    "Enjoyable stay with wonderful {amenity}. Exceeded expectations. {cultural_aspect}"
                ],
                'couple': [
                    "Romantic getaway with excellent {amenity}. Perfect for couples. {cultural_aspect}",
                    "Beautiful ambiance and wonderful {amenity}. Great couple experience. {cultural_aspect}",
                    "Intimate setting with amazing {amenity}. Highly recommend for couples. {cultural_aspect}"
                ]
            },
            'negative': {
                'business': [
                    "Poor business facilities, {amenity} not working properly. Unprofessional service. {cultural_issue}",
                    "Disappointing for business travel - {amenity} was inadequate. Location inconvenient. {cultural_issue}",
                    "Unproductive stay due to faulty {amenity}. Staff not helpful for business needs. {cultural_issue}"
                ],
                'family': [
                    "Not suitable for families - {amenity} was broken. Safety concerns. {cultural_issue}",
                    "Children were disappointed with {amenity}. Not family-oriented service. {cultural_issue}",
                    "Family vacation ruined by poor {amenity} and unhelpful staff. {cultural_issue}"
                ],
                'leisure': [
                    "Terrible vacation experience - {amenity} was awful. No relaxation possible. {cultural_issue}",
                    "Overpriced for quality - {amenity} not worth it. Poor maintenance. {cultural_issue}",
                    "Disappointing stay with substandard {amenity}. Will not recommend. {cultural_issue}"
                ],
                'couple': [
                    "Poor experience for couples - {amenity} was disappointing. Not romantic at all. {cultural_issue}",
                    "Not suitable for couples - {amenity} was inadequate. No privacy. {cultural_issue}",
                    "Couple getaway ruined by faulty {amenity} and poor service. {cultural_issue}"
                ]
            }
        }

    def generate_realistic_user_profiles(self, n_users=300):
        """Generate user profiles with realistic correlations"""
        users = []
        
        for i in range(n_users):
            # Start with age group and derive other attributes
            age_group = np.random.choice(['22-30', '30-40', '40-50', '50-60'], p=[0.35, 0.35, 0.20, 0.10])
            
            # Income correlates with age
            if age_group == '22-30':
                income = np.random.choice(['3-6L', '6-10L', '10-15L'], p=[0.60, 0.30, 0.10])
            elif age_group == '30-40':
                income = np.random.choice(['6-10L', '10-15L', '15L+'], p=[0.40, 0.40, 0.20])
            elif age_group == '40-50':
                income = np.random.choice(['10-15L', '15L+', '25L+'], p=[0.30, 0.50, 0.20])
            else:  # 50-60
                income = np.random.choice(['15L+', '25L+'], p=[0.60, 0.40])
            
            # Family type correlates with age
            if age_group == '22-30':
                family_type = np.random.choice(['Solo', 'Couple'], p=[0.60, 0.40])
            elif age_group == '30-40':
                family_type = np.random.choice(['Couple', 'Family_with_kids'], p=[0.50, 0.50])
            else:
                family_type = np.random.choice(['Couple', 'Family_with_kids'], p=[0.70, 0.30])
            
            # Budget based on income and family type
            budget_ranges = {
                '3-6L': (1500, 4000), '6-10L': (2500, 6000), 
                '10-15L': (3500, 8000), '15L+': (5000, 12000), '25L+': (8000, 20000)
            }
            base_min, base_max = budget_ranges[income]
            
            # Family multiplier
            family_multiplier = 1.0 if family_type == 'Solo' else (1.2 if family_type == 'Couple' else 1.5)
            budget_min = int(base_min * family_multiplier)
            budget_max = int(base_max * family_multiplier)
            
            # Business travel frequency correlates with income and age
            if income in ['15L+', '25L+'] and age_group in ['30-40', '40-50']:
                business_freq = np.random.beta(3, 2)  # Skewed toward higher values
            else:
                business_freq = np.random.beta(1, 3)  # Skewed toward lower values
            
            # City distribution (realistic population representation)
            home_city = np.random.choice(
                list(self.indian_cities.keys()),
                p=[0.18, 0.17, 0.14, 0.12, 0.10, 0.09, 0.08, 0.05, 0.03, 0.03, 0.01]
            )
            
            # Vegetarian preference correlates with region
            city_veg_pct = self.indian_cities[home_city]['veg_pct']
            is_vegetarian = random.random() < city_veg_pct
            
            user = {
                'user_id': i + 1,
                'age_group': age_group,
                'gender': random.choice(['Male', 'Female']),
                'home_city': home_city,
                'home_state': self.indian_cities[home_city]['state'],
                'income_bracket_inr': income,
                'family_type': family_type,
                'budget_min_inr': budget_min,
                'budget_max_inr': budget_max,
                'vegetarian_preference': is_vegetarian,
                'business_travel_frequency': round(business_freq, 2),
                'travel_frequency': np.random.choice(['Occasional', 'Regular', 'Frequent'], p=[0.50, 0.35, 0.15]),
                'location_preference': np.random.choice(['City_center', 'Transport_hub', 'Quiet_area'], p=[0.40, 0.35, 0.25]),
                'booking_advance_days': int(np.random.exponential(10)) + 1  # Realistic booking patterns
            }
            
            # Preferred amenities based on user profile
            if family_type == 'Family_with_kids':
                preferred = ['pool', 'family_rooms', 'parking']
            elif business_freq > 0.5:
                preferred = ['free_wifi', 'business_center', 'airport_shuttle', 'meeting_rooms']
            elif income in ['15L+', '25L+']:
                preferred = ['spa', 'gym', 'room_service']
            else:
                preferred = ['free_wifi', 'restaurant', 'parking']
            
            user['preferred_amenities'] = json.dumps(preferred)
            users.append(user)
        
        return pd.DataFrame(users)

    def _generate_realistic_star_rating(self, tier):
        """Generate integer star ratings with realistic distribution"""
        if tier == 1:
            # More 4-5 star hotels in tier 1 cities (only integers 3, 4, 5)
            return int(np.random.choice([3, 4, 5], p=[0.15, 0.45, 0.40]))
        else:
            # More 3-4 star hotels in tier 2 cities (only integers 3, 4, 5)
            return int(np.random.choice([3, 4, 5], p=[0.50, 0.40, 0.10]))

    def _generate_correlated_amenities(self, star_rating, city_info):
        """Generate amenities that correlate with star rating and city"""
        amenities = {}
        
        # Essential amenities (high probability)
        amenities['free_wifi'] = random.random() < 0.95 if star_rating >= 3 else random.random() < 0.7
        amenities['air_conditioning'] = random.random() < 0.98 if star_rating >= 3 else random.random() < 0.8
        amenities['parking'] = random.random() < 0.85
        
        # Comfort amenities (correlate with star rating)
        amenities['room_service'] = random.random() < (0.3 + (star_rating - 2.5) / 2.5 * 0.6)
        amenities['24_7_front_desk'] = random.random() < (0.4 + (star_rating - 2.5) / 2.5 * 0.5)
        amenities['restaurant'] = random.random() < (0.5 + (star_rating - 2.5) / 2.5 * 0.4)
        
        # Premium amenities (strong correlation with star rating)
        amenities['gym'] = random.random() < (0.1 + (star_rating - 2.5) / 2.5 * 0.8)
        amenities['pool'] = random.random() < (0.05 + (star_rating - 2.5) / 2.5 * 0.7)
        amenities['spa'] = random.random() < (0.02 + (star_rating - 2.5) / 2.5 * 0.6)
        
        # Business amenities (correlate with city tier)
        business_prob = 0.3 + (city_info['tier'] - 1) * 0.4
        amenities['business_center'] = random.random() < business_prob
        amenities['conference_hall'] = random.random() < (business_prob * 0.8)
        amenities['airport_shuttle'] = random.random() < (0.2 + (city_info['tier'] - 1) * 0.3)
        # Meeting rooms - most hotels with business centers have meeting rooms
        if amenities['business_center']:
            amenities['meeting_rooms'] = random.random() < 0.85  # 85% chance if business center exists
        else:
            amenities['meeting_rooms'] = random.random() < (business_prob * 0.4)  # Lower chance otherwise
        
        # Cultural amenities
        amenities['vegetarian_restaurant'] = random.random() < city_info['veg_pct']
        amenities['multilingual_staff'] = random.random() < (0.4 + (star_rating - 2.5) / 2.5 * 0.4)
        amenities['local_cuisine'] = random.random() < 0.7
        
        return amenities

    def _select_property_type(self, star_rating):
        """Select property type based on star rating"""
        if star_rating >= 4.5:
            return random.choice(['Luxury Hotel', '5-Star Resort', 'Business Hotel'])
        elif star_rating >= 4.0:
            return random.choice(['Hotel', 'Resort', 'Business Hotel'])
        elif star_rating >= 3.0:
            return random.choice(['Hotel', 'Boutique Hotel', 'Service Apartment'])
        else:
            return random.choice(['Budget Hotel', 'Guesthouse', 'Lodge'])

    def _create_hotel_placeholder(self, n_hotels):
        """Create more realistic hotel data with correlations"""
        hotels = []
        
        for i in range(n_hotels):
            city = random.choice(list(self.indian_cities.keys()))
            city_info = self.indian_cities[city]
            
            # Realistic star rating distribution
            star_rating = self._generate_realistic_star_rating(city_info['tier'])
            
            # Price based on city, star rating, and amenities
            base_price = city_info['avg_price']
            price_multiplier = 0.8 + (star_rating - 2.5) / 2.5 * 0.8  # 0.8-1.6 multiplier based on stars
            price = int(base_price * price_multiplier * random.uniform(0.9, 1.1))
            
            # Generate amenities with realistic correlations
            amenities = self._generate_correlated_amenities(star_rating, city_info)
            
            hotel = {
                'hotel_id': i + 1,
                'name': f"{random.choice(['Grand', 'Royal', 'Taj', 'Leela', 'ITC'])} {fake.company()} {city}",
                'city': city,
                'state': city_info['state'],
                'star_rating': star_rating,
                'price_per_night_inr': price,
                'property_type': self._select_property_type(star_rating),
                'total_rooms': random.randint(20, 300),
                'year_established': random.randint(1990, 2022),
                **amenities
            }
            
            hotels.append(hotel)
        
        return pd.DataFrame(hotels)

    def generate_realistic_interactions(self, hotels_df, users_df, target_interactions=1500):
        """Generate interactions with realistic user-hotel matching"""
        interactions = []
        
        # Ensure minimum interactions per user for collaborative filtering
        min_interactions_per_user = 3
        remaining_interactions = target_interactions - (len(users_df) * min_interactions_per_user)
        
        interaction_id = 1
        
        for _, user in users_df.iterrows():
            user_interactions = []
            
            # Filter hotels matching user preferences
            suitable_hotels = self._filter_suitable_hotels(hotels_df, user)
            
            # Guaranteed minimum interactions
            for _ in range(min_interactions_per_user):
                if len(suitable_hotels) > 0:
                    hotel = suitable_hotels.sample(1).iloc[0]
                    interaction = self._create_interaction(
                        interaction_id, user, hotel, 
                        datetime(2023, 1, 1) + timedelta(days=random.randint(0, 365))
                    )
                    interactions.append(interaction)
                    user_interactions.append(interaction)
                    interaction_id += 1
            
            # Additional interactions for active users
            if user['travel_frequency'] in ['Regular', 'Frequent']:
                extra_interactions = random.randint(1, 4)
                for _ in range(extra_interactions):
                    if remaining_interactions > 0 and len(suitable_hotels) > 0:
                        hotel = suitable_hotels.sample(1).iloc[0]
                        interaction = self._create_interaction(
                            interaction_id, user, hotel,
                            datetime(2023, 1, 1) + timedelta(days=random.randint(0, 365))
                        )
                        interactions.append(interaction)
                        interaction_id += 1
                        remaining_interactions -= 1
        
        return pd.DataFrame(interactions)
    
    def _filter_suitable_hotels(self, hotels_df, user):
        """Filter hotels that match user preferences"""
        filtered = hotels_df.copy()
        
        # Budget filtering (strict)
        filtered = filtered[
            (filtered['price_per_night_inr'] >= user['budget_min_inr']) &
            (filtered['price_per_night_inr'] <= user['budget_max_inr'])
        ]
        
        # Vegetarian preference (if user is vegetarian, prefer hotels with veg restaurants)
        if user['vegetarian_preference']:
            veg_hotels = filtered[filtered['vegetarian_restaurant'] == True]
            if len(veg_hotels) > 0:
                # 80% chance to book veg-friendly hotel
                if random.random() < 0.8:
                    filtered = veg_hotels
        
        # Location preference
        preferred_amenities = json.loads(user['preferred_amenities'])
        for amenity in preferred_amenities:
            if amenity in filtered.columns:
                amenity_hotels = filtered[filtered[amenity] == True]
                if len(amenity_hotels) > 0:
                    # 70% chance to prefer hotels with required amenities
                    if random.random() < 0.7:
                        filtered = amenity_hotels
        
        return filtered if len(filtered) > 0 else hotels_df.sample(min(10, len(hotels_df)))
    
    def _create_interaction(self, interaction_id, user, hotel, interaction_date):
        """Create single interaction with realistic rating"""
        # Base rating from hotel quality
        base_rating = hotel['star_rating']
        
        # Adjust rating based on user-hotel fit
        price_fit = 1.0 if (user['budget_min_inr'] <= hotel['price_per_night_inr'] <= user['budget_max_inr']) else 0.5
        
        # Vegetarian fit bonus
        veg_fit = 1.0
        if user['vegetarian_preference'] and hotel['vegetarian_restaurant']:
            veg_fit = 1.2
        elif user['vegetarian_preference'] and not hotel['vegetarian_restaurant']:
            veg_fit = 0.7
        
        # Amenity fit
        preferred_amenities = json.loads(user['preferred_amenities'])
        amenity_matches = sum([1 for amenity in preferred_amenities if hotel.get(amenity, False)])
        amenity_fit = 0.8 + (amenity_matches / len(preferred_amenities)) * 0.4
        
        # Calculate final rating
        adjusted_rating = base_rating * price_fit * veg_fit * amenity_fit
        # Add some randomness
        final_rating = max(1.0, min(5.0, adjusted_rating + random.uniform(-0.8, 0.8)))
        
        interaction = {
            'interaction_id': interaction_id,
            'user_id': user['user_id'],
            'hotel_id': hotel['hotel_id'],
            'interaction_type': np.random.choice(['Booked', 'Reviewed'], p=[0.65, 0.35]),
            'interaction_date': interaction_date.strftime('%Y-%m-%d'),
            'rating': round(final_rating, 1),
            'total_amount_inr': hotel['price_per_night_inr'] * random.randint(1, 5)
        }
        
        # Enhanced attributes
        if interaction['interaction_type'] == 'Booked':
            interaction['stay_duration'] = random.randint(1, 7)
            interaction['travel_purpose'] = self._determine_travel_purpose(user)
            interaction['advance_booking_days'] = max(0, int(np.random.exponential(user['booking_advance_days'])))
            interaction['cancellation_flag'] = random.random() < 0.08  # 8% cancellation
        
        return interaction
    
    def _determine_travel_purpose(self, user):
        """Determine travel purpose based on user profile"""
        business_prob = user['business_travel_frequency']
        if random.random() < business_prob:
            return 'Business'
        elif user['family_type'] == 'Family_with_kids':
            return 'Family_vacation'
        else:
            return 'Leisure'
    
    def generate_diverse_reviews(self, hotels_df, interactions_df, users_df, n_reviews=700):
        """Generate more diverse and realistic reviews"""
        reviews = []
        
        # Get review interactions only
        review_interactions = interactions_df[interactions_df['interaction_type'] == 'Reviewed'].copy()
        
        # Enhanced cultural context with more regional diversity
        cultural_contexts = {
            'positive': {
                'North': [
                    "Excellent vegetarian thali available", "Staff spoke fluent Hindi", 
                    "Rajasthani hospitality at its best", "Authentic Punjabi cuisine",
                    "Traditional North Indian welcome with garland"
                ],
                'South': [
                    "Great South Indian breakfast with fresh filter coffee", 
                    "Comfortable for vegetarian families", "Traditional South Indian welcome",
                    "Authentic Chettinad cuisine experience", "Beautiful temple architecture influence"
                ],
                'West': [
                    "Business-friendly atmosphere", "Excellent Gujarati/Marathi food options", 
                    "Efficient service like Mumbai", "Great for business travelers",
                    "Authentic Maharashtrian thali available"
                ],
                'East': [
                    "Bengali sweets available in restaurant", "Cultural sensitivity appreciated", 
                    "Fish curry was authentic Bengali style", "Traditional Durga Puja decorations",
                    "Excellent tea selection from Assam and Darjeeling"
                ]
            },
            'negative': {
                'North': [
                    "No proper North Indian food options", "Language communication issues with staff", 
                    "Not suitable for vegetarian guests", "Lacked authentic North Indian hospitality",
                    "No regional cuisine options available"
                ],
                'South': [
                    "Limited South Indian options in breakfast", "No Tamil/Telugu speaking staff", 
                    "Breakfast not region-appropriate", "Lacked South Indian cultural elements",
                    "No authentic filter coffee available"
                ],
                'West': [
                    "No Gujarati/Marathi food options", "Business facilities inadequate for Mumbai standards", 
                    "Cultural context missing", "Not suitable for business travelers",
                    "Service not up to Western India expectations"
                ],
                'East': [
                    "No Bengali cuisine options", "Fish preparations not available despite request", 
                    "Regional preferences ignored", "Lacked Eastern cultural sensitivity",
                    "No authentic Eastern Indian tea varieties"
                ]
            }
        }
        
        # Regional specific keywords
        regional_keywords = {
            'North': ['chole bhature', 'butter chicken', 'naan', 'tandoor', 'rajasthani', 'punjabi'],
            'South': ['dosa', 'idli', 'sambar', 'filter coffee', 'chettinad', 'hyderabadi'],
            'West': ['vada pav', 'dhokla', 'pav bhaji', 'gujarati', 'marathi', 'mumbai style'],
            'East': ['rosogolla', 'machher jhol', 'mishti doi', 'bengali', 'assamese', 'fish curry']
        }
        
        for _, interaction in review_interactions.iterrows():
            if len(reviews) >= n_reviews:
                break
            
            hotel = hotels_df[hotels_df['hotel_id'] == interaction['hotel_id']].iloc[0]
            user = users_df[users_df['user_id'] == interaction['user_id']].iloc[0]
            
            # Determine review sentiment
            is_positive = interaction['rating'] >= 3.5
            sentiment_category = 'positive' if is_positive else 'negative'
            
            # Select travel context
            travel_contexts = ['business', 'family', 'leisure', 'couple']
            if user['business_travel_frequency'] > 0.6:
                travel_context = 'business'
            elif user['family_type'] == 'Family_with_kids':
                travel_context = 'family'
            elif user['family_type'] == 'Couple' and random.random() < 0.7:
                travel_context = 'couple'
            else:
                travel_context = 'leisure'
            
            # Add couple context if needed
            if travel_context == 'couple' and 'couple' not in self.review_templates[sentiment_category]:
                travel_context = 'leisure'  # Fallback
            
            # Generate review text
            template = random.choice(self.review_templates[sentiment_category][travel_context])
            
            # Select amenity to mention
            available_amenities = [amenity for amenity in self.amenities_hierarchy['essential'] + 
                                 self.amenities_hierarchy['comfort'] + self.amenities_hierarchy['premium'] 
                                 if hotel.get(amenity, False)]
            amenity_mention = random.choice(available_amenities) if available_amenities else 'service'
            
            # Cultural context
            region = self.indian_cities[hotel['city']]['region']
            cultural_aspects = cultural_contexts[sentiment_category][region]
            cultural_mention = random.choice(cultural_aspects)
            
            # Add regional keywords
            regional_terms = regional_keywords[region]
            regional_mention = random.choice(regional_terms) if random.random() < 0.6 else ""
            
            review_text = template.format(
                amenity=amenity_mention.replace('_', ' '),
                cultural_aspect=cultural_mention if is_positive else '',
                cultural_issue=cultural_mention if not is_positive else ''
            )
            
            # Add regional mention if not already included
            if regional_mention and regional_mention not in review_text.lower():
                if is_positive:
                    review_text += f" The {regional_mention} was particularly excellent."
                else:
                    review_text += f" They didn't even have proper {regional_mention}."
            
            # Generate aspect ratings correlated with overall rating
            base_rating = interaction['rating']
            aspect_variance = 0.5
            
            review = {
                'review_id': len(reviews) + 1,
                'hotel_id': interaction['hotel_id'],
                'user_id': interaction['user_id'],
                'review_text': review_text,
                'overall_rating': interaction['rating'],
                'review_date': interaction['interaction_date'],
                'reviewer_type': travel_context.title(),
                'sentiment_score': round((interaction['rating'] - 3) / 2, 2),  # -1 to 1 scale
                'cleanliness_rating': round(max(1, min(5, base_rating + random.uniform(-aspect_variance, aspect_variance))), 1),
                'service_rating': round(max(1, min(5, base_rating + random.uniform(-aspect_variance, aspect_variance))), 1),
                'location_rating': round(max(1, min(5, base_rating + random.uniform(-0.3, 0.3))), 1),
                'value_rating': round(max(1, min(5, base_rating + random.uniform(-0.7, 0.3))), 1),
                'language': 'en'
            }
            
            # Extract keywords with more diversity
            positive_keywords = ['excellent', 'great', 'wonderful', 'perfect', 'amazing', 'outstanding', 'superb', 'fantastic', 'delightful']
            negative_keywords = ['poor', 'terrible', 'disappointing', 'awful', 'inadequate', 'horrible', 'unacceptable', 'substandard', 'lacking']
            
            review['positive_keywords'] = json.dumps([word for word in positive_keywords if word in review_text.lower()])
            review['negative_keywords'] = json.dumps([word for word in negative_keywords if word in review_text.lower()])
            review['cultural_mentions'] = json.dumps([cultural_mention])
            
            # Review length category
            word_count = len(review_text.split())
            if word_count < 20:
                review['review_length_category'] = 'Short'
            elif word_count < 50:
                review['review_length_category'] = 'Medium'
            else:
                review['review_length_category'] = 'Long'
            
            reviews.append(review)
        
        return pd.DataFrame(reviews)
    
    def generate_booking_trends(self):
        """Generate realistic booking trends"""
        trends = []
        
        # Indian travel seasons
        season_multipliers = {
            1: 1.2, 2: 1.3, 3: 1.4,  # Winter peak (Jan-Mar)
            4: 0.9, 5: 0.8, 6: 0.7,  # Summer decline (Apr-Jun) 
            7: 0.6, 8: 0.6, 9: 0.7,  # Monsoon low (Jul-Sep)
            10: 1.5, 11: 1.6, 12: 1.4  # Festival/Winter peak (Oct-Dec)
        }
        
        for city in self.indian_cities.keys():
            for month in range(1, 13):
                base_volume = 1.0
                seasonal_factor = season_multipliers[month]
                
                # City-specific adjustments
                if city in ['Mumbai', 'Delhi', 'Bangalore']:  # Business hubs
                    business_factor = 1.1
                else:
                    business_factor = 0.9
                
                final_volume = base_volume * seasonal_factor * business_factor
                
                trend = {
                    'city': city,
                    'month': month,
                    'booking_volume_index': round(final_volume, 2),
                    'price_premium_factor': round(1.0 + (final_volume - 1.0) * 0.3, 2),  # Price follows demand
                    'popular_traveler_type': 'Family' if month in [10, 11, 12, 1] else 'Business'
                }
                
                trends.append(trend)
        
        return pd.DataFrame(trends)
    
    def generate_sample_queries(self, users_df, hotels_df, n_queries=200):
        """Generate sample search queries for recommendation testing"""
        queries = []
        
        for i in range(n_queries):
            user = users_df.sample(1).iloc[0]
            
            # Determine query type based on user profile
            if user['business_travel_frequency'] > 0.6:
                query_type = 'business'
                amenities = ['free_wifi', 'business_center', 'air_conditioning', 'meeting_rooms']
            elif user['family_type'] == 'Family_with_kids':
                query_type = 'family'
                amenities = ['family_rooms', 'pool', 'restaurant']
            else:
                query_type = 'leisure'
                amenities = ['spa', 'gym', 'pool']
            
            # Select random city (sometimes user's home city, sometimes other)
            if random.random() < 0.7:
                city = user['home_city']
            else:
                city = random.choice(list(self.indian_cities.keys()))
            
            query = {
                'query_id': i + 1,
                'user_id': user['user_id'],
                'query_city': city,
                'check_in_date': (datetime(2023, 1, 1) + timedelta(days=random.randint(0, 365))).strftime('%Y-%m-%d'),
                'duration_nights': random.randint(1, 7),
                'num_guests': 1 if user['family_type'] == 'Solo' else (2 if user['family_type'] == 'Couple' else random.randint(3, 4)),
                'min_star_rating': max(2.5, random.uniform(2.5, 5.0) - 0.5),
                'max_price': user['budget_max_inr'] * random.uniform(0.9, 1.2),
                'required_amenities': json.dumps(random.sample(amenities, random.randint(1, 3))),
                'query_type': query_type
            }
            
            queries.append(query)
        
        return pd.DataFrame(queries)
    
    def _generate_cities_context(self):
        """Generate Indian cities context data"""
        cities_data = []
        
        for city, info in self.indian_cities.items():
            major_languages = {
                'Mumbai': ['Hindi', 'Marathi', 'English'],
                'Delhi': ['Hindi', 'English', 'Punjabi'],
                'Bangalore': ['Kannada', 'English', 'Hindi'],
                'Chennai': ['Tamil', 'English', 'Hindi'],
                'Kolkata': ['Bengali', 'Hindi', 'English'],
                'Hyderabad': ['Telugu', 'Hindi', 'English'],
                'Pune': ['Marathi', 'Hindi', 'English'],
                'Jaipur': ['Hindi', 'Rajasthani', 'English'],
                'Ahmedabad': ['Gujarati', 'Hindi', 'English'],
                'Lucknow': ['Hindi', 'Urdu', 'English'],
                'Kochi': ['Malayalam', 'English', 'Hindi']
            }
            
            city_data = {
                'city_name': city,
                'state': info['state'],
                'region': info['region'],
                'tier': info['tier'],
                'vegetarian_percentage': info['veg_pct'],
                'major_languages': json.dumps(major_languages.get(city, ['Hindi', 'English'])),
                'business_hub_score': 0.9 if info['tier'] == 1 else 0.7,
                'average_hotel_price_inr': info['avg_price'],
                'transport_connectivity_score': random.uniform(0.7, 0.95)
            }
            
            cities_data.append(city_data)
        
        return pd.DataFrame(cities_data)
    
    def _validate_data_quality(self, users_df, hotels_df, interactions_df, reviews_df):
        """Enhanced data quality validation for ML algorithms"""
        print("\nEnhanced Data Quality Validation:")
        
        # Check interaction distribution
        interactions_per_user = interactions_df.groupby('user_id').size()
        print(f"- Interactions per user: min={interactions_per_user.min()}, max={interactions_per_user.max()}, avg={interactions_per_user.mean():.1f}")
        
        # Check rating distribution
        rating_dist = interactions_df['rating'].value_counts().sort_index()
        print(f"- Rating distribution: {dict(rating_dist)}")
        
        # Check sparsity
        n_users = len(users_df)
        n_hotels = len(hotels_df)
        n_interactions = len(interactions_df)
        sparsity = 1 - (n_interactions / (n_users * n_hotels))
        print(f"- Matrix sparsity: {sparsity:.3f} ({sparsity*100:.1f}% empty)")
        
        # Check budget alignment
        budget_violations = 0
        price_differences = []
        
        for _, interaction in interactions_df.iterrows():
            user = users_df[users_df['user_id'] == interaction['user_id']].iloc[0]
            hotel = hotels_df[hotels_df['hotel_id'] == interaction['hotel_id']].iloc[0]
            
            if not (user['budget_min_inr'] <= hotel['price_per_night_inr'] <= user['budget_max_inr']):
                budget_violations += 1
                price_differences.append(hotel['price_per_night_inr'] - user['budget_max_inr'] 
                                       if hotel['price_per_night_inr'] > user['budget_max_inr'] 
                                       else user['budget_min_inr'] - hotel['price_per_night_inr'])
        
        print(f"- Budget violations: {budget_violations}/{n_interactions} ({budget_violations/n_interactions*100:.1f}%)")
        if budget_violations > 0:
            print(f"  Average violation amount: ₹{sum(price_differences)/len(price_differences):.0f}")
        
        # Check review distribution
        reviews_per_hotel = reviews_df.groupby('hotel_id').size()
        print(f"- Reviews per hotel: min={reviews_per_hotel.min()}, max={reviews_per_hotel.max()}, avg={reviews_per_hotel.mean():.1f}")
        
        # Check sentiment distribution
        positive_reviews = len(reviews_df[reviews_df['sentiment_score'] > 0])
        negative_reviews = len(reviews_df[reviews_df['sentiment_score'] < 0])
        neutral_reviews = len(reviews_df[reviews_df['sentiment_score'] == 0])
        print(f"- Sentiment distribution: Positive={positive_reviews}, Negative={negative_reviews}, Neutral={neutral_reviews}")
        
        # Check feature correlations
        print("\nFeature Correlation Analysis:")
        
        # User budget vs hotel price correlation
        user_budget_avg = users_df['budget_max_inr'].mean()
        hotel_price_avg = hotels_df['price_per_night_inr'].mean()
        print(f"- Average user budget: ₹{user_budget_avg:.0f} vs Average hotel price: ₹{hotel_price_avg:.0f}")
        
        # Warn about collaborative filtering viability
        if sparsity > 0.99:
            print("⚠️  WARNING: Matrix too sparse for effective collaborative filtering!")
        if interactions_per_user.min() < 3:
            print("⚠️  WARNING: Some users have <3 interactions - cold start issues expected!")
        if reviews_per_hotel.min() < 2:
            print("⚠️  WARNING: Some hotels have <2 reviews - content-based filtering recommended!")
    
    def add_data_quality_metrics(self, datasets):
        """Add data quality metrics to the datasets"""
        # Add user activity level
        interactions_count = datasets['interactions'].groupby('user_id').size().reset_index(name='interaction_count')
        datasets['users'] = datasets['users'].merge(interactions_count, on='user_id', how='left')
        datasets['users']['interaction_count'].fillna(0, inplace=True)
        
        # Add hotel popularity
        hotel_interactions = datasets['interactions'].groupby('hotel_id').size().reset_index(name='popularity_score')
        datasets['hotels'] = datasets['hotels'].merge(hotel_interactions, on='hotel_id', how='left')
        datasets['hotels']['popularity_score'].fillna(0, inplace=True)
        
        # Add review counts
        review_counts = datasets['reviews'].groupby('hotel_id').size().reset_index(name='review_count')
        datasets['hotels'] = datasets['hotels'].merge(review_counts, on='hotel_id', how='left')
        datasets['hotels']['review_count'].fillna(0, inplace=True)
        
        return datasets
    
    def generate_complete_dataset(self):
        """Generate all datasets with proper correlations"""
        print("Generating enhanced synthetic datasets...")
        
        # Generate datasets
        print("1. Creating user profiles with realistic correlations...")
        users_df = self.generate_realistic_user_profiles(300)
        
        print("2. Creating realistic hotel data with amenities correlations...")
        hotels_df = self._create_hotel_placeholder(400)
        
        print("3. Generating user-hotel interactions with preference matching...")
        interactions_df = self.generate_realistic_interactions(hotels_df, users_df, 1500)
        
        print("4. Creating diverse reviews with cultural context...")
        reviews_df = self.generate_diverse_reviews(hotels_df, interactions_df, users_df, 700)
        
        print("5. Generating booking trends...")
        trends_df = self.generate_booking_trends()
        
        print("6. Creating Indian cities context...")
        cities_df = self._generate_cities_context()
        
        print("7. Generating sample search queries...")
        queries_df = self.generate_sample_queries(users_df, hotels_df, 200)
        
        # Data quality validation
        self._validate_data_quality(users_df, hotels_df, interactions_df, reviews_df)
        
        datasets = {
            'users': users_df,
            'hotels': hotels_df,
            'interactions': interactions_df,
            'reviews': reviews_df,
            'trends': trends_df,
            'cities': cities_df,
            'queries': queries_df
        }
        
        # Add data quality metrics
        print("8. Adding data quality metrics...")
        datasets = self.add_data_quality_metrics(datasets)
        
        return datasets

def save_datasets(datasets, output_dir='data/processed/'):
    """Save all datasets to CSV files"""
    import os
    os.makedirs(output_dir, exist_ok=True)
    
    file_mapping = {
        'users': 'user_profiles.csv',
        'hotels': 'hotels_master.csv',
        'interactions': 'user_hotel_interactions.csv',
        'reviews': 'hotel_reviews.csv',
        'trends': 'booking_trends.csv',
        'cities': 'indian_cities_data.csv',
        'queries': 'search_queries.csv'
    }
    
    for dataset_name, df in datasets.items():
        filepath = os.path.join(output_dir, file_mapping[dataset_name])
        df.to_csv(filepath, index=False)
        print(f"Saved {len(df)} records to {filepath}")

if __name__ == "__main__":
    # Generate complete dataset
    generator = IndianHotelDataGenerator()
    datasets = generator.generate_complete_dataset()
    
    # Save to files
    save_datasets(datasets)
    
    print("\n" + "="*60)
    print("ENHANCED SYNTHETIC DATA GENERATION COMPLETED")
    print("="*60)
    print(f"Users: {len(datasets['users'])} profiles")
    print(f"Hotels: {len(datasets['hotels'])} properties") 
    print(f"Interactions: {len(datasets['interactions'])} records")
    print(f"Reviews: {len(datasets['reviews'])} reviews")
    print(f"Cities: {len(datasets['cities'])} contexts")
    print(f"Trends: {len(datasets['trends'])} records")
    print(f"Queries: {len(datasets['queries'])} search queries")
    
    # Quick data overview
    print("\nQuick Data Overview:")
    print("User Demographics:")
    print(datasets['users']['age_group'].value_counts())
    print("\nHotel Star Distribution:")
    print(datasets['hotels']['star_rating'].value_counts().sort_index())
    print("\nInteraction Types:")
    print(datasets['interactions']['interaction_type'].value_counts())
    
    # Matrix sparsity analysis
    sparsity = 1 - (len(datasets['interactions']) / (len(datasets['users']) * len(datasets['hotels'])))
    print(f"\nMatrix Sparsity: {sparsity:.3f} ({sparsity*100:.1f}% empty cells)")
    
    # User activity levels
    active_users = len(datasets['users'][datasets['users']['interaction_count'] >= 5])
    print(f"Active users (5+ interactions): {active_users}/{len(datasets['users'])} ({active_users/len(datasets['users'])*100:.1f}%)")

Generating enhanced synthetic datasets...
1. Creating user profiles with realistic correlations...
2. Creating realistic hotel data with amenities correlations...
3. Generating user-hotel interactions with preference matching...
4. Creating diverse reviews with cultural context...
5. Generating booking trends...
6. Creating Indian cities context...
7. Generating sample search queries...

Enhanced Data Quality Validation:
- Interactions per user: min=3, max=7, avg=4.3
- Rating distribution: {1.0: np.int64(6), 1.2: np.int64(3), 1.4: np.int64(2), 1.5: np.int64(2), 1.6: np.int64(1), 1.7: np.int64(4), 1.8: np.int64(2), 1.9: np.int64(2), 2.0: np.int64(4), 2.1: np.int64(2), 2.2: np.int64(3), 2.3: np.int64(6), 2.4: np.int64(4), 2.5: np.int64(8), 2.6: np.int64(9), 2.7: np.int64(6), 2.8: np.int64(11), 2.9: np.int64(14), 3.0: np.int64(23), 3.1: np.int64(15), 3.2: np.int64(17), 3.3: np.int64(17), 3.4: np.int64(20), 3.5: np.int64(26), 3.6: np.int64(41), 3.7: np.int64(35), 3.8: np.int64(29), 3.9: np