In [1]:
import pandas as pd
import numpy as np
import warnings
import logging
from sklearn.preprocessing import MultiLabelBinarizer
from math import radians, sin, cos, sqrt, atan2
from sklearn.metrics.pairwise import cosine_similarity
import random

In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

## Data Uploading

In [3]:
egyptopia_places =  pd.read_csv(r"..\egyptopia_places_dataset_versions\egyptopia_places_v4.csv")

In [4]:
egyptopia_places

Unnamed: 0,place_id,Name,Category,Google Maps Link,City,Rate,Total Rates,Description,Tourism Type,city_id,Bayesian Average,Latitude,Longitude,Popularity
0,2044,Wadi El Natrun Monastery,Monastery,https://maps.app.goo.gl/4na9bVZqHNQfrwN87,Beheira,4.9,5685,"The Wadi El Natrun Monasteries, located in Beh...",Religious and Spiritual Attractions,19,4.850384,30.410208,30.154400,Moderate Popularity
1,2020,Monastery of Saint Paul,Monastery,https://maps.app.goo.gl/Mcjsncn5ZzTDTcVB9,Red Sea,4.9,3373,"Monastery of Saint Paul, Red Sea, Egypt The Mo...",Religious and Spiritual Attractions,13,4.822484,28.847306,32.550623,Moderate Popularity
2,2034,Syrian Monastery,Monastery,https://maps.app.goo.gl/y1V2K9Mth8UNZDRE6,Beheira,4.9,2324,The Syrian Monastery (Deir Al-Surian) in Behei...,Religious and Spiritual Attractions,19,4.795932,30.317832,30.202350,Moderate Popularity
3,4004,Luxor Temple,Temple,https://maps.app.goo.gl/QTDLBrKuWetZUHfEA,Luxor,4.8,30746,Luxor Temple is one of the most breathtaking a...,Cultural and Historical Attractions,18,4.792223,25.699908,32.636784,High Popularity
4,4003,Karnak Temple,Temple,https://maps.app.goo.gl/UuM5Nobqa81Kvs2b7,Luxor,4.8,26643,The Karnak Temple is one of the most magnifice...,Cultural and Historical Attractions,18,4.791058,25.718835,32.657270,High Popularity
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,14,Fayoum Zoo,Zoo,https://maps.app.goo.gl/JKN2BQE9xJzXKHzJA,Fayoum,3.5,665,Fayoum Zoo is a charming destination for famil...,Entertainment and Modern Attractions,6,4.009577,29.319826,30.852000,Low Popularity
316,63,Hollywood Sharm El Sheikh,Theme Park,https://maps.app.goo.gl/JUufPMkmJnq4mMfV7,Sharm El Sheikh,3.9,5345,Hollywood Sharm El Sheikh is a unique theme pa...,Entertainment and Modern Attractions,2,3.969005,27.926390,34.344056,Moderate Popularity
317,56,Fustat Garden,Garden,https://maps.app.goo.gl/pmsBxtnDEG9T1gAy8,Cairo,3.8,5905,Fustat Garden is a serene and historically ric...,Entertainment and Modern Attractions,1,3.874304,30.009323,31.244704,Moderate Popularity
318,7,Wonderland Amusement Park,Theme Park,https://maps.app.goo.gl/BwdsXTJgwPHDFiAW7,Cairo,3.7,3145,Wonderland Amusement Park is a popular destina...,Entertainment and Modern Attractions,1,3.846129,30.048093,31.337622,Moderate Popularity


## Egyptopia Content-Based Recommendation System Development

In [5]:
def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate the Haversine distance between two points in kilometers."""
    R = 6371  # Earth's radius in kilometers
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    return R * c

In [6]:
def get_similar_categories(category, all_categories):
    """Return categories similar to the input category based on a heuristic."""
    natural_categories = ['Beach', 'Island', 'Natural Reserve', 'Mountain', 'Aquarium', 'Garden']
    historical_categories = ['Historical Site', 'Temple', 'Fortress', 'Palace', 'Tomb']
    cultural_categories = ['Museum', 'Theater', 'Cultural Center', 'Library']
    religious_categories = ['Mosque', 'Church', 'Monastery', 'Synagogue']
    modern_categories = ['Shopping', 'Theme Park', 'Water Park', 'Zoo', 'Tower']
    medical_categories = ['Hot Spring', 'Healing Oases & Sand Therapy', 'Rehabilitation & Wellness Center']
    
    if category in natural_categories:
        similar = [c for c in natural_categories if c != category and c in all_categories]
    elif category in historical_categories:
        similar = [c for c in historical_categories if c != category and c in all_categories]
    elif category in cultural_categories:
        similar = [c for c in cultural_categories if c != category and c in all_categories]
    elif category in religious_categories:
        similar = [c for c in religious_categories if c != category and c in all_categories]
    elif category in modern_categories:
        similar = [c for c in modern_categories if c != category and c in all_categories]
    elif category in medical_categories:
        similar = [c for c in medical_categories if c != category and c in all_categories]
    else:
        similar = [c for c in all_categories if c != category]
    
    return similar[:3] if similar else all_categories

In [7]:
def get_relevant_tourism_type(category, df):
    """Infer the most relevant tourism type for a category based on dataset."""
    relevant_rows = df[df['Category'] == category]
    if not relevant_rows.empty:
        tourism_types = relevant_rows['Tourism Type'].value_counts()
        return tourism_types.index[0] if not tourism_types.empty else 'Unknown'
    return 'Unknown'

In [8]:
def recommend_places_content_based(df, user_preferences, top_n=14, category_weight=2.5, tourism_type_weight=2.0, geo_weight=2.0, max_per_city=5, max_category_matches=12, min_low_popularity=2):
    """
    Recommend exactly top_n places based on user preferences with content-based filtering and diversity awareness.
    Supports 'City' or 'city_id' in user_preferences, mapping city names to city_ids if needed.
    Prioritizes user-selected categories, then tourism types, then city IDs, with fallbacks only when necessary.
    Ensures at least min_low_popularity Low Popularity places if available, prioritizing Moderate/High Popularity for others.
    If categories and tourism types don't match, includes 2-3 places from original tourism types.
    If only one city is provided, ensures 5-7 places from that city.
    If more than 3 tourism types are provided with no categories, balances the tourism types in recommendations.
    """
    df_copy = df.copy()
    logging.info(f"Processing dataset with {len(df_copy)} entries")
    logging.info(f"Available Categories: {df_copy['Category'].unique()}")
    logging.info(f"Available Tourism Types: {df_copy['Tourism Type'].unique()}")
    logging.info(f"Available Popularity Levels: {df_copy['Popularity'].unique()}")
    logging.info(f"Available Cities: {df_copy['City'].unique()}")

    # Remove null or Unknown Tourism Type
    initial_len = len(df_copy)
    df_copy = df_copy[df_copy["Tourism Type"].notna() & (df_copy["Tourism Type"] != "Unknown")]
    if len(df_copy) < initial_len:
        logging.warning(f"Removed {initial_len - len(df_copy)} entries with null or Unknown Tourism Type")

    # Data Preprocessing
    if len(df_copy) < top_n:
        warnings.warn(f"Dataset has only {len(df_copy)} items, less than requested {top_n}. Returning all available.")
        top_n = len(df_copy)

    df_copy["Category"] = df_copy["Category"].fillna("Unknown").astype(str)
    df_copy["Tourism Type"] = df_copy["Tourism Type"].fillna("Unknown").astype(str)
    df_copy["Popularity"] = df_copy["Popularity"].fillna("Unknown").astype(str)
    df_copy["Latitude"] = pd.to_numeric(df_copy["Latitude"], errors="coerce").fillna(0)
    df_copy["Longitude"] = pd.to_numeric(df_copy["Longitude"], errors="coerce").fillna(0)

    # Create city-to-city_id mapping
    city_to_id = df_copy[['City', 'city_id']].drop_duplicates().set_index('City')['city_id'].to_dict()

    # Validating user preferences
    valid_categories = list(df_copy["Category"].unique())
    valid_tourism_types = set(df_copy["Tourism Type"].unique())
    valid_city_ids = list(df_copy["city_id"].unique())
    
    user_category = user_preferences.get("category", [])
    if not isinstance(user_category, list):
        user_category = [user_category]
    original_user_category = user_category.copy()
    user_tourism = user_preferences.get("tourism_type", [])
    if not isinstance(user_tourism, list):
        user_tourism = [user_tourism]
    original_user_tourism = user_tourism.copy()  # Store original tourism types
    
    # Handle city preference: support both 'city_id' and 'City'
    original_city_ids = user_preferences.get("city_id", None)
    if original_city_ids is None and "City" in user_preferences:
        city_names = user_preferences["City"]
        if not isinstance(city_names, list):
            city_names = [city_names]
        original_city_ids = [city_to_id.get(city, -1) for city in city_names if city in city_to_id]
        if not original_city_ids or all(cid == -1 for cid in original_city_ids):
            original_city_ids = None
            warnings.warn(f"None of the city names {city_names} found in dataset. Ignoring city preference.")
        else:
            logging.info(f"Mapped city names {city_names} to city_ids {original_city_ids}")
    if original_city_ids is not None and not isinstance(original_city_ids, list):
        original_city_ids = [original_city_ids]
    
    only_category = False
    only_tourism = False
    only_city_preference = False
    has_mismatch = False
    single_city_preference = False
    balance_tourism_types = False
    selected_city = None
    
    if not user_category and not user_tourism and not original_city_ids:
        warnings.warn("No preferences provided. Using all places.")
        user_category = valid_categories
        user_tourism = list(valid_tourism_types)
        original_city_ids = valid_city_ids

    # Validate categories
    if user_category:
        matching_categories = [cat for cat in user_category if cat in valid_categories]
        if not matching_categories:
            warnings.warn(f"None of the categories {user_category} found in dataset. Using related categories.")
            user_category = []
            for cat in original_user_category:
                user_category.extend(get_similar_categories(cat, valid_categories))
            user_category = list(set(user_category)) or valid_categories
        else:
            user_category = matching_categories
        logging.info(f"Using valid categories: {user_category}")
    else:
        only_tourism = True
        user_category = list(df_copy[df_copy["Tourism Type"].isin(user_tourism)]["Category"].unique())
        logging.info(f"No categories provided. Using categories for tourism types {user_tourism}: {user_category}")
        if not user_category and user_tourism:
            warnings.warn(f"No places found for tourism types {user_tourism}. Using all categories.")
            user_category = valid_categories
        # Check for balancing tourism types
        if len(user_tourism) > 3:
            balance_tourism_types = True
            logging.info(f"More than 3 tourism types provided without categories: {user_tourism}. Will balance tourism types.")

    # Validate tourism types
    if user_tourism:
        matching_tourism_types = [t for t in user_tourism if t in valid_tourism_types]
        if not matching_tourism_types:
            warnings.warn(f"None of the tourism types {user_tourism} found in dataset. Using inferred tourism types.")
            user_tourism = []
            for cat in user_category:
                tourism_type = get_relevant_tourism_type(cat, df_copy)
                if tourism_type != 'Unknown':
                    user_tourism.append(tourism_type)
            user_tourism = list(set(user_tourism)) or list(valid_tourism_types)
        else:
            user_tourism = matching_tourism_types
        logging.info(f"Using valid tourism types: {user_tourism}")
    else:
        only_category = True
        user_tourism = []
        for cat in user_category:
            tourism_type = get_relevant_tourism_type(cat, df_copy)
            if tourism_type != 'Unknown':
                user_tourism.append(tourism_type)
        user_tourism = list(set(user_tourism)) or list(valid_tourism_types)
        logging.info(f"No tourism types provided. Inferred tourism types for categories {user_category}: {user_tourism}")

    # Validate city IDs
    if original_city_ids:
        matching_city_ids = [cid for cid in original_city_ids if cid in valid_city_ids]
        if not matching_city_ids:
            warnings.warn(f"None of the city IDs {original_city_ids} found in dataset. Ignoring city preference.")
            original_city_ids = None
        else:
            original_city_ids = matching_city_ids
            logging.info(f"Using valid city IDs: {original_city_ids}")
            if len(original_city_ids) == 1:
                single_city_preference = True
                city_id_to_name = df_copy[df_copy['city_id'].isin(original_city_ids)][['city_id', 'City']].drop_duplicates().set_index('city_id')['City'].to_dict()
                selected_city = city_id_to_name[original_city_ids[0]]
                logging.info(f"Single city preference detected: {selected_city}")
    else:
        only_city_preference = user_category == [] and user_tourism == []
        if only_city_preference:
            user_category = valid_categories
            user_tourism = list(valid_tourism_types)
            logging.info(f"Only city_id provided. Using all categories and tourism types")

    # Adjust max_per_city for single city preference
    if single_city_preference and only_city_preference:
        max_per_city = 7  # Allow up to 7 places from the selected city
        min_city_places = 5  # Ensure at least 5 places
        logging.info(f"Single city preference with no other inputs. Adjusted max_per_city to {max_per_city} and min_city_places to {min_city_places}")
    else:
        min_city_places = 0

    # Handle city_id filtering
    target_centroid = None
    use_distance = False
    df_selected_cities = pd.DataFrame()
    if original_city_ids:
        use_distance = True
        df_selected_cities = df_copy[df_copy["city_id"].isin(original_city_ids)].copy()
        if df_selected_cities.empty:
            warnings.warn(f"No places found for city_id {original_city_ids}. Using nearby cities.")
        else:
            logging.info(f"Filtered to {len(df_selected_cities)} entries in city_id {original_city_ids}")
            # Compute centroid
            target_centroid = df_selected_cities.groupby('city_id').agg({
                'Latitude': 'mean',
                'Longitude': 'mean',
                'City': 'first'
            }).reset_index()
            centroid_lat = target_centroid['Latitude'].mean()
            centroid_lon = target_centroid['Longitude'].mean()
            logging.info(f"Centroid for city_id {original_city_ids}: Lat={centroid_lat:.4f}, Lon={centroid_lon:.4f}")

    # Check for category-tourism type mismatch
    if user_category and user_tourism and original_user_category:
        matched_places = df_copy[(df_copy["Category"].isin(user_category)) & (df_copy["Tourism Type"].isin(user_tourism))]
        if matched_places.empty:
            has_mismatch = True
            warnings.warn(f"No places match both category {user_category} and tourism type {user_tourism}. Prioritizing category and including 2-3 places from tourism types {original_user_tourism}.")
            inferred_tourism = []
            for cat in user_category:
                tourism_type = get_relevant_tourism_type(cat, df_copy)
                if tourism_type != 'Unknown':
                    inferred_tourism.append(tourism_type)
            user_tourism = list(set(user_tourism + inferred_tourism))
            logging.info(f"Adjusted tourism types to include inferred types: {user_tourism}")
        else:
            logging.info(f"Found {len(matched_places)} places matching both categories {user_category} and tourism types {user_tourism}")

    # Filter by city_id
    if original_city_ids and not only_city_preference:
        df_filtered = df_copy[df_copy["Category"].isin(user_category) | df_copy["Tourism Type"].isin(user_tourism)]
        if df_filtered.empty:
            warnings.warn(f"No places found for city_id {original_city_ids} with categories {user_category} or tourism types {user_tourism}. Including nearby places.")
            df_copy['Distance_km'] = df_copy.apply(
                lambda x: 0 if x['city_id'] in original_city_ids else haversine_distance(
                    target_centroid['Latitude'].mean(),
                    target_centroid['Longitude'].mean(),
                    x['Latitude'],
                    x['Longitude']
                ), axis=1
            )
            df_copy = df_copy[(df_copy['city_id'].isin(original_city_ids)) | (df_copy['Distance_km'] <= 500)]
            max_distance = df_copy['Distance_km'].max() if not df_copy['Distance_km'].empty else 1
            df_copy['geo_similarity'] = 1 / (1 + df_copy['Distance_km'] / max_distance)
        else:
            df_copy = df_filtered
            df_copy['Distance_km'] = df_copy.apply(
                lambda x: 0 if x['city_id'] in original_city_ids else haversine_distance(
                    target_centroid['Latitude'].mean(),
                    target_centroid['Longitude'].mean(),
                    x['Latitude'],
                    x['Longitude']
                ), axis=1
            )
            max_distance = df_copy['Distance_km'].max() if not df_copy['Distance_km'].empty else 1
            df_copy['geo_similarity'] = 1 / (1 + df_copy['Distance_km'] / max_distance)
    else:
        df_copy['geo_similarity'] = 1.0
        df_copy['Distance_km'] = 0.0

    # Log available places
    category_counts = df_copy["Category"].value_counts()
    logging.info(f"Available places per category: {category_counts.to_dict()}")
    popularity_counts = df_copy["Popularity"].value_counts()
    logging.info(f"Available places per popularity: {popularity_counts.to_dict()}")
    city_counts = df_copy["City"].value_counts()
    logging.info(f"Available places per city: {city_counts.to_dict()}")

    # Tags Preparation
    df_copy["category_tag"] = df_copy["Category"].apply(lambda x: [x])
    df_copy["tourism_tag"] = df_copy["Tourism Type"].apply(lambda x: [x])

    # One-hot Encoding
    category_encoder = MultiLabelBinarizer()
    tourism_encoder = MultiLabelBinarizer()

    category_encoded = category_encoder.fit_transform(df_copy["category_tag"])
    tourism_encoded = tourism_encoder.fit_transform(df_copy["tourism_tag"])

    category_df = pd.DataFrame(category_encoded, columns=category_encoder.classes_, index=df_copy.index)
    tourism_df = pd.DataFrame(tourism_encoded, columns=tourism_encoder.classes_, index=df_copy.index)

    # User preference vectors
    user_category_vec = category_encoder.transform([user_category]) if user_category else np.ones((1, len(category_encoder.classes_)))
    user_tourism_vec = tourism_encoder.transform([user_tourism]) if user_tourism else np.ones((1, len(tourism_encoder.classes_)))

    # Calculating cosine similarity
    df_copy["category_similarity"] = cosine_similarity(category_df, user_category_vec).flatten()
    df_copy["tourism_similarity"] = cosine_similarity(tourism_df, user_tourism_vec).flatten()

    # Assign similarity scores
    df_copy["similarity"] = 0.2  # Default for fallback
    if original_user_category:
        df_copy.loc[df_copy["Category"].isin(original_user_category), "similarity"] = 0.833333
        df_copy.loc[(~df_copy["Category"].isin(original_user_category)) & (df_copy["Tourism Type"].isin(user_tourism)), "similarity"] = 0.416667
    else:
        df_copy.loc[df_copy["Category"].isin(user_category), "similarity"] = 0.833333
        df_copy.loc[(~df_copy["Category"].isin(user_category)) & (df_copy["Tourism Type"].isin(user_tourism)), "similarity"] = 0.416667

    # Combining similarities with weights
    df_copy["weighted_similarity"] = (
        df_copy["category_similarity"] * category_weight +
        df_copy["tourism_similarity"] * tourism_type_weight +
        df_copy["geo_similarity"] * geo_weight
    ) / (category_weight + tourism_type_weight + geo_weight)
    df_copy["weighted_similarity"] = df_copy["weighted_similarity"].fillna(0.5)

    # Apply ratings
    if "Rate" in df_copy.columns and "Total Rates" in df_copy.columns:
        df_copy["Rate"] = pd.to_numeric(df_copy["Rate"], errors="coerce").fillna(0)
        df_copy["Total Rates"] = pd.to_numeric(df_copy["Total Rates"], errors="coerce").fillna(0)
        max_rate = df_copy["Rate"].max() + 1
        max_total_rates = df_copy["Total Rates"].max() + 1
        df_copy["normalized_rating"] = (df_copy["Rate"] + 1) / max_rate * np.sqrt(df_copy["Total Rates"] + 1) / np.sqrt(max_total_rates)
    else:
        df_copy["normalized_rating"] = 1.0
        logging.info("Rate or Total Rates missing; using default normalized_rating=1.0")

    # Adding diversity bonus
    category_freq = df_copy["Category"].value_counts(normalize=True)
    tourism_freq = df_copy["Tourism Type"].value_counts(normalize=True)
    df_copy["diversity_bonus"] = df_copy["Category"].map(lambda x: 1 / (category_freq.get(x, 1) + 0.1)) + \
                                df_copy["Tourism Type"].map(lambda x: 1 / (tourism_freq.get(x, 1) + 0.1))
    
    df_copy["weighted_similarity"] = df_copy["similarity"] * df_copy["normalized_rating"] * (1 + 0.25 * df_copy["diversity_bonus"])
    logging.info(f"Calculated weighted similarity with diversity bonus")

    # Split into category, tourism type, and fallback recommendations
    category_df = df_copy[df_copy["Category"].isin(user_category)]
    tourism_df = df_copy[(~df_copy["Category"].isin(user_category)) & (df_copy["Tourism Type"].isin(user_tourism))]
    similar_category_df = df_copy[df_copy["Category"].isin(get_similar_categories(user_category[0] if user_category else valid_categories[0], valid_categories)) & ~df_copy["Name"].isin(category_df["Name"])] if user_category else pd.DataFrame()
    logging.info(f"Category items: {len(category_df)}, Tourism type items: {len(tourism_df)}, Similar category items: {len(similar_category_df)}")

    # Log availability per user-selected category and popularity
    for cat in user_category:
        cat_count = len(df_copy[df_copy["Category"] == cat])
        low_pop_count = len(df_copy[(df_copy["Category"] == cat) & (df_copy["Popularity"] == "Low Popularity")])
        med_high_pop_count = len(df_copy[(df_copy["Category"] == cat) & (df_copy["Popularity"].isin(["Moderate Popularity", "High Popularity"]))])
        logging.info(f"Available places for category '{cat}': {cat_count}, Low Popularity: {low_pop_count}, Moderate/High Popularity: {med_high_pop_count}")

    # Adjust max_category_matches for mismatch case
    available_category_matches = len(category_df)
    min_category_matches = 8 if original_user_category else 1
    max_category_matches_per_category = 5
    if has_mismatch:
        max_category_matches = min(10, available_category_matches)  # Reduce to ~10 to fit 2-3 tourism places
        logging.info(f"Mismatch detected. Adjusted max_category_matches to {max_category_matches} to include 2-3 tourism type places")
    else:
        if available_category_matches < min_category_matches:
            max_category_matches = available_category_matches
        else:
            max_category_matches = min(max_category_matches, available_category_matches)
    logging.info(f"Adjusted max_category_matches to {max_category_matches} based on {available_category_matches} available")

    # Diversity-aware ranking
    def diversify_recommendations(category_df, tourism_df, similar_category_df, n, max_category_matches, min_low_popularity, original_user_tourism, has_mismatch, balance_tourism_types, single_city_preference, selected_city, min_city_places, max_per_city, category_col="Category", tourism_col="Tourism Type"):
        selected = []
        city_count = {}
        selected_names = set()
        category_count = {cat: 0 for cat in user_category}
        low_popularity_count = 0
        total_category_selected = 0
        tourism_type_selected = 0
        tourism_type_count = {tt: 0 for tt in user_tourism} if balance_tourism_types else {}

        # Calculate target for balancing tourism types
        if balance_tourism_types:
            target_per_tourism_type = max(1, n // len(user_tourism))
            logging.info(f"Balancing tourism types: Targeting ~{target_per_tourism_type} places per tourism type for {user_tourism}")

        # Step 1: For single city preference, prioritize 5-7 places from the selected city
        if single_city_preference and selected_city:
            city_df = df_copy[df_copy["City"] == selected_city]
            if not city_df.empty:
                logging.info(f"Single city preference: Prioritizing 5-7 places from {selected_city}")
                sort_columns = ["Distance_km", "weighted_similarity"] if "Distance_km" in city_df.columns and original_city_ids is not None else ["weighted_similarity"]
                sort_ascending = [True, False] if "Distance_km" in city_df.columns and original_city_ids is not None else [False]
                city_remaining = city_df.sort_values(by=sort_columns, ascending=sort_ascending)
                for _, item in city_remaining.iterrows():
                    if (item["Name"] not in selected_names and 
                        city_count.get(item["City"], 0) < max_per_city and 
                        len(selected) < n and 
                        city_count.get(selected_city, 0) < max_per_city):
                        selected.append(item)
                        city_count[item["City"]] = city_count.get(item["City"], 0) + 1
                        selected_names.add(item["Name"])
                        if item["Popularity"] == "Low Popularity":
                            low_popularity_count += 1
                        distance_log = f"Distance: {item['Distance_km']:.2f} km, " if "Distance_km" in item and original_city_ids is not None else ""
                        logging.info(f"Added {item['Name']} ({item[category_col]}, {item['Popularity']}) from {selected_city} (similarity: {item['similarity']:.4f}, {distance_log}weighted_similarity: {item['weighted_similarity']:.4f})")
                    if city_count.get(selected_city, 0) >= max_per_city or len(selected) >= n:
                        break
                if city_count.get(selected_city, 0) < min_city_places:
                    logging.warning(f"Only {city_count.get(selected_city, 0)} places found in {selected_city}. Minimum required is {min_city_places}.")
            else:
                logging.warning(f"No places found for city {selected_city}")

        # Step 2: Distribute selections across all user-selected categories, prioritizing Low Popularity for min requirement
        if user_category:
            target_per_category = max(1, max_category_matches // len(user_category))
            logging.info(f"Target selections per category: {target_per_category}")
            # First pass: Select up to min_low_popularity Low Popularity places
            for _ in range(max_category_matches_per_category):
                for cat in user_category:
                    if total_category_selected >= max_category_matches or len(selected) >= n or low_popularity_count >= min_low_popularity:
                        break
                    cat_df = category_df[(category_df[category_col] == cat) & (category_df["Popularity"] == "Low Popularity")]
                    if cat_df.empty:
                        continue
                    sort_columns = ["Distance_km", "weighted_similarity"] if "Distance_km" in cat_df.columns and original_city_ids is not None else ["weighted_similarity"]
                    sort_ascending = [True, False] if "Distance_km" in cat_df.columns and original_city_ids is not None else [False]
                    cat_remaining = cat_df.sort_values(by=sort_columns, ascending=sort_ascending)
                    for _, item in cat_remaining.iterrows():
                        if (item["Name"] not in selected_names and 
                            city_count.get(item["City"], 0) < (max_per_city if item["City"] != selected_city else 7) and 
                            category_count[cat] < target_per_category and 
                            total_category_selected < max_category_matches and 
                            low_popularity_count < min_low_popularity):
                            selected.append(item)
                            city_count[item["City"]] = city_count.get(item["City"], 0) + 1
                            selected_names.add(item["Name"])
                            category_count[cat] += 1
                            total_category_selected += 1
                            low_popularity_count += 1
                            distance_log = f"Distance: {item['Distance_km']:.2f} km, " if "Distance_km" in item and original_city_ids is not None else ""
                            logging.info(f"Added {item['Name']} ({cat}, Low Popularity) (similarity: {item['similarity']:.4f}, {distance_log}weighted_similarity: {item['weighted_similarity']:.4f})")
                            break
                if total_category_selected >= max_category_matches or len(selected) >= n or low_popularity_count >= min_low_popularity:
                    break

            # Second pass: Fill with Moderate/High Popularity, balancing categories
            for _ in range(max_category_matches_per_category):
                for cat in user_category:
                    if total_category_selected >= max_category_matches or len(selected) >= n:
                        break
                    cat_df = category_df[(category_df[category_col] == cat) & (category_df["Popularity"].isin(["Moderate Popularity", "High Popularity"]))]
                    if cat_df.empty:
                        cat_df = category_df[category_df[category_col] == cat]
                    if cat_df.empty:
                        logging.warning(f"No places available for category '{cat}'")
                        continue
                    sort_columns = ["Distance_km", "weighted_similarity"] if "Distance_km" in cat_df.columns and original_city_ids is not None else ["weighted_similarity"]
                    sort_ascending = [True, False] if "Distance_km" in cat_df.columns and original_city_ids is not None else [False]
                    cat_remaining = cat_df.sort_values(by=sort_columns, ascending=sort_ascending)
                    for _, item in cat_remaining.iterrows():
                        if (item["Name"] not in selected_names and 
                            city_count.get(item["City"], 0) < (max_per_city if item["City"] != selected_city else 7) and 
                            category_count[cat] < target_per_category and 
                            total_category_selected < max_category_matches):
                            selected.append(item)
                            city_count[item["City"]] = city_count.get(item["City"], 0) + 1
                            selected_names.add(item["Name"])
                            category_count[cat] += 1
                            total_category_selected += 1
                            if item["Popularity"] == "Low Popularity":
                                low_popularity_count += 1
                            distance_log = f"Distance: {item['Distance_km']:.2f} km, " if "Distance_km" in item and original_city_ids is not None else ""
                            logging.info(f"Added {item['Name']} ({cat}, {item['Popularity']}) (similarity: {item['similarity']:.4f}, {distance_log}weighted_similarity: {item['weighted_similarity']:.4f})")
                            break
                if total_category_selected >= max_category_matches or len(selected) >= n:
                    break

        # Step 3: Ensure min_low_popularity by adding Low Popularity places if needed
        if total_category_selected < max_category_matches and len(selected) < n and low_popularity_count < min_low_popularity:
            low_pop_df = category_df[category_df["Popularity"] == "Low Popularity"]
            sort_columns = ["Distance_km", "weighted_similarity"] if "Distance_km" in low_pop_df.columns and original_city_ids is not None else ["weighted_similarity"]
            sort_ascending = [True, False] if "Distance_km" in low_pop_df.columns and original_city_ids is not None else [False]
            category_remaining = low_pop_df.sort_values(by=sort_columns, ascending=sort_ascending)
            for _, item in category_remaining.iterrows():
                cat = item[category_col]
                if (item["Name"] not in selected_names and 
                    city_count.get(item["City"], 0) < (max_per_city if item["City"] != selected_city else 7) and 
                    total_category_selected < max_category_matches and 
                    category_count[cat] < max_category_matches_per_category and 
                    low_popularity_count < min_low_popularity):
                    selected.append(item)
                    city_count[item["City"]] = city_count.get(item["City"], 0) + 1
                    selected_names.add(item["Name"])
                    category_count[cat] += 1
                    total_category_selected += 1
                    low_popularity_count += 1
                    distance_log = f"Distance: {item['Distance_km']:.2f} km, " if "Distance_km" in item and original_city_ids is not None else ""
                    logging.info(f"Added {item['Name']} ({cat}, Low Popularity) (similarity: {item['similarity']:.4f}, {distance_log}weighted_similarity: {item['weighted_similarity']:.4f})")
                if total_category_selected >= max_category_matches or len(selected) >= n or low_popularity_count >= min_low_popularity:
                    break

        # Step 4: For mismatch case, add 2-3 places from original_user_tourism
        if has_mismatch and len(selected) < n:
            tourism_target = min(3, n - len(selected))
            tourism_target = max(2, tourism_target) if len(tourism_df) >= 2 else len(tourism_df)
            logging.info(f"Mismatch case: Targeting {tourism_target} places from original tourism types {original_user_tourism}")
            mismatch_tourism_df = tourism_df[tourism_df["Tourism Type"].isin(original_user_tourism)]
            if not mismatch_tourism_df.empty:
                sort_columns = ["Distance_km", "weighted_similarity"] if "Distance_km" in mismatch_tourism_df.columns and original_city_ids is not None else ["weighted_similarity"]
                sort_ascending = [True, False] if "Distance_km" in mismatch_tourism_df.columns and original_city_ids is not None else [False]
                tourism_remaining = mismatch_tourism_df.sort_values(by=sort_columns, ascending=sort_ascending)
                for _, item in tourism_remaining.iterrows():
                    if (item["Name"] not in selected_names and 
                        city_count.get(item["City"], 0) < (max_per_city if item["City"] != selected_city else 7) and 
                        tourism_type_selected < tourism_target and 
                        len(selected) < n):
                        selected.append(item)
                        city_count[item["City"]] = city_count.get(item["City"], 0) + 1
                        selected_names.add(item["Name"])
                        tourism_type_selected += 1
                        if item["Popularity"] == "Low Popularity":
                            low_popularity_count += 1
                        distance_log = f"Distance: {item['Distance_km']:.2f} km, " if "Distance_km" in item and original_city_ids is not None else ""
                        logging.info(f"Added {item['Name']} ({item[category_col]}, {item['Popularity']}, Tourism Type: {item[tourism_col]}) (similarity: {item['similarity']:.4f}, {distance_log}weighted_similarity: {item['weighted_similarity']:.4f})")
                    if tourism_type_selected >= tourism_target or len(selected) >= n:
                        break
                logging.info(f"Added {tourism_type_selected} places from original tourism types")
            else:
                logging.warning(f"No places found for original tourism types {original_user_tourism}")

        # Step 5: Balance tourism types if more than 3 are provided with no categories
        if balance_tourism_types and len(selected) < n:
            remaining_n = n - len(selected)
            logging.info(f"Balancing {len(user_tourism)} tourism types: {user_tourism}")
            for _ in range(remaining_n):
                for tt in user_tourism:
                    if len(selected) >= n:
                        break
                    if tourism_type_count[tt] >= target_per_tourism_type:
                        continue
                    tt_df = tourism_df[tourism_df[tourism_col] == tt]
                    if tt_df.empty:
                        continue
                    sort_columns = ["Distance_km", "weighted_similarity"] if "Distance_km" in tt_df.columns and original_city_ids is not None else ["weighted_similarity"]
                    sort_ascending = [True, False] if "Distance_km" in tt_df.columns and original_city_ids is not None else [False]
                    tt_remaining = tt_df.sort_values(by=sort_columns, ascending=sort_ascending)
                    for _, item in tt_remaining.iterrows():
                        if (item["Name"] not in selected_names and 
                            city_count.get(item["City"], 0) < (max_per_city if item["City"] != selected_city else 7) and 
                            tourism_type_count[tt] < target_per_tourism_type and 
                            len(selected) < n):
                            selected.append(item)
                            city_count[item["City"]] = city_count.get(item["City"], 0) + 1
                            selected_names.add(item["Name"])
                            tourism_type_count[tt] += 1
                            if item["Popularity"] == "Low Popularity":
                                low_popularity_count += 1
                            distance_log = f"Distance: {item['Distance_km']:.2f} km, " if "Distance_km" in item and original_city_ids is not None else ""
                            logging.info(f"Added {item['Name']} ({item[category_col]}, {item['Popularity']}, Tourism Type: {tt}) (similarity: {item['similarity']:.4f}, {distance_log}weighted_similarity: {item['weighted_similarity']:.4f})")
                            break
                    if len(selected) >= n:
                        break

        # Step 6: Fill with remaining user category places, prioritizing Moderate/High Popularity
        if total_category_selected < max_category_matches and len(selected) < n:
            sort_columns = ["Distance_km", "weighted_similarity"] if "Distance_km" in category_df.columns and original_city_ids is not None else ["weighted_similarity"]
            sort_ascending = [True, False] if "Distance_km" in category_df.columns and original_city_ids is not None else [False]
            category_remaining = category_df[category_df["Popularity"].isin(["Moderate Popularity", "High Popularity"])].sort_values(by=sort_columns, ascending=sort_ascending)
            if category_remaining.empty:
                category_remaining = category_df.sort_values(by=sort_columns, ascending=sort_ascending)
            for _, item in category_remaining.iterrows():
                cat = item[category_col]
                if (item["Name"] not in selected_names and 
                    city_count.get(item["City"], 0) < (max_per_city if item["City"] != selected_city else 7) and 
                    total_category_selected < max_category_matches and 
                    category_count[cat] < max_category_matches_per_category):
                    selected.append(item)
                    city_count[item["City"]] = city_count.get(item["City"], 0) + 1
                    selected_names.add(item["Name"])
                    category_count[cat] += 1
                    total_category_selected += 1
                    if item["Popularity"] == "Low Popularity":
                        low_popularity_count += 1
                    distance_log = f"Distance: {item['Distance_km']:.2f} km, " if "Distance_km" in item and original_city_ids is not None else ""
                    logging.info(f"Added {item['Name']} ({cat}, {item['Popularity']}) (similarity: {item['similarity']:.4f}, {distance_log}weighted_similarity: {item['weighted_similarity']:.4f})")
                if total_category_selected >= max_category_matches or len(selected) >= n:
                    break

        # Step 7: Fill with similar categories, prioritizing Moderate/High Popularity
        remaining_n = n - len(selected)
        if remaining_n > 0 and not similar_category_df.empty:
            sort_columns = ["Distance_km", "weighted_similarity"] if "Distance_km" in similar_category_df.columns and original_city_ids is not None else ["weighted_similarity"]
            sort_ascending = [True, False] if "Distance_km" in similar_category_df.columns and original_city_ids is not None else [False]
            similar_remaining = similar_category_df[similar_category_df["Popularity"].isin(["Moderate Popularity", "High Popularity"])].sort_values(by=sort_columns, ascending=sort_ascending)
            if similar_remaining.empty:
                similar_remaining = similar_category_df.sort_values(by=sort_columns, ascending=sort_ascending)
            for _, item in similar_remaining.iterrows():
                if item["Name"] not in selected_names and city_count.get(item["City"], 0) < (max_per_city if item["City"] != selected_city else 7) and len(selected) < n:
                    selected.append(item)
                    city_count[item["City"]] = city_count.get(item["City"], 0) + 1
                    selected_names.add(item["Name"])
                    if item["Popularity"] == "Low Popularity":
                        low_popularity_count += 1
                    distance_log = f"Distance: {item['Distance_km']:.2f} km, " if "Distance_km" in item and original_city_ids is not None else ""
                    logging.info(f"Added {item['Name']} ({item[category_col]}, {item['Popularity']}) (similarity: {item['similarity']:.4f}, {distance_log}weighted_similarity: {item['weighted_similarity']:.4f})")
                if len(selected) >= n:
                    break

        # Step 8: Fill with tourism_df, prioritizing Moderate/High Popularity
        remaining_n = n - len(selected)
        if remaining_n > 0:
            sort_columns = ["Distance_km", "weighted_similarity"] if "Distance_km" in tourism_df.columns and original_city_ids is not None else ["weighted_similarity"]
            sort_ascending = [True, False] if "Distance_km" in tourism_df.columns and original_city_ids is not None else [False]
            tourism_remaining = tourism_df[tourism_df["Popularity"].isin(["Moderate Popularity", "High Popularity"])].sort_values(by=sort_columns, ascending=sort_ascending)
            if tourism_remaining.empty:
                tourism_remaining = tourism_df.sort_values(by=sort_columns, ascending=sort_ascending)
            for _, item in tourism_remaining.iterrows():
                if item["Name"] not in selected_names and city_count.get(item["City"], 0) < (max_per_city if item["City"] != selected_city else 7) and len(selected) < n:
                    selected.append(item)
                    city_count[item["City"]] = city_count.get(item["City"], 0) + 1
                    selected_names.add(item["Name"])
                    if item["Popularity"] == "Low Popularity":
                        low_popularity_count += 1
                    distance_log = f"Distance: {item['Distance_km']:.2f} km, " if "Distance_km" in item and original_city_ids is not None else ""
                    logging.info(f"Added {item['Name']} ({item[category_col]}, {item['Popularity']}) (similarity: {item['similarity']:.4f}, {distance_log}weighted_similarity: {item['weighted_similarity']:.4f})")
                if len(selected) >= n:
                    break

        # Step 9: Fill with all places if still short
        remaining_n = n - len(selected)
        if remaining_n > 0:
            additional_places = df_copy[~df_copy["Name"].isin(selected_names)].copy()
            additional_places['similarity'] = 0.2
            additional_places['weighted_similarity'] = additional_places['similarity'] * additional_places['normalized_rating'] * (1 + 0.25 * additional_places['diversity_bonus'])
            sort_columns = ["Distance_km", "weighted_similarity"] if "Distance_km" in additional_places.columns and original_city_ids is not None else ["weighted_similarity"]
            sort_ascending = [True, False] if "Distance_km" in additional_places.columns and original_city_ids is not None else [False]
            additional_remaining = additional_places[additional_places["Popularity"].isin(["Moderate Popularity", "High Popularity"])].sort_values(by=sort_columns, ascending=sort_ascending)
            if additional_remaining.empty:
                additional_remaining = additional_places.sort_values(by=sort_columns, ascending=sort_ascending)
            for _, item in additional_remaining.iterrows():
                if item["Name"] not in selected_names and city_count.get(item["City"], 0) < (max_per_city if item["City"] != selected_city else 7) and len(selected) < n:
                    selected.append(item)
                    city_count[item["City"]] = city_count.get(item["City"], 0) + 1
                    selected_names.add(item["Name"])
                    if item["Popularity"] == "Low Popularity":
                        low_popularity_count += 1
                    distance_log = f"Distance: {item['Distance_km']:.2f} km, " if "Distance_km" in item and original_city_ids is not None else ""
                    logging.info(f"Added {item['Name']} ({item[category_col]}, {item['Popularity']}) (similarity: {item['similarity']:.4f}, {distance_log}weighted_similarity: {item['weighted_similarity']:.4f})")
                if len(selected) >= n:
                    break

        # Log final counts
        logging.info(f"Final Low Popularity count: {low_popularity_count}")
        if low_popularity_count < min_low_popularity:
            logging.warning(f"Could not meet minimum Low Popularity requirement ({min_low_popularity}). Only {low_popularity_count} Low Popularity places found.")
        final_pop_counts = pd.DataFrame(selected)["Popularity"].value_counts()
        logging.info(f"Final popularity distribution: {final_pop_counts.to_dict()}")
        final_tourism_counts = pd.DataFrame(selected)["Tourism Type"].value_counts()
        logging.info(f"Final tourism type distribution: {final_tourism_counts.to_dict()}")
        final_city_counts = pd.DataFrame(selected)["City"].value_counts()
        logging.info(f"Final city distribution: {final_city_counts.to_dict()}")

        return pd.DataFrame(selected)

    # Get recommendations
    recommendations = diversify_recommendations(category_df, tourism_df, similar_category_df, top_n, max_category_matches, min_low_popularity, original_user_tourism, has_mismatch, balance_tourism_types, single_city_preference, selected_city, min_city_places, max_per_city)
    
    # Ensure exactly top_n recommendations
    if len(recommendations) < top_n:
        logging.warning(f"Only {len(recommendations)} recommendations found. Filling with top-rated places.")
        additional_places = df_copy[~df_copy["Name"].isin(recommendations["Name"])].copy()
        additional_places['similarity'] = 0.2
        additional_places['weighted_similarity'] = additional_places['similarity'] * additional_places['normalized_rating'] * (1 + 0.25 * additional_places['diversity_bonus'])
        sort_columns = ["Distance_km", "weighted_similarity"] if "Distance_km" in additional_places.columns and original_city_ids is not None else ["weighted_similarity"]
        sort_ascending = [True, False] if "Distance_km" in additional_places.columns and original_city_ids is not None else [False]
        additional_remaining = additional_places[additional_places["Popularity"].isin(["Moderate Popularity", "High Popularity"])].sort_values(by=sort_columns, ascending=sort_ascending)
        if additional_remaining.empty:
            additional_remaining = additional_places.sort_values(by=sort_columns, ascending=sort_ascending)
        city_count = recommendations.groupby("City").size().to_dict()
        selected_names = set(recommendations["Name"])
        for _, item in additional_remaining.iterrows():
            if item["Name"] not in selected_names and city_count.get(item["City"], 0) < (max_per_city if item["City"] != selected_city else 7) and len(recommendations) < top_n:
                recommendations = pd.concat([recommendations, item.to_frame().T])
                city_count[item["City"]] = city_count.get(item["City"], 0) + 1
                selected_names.add(item["Name"])
                distance_log = f"Distance: {item['Distance_km']:.2f} km, " if "Distance_km" in item and original_city_ids is not None else ""
                logging.info(f"Added {item['Name']} ({item['Category']}, {item['Popularity']}) (similarity: {item['similarity']:.4f}, {distance_log}weighted_similarity: {item['weighted_similarity']:.4f})")
            if len(recommendations) >= top_n:
                break

    # Sort recommendations
    if not recommendations.empty and "City" in recommendations.columns:
        sort_columns = ["Distance_km", "City", "weighted_similarity"] if "Distance_km" in recommendations.columns and original_city_ids is not None else ["City", "weighted_similarity"]
        sort_ascending = [True, True, False] if "Distance_km" in recommendations.columns and original_city_ids is not None else [True, False]
        recommendations = recommendations.sort_values(by=sort_columns, ascending=sort_ascending)
        logging.info(f"Recommendations sorted by {sort_columns}")

    # Return relevant columns
    output_columns = ['place_id', 'Name', 'Category', 'Google Maps Link', 'City', 'Rate',
       'Total Rates', 'Description', 'Tourism Type', 'city_id', 'Latitude',
       'Longitude', 'Popularity','weighted_similarity']
    if original_city_ids is not None:
        output_columns.append("Distance_km")
    if recommendations.empty:
        return pd.DataFrame(columns=output_columns)
    return recommendations[output_columns]

In [9]:
## Sample outputs from Egyptopia CBRS

In [10]:
user_preferences = {
        "tourism_type": ["Entertainment and Modern Attractions","Cultural and Historical Attractions"],
        "category": ['Church', 'Garden','Temple'],
        "City":  ['Alexandria', 'Giza','Cairo'] 
}
recommendations = recommend_places_content_based(egyptopia_places, user_preferences, top_n=14, max_per_city=5)

2025-06-29 15:07:50,807 - INFO - Processing dataset with 320 entries
2025-06-29 15:07:50,808 - INFO - Available Categories: ['Monastery' 'Temple' 'Historical Site' 'Mosque' 'Water Park' 'Church'
 'Museum' 'Cultural Center' 'Island' 'Mountain' 'Natural Reserve' 'Tomb'
 'Beach' 'Shopping' 'Fortress' 'Palace' 'Library' 'Tower' 'Hot Spring'
 'Garden' 'Theme Park' 'Rehabilitation & Wellness Center'
 'Healing Oases & Sand Therapy' 'Aquarium' 'Theater' 'Synagogue' 'Zoo']
2025-06-29 15:07:50,810 - INFO - Available Tourism Types: ['Religious and Spiritual Attractions'
 'Cultural and Historical Attractions'
 'Entertainment and Modern Attractions' 'Natural Attractions'
 'Medical Attractions']
2025-06-29 15:07:50,811 - INFO - Available Popularity Levels: ['Moderate Popularity' 'High Popularity' 'Low Popularity']
2025-06-29 15:07:50,812 - INFO - Available Cities: ['Beheira' 'Red Sea' 'Luxor' 'Aswan' 'Cairo' 'Sharm El Sheikh' 'Giza'
 'Qena' 'Alexandria' 'Assiut' 'Hurghada' 'Sinai' 'Marsa Matrouh' 'S

In [11]:
recommendations

Unnamed: 0,place_id,Name,Category,Google Maps Link,City,Rate,Total Rates,Description,Tourism Type,city_id,Latitude,Longitude,Popularity,weighted_similarity,Distance_km
113,39,San Stefano Grand Plaza,Shopping,https://maps.app.goo.gl/E8LbJmeF28kANVGb7,Alexandria,4.5,32635,San Stefano Grand Plaza is Alexandrias most pr...,Entertainment and Modern Attractions,5,31.245834,29.965938,High Popularity,0.704793,0.0
14,2011,Saint Mark's Coptic Orthodox Cathedral,Church,https://maps.app.goo.gl/z2hmD6FmoCiNdsPZA,Alexandria,4.8,3056,"Saint Mark's Coptic Orthodox Cathedral, Egypt....",Religious and Spiritual Attractions,5,31.198382,28.680083,Moderate Popularity,0.671446,0.0
295,55,Antoniades Garden,Garden,https://maps.app.goo.gl/RcY6uYzvme27KSzr7,Alexandria,4.2,5202,Antoniades Garden is one of Egypts oldest and ...,Entertainment and Modern Attractions,5,31.205676,29.946999,Moderate Popularity,0.660195,0.0
179,4010,Taposiris Magna Temple,Temple,https://maps.app.goo.gl/rCC44wN2ngGKQwnC9,Alexandria,4.5,104,"Taposiris Magna Temple, located near the moder...",Cultural and Historical Attractions,5,30.946167,29.518694,Low Popularity,0.082405,0.0
112,48,Al Azhar Park,Garden,https://maps.app.goo.gl/z8w2o2y27zEkuSzm7,Cairo,4.5,37053,Al Azhar Park is one of the city's most breath...,Entertainment and Modern Attractions,1,30.041426,31.265581,High Popularity,1.86347,0.0
312,52,International Garden,Garden,https://maps.app.goo.gl/fEAuBjT782zQrXC16,Cairo,4.1,27554,International Garden is a unique green space t...,Entertainment and Modern Attractions,1,30.049547,31.336493,High Popularity,1.49009,0.0
26,2013,The Hanging Church,Church,https://maps.app.goo.gl/NY9w6pyecsNiVeoUA,Cairo,4.7,6949,The Hanging Church (Saint Virgin Mary's Coptic...,Religious and Spiritual Attractions,1,30.005239,31.230169,Moderate Popularity,0.994953,0.0
15,2018,Church of the Holy Virgin (Maadi Church),Church,https://maps.app.goo.gl/smbiRVYTQboenKFg7,Cairo,4.8,2191,"Church of the Holy Virgin (Maadi Church), Cair...",Religious and Spiritual Attractions,1,29.952501,31.256114,Moderate Popularity,0.568569,0.0
77,2016,Church of (Mar Girgis),Church,https://maps.app.goo.gl/HHmi4oQkRFzYMqiB7,Cairo,4.7,538,"Church of St. George (Mar Girgis), Cairo, Egyp...",Religious and Spiritual Attractions,1,30.006621,31.227973,Low Popularity,0.27708,0.0
300,49,Orman Garden,Garden,https://maps.app.goo.gl/zZfkfxFRMPXNbraa8,Giza,4.2,13069,Orman Garden is one of Egypts most beautiful a...,Entertainment and Modern Attractions,0,30.02904,31.212974,High Popularity,1.046365,0.0


## Model Evaluation

In [12]:
def evaluate_recommendations(recommendations, user_preferences, df, k=14):
    """
    Evaluate the given recommendations based on user preferences.
    
    Parameters:
    - recommendations: DataFrame of recommended places
    - user_preferences: Dictionary with user preferences (category, City, tourism_type)
    - df: Original dataset DataFrame (egyptopia_places)
    - k: Number of recommendations to evaluate (default: 14)
    
    Returns:
    - Dictionary with metrics: Precision@k, Preference_Coverage@k, Category_Diversity, Tourism_Type_Diversity
    """
    metrics = {
        'Precision@k': 0.0,
        'Preference_Coverage@k': 0.0,
        'Category_Diversity': 0.0,
        'Tourism_Type_Diversity': 0.0
    }
    
    # Getting user preferences
    categories = user_preferences.get('category', [])
    cities = user_preferences.get('City', [])
    tourism_types = user_preferences.get('tourism_type', [])
    
    # Precision@k: Check if recommended places match category, city, or tourism_type
    relevant = 0
    for _, rec in recommendations.head(k).iterrows():
        if rec['Category'] in categories or rec['City'] in cities or rec['Tourism Type'] in tourism_types:
            relevant += 1
    metrics['Precision@k'] = relevant / k if k > 0 else 0.0
    logging.info(f"Precision@k: {metrics['Precision@k']:.4f} ({relevant}/{k} relevant)")
    
    # Preference_Coverage@k: Proportion of user preferences (categories, cities, tourism types) covered in recommendations
    total_preferences = len(categories) + len(cities) + len(tourism_types)
    if total_preferences > 0:
        # Get unique categories, cities, and tourism types in recommendations
        rec_categories = set(recommendations.head(k)['Category'].unique()) & set(categories)
        rec_cities = set(recommendations.head(k)['City'].unique()) & set(cities)
        rec_tourism_types = set(recommendations.head(k)['Tourism Type'].unique()) & set(tourism_types)
        
        # Count covered preferences
        covered_preferences = len(rec_categories) + len(rec_cities) + len(rec_tourism_types)
        metrics['Preference_Coverage@k'] = covered_preferences / total_preferences
        logging.info(f"Preference_Coverage@k: {metrics['Preference_Coverage@k']:.4f} ({covered_preferences}/{total_preferences} preferences covered)")
    else:
        logging.warning("No user preferences specified for Preference_Coverage@k calculation")
    
    # Category_Diversity: Using normalized absolute deviation
    total_recommendations = len(recommendations.head(k))
    if total_recommendations > 0 and categories:
        expected_counts = {cat: total_recommendations / len(categories) for cat in categories}
        actual_counts = recommendations.head(k)['Category'].value_counts().to_dict()
        deviation = sum(
            abs((actual_counts.get(cat, 0) / total_recommendations) - (expected_counts.get(cat, 0) / total_recommendations))
            for cat in categories
        ) / 2
        metrics['Category_Diversity'] = 1 - deviation
        logging.info(f"Category_Diversity: {metrics['Category_Diversity']:.4f} (Expected: {expected_counts}, Actual: {actual_counts})")
    else:
        logging.warning("No recommendations or categories for Category_Diversity calculation")
    
    # Tourism_Type_Diversity: Using normalized absolute deviation
    if total_recommendations > 0 and tourism_types:
        expected_counts = {tt: total_recommendations / len(tourism_types) for tt in tourism_types}
        actual_counts = recommendations.head(k)['Tourism Type'].value_counts().to_dict()
        deviation = sum(
            abs((actual_counts.get(tt, 0) / total_recommendations) - (expected_counts.get(tt, 0) / total_recommendations))
            for tt in tourism_types
        ) / 2
        metrics['Tourism_Type_Diversity'] = 1 - deviation
        logging.info(f"Tourism_Type_Diversity: {metrics['Tourism_Type_Diversity']:.4f} (Expected: {expected_counts}, Actual: {actual_counts})")
    else:
        logging.warning("No recommendations or tourism types for Tourism_Type_Diversity calculation")
    
    return metrics

## Evaluation of CBRS Sample Output 

In [13]:
df = egyptopia_places
results = evaluate_recommendations(recommendations, user_preferences, df, k=14)
print("Evaluation Results:")
for metric, value in results.items():
    print(f"{metric}: {value:.4f}")

2025-06-29 15:07:50,975 - INFO - Precision@k: 1.0000 (14/14 relevant)
2025-06-29 15:07:50,980 - INFO - Preference_Coverage@k: 1.0000 (8/8 preferences covered)
2025-06-29 15:07:50,984 - INFO - Category_Diversity: 0.9286 (Expected: {'Church': 4.666666666666667, 'Garden': 4.666666666666667, 'Temple': 4.666666666666667}, Actual: {'Church': 4, 'Garden': 4, 'Temple': 4, 'Shopping': 1, 'Theme Park': 1})
2025-06-29 15:07:50,987 - INFO - Tourism_Type_Diversity: 0.8571 (Expected: {'Entertainment and Modern Attractions': 7.0, 'Cultural and Historical Attractions': 7.0}, Actual: {'Entertainment and Modern Attractions': 6, 'Religious and Spiritual Attractions': 4, 'Cultural and Historical Attractions': 4})


Evaluation Results:
Precision@k: 1.0000
Preference_Coverage@k: 1.0000
Category_Diversity: 0.9286
Tourism_Type_Diversity: 0.8571


## Simulating Multiple User Preference Scenarios to Evaluate Egyptopia Content-Based Recommendation System Performance

In [14]:
# Function to generate random user preferences

In [15]:
def generate_random_preferences(df, num_scenarios=50):
    """
    Generate random user preferences for simulation.
    
    Parameters:
    - df: Original dataset DataFrame (egyptopia_places)
    - num_scenarios: Number of scenarios to generate (default: 50)
    
    Returns:
    - List of dictionaries with random user preferences
    """
    categories = df['Category'].unique()
    cities = df['City'].unique()
    tourism_types = df['Tourism Type'].unique()
    
    scenarios = []
    np.random.seed(42)  # For reproducibility
    for _ in range(num_scenarios):
        num_categories = np.random.randint(6, 10)
        selected_categories = np.random.choice(categories, size=num_categories, replace=False).tolist()
        
        num_cities = np.random.randint(3, 8)
        selected_cities = np.random.choice(cities, size=num_cities, replace=False).tolist()
        
        num_tourism_types = np.random.randint(3, 4)
        selected_tourism_types = np.random.choice(tourism_types, size=num_tourism_types, replace=False).tolist()
        
        preferences = {
            'category': selected_categories,
            'City': selected_cities,
            'tourism_type': selected_tourism_types
        }
        scenarios.append(preferences)
    
    logging.info(f"Generated {len(scenarios)} random user preference scenarios")
    return scenarios

In [16]:
# Simulation function (Multiple Inputs)

In [17]:
def run_simulation(df, num_scenarios=100, k=14, output_csv='simulation_results_preference_coverage.csv'):
    
    """
    Run simulation with random user preferences and save results to CSV.
    
    Parameters:
    - df: Original dataset DataFrame (egyptopia_places)
    - num_scenarios: Number of scenarios to simulate (default: 50)
    - k: Number of recommendations to evaluate (default: 14)
    - output_csv: Path to save the results CSV (default: 'simulation_results_preference_coverage.csv')
    
    Returns:
    - DataFrame with simulation results
    """
    scenarios = generate_random_preferences(df, num_scenarios)
    results = []
    
    for idx, preferences in enumerate(scenarios):
        logging.info(f"Running scenario {idx+1}/{num_scenarios}: {preferences}")
        
        try:
            recommendations = recommend_places_content_based(
                df,
                preferences,
                top_n=k,
                max_per_city=5
            )
        except Exception as e:
            logging.error(f"Error generating recommendations for scenario {idx+1}: {e}")
            continue
        
        if recommendations.empty:
            logging.warning(f"No recommendations for scenario {idx+1}")
            metrics = {'Precision@k': 0.0, 'Preference_Coverage@k': 0.0, 'Category_Diversity': 0.0, 'Tourism_Type_Diversity': 0.0}
        else:
            metrics = evaluate_recommendations(recommendations, preferences, df, k=k)
        
        result = {
            'Scenario': idx+1,
            'Categories': ', '.join(preferences['category']),
            'Cities': ', '.join(preferences['City']),
            'Tourism_Types': ', '.join(preferences['tourism_type']),
            'Precision@k': metrics['Precision@k'],
            'Preference_Coverage@k': metrics['Preference_Coverage@k'],
            'Category_Diversity': metrics['Category_Diversity'],
            'Tourism_Type_Diversity': metrics['Tourism_Type_Diversity']
        }
        results.append(result)
    
    results_df = pd.DataFrame(results)
    if not results_df.empty:
        results_df.to_csv(output_csv, index=False)
        logging.info(f"Saved simulation results to {output_csv}")
    else:
        logging.warning("No results to save")
    
    return results_df

In [25]:
# results_df = run_simulation(egyptopia_places, num_scenarios=50, k=14, output_csv='.\synthetic_data_experiments\experiment_9.csv')

## Generated Single User Input Preferences

In [None]:
# Simulation function for single preferences (Category, City, or Tourism Type)

In [21]:
def run_single_preferences_simulation(df, categories, cities, tourism_types, k=14, output_csv='single_preferences_results.csv'):
    
    """
    Run simulation for all single user preferences (one Category, City, or Tourism Type) and save results to CSV.
    
    Parameters:
    - df: Original dataset DataFrame (egyptopia_places)
    - categories: List of unique categories
    - cities: List of unique cities
    - tourism_types: List of unique tourism types
    - k: Number of recommendations to evaluate (default: 14)
    - output_csv: Path to save the results CSV (default: 'single_preferences_results.csv')
    
    Returns:
    - DataFrame with simulation results
    """
    results = []
    
    # Evaluate each category alone
    for category in categories:
        user_preferences = {'category': [category], 'City': [], 'tourism_type': []}
        logging.info(f"Running simulation for category: {category}")
        
        try:
            recommendations = recommend_places_content_based(
                df,
                user_preferences,
                top_n=k,
                max_per_city=5
            )
        except Exception as e:
            logging.error(f"Error generating recommendations for {category}: {e}")
            recommendations = pd.DataFrame()
        
        if recommendations.empty:
            logging.warning(f"No recommendations for {category}")
            metrics = {'Precision@k': 0.0, 'Preference_Coverage@k': 0.0, 'Category_Diversity': 0.0, 'Tourism_Type_Diversity': 0.0}
        else:
            metrics = evaluate_recommendations(recommendations, user_preferences, df, k=k)
        
        results.append({
            'Category': category,
            'City': 'N/A',
            'Tourism Type': 'N/A',
            'Precision@k': metrics['Precision@k'],
            'Preference_Coverage@k': metrics['Preference_Coverage@k'],
            'Category_Diversity': metrics['Category_Diversity'],
            'Tourism_Type_Diversity': metrics['Tourism_Type_Diversity']
        })
    
    # Evaluate each city alone
    for city in cities:
        user_preferences = {'category': [], 'City': [city], 'tourism_type': []}
        logging.info(f"Running simulation for city: {city}")
        
        try:
            recommendations = recommend_places_content_based(
                df,
                user_preferences,
                top_n=k,
                max_per_city=5
            )
        except Exception as e:
            logging.error(f"Error generating recommendations for {city}: {e}")
            recommendations = pd.DataFrame()
        
        if recommendations.empty:
            logging.warning(f"No recommendations for {city}")
            metrics = {'Precision@k': 0.0, 'Preference_Coverage@k': 0.0, 'Category_Diversity': 0.0, 'Tourism_Type_Diversity': 0.0}
        else:
            metrics = evaluate_recommendations(recommendations, user_preferences, df, k=k)
        
        results.append({
            'Category': 'N/A',
            'City': city,
            'Tourism Type': 'N/A',
            'Precision@k': metrics['Precision@k'],
            'Preference_Coverage@k': metrics['Preference_Coverage@k'],
            'Category_Diversity': metrics['Category_Diversity'],
            'Tourism_Type_Diversity': metrics['Tourism_Type_Diversity']
        })
    
    # Evaluate each tourism type alone
    for tourism_type in tourism_types:
        user_preferences = {'category': [], 'City': [], 'tourism_type': [tourism_type]}
        logging.info(f"Running simulation for tourism type: {tourism_type}")
        
        try:
            recommendations = recommend_places_content_based(
                df,
                user_preferences,
                top_n=k,
                max_per_city=5
            )
        except Exception as e:
            logging.error(f"Error generating recommendations for {tourism_type}: {e}")
            recommendations = pd.DataFrame()
        
        if recommendations.empty:
            logging.warning(f"No recommendations for {tourism_type}")
            metrics = {'Precision@k': 0.0, 'Preference_Coverage@k': 0.0, 'Category_Diversity': 0.0, 'Tourism_Type_Diversity': 0.0}
        else:
            metrics = evaluate_recommendations(recommendations, user_preferences, df, k=k)
        
        results.append({
            'Category': 'N/A',
            'City': 'N/A',
            'Tourism Type': tourism_type,
            'Precision@k': metrics['Precision@k'],
            'Preference_Coverage@k': metrics['Preference_Coverage@k'],
            'Category_Diversity': metrics['Category_Diversity'],
            'Tourism_Type_Diversity': metrics['Tourism_Type_Diversity']
        })
    
    results_df = pd.DataFrame(results)
    if not results_df.empty:
        results_df.to_csv(output_csv, index=False)
        logging.info(f"Saved simulation results to {output_csv}")
    else:
        logging.warning("No results to save")
    
    return results_df

In [26]:
# results_df = run_single_preferences_simulation(egyptopia_places, categories, cities, tourism_types, k=14,
#                                                output_csv='.\synthetic_data_experiments\single_preferences_results.csv')