# Demand prediction for theme parks (specific case of USS)

In [5]:
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import simpy
from datetime import datetime, timedelta
import os
os.chdir("C:/Users/THAI LOW Jin Yang/data-science-guest-experience/Scripts/Subgroup_B")

## Loading datasets
### Load survey data
Loads and preprocesses Universal Studios Singapore survey data, extracting useful features for demand modeling — especially when IoT data is unavailable.

Purpose:
- Prepares a cleaned dataset from raw survey responses by:
- Converting subjective text-based responses into numerical features
- Mapping ride wait times into numerical estimates
- Normalizing guest satisfaction scores
- Inferring seasonal context from survey answers
- Generating additional data rows for longer-than-expected wait experiences
- Filtering to focus only on major attractions

Returns a cleaned pd.DataFrame with relevant columns like:
- Attraction:	Ride name
- Wait_Time:	Numeric estimated wait time
- Guest_Satisfaction_Score:	Normalized score (0–1)
- Event:	'None' or 'Special Event'
-  Season:	One of four seasonal groupings
- Other optional demographic columns like: 'Did you purchase the Express Pass?', 'Who did you visit USS with?', etc.	


In [6]:
def load_survey_data(file_path="../../data/survey.csv"):
    """
    Loads and preprocesses survey data from survey.csv, extracting seasonal information
    and normalizing satisfaction scores. Timestamp is removed in favor of seasonal tagging.

    Args:
        file_path (str): Path to the survey CSV file.

    Returns:
        pd.DataFrame: Preprocessed survey data with 'Season' column added.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"{file_path} not found. Please provide the survey dataset.")

    df = pd.read_csv(file_path)

    df = df.rename(columns={
        "On a scale of 1-5, how would you rate your overall experience at USS?": "Guest_Satisfaction_Score",
        "How long did you wait in line for rides on average during your visit?": "Wait_Time",
        "Which ride or attraction was your favourite?": "Attraction"
    })

    # Map wait times to numerical estimates
    wait_time_mapping = {
        "Less than 15 mins": 10,
        "15-30 mins": 22.5,
        "30-45 mins": 37.5,
        "45 mins - 1 hr": 52.5,
        "More than 1 hr": 75
    }
    df["Wait_Time"] = df["Wait_Time"].map(wait_time_mapping).fillna(37.5)

    # Generate synthetic Event column
    np.random.seed(42)
    df["Event"] = np.random.choice(['None', 'Special Event'], size=len(df), p=[0.8, 0.2])

    # Handle optional long wait experience data (as before)
    long_wait_df = pd.DataFrame()
    if 'Did you experience any rides with longer-than-expected wait times? If yes, which ride(s)?' in df.columns:
        long_wait_rides = df['Did you experience any rides with longer-than-expected wait times? If yes, which ride(s)?'].str.split(', ', expand=True).stack().reset_index()
        long_wait_rides.columns = ['original_index', 'split_index', 'Attraction']
        long_wait_rides = long_wait_rides[long_wait_rides['Attraction'].notna()]

        queue_worth_col = 'Did you feel that overall, the queuing time was worth the experience of the attraction? '
        unpleasant_col = 'What made your experience with this ride or attraction unpleasant? '

        wait_time_adjusted = []
        for idx in long_wait_rides['original_index']:
            base_wait = 75
            if queue_worth_col in df.columns and df[queue_worth_col].iloc[idx] == 'No':
                base_wait = 90
            if unpleasant_col in df.columns and pd.notna(df[unpleasant_col].iloc[idx]):
                if 'long wait' in str(df[unpleasant_col].iloc[idx]).lower():
                    base_wait += 15
            wait_time_adjusted.append(base_wait)

        long_wait_df = pd.DataFrame({
            'Attraction': long_wait_rides['Attraction'],
            'Event': df['Event'].iloc[long_wait_rides['original_index']].values,
            'Wait_Time': wait_time_adjusted,
            'Guest_Satisfaction_Score': df['Guest_Satisfaction_Score'].iloc[long_wait_rides['original_index']].values,
            'Which part of the year did you visit USS?': df['Which part of the year did you visit USS?'].iloc[long_wait_rides['original_index']].values,
            'Did you purchase the Express Pass?': df['Did you purchase the Express Pass?'].iloc[long_wait_rides['original_index']].values if 'Did you purchase the Express Pass?' in df.columns else [None] * len(long_wait_rides),
            'What was the main purpose of your visit?': df['What was the main purpose of your visit?'].iloc[long_wait_rides['original_index']].values if 'What was the main purpose of your visit?' in df.columns else [None] * len(long_wait_rides),
            'Who did you visit USS with?': df['Who did you visit USS with?'].iloc[long_wait_rides['original_index']].values if 'Who did you visit USS with?' in df.columns else [None] * len(long_wait_rides),
            'Which age group do you belong to?': df['Which age group do you belong to?'].iloc[long_wait_rides['original_index']].values if 'Which age group do you belong to?' in df.columns else [None] * len(long_wait_rides)
        })

    base_df = df[['Attraction', 'Wait_Time', 'Event', 'Guest_Satisfaction_Score',
                  'Which part of the year did you visit USS?', 'Did you purchase the Express Pass?',
                  'What was the main purpose of your visit?', 'Who did you visit USS with?',
                  'Which age group do you belong to?']].copy()

    df_combined = pd.concat([base_df, long_wait_df], ignore_index=True)

    # Normalize Guest Satisfaction
    df_combined["Guest_Satisfaction_Score"] = pd.to_numeric(df_combined["Guest_Satisfaction_Score"], errors="coerce")
    df_combined["Guest_Satisfaction_Score"] = (
        (df_combined["Guest_Satisfaction_Score"] - df_combined["Guest_Satisfaction_Score"].min()) /
        (df_combined["Guest_Satisfaction_Score"].max() - df_combined["Guest_Satisfaction_Score"].min())
    )

    # Assign season based on user input or synthetic if 'Can't recall'
    def assign_season(row):
        season = row['Which part of the year did you visit USS?']
        if season != "Can't recall / Not sure":
            return season
        return np.random.choice(
            ["July - September", "October - December", "January - March", "April - June"],
            p=[0.6, 0.3, 0.05, 0.05]
        )

    df_combined['Season'] = df_combined.apply(assign_season, axis=1)

    # Filter valid attractions
    valid_attractions = [
        "Revenge of the Mummy",
        "Battlestar Galactica: CYLON",
        "Transformers: The Ride",
        "Puss In Boots' Giant Journey",
        "Sesame Street Spaghetti Space Chase"
    ]
    df_combined = df_combined[df_combined['Attraction'].isin(valid_attractions)]

    return df_combined


df_survey = load_survey_data()
print(df_survey.head())

                             Attraction  Wait_Time          Event  \
4   Sesame Street Spaghetti Space Chase       37.5           None   
5                  Revenge of the Mummy       37.5           None   
6                  Revenge of the Mummy       37.5           None   
8   Sesame Street Spaghetti Space Chase       37.5           None   
11                 Revenge of the Mummy       37.5  Special Event   

    Guest_Satisfaction_Score Which part of the year did you visit USS?  \
4                       0.50                   Can't recall / Not sure   
5                       0.75                   Can't recall / Not sure   
6                       0.50                   Can't recall / Not sure   
8                       0.75                        October - December   
11                      0.25                   Can't recall / Not sure   

   Did you purchase the Express Pass?  \
4                                  No   
5                                  No   
6                

### Load IOT data (optional, to answer question 5)
Loads and processes synthetic IoT data representing guest behaviors at Universal Studios Singapore, used to enrich demand prediction models.

Prepares raw IoT-style activity logs by extracting temporal, contextual, and behavioral features to support queue/demand modeling for attractions and services.

Processing Steps:
1. File Existence Check: Warns and returns None if the file is not found (graceful fallback for non-IoT experiments).
2. Timestamp Parsing: Converts 'Timestamp' to proper datetime format
3. Feature Engineering

Output: Returns a cleaned pd.DataFrame with original fields plus:
Column	Description
- Timestamp:	Guest activity timestamp
- Attraction:	Ride name
- Age_Group, Gender, etc.:	Visitor demographics
- Check_In_Time / Check_Out_Time:	Check-in and check-out times from park
- Average_Queue_Time:	Reported average wait at the ride
- Number_of_People_in_Queue:	Target variable for demand modeling
- day_of_week:	Day name (e.g., Monday)
- is_weekend:	Boolean weekend indicator
- is_popular_attraction:	Boolean for high-demand rides

In [7]:
def load_iot_data(file_path="../../data/synthetic_iot_data.csv"):
    """
    Loads synthetic IoT data for demand prediction, adding day_of_week, is_weekend, and is_popular_attraction features.

    Args:
        file_path (str): Path to the synthetic IoT CSV file.

    Returns:
        pd.DataFrame: Preprocessed IoT data.
    """
    if not os.path.exists(file_path):
        print(f"Warning: IoT data file {file_path} not found. Skipping IoT data integration.")
        return None

    df_iot = pd.read_csv(file_path)

    # Convert Timestamp to datetime
    df_iot['Timestamp'] = pd.to_datetime(df_iot['Timestamp'])

    # Add day_of_week feature
    df_iot['day_of_week'] = df_iot['Timestamp'].dt.day_name()

    # Add is_weekend feature (True for Saturday and Sunday)
    df_iot['is_weekend'] = df_iot['day_of_week'].isin(["Saturday", "Sunday"])

    # Define popular attractions
    POPULAR_ATTRACTIONS = {"Revenge of the Mummy", "Battlestar Galactica: CYLON", "Transformers: The Ride"}

    # Add is_popular_attraction feature
    df_iot['is_popular_attraction'] = df_iot['Attraction'].isin(POPULAR_ATTRACTIONS)

    return df_iot

# Example usage
df_iot = load_iot_data()
print(df_iot.head())

            Timestamp                    Attraction    Age_Group  Gender  \
0 2024-12-02 17:13:25  Puss In Boots' Giant Journey       Senior  Female   
1 2024-12-01 13:26:11  Puss In Boots' Giant Journey        Adult    Male   
2 2024-11-29 07:52:28  Puss In Boots' Giant Journey        Child    Male   
3 2024-08-01 16:39:11  Puss In Boots' Giant Journey  Young Adult    Male   
4 2024-05-31 18:22:35          Revenge of the Mummy        Child  Female   

  Loyalty_Member  Check_In_Time  Check_Out_Time  Step_Count  \
0             No              9              14       12775   
1             No             13              16       14102   
2             No             12              17       13212   
3             No              9              16       13017   
4             No             12              17        9916   

   Transaction_Amount  Guest_Satisfaction_Score  Average_Queue_Time  \
0                 171                  3.945533                  51   
1                 217 

### Checking which features are important to predict demand for IOT data
Using random forest to find features that correlate to the number of people in queue, which are the weather conditions, guest satisfaction score, and average queue time. 

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

features = ['Guest_Satisfaction_Score', 'Average_Queue_Time', 'Check_In_Time', 
            'Check_Out_Time', 'Temperature', 'Rainfall', 'Humidity']
X = df_iot[features]
y = df_iot['Number_of_People_in_Queue']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

feature_importance = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
print(feature_importance)


Rainfall                    0.191108
Humidity                    0.181672
Temperature                 0.175049
Guest_Satisfaction_Score    0.172186
Average_Queue_Time          0.147287
Check_Out_Time              0.070059
Check_In_Time               0.062638
dtype: float64


## Load weather data
Fetches or loads monthly weather data from Singapore’s government open data API and aggregates it into seasonal averages to be used as input features in demand prediction models.

Purpose:
- Automates the process of downloading or loading historical monthly weather data for 2024.
- Maps each month to a seasonal category used in survey responses (e.g., "January - March").
- Outputs a clean dataset with average weather values per season.

In [9]:
def fetch_weather_data(file_path="../../data/singapore_seasonal_weather.csv"):
    """
    Fetches or loads seasonal weather data for all months of 2024,
    calculates seasonal averages, and saves the result for reuse.

    Returns:
        pd.DataFrame: Weather data averaged by season.
    """
    import os
    import requests
    import pandas as pd

    if os.path.exists(file_path):
        print(f"✅ Loaded existing weather data from: {file_path}")
        return pd.read_csv(file_path)

    print("📡 Fetching weather data from API...")

    base_url = "https://api.data.gov.sg/v1/environment/"
    weather_types = ["rainfall", "air-temperature", "relative-humidity", "wind-speed"]
    months = [f"2024-{str(m).zfill(2)}-15" for m in range(1, 13)]
    month_names = [datetime.strptime(m, "%Y-%m-%d").strftime("%B") for m in months]

    all_data = []

    for date_str, month_name in zip(months, month_names):
        print(f"Fetching data for: {date_str}")
        daily_data = {"month": month_name}

        for weather_type in weather_types:
            url = f"{base_url}{weather_type}"
            params = {"date": date_str}
            response = requests.get(url, params=params)

            if response.status_code == 200:
                try:
                    data = response.json()
                    readings = data["items"][0]["readings"]
                    avg_value = sum(d["value"] for d in readings) / len(readings)
                    daily_data[weather_type] = avg_value
                except (KeyError, IndexError):
                    print(f"⚠️ Missing data for {weather_type} on {date_str}")
                    daily_data[weather_type] = None
            else:
                print(f"❌ Error fetching {weather_type} for {date_str}: {response.status_code}")
                daily_data[weather_type] = None

        all_data.append(daily_data)

    df = pd.DataFrame(all_data)

    # Map months to seasons
    month_to_season = {
        "January": "January - March", "February": "January - March", "March": "January - March",
        "April": "April - June", "May": "April - June", "June": "April - June",
        "July": "July - September", "August": "July - September", "September": "July - September",
        "October": "October - December", "November": "October - December", "December": "October - December"
    }
    df["Season"] = df["month"].map(month_to_season)

    # Average by season
    df_seasonal = df.groupby("Season").agg({
        "rainfall": "mean",
        "air-temperature": "mean",
        "relative-humidity": "mean",
        "wind-speed": "mean"
    }).reset_index()

    df_seasonal.rename(columns={
        "air-temperature": "air_temperature",
        "relative-humidity": "relative_humidity",
        "wind-speed": "wind_speed"
    }, inplace=True)

    # Save to disk
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    df_seasonal.to_csv(file_path, index=False)
    print(f"✅ Saved seasonal weather data to: {file_path}")

    return df_seasonal

In [10]:
df_weather = fetch_weather_data("../../data/singapore_seasonal_weather.csv")
print(df_weather.head())

✅ Loaded existing weather data from: ../../data/singapore_seasonal_weather.csv
               Season  rainfall  air_temperature  relative_humidity  \
0        April - June  0.000000        27.910354          86.727778   
1     January - March  0.001093        27.134091          81.434444   
2    July - September  0.000000        27.970971          78.282112   
3  October - December  0.000000        27.033810          83.870873   

   wind_speed  
0    2.330833  
1    4.568889  
2    2.646852  
3    2.230736  


## Merging datasets
### Merging survey and weather data (to analyse the absence of IOT data to feed into the model)
Combines survey and weather datasets on the 'Season' column. If iot_df is provided, it is also merged and appended. Since there's not much overlapping between the IOT dataframe columns and the other datasets, new columns are added instead and IOT additional data are added as appended rows.

🔍 Use Cases:
- When iot_df=None: Used to analyze survey-only data (e.g. when IoT data is not available).
- When iot_df is provided: Used to create a richer dataset by incorporating behavioral signals from guest activity logs (IoT).

Smart handling includes:
- Automatic season inference from Timestamp (for IoT)
- Outer join to preserve all relevant fields
Output:
- Returns a single DataFrame with the following structure:
Core columns from survey data: 'Season', 'Attraction', 'Wait_Time', 'Guest_Satisfaction_Score', 'Event', demographic info
Weather columns:'rainfall', 'air_temperature', 'relative_humidity', 'wind_speed'
IoT columns (if applicable):'Check_In_Time', 'Check_Out_Time', 'Average_Queue_Time', 'Step_Count', 'Number_of_People_in_Queue', 'Transaction_Amount'

📌 When IoT data is included: The final dataset is longer (due to row appending). It contains additional columns populated only for IoT rows

In [11]:
def merge_survey_weather_iot(survey_df, weather_df, iot_df=None):
    """
    Merges survey data with seasonal weather data, and appends IoT data if provided.
    
    Args:
        survey_df (pd.DataFrame): Survey data with a 'Season' column.
        weather_df (pd.DataFrame): Seasonal weather data with 'Season' column.
        iot_df (pd.DataFrame, optional): IoT data (should contain 'Season' column).

    Returns:
        pd.DataFrame: Combined dataset (survey + weather [+ iot if provided]).
    """
    import pandas as pd
    import numpy as np

    # Merge survey with weather data
    merged_survey = pd.merge(survey_df, weather_df, on='Season', how='left')

    if iot_df is None:
        return merged_survey

    # Ensure 'Season' exists in IoT data
    if 'Season' not in iot_df.columns:
        print("⚠️ 'Season' column missing in IoT data. Assigning season synthetically...")
        if 'Timestamp' in iot_df.columns:
            iot_df['Timestamp'] = pd.to_datetime(iot_df['Timestamp'])
            month_to_season = {
                1: "January - March", 2: "January - March", 3: "January - March",
                4: "April - June", 5: "April - June", 6: "April - June",
                7: "July - September", 8: "July - September", 9: "July - September",
                10: "October - December", 11: "October - December", 12: "October - December"
            }
            iot_df['Season'] = iot_df['Timestamp'].dt.month.map(month_to_season)
        else:
            iot_df['Season'] = np.random.choice(
                ["January - March", "April - June", "July - September", "October - December"],
                size=len(iot_df),
                p=[0.1, 0.1, 0.4, 0.4]
            )

    # Merge IoT with weather
    merged_iot = pd.merge(iot_df, weather_df, on='Season', how='left')

    # Append both datasets (not inner join, preserve all columns)
    combined = pd.concat([merged_survey, merged_iot], ignore_index=True, join='outer')

    return combined


### Merged dataset without IOT data
To put into model later to check the effect of just survey and weather data on the accuracy of the model

In [12]:
df_combined = merge_survey_weather_iot(df_survey, df_weather)
print(df_combined.head())
print(df_combined.columns.tolist())

                            Attraction  Wait_Time          Event  \
0  Sesame Street Spaghetti Space Chase       37.5           None   
1                 Revenge of the Mummy       37.5           None   
2                 Revenge of the Mummy       37.5           None   
3  Sesame Street Spaghetti Space Chase       37.5           None   
4                 Revenge of the Mummy       37.5  Special Event   

   Guest_Satisfaction_Score Which part of the year did you visit USS?  \
0                      0.50                   Can't recall / Not sure   
1                      0.75                   Can't recall / Not sure   
2                      0.50                   Can't recall / Not sure   
3                      0.75                        October - December   
4                      0.25                   Can't recall / Not sure   

  Did you purchase the Express Pass? What was the main purpose of your visit?  \
0                                 No                            Family 

Merged dataset with IOT data

In [13]:
df_all_combined = merge_survey_weather_iot(df_survey, df_weather, df_iot)
print(df_all_combined.head())
print(df_all_combined.columns.tolist())

⚠️ 'Season' column missing in IoT data. Assigning season synthetically...
                            Attraction  Wait_Time          Event  \
0  Sesame Street Spaghetti Space Chase       37.5           None   
1                 Revenge of the Mummy       37.5           None   
2                 Revenge of the Mummy       37.5           None   
3  Sesame Street Spaghetti Space Chase       37.5           None   
4                 Revenge of the Mummy       37.5  Special Event   

   Guest_Satisfaction_Score Which part of the year did you visit USS?  \
0                      0.50                   Can't recall / Not sure   
1                      0.75                   Can't recall / Not sure   
2                      0.50                   Can't recall / Not sure   
3                      0.75                        October - December   
4                      0.25                   Can't recall / Not sure   

  Did you purchase the Express Pass? What was the main purpose of your visit? 

## Modelling with XGBoost
Trains an XGBoost regression model to predict attraction or park demand.

Key features:
- Automatically detects whether IoT data is present.
- Uses enhanced feature set when IoT data is available, and fallback set otherwise.
- Handles missing data and encodes categorical variables.
- Dynamically switches target to 'Wait_Time' if 'Number_of_People_in_Queue' is unavailable.

### Case 1: With IoT Data
When IoT data is available, we define demand as: Number_of_People_in_Queue at a specific attraction
This is the most direct and objective measure of real-time attraction load, derived from simulated IoT sensors.
Target Variable: Number_of_People_in_Queue
Model goal: Learn how weather, queue times, satisfaction, and activity patterns relate to crowd size.

🔁 Feature signals used: Season, Attraction, Wait_Time, Guest_Satisfaction_Score, rainfall, relative_humidity, air_temperature, Average_Queue_Time, Check_In_Time, Check_Out_Time

📈 Why this matters:
This gives us the most granular and accurate model for attraction-level forecasting — and can be expanded to restaurant and shop estimates.

### Case 2: Without IoT Data (Survey + Weather Only)
When IoT data is not available, we define demand using a proxy: Wait_Time reported by survey respondents
This is a subjective but meaningful proxy for demand — longer queues suggest higher demand at that attraction.
Fallback Target: Wait_Time
Model goal: Predict expected wait times based on satisfaction, event type, and weather conditions.

🔁 Feature signals used:Season, Attraction, Event, Guest_Satisfaction_Score, rainfall, relative_humidity, air_temperature, wind_speed
Can be influenced by perception, not just actual crowd size
Enables forecasting for scenarios or seasonal changes

In [14]:
def train_demand_model_flexible(df, target='Number_of_People_in_Queue', fallback_target='Wait_Time'):
    """
    Trains an XGBoost model using available features in the dataset.
    If IoT-specific features are available, they are included in the model.
    Otherwise, falls back to survey + weather-only features.

    Args:
        df (pd.DataFrame): Merged dataset (survey + weather [+ iot]).
        target (str): Primary target column (default: Number_of_People_in_Queue).
        fallback_target (str): Fallback target column if primary is unavailable (default: Wait_Time).

    Returns:
        model (XGBRegressor): Trained XGBoost model.
        metrics (dict): Evaluation scores on test data.
    """
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder
    from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
    from xgboost import XGBRegressor
    from pandas.api.types import is_object_dtype

    # Determine actual target column
    if target in df.columns and df[target].notna().any():
        actual_target = target
    elif fallback_target in df.columns and df[fallback_target].notna().any():
        actual_target = fallback_target
        print(f"⚠️ Target '{target}' not found. Using fallback '{fallback_target}' instead.")
    else:
        raise ValueError("❌ No valid target column found.")

    # Define full IoT-aware feature set and reduced fallback set
    full_features = [
        'Season', 'Attraction', 'Wait_Time', 'Guest_Satisfaction_Score',
        'rainfall', 'relative_humidity', 'air_temperature',
        'Average_Queue_Time', 'Check_In_Time', 'Check_Out_Time'
    ]
    fallback_features = [
        'Season', 'Attraction', 'Event', 'Guest_Satisfaction_Score',
        'rainfall', 'air_temperature', 'relative_humidity', 'wind_speed'
    ]

    # Decide which feature set to use based on available columns
    if all(col in df.columns for col in full_features):
        used_features = full_features
        print("📡 Detected IoT features. Using full feature set.")
    else:
        used_features = [col for col in fallback_features if col in df.columns]
        print("📋 Using survey-only feature set.")

    print(f"🧮 Features used: {used_features}")
    print(f"🎯 Target: {actual_target}")

    # Drop rows with missing target
    df = df[df[actual_target].notna()]
    if df.empty:
        raise ValueError("❌ No valid rows with target values.")

    df = df[used_features + [actual_target]].copy()

    # Fill missing values
    for col in used_features:
        if col not in df.columns:
            continue
        if is_object_dtype(df[col]):
            df[col] = df[col].fillna("Unknown")
        else:
            df[col] = df[col].fillna(df[col].mean())

    # Encode categorical columns
    for col in ['Season', 'Attraction', 'Event']:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))

    # Prepare train/test data
    X = df[used_features]
    y = df[actual_target]

    if len(df) < 5:
        raise ValueError("❌ Not enough data to train.")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train XGBoost
    model = XGBRegressor(
        random_state=42,
        n_estimators=50,
        max_depth=4,
        learning_rate=0.1,
        verbosity=0
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluation metrics
    metrics = {
        'R² Score': r2_score(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'MAE': mean_absolute_error(y_test, y_pred)
    }

    print("✅ Model trained successfully.")
    print("📊 Evaluation:")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    return model, metrics


### Training model without the IOT data
Evaluation of model training with the merged survey and weather data yields reasonable prediction with low RMSE and MAE, although the R^2 is on the lower side. The evaluation metrics are rather expected due to the subjectiveness of the survey data and no real-time signal. 

In [15]:
print(df_combined.columns.tolist())
model_1, metrics_1 = train_demand_model_flexible(df_combined)

['Attraction', 'Wait_Time', 'Event', 'Guest_Satisfaction_Score', 'Which part of the year did you visit USS?', 'Did you purchase the Express Pass?', 'What was the main purpose of your visit?', 'Who did you visit USS with?', 'Which age group do you belong to?', 'Season', 'rainfall', 'air_temperature', 'relative_humidity', 'wind_speed']
⚠️ Target 'Number_of_People_in_Queue' not found. Using fallback 'Wait_Time' instead.
📋 Using survey-only feature set.
🧮 Features used: ['Season', 'Attraction', 'Event', 'Guest_Satisfaction_Score', 'rainfall', 'air_temperature', 'relative_humidity', 'wind_speed']
🎯 Target: Wait_Time
✅ Model trained successfully.
📊 Evaluation:
R² Score: 0.0720
RMSE: 18.2159
MAE: 15.8256


### Training model with IOT data
Evaluation of model training with IOT data shows that model performs better when IOT data is not involved. This suggests that this model may perhaps be too simple to handle the IOT data, either that or that the synthetically-generated IOT data does not replicate the correlation between the features to the predicted demand enough to be helpful for the model. 

In [16]:
model_2, metrics_2 = train_demand_model_flexible(df_all_combined)

📡 Detected IoT features. Using full feature set.
🧮 Features used: ['Season', 'Attraction', 'Wait_Time', 'Guest_Satisfaction_Score', 'rainfall', 'relative_humidity', 'air_temperature', 'Average_Queue_Time', 'Check_In_Time', 'Check_Out_Time']
🎯 Target: Number_of_People_in_Queue
✅ Model trained successfully.
📊 Evaluation:
R² Score: 0.0297
RMSE: 85.4621
MAE: 74.6309


## Modelling with Gradient Boosting

In [23]:
def train_rf_demand_model_flexible(df, target='Number_of_People_in_Queue', fallback_target='Wait_Time'):
    """
    Trains a Random Forest Regressor for demand prediction.
    Automatically detects if IoT features are present and adjusts features accordingly.

    Args:
        df (pd.DataFrame): Merged dataset (survey + weather [+ iot])
        target (str): Main prediction target (default: 'Number_of_People_in_Queue')
        fallback_target (str): Backup target if main target is missing (default: 'Wait_Time')

    Returns:
        model (RandomForestRegressor): Trained model
        metrics (dict): R², RMSE, MAE
    """
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
    from sklearn.preprocessing import LabelEncoder
    from pandas.api.types import is_object_dtype

    # --- Determine which target to use ---
    if target in df.columns and df[target].notna().any():
        actual_target = target
    elif fallback_target in df.columns and df[fallback_target].notna().any():
        actual_target = fallback_target
        print(f"⚠️ '{target}' not found. Using fallback target '{fallback_target}'.")
    else:
        raise ValueError("❌ No valid target found in dataset.")

    # --- Define feature sets ---
    full_features = [
        'Season', 'Attraction', 'Wait_Time', 'Guest_Satisfaction_Score',
        'rainfall', 'relative_humidity', 'air_temperature',
        'Average_Queue_Time', 'Check_In_Time', 'Check_Out_Time'
    ]
    fallback_features = [
        'Season', 'Attraction', 'Event', 'Guest_Satisfaction_Score',
        'rainfall', 'air_temperature', 'relative_humidity', 'wind_speed'
    ]

    if all(col in df.columns for col in full_features):
        used_features = full_features
        print("📡 IoT features detected. Using full feature set.")
    else:
        used_features = [col for col in fallback_features if col in df.columns]
        print("📋 Using reduced survey feature set.")

    print(f"🧮 Features used: {used_features}")
    print(f"🎯 Target: {actual_target}")

    # --- Drop rows with missing target ---
    df = df[df[actual_target].notna()]
    if df.empty:
        raise ValueError("❌ No rows with valid target values.")

    df = df[used_features + [actual_target]].copy()

    # --- Fill missing values ---
    for col in used_features:
        if col not in df.columns:
            continue
        if is_object_dtype(df[col]):
            df[col] = df[col].fillna("Unknown")
        else:
            df[col] = df[col].fillna(df[col].mean())

    # --- Encode categorical features ---
    for col in ['Season', 'Attraction', 'Event']:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))

    # --- Train-test split ---
    X = df[used_features]
    y = df[actual_target]

    if len(df) < 5:
        raise ValueError("❌ Not enough data to train.")

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # --- Train Random Forest ---
    model = RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # --- Evaluation ---
    metrics = {
        'R² Score': r2_score(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'MAE': mean_absolute_error(y_test, y_pred)
    }

    print("✅ Random Forest model trained successfully.")
    print("📊 Evaluation:")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    return model, metrics


### Modelling with no IOT dataset

Yields better results than XGBoost in terms of R^2 square, albeit still low. 

In [25]:
model_gb_1, metrics_gb_1 = train_rf_demand_model_flexible(df_combined)

⚠️ 'Number_of_People_in_Queue' not found. Using fallback target 'Wait_Time'.
📋 Using reduced survey feature set.
🧮 Features used: ['Season', 'Attraction', 'Event', 'Guest_Satisfaction_Score', 'rainfall', 'air_temperature', 'relative_humidity', 'wind_speed']
🎯 Target: Wait_Time
✅ Random Forest model trained successfully.
📊 Evaluation:
R² Score: 0.1039
RMSE: 17.8996
MAE: 15.0964


### Modelling with IOT dataset
Results are around the same, indicating that instead of modelling issue, it's perhaps more of a data distribution issue that lacks accuracy in resembling real-life distribution. 

In [26]:
model_gb_2, metrics_gb_2 = train_rf_demand_model_flexible(df_all_combined)

📡 IoT features detected. Using full feature set.
🧮 Features used: ['Season', 'Attraction', 'Wait_Time', 'Guest_Satisfaction_Score', 'rainfall', 'relative_humidity', 'air_temperature', 'Average_Queue_Time', 'Check_In_Time', 'Check_Out_Time']
🎯 Target: Number_of_People_in_Queue
✅ Random Forest model trained successfully.
📊 Evaluation:
R² Score: 0.0173
RMSE: 86.0086
MAE: 74.6279


# Simulating some of the demand for attractions, restaurants, and merchandise shops
This function simulates daily visitor demand across different attractions and services at USS by predicting the number of people in queue based on future conditions (e.g., weather, season, ride popularity, etc.).

📤 Output:
Prints demand predictions in 3 segments:
🎢 Predicted Queue Size by Attraction: 
- Average number of people expected in queue per attraction per day.
Note: This represents the average instantaneous queue length at a given point in the day.

🍔 Estimated Restaurant Demand by Season:
- Uses average predicted queue size per attraction in a season as a proxy for restaurant demand.
Assumes restaurant traffic correlates with attraction traffic.

🎁 Estimated Merchandise Demand:
- Estimates the number of people likely to exit after 5PM (e.g., visiting shops near the exit).
- Based on the predicted queue sizes of attractions for people with Check_Out_Time >= 17.


In [27]:
def simulate_demand_predictions(model, features_df, label_encoders=None):
    """
    Generates and prints demand predictions using the trained model for attractions and services.

    Args:
        model: Trained XGBoost model.
        features_df (pd.DataFrame): Input features for prediction (should match training format).
        label_encoders (dict): Optional, if Season or Attraction were label-encoded.
    """

    df = features_df.copy()

    # Encode Season and Attraction if needed
    if label_encoders:
        for col in ['Season', 'Attraction']:
            if col in df.columns and col in label_encoders:
                df[col] = label_encoders[col].transform(df[col].astype(str))

    predictions = model.predict(df)
    features_df['Predicted_Queue'] = predictions

    # --- Summarize demand per attraction ---
    print("\n🎢 Predicted Queue Size by Attraction (avg. people in queue today):")
    print(features_df.groupby("Attraction")['Predicted_Queue'].mean().round(1).sort_values(ascending=False))

    # --- Estimate restaurant demand per season (proxy: mean attraction demand) ---
    print("\n🍔 Estimated Restaurant Demand by Season (avg. people per attraction per day):")
    restaurant_demand = features_df.groupby("Season")['Predicted_Queue'].mean().round(1)
    print(restaurant_demand)

    # --- Estimate merchandise demand (clustered at exit) ---
    print("\n🎁 Estimated Merchandise Demand (avg. people exiting after 5PM per day):")
    exit_demand = features_df[features_df['Check_Out_Time'] >= 17]  # Assume late checkout = near exit
    merch_demand = exit_demand['Predicted_Queue'].mean().round(1) if not exit_demand.empty else 0
    print(f"Expected merchandise shop crowding (near exit): {merch_demand} people (avg)")


Example of simulation with synthetic data.

In [29]:
from sklearn.preprocessing import LabelEncoder
# Sample input for next 3 days
df_future = pd.DataFrame({
    'Season': ['July - September'] * 5,
    'Attraction': [
        "Revenge of the Mummy",
        "Transformers: The Ride",
        "Puss In Boots' Giant Journey",
        "Sesame Street Spaghetti Space Chase",
        "Battlestar Galactica: CYLON"
    ],
    'Wait_Time': [35, 45, 30, 25, 40],
    'Guest_Satisfaction_Score': [0.7, 0.8, 0.6, 0.9, 0.75],
    'rainfall': [1.2] * 5,
    'relative_humidity': [82] * 5,
    'air_temperature': [30] * 5,
    'Average_Queue_Time': [45, 50, 40, 30, 55],
    'Check_In_Time': [10, 11, 9, 10, 12],
    'Check_Out_Time': [18, 17, 16, 19, 18]
})

# Encode categorical fields if needed
label_encoders = {}
for col in ['Season', 'Attraction']:
    le = LabelEncoder()
    le.fit(df_all_combined[col].astype(str))  # Fit on the full dataset
    label_encoders[col] = le

simulate_demand_predictions(model_2, df_future, label_encoders)



🎢 Predicted Queue Size by Attraction (avg. people in queue today):
Attraction
Sesame Street Spaghetti Space Chase    172.800003
Transformers: The Ride                 124.099998
Battlestar Galactica: CYLON            120.400002
Revenge of the Mummy                   102.900002
Puss In Boots' Giant Journey            96.599998
Name: Predicted_Queue, dtype: float32

🍔 Estimated Restaurant Demand by Season (avg. people per attraction per day):
Season
July - September    123.400002
Name: Predicted_Queue, dtype: float32

🎁 Estimated Merchandise Demand (avg. people exiting after 5PM per day):
Expected merchandise shop crowding (near exit): 130.10000610351562 people (avg)
