# Final IoT data generation with check-in and check-out times for attractions

First, change your working directory to ensure that it's in the scripts folder. 

In [3]:
import os

# Change the working directory
new_path = r"C:\Users\parma\data-science-guest-experience\data-science-guest-experience\Scripts\Subgroup_B"
os.chdir(new_path)

# Print the current working directory
print("Current Working Directory:", os.getcwd())

Current Working Directory: C:\Users\parma\data-science-guest-experience\data-science-guest-experience\Scripts\Subgroup_B


In [6]:
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from faker import Faker

# Initialize Faker instance
fake = Faker()

# Define theme zones and attractions
THEME_ZONES = [
    "Hollywood", "New York", "Sci-Fi City", "Ancient Egypt", "The Lost World", "Far Far Away"
]

ATTRACTIONS = {
    "Ancient Egypt": ["Revenge of the Mummy"],
    "Sci-Fi City": ["Battlestar Galactica: CYLON", "Transformers: The Ride", "Battlestar Galactica"],
    "New York": ["Sesame Street Spaghetti Space Chase"],
    "Hollywood": [], "The Lost World": ["Canopy Flyer"], "Far Far Away": ["Puss In Boots' Giant Journey"]
}

POPULAR_ATTRACTIONS = sum(ATTRACTIONS.values(), [])

# Function to generate synthetic IoT data
def generate_synthetic_data(num_samples=5000):
    data = []
    np.random.seed(42)
    start_date = datetime(2024, 1, 1)
    end_date = datetime(2024, 12, 31)
    date_range = pd.date_range(start=start_date, end=end_date)

    for _ in range(num_samples):
        visit_date = np.random.choice(date_range)
        theme_zone = np.random.choice(THEME_ZONES, p=[0.1, 0.15, 0.2, 0.2, 0.15, 0.2])
        
        # Generate visitor ID and loyalty status
        visitor_id = fake.uuid4()
        loyalty_member = np.random.choice(["Yes", "No"], p=[0.2, 0.8])

        # Generate demographic data
        age = fake.random_int(min=5, max=70)  # Random age from 5 to 70
        gender = np.random.choice(["Male", "Female"], p=[0.5, 0.5])

        # Assigning check-in and check-out times based on the theme zone's popularity
        check_in_time = np.random.randint(9, 15)
        stay_duration = np.random.randint(2, 4) if theme_zone in ["Sci-Fi City", "Ancient Egypt"] else np.random.randint(1, 3)
        check_out_time = min(check_in_time + stay_duration, 17)

        # Assign an attraction visit count (higher for full-day visitors)
        num_attractions_visited = np.random.randint(3, 4) if check_out_time - check_in_time < 5 else np.random.randint(5, 7)
        
        attractions_visited = np.random.choice(
            ATTRACTIONS.get(theme_zone, []), size=min(num_attractions_visited, len(ATTRACTIONS.get(theme_zone, []))), replace=False
        ).tolist()

        # Assign queue times based on attraction popularity
        base_wait = np.random.randint(60, 90) if any(attraction in POPULAR_ATTRACTIONS for attraction in attractions_visited) else np.random.randint(30, 60)

        # Spending behavior
        restaurant_spending = np.random.randint(10, 16) if np.random.rand() < 0.6 else 0
        merchandise_spending = np.random.randint(30, 51) if np.random.rand() < 0.4 else 0
        total_spending = restaurant_spending + merchandise_spending

        # Append data
        data.append({
            "Date": pd.Timestamp(visit_date).strftime("%Y-%m-%d"),
            "Visitor_ID": visitor_id,
            "Loyalty_Member": loyalty_member,
            "Age": age,
            "Gender": gender,
            "Theme_Zone": theme_zone,
            "Check_In_Time": check_in_time,
            "Check_Out_Time": check_out_time,
            "Attractions_Visited": ", ".join(attractions_visited),
            "Average_Queue_Time": base_wait,
            "Restaurant_Spending": restaurant_spending,
            "Merchandise_Spending": merchandise_spending,
            "Total_Spending": total_spending
        })

    df = pd.DataFrame(data)
    return df

# Generate synthetic data
data_df = generate_synthetic_data(5000)
print(data_df.head())

# Save to CSV
data_df.to_csv("../../data/synthetic_iot_data_v2.csv", index=False)


         Date                            Visitor_ID Loyalty_Member  Age  \
0  2024-04-12  51ee8898-7899-4eeb-9c8f-f3ee456abc67            Yes   53   
1  2024-04-09  ffc105c8-7ed0-4adc-bea0-c49b6373d262             No   22   
2  2024-11-09  bbf747e7-ecbf-45c4-aafa-c822ae797ef9             No   16   
3  2024-02-20  33959313-7d6e-420d-9e02-cecd9e97315f             No   45   
4  2024-01-14  18114ed9-fe3b-4880-a121-c30fd4ad2a45             No   39   

   Gender      Theme_Zone  Check_In_Time  Check_Out_Time  \
0  Female  The Lost World             13              14   
1    Male        New York             12              14   
2    Male   Ancient Egypt             11              13   
3    Male     Sci-Fi City             13              15   
4    Male    Far Far Away             12              14   

                                 Attractions_Visited  Average_Queue_Time  \
0                                       Canopy Flyer                  85   
1                Sesame Street Spagh