<div style="background-color: rgb(235, 218, 30); color: rgb(157, 156, 156); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    Data science at Disney World
</div>

<div style="background-color: rgb(235, 218, 30); color: rgb(157, 156, 156); font-size: 24px; font-weight: bold; padding: 10px; border-radius: 15px;">
    Research Question 1: Compare predicted and actual waiting time
</div>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import ipywidgets as widgets
from ipywidgets import interact

### 🎢 Compare Predicted and Actual Waiting Times

Create a visualization to compare **predicted (posted)** and **actual** waiting times.  
You can follow these steps:

1. 🔄 For each actual waiting time, find the **previous** and **next** posted waiting times.
2. 📈 Use **linear interpolation** to estimate the posted waiting time that corresponds to the actual time.
3. ➖ Calculate the **difference** between actual and posted waiting times.
4. 📊 Create a **histogram** of all these differences.
   - Optionally, use a **scatterplot** or a **2D histogram / heatmap** for more insights.
5. 🔍 Investigate specific **dates** and **attractions** with lots of actual waiting time data.
   - Plot **actual vs. posted waiting times** together in a single visualization for those cases.

In [2]:
df = pd.read_csv('C:/Disney_Waiting_Times/disney-waiting-times/all_waiting_times_extracted/all_waiting_times.csv')
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y')
df['datetime'] = pd.to_datetime(df['datetime'])  # # dtype adjusted 
# df.info()

In [3]:
list_attractions = df['attraction'].unique().tolist()

In [4]:
list_attractions = df['attraction'].unique().tolist()
list_years = ['2018']

df_selection= df.query( ' date.dt.year==2018 and attraction in @list_attractions' ).copy()
# df_selection.info()
# df_selection

In [5]:
# show only df_SACTMIN
# condition2= df_selection['SACTMIN'].notna()
# df_selection[condition2]

In [6]:
interpolated_dfs = {}

for (attraction, date), group in tqdm( df_selection.groupby(['attraction', 'date']) ):
    group.set_index('datetime', inplace=True)
    group['SPOSTMIN'] = group['SPOSTMIN'].interpolate(method='time')
    interpolated_dfs[(attraction, date)] = group

interpolated_dfs  # `interpolated_dfs` contains a separate DataFrame for each attraction with interpolated 'SPOSTMIN'
# dumbo_1_df = interpolated_dfs[('dumbo', pd.to_datetime('01/01/2018') )]
# dumbo_1_df[dumbo_1_df['SACTMIN'].notna()]
# dwarfs_2_df = interpolated_dfs[('7_dwarfs_train', pd.to_datetime('01/01/2018') )]

df_final = pd.concat(interpolated_dfs.values())

100%|████████████████████████████████████████████████████████████████████████████| 15534/15534 [00:37<00:00, 418.26it/s]


In [7]:
# show only df_SACTMIN
df_interpolated = df_final[df_final['SACTMIN'].notna()]

In [8]:
# Calculate the difference between actual and estimated waiting times.
df_interpolated['diff']= (
        df_interpolated['SPOSTMIN'] - df_interpolated['SACTMIN'])
# Clean the dataset
df_interpolated = ( df_interpolated[ 
                           ( df_interpolated['diff'].notna() ) & 
                           ( df_interpolated['SPOSTMIN']>=0 ) & 
                           (df_interpolated['diff']>=-100 ) & 
                           (df_interpolated['diff']<=100 ) ] )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_interpolated['diff']= (


In [9]:
# df_interpolated  # 22026 rows × 5 columns

In [13]:
# Visualize differences between actual and estimated waiting times for attractions by choise.

# Widget for selecting attractions
attraction_selector = widgets.SelectMultiple(
    options=list_attractions,
    value=list_attractions[:1],  # Default to first attraction
    description='Attractions',
    disabled=False  )

# Define function to update plots based on selected attractions
def update_plots(selected_attractions):
    # Filter the dataframe based on selected attractions
    df_query = df_interpolated[df_interpolated['attraction'].isin(selected_attractions)]
    
    # Create a 2x2 grid for the subplots
    fig, axs = plt.subplots(2, 2, figsize=(8, 8))
    
    # Plot 1: Histogram of 'diff'
    sns.histplot(data=df_query['diff'], bins=30, ax=axs[0, 0])
    axs[0, 0].axvline(color='r')
    axs[0, 0].axvline(df_query['diff'].mean(), color='b')
    axs[0, 0].set_title('Histogram of Differences between Interpolated SPOSTMIN and SACTMIN', wrap=True)
    
    # Plot 2: Scatter plot with regression line
    sns.regplot(data=df_query, x='SACTMIN', y='SPOSTMIN', scatter=True, line_kws={'color':'r'}, ax=axs[0, 1])
    axs[0, 1].set_title('Regression Plot', wrap=True)
    
    # Plot 3: 2D Histogram
    c = axs[1, 0].hist2d(df_query['SACTMIN'], df_query['SPOSTMIN'], bins=10, cmap='Blues')
    plt.colorbar(c[3], ax=axs[1, 0], label='Frequency')  # Corrected colorbar usage
    axs[1, 0].set_xlabel('SACTMIN')
    axs[1, 0].set_ylabel('SPOSTMIN')
    axs[1, 0].set_title('2D Histogram of Differences', wrap=True)
    
    # Plot 4: Correlation Heatmap
    correlation_matrix = df_query[['SACTMIN', 'SPOSTMIN']].corr()
    sns.heatmap(correlation_matrix, annot=True, fmt='.2f', ax=axs[1, 1])
    axs[1, 1].set_title('Correlation Heatmap', wrap=True)
    
    # Adjust layout to prevent overlap
    plt.tight_layout()
    plt.show()

# Make the widget interactive
interact(update_plots, selected_attractions=attraction_selector)
None

interactive(children=(SelectMultiple(description='Attractions', index=(0,), options=('7_dwarfs_train', 'alien_…

In [11]:
# Compare actual waiting times with calculated waiting times for 1 specific attraction and for 1 specific day of choise.

list_obs_att=['pirates_of_caribbean', 'soarin', 'big_thunder_mtn', 'haunted_mansion', 'toy_story_mania']
list_obs_date=['2018-09-08', '2018-12-02', '2018-11-27', '2018-08-24', '2018-09-09']

# Widget for selecting date
date_selector = widgets.Dropdown(
    options=list_obs_date,
    value='2018-09-08',  # Default value
    description='Date:',
    disabled=False  )

# Widget for selecting attractions (can select multiple attractions)
attraction_selector = widgets.SelectMultiple(
    options=list_obs_att,
    value=['pirates_of_caribbean', 'soarin'],  # Default values
    description='Attractions:',
    disabled=False  )

# Define function to update plots based on selected date and attractions
def update_plots(selected_date, selected_attractions):
    # Convert selected date to datetime object
    temp_date = pd.to_datetime(selected_date, format='%Y-%m-%d')
    
    # Ensure two attractions are selected
    if len(selected_attractions) != 2:
        print("Please select exactly two attractions.")
        return
    
    temp_attraction1, temp_attraction2 = selected_attractions

    # Filter the dataframe for the selected date and attractions
    df_temp1 = df_interpolated.query('attraction==@temp_attraction1 and date==@temp_date')
    df_temp2 = df_interpolated.query('attraction==@temp_attraction2 and date==@temp_date')

    # Create plots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    # Plot for the first attraction
    df_temp1.plot(y=['SPOSTMIN', 'SACTMIN'], marker='o', ax=ax1)
    ax1.set_title(f'Actual vs Posted Waiting Time for {temp_attraction1} on {temp_date.date()}', wrap=True)
    ax1.set_xlabel('Datetime')
    ax1.set_ylabel('Waiting Minutes')

    # Plot for the second attraction
    df_temp2.plot(y=['SPOSTMIN', 'SACTMIN'], marker='o', ax=ax2)
    ax2.set_title(f'Actual vs Posted Waiting Time for {temp_attraction2} on {temp_date.date()}', wrap=True)
    ax2.set_xlabel('Datetime')
    ax2.set_ylabel('Waiting Minutes')

    # Adjust layout to prevent overlap
    plt.tight_layout()
    plt.show()

# Make the widget interactive
interact(update_plots, selected_date=date_selector, selected_attractions=attraction_selector)
None

interactive(children=(Dropdown(description='Date:', options=('2018-09-08', '2018-12-02', '2018-11-27', '2018-0…

In [59]:
# Dataframe selector
# q_date= pd.to_datetime('2015-09-12', format='%Y-%m-%d')
# q_attraction= 'astro_orbiter'
# df_interpolated.query( ' date==@q_date and attraction==@q_attractions' )

Unnamed: 0_level_0,date,SACTMIN,SPOSTMIN,attraction,diff
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-09-12 10:32:48,2015-09-12,13.0,10.0,astro_orbiter,-3.0
2015-09-12 16:31:31,2015-09-12,6.0,-55.978448,astro_orbiter,-61.978448
