# Date and Attraction Based Exploration of Waiting Times

The purpose of this notebook is to analyse how the waiting time depends on the time 
(e.g. current month) as well as the specific attraction. It also checks how many 
attractions are opened during which time of the year. The main finding is that it is not
possible to analyse all attraction together as they have vastly different waiting time
profiles.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import calplot
import plotly.graph_objects as go
import plotly.express as px

In [None]:
waiting_times_df = pd.read_csv("../../data/processed/waiting_times.csv", index_col="id", parse_dates=["date"])

In [None]:
print("datapoints with nonnegative waiting time:", len(waiting_times_df[waiting_times_df.waiting_time >= 0]))

## Analyse mean waiting time grouped by date across all attractions

In [None]:
mean_waiting_times_df = waiting_times_df[waiting_times_df.waiting_time >= 0].groupby(
    "date").aggregate("mean")

In [None]:
sns.histplot(mean_waiting_times_df, kde=True)
plt.title("histogram of mean daily waiting time across all attractions");

In [None]:
mean_waiting_times_df.reset_index(inplace=True)
mean_waiting_times_df["weekday"] = mean_waiting_times_df.date.dt.dayofweek
mean_waiting_times_df["weekend"] = mean_waiting_times_df.weekday.apply(lambda x: x >= 5)
mean_waiting_times_df["month"] = mean_waiting_times_df.date.dt.month
mean_waiting_times_df.set_index("date", inplace=True)

In [None]:
sns.histplot(data=mean_waiting_times_df, x="waiting_time", hue="weekend", kde=True)
plt.title("histogram of mean daily waiting time across all attractions, separated by weekend or not");

In [None]:
sns.histplot(data=mean_waiting_times_df, x="waiting_time", hue="weekend", kde=True, stat="percent", common_norm=False)
plt.title("histogram of mean daily waiting time across all attractions, separated by weekend or not");

In [None]:
mean_waiting_times_df.groupby("month").agg("mean").waiting_time.plot(kind="bar", ylabel="mean waiting time (min)");

## Analyse mean waiting time grouped by date and attraction

In [None]:
mean_date_attraction_df = waiting_times_df[waiting_times_df.waiting_time >= 0].groupby(
    ["date", "attraction"]).aggregate("mean")
mean_date_attraction_df.reset_index(inplace=True)
mean_date_attraction_df["month"] = mean_date_attraction_df.date.dt.month

In [None]:
mean_date_attraction_df.head()

In [None]:
mean_date_attraction_df.date.describe(datetime_is_numeric=True)

In [None]:
px.box(mean_date_attraction_df, x="month", y="waiting_time", title="each data point is the mean waiting time of one attraction on one day, grouped by month<br>timespan: 2019/07-2021/10", width=1000, height=600).show()

In [None]:
px.box(mean_date_attraction_df, x="attraction", y="waiting_time", title="each data point is the mean waiting time of one attraction on one day, grouped by attraction<br>timespan: 2019/07-2021/10", width=1200, height=600).show()

In [None]:
heatmap_df = (
    mean_date_attraction_df[["month", "waiting_time", "attraction"]]
    .groupby(["month", "attraction"])
    .agg("mean")
    .reset_index()
    .pivot(index="month", columns="attraction")
    .droplevel(0, axis="columns")
)

for i in range(2, 5):
    heatmap_df.loc[i] = np.nan

heatmap_df.sort_index(inplace=True)
heatmap_df.index = heatmap_df.index.map(str)

In [None]:
heatmap_df

In [None]:
px.imshow(
    heatmap_df,
    width=1200,
    height=600,
    title="mean waiting time per month and attraction (timespan: 2019/07-2021/10)",
).show()


## Comparison of opening hours of different attractions over the year

In [None]:
# We aggregate by max instead of mean here because the primary goal is to see if an 
# attraction is open on that day or not
date_attraction_df = waiting_times_df[["date", "attraction", "waiting_time"]].groupby(
    by=["attraction", "date"]).agg("max")
date_attraction_df.reset_index(inplace=True)

open_date_attraction_df = date_attraction_df[date_attraction_df.waiting_time >=0]

# This dataframe contains the number of opened attractions per date
date2open_count_df = open_date_attraction_df[["date", "attraction"]].groupby("date").agg("count")

In [None]:
date2open_count_df.plot(kind="hist")
plt.title("histogram of the number of attractions opened on a given date")

In [None]:
calplot.calplot(date2open_count_df.attraction, dropzero=True, 
    suptitle="number of attractions opened at this date (i.e. having any non-negative waiting time");

In [None]:
chiapas_df = open_date_attraction_df[open_date_attraction_df.attraction == "Chiapas DIE Wasserbahn"]
chiapas_df.set_index("date", inplace=True)

In [None]:
calplot.calplot(chiapas_df.waiting_time, dropzero=True, 
    suptitle="maximum waiting time (min) per day of Chiapas DIE Wasserbahn");

In [None]:
riverquest_df = open_date_attraction_df[open_date_attraction_df.attraction == "River Quest"]
riverquest_df.set_index("date", inplace=True)

In [None]:
calplot.calplot(riverquest_df.waiting_time, dropzero=True, 
    suptitle="maximum waiting time (min) per day of River Quest");