# EDA: Rewe simulation

## Importing

In [97]:
import pandas as pd
import plotly.express as px
import os
import numpy as np

In [98]:
df = pd.read_csv("../output/sim2.csv", index_col=0, names=["customer name", "location"],parse_dates=True, sep=",")
df


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



Unnamed: 0,customer name,location
2023-06-30 07:01:00,Gregory Decker,dairy
2023-06-30 07:01:00,Diane Williams,fruit
2023-06-30 07:01:00,Lindsay Henderson,drinks
2023-06-30 07:02:00,Gregory Decker,spices
2023-06-30 07:02:00,Lindsay Henderson,fruit
...,...,...
2023-06-30 21:58:00,Nathan Simmons,fruit
2023-06-30 21:59:00,Jenna Garcia,dairy
2023-06-30 21:59:00,Marissa Hunter,checkout
2023-06-30 21:59:00,John Cisneros,fruit


modify the customer_no to be unique throughout the days

In [99]:
df["customer_no"] = df.groupby("customer name").ngroup(ascending=True)

In [100]:
df

Unnamed: 0,customer name,location,customer_no
2023-06-30 07:01:00,Gregory Decker,dairy,563
2023-06-30 07:01:00,Diane Williams,fruit,452
2023-06-30 07:01:00,Lindsay Henderson,drinks,941
2023-06-30 07:02:00,Gregory Decker,spices,563
2023-06-30 07:02:00,Lindsay Henderson,fruit,941
...,...,...,...
2023-06-30 21:58:00,Nathan Simmons,fruit,1159
2023-06-30 21:59:00,Jenna Garcia,dairy,659
2023-06-30 21:59:00,Marissa Hunter,checkout,991
2023-06-30 21:59:00,John Cisneros,fruit,719


create time related features for easier filtering

In [101]:
df["day"]  = df.index.day
df["day_name"] = df.index.day_name()
df["hour"] = df.index.hour
df["min"]  = df.index.minute
df 

Unnamed: 0,customer name,location,customer_no,day,day_name,hour,min
2023-06-30 07:01:00,Gregory Decker,dairy,563,30,Friday,7,1
2023-06-30 07:01:00,Diane Williams,fruit,452,30,Friday,7,1
2023-06-30 07:01:00,Lindsay Henderson,drinks,941,30,Friday,7,1
2023-06-30 07:02:00,Gregory Decker,spices,563,30,Friday,7,2
2023-06-30 07:02:00,Lindsay Henderson,fruit,941,30,Friday,7,2
...,...,...,...,...,...,...,...
2023-06-30 21:58:00,Nathan Simmons,fruit,1159,30,Friday,21,58
2023-06-30 21:59:00,Jenna Garcia,dairy,659,30,Friday,21,59
2023-06-30 21:59:00,Marissa Hunter,checkout,991,30,Friday,21,59
2023-06-30 21:59:00,John Cisneros,fruit,719,30,Friday,21,59


In [102]:
sim_df = df.copy()
%store sim_df 

Stored 'sim_df' (DataFrame)


## Q1: total number of customers in each section?

In [80]:
df.groupby(df["location"]).nunique()["customer_no"]

location
checkout    1554
dairy        779
drinks       707
fruit        917
spices       643
Name: customer_no, dtype: int64

## Q2, Q3: total number of customers in each section over time and total number of customers in checkout over time

In [81]:
plotdf = df.groupby([df["day"], df["day_name"],df["hour"], df["location"]]).nunique()["customer_no"].reset_index()
plotdf

Unnamed: 0,day,day_name,hour,location,customer_no
0,30,Friday,7,checkout,90
1,30,Friday,7,dairy,57
2,30,Friday,7,drinks,40
3,30,Friday,7,fruit,51
4,30,Friday,7,spices,33
...,...,...,...,...,...
70,30,Friday,21,checkout,101
71,30,Friday,21,dairy,55
72,30,Friday,21,drinks,42
73,30,Friday,21,fruit,62


In [82]:
fig=px.line(
    plotdf,
    x='hour',
    y='customer_no',
    color='location',
    animation_frame='day_name',
    markers=True,
    labels={
        "customer_no": "total # of customers",
        "day_name": "weekday"
    },
    template='plotly_dark'
    )

fig.update_layout(autosize=False, width=1400, height=700)

fig.show()

if not os.path.exists("../plots"):
    os.mkdir("../plots")

if not os.path.exists("../plots/customers.html"):
    fig.write_html("../plots/customers.html")

In [83]:
df = df.drop(columns=(["day", "day_name", "hour", "min"])).reset_index()


In [84]:
plotser = (df.groupby("customer_no").max()["index"] - df.groupby("customer_no").min()["index"]).apply(lambda x: x.seconds//60)
plotser = plotser[plotser > 0]


In [85]:
fig = px.histogram(
    plotser,
    labels={
        "value": "minutes spent in market",
    },
    template='plotly_dark'
    )

fig.update_layout(showlegend=False, autosize=False, width=1400, height=700, yaxis_title="# of customers")

fig.show()

if not os.path.exists("../plots"):
    os.mkdir("../plots")

if not os.path.exists("../plots/time_spent.svg"):
    fig.write_image("../plots/time_spent.svg")


In [86]:
plotser.mean()

8.9781631342325

In [89]:
mu = plotdf[(plotdf["location"]=="checkout") & (plotdf["hour"] != 8) & (plotdf["hour"] != 19)]["customer_no"].mean()/60
sigma = plotdf[(plotdf["location"]=="checkout") & (plotdf["hour"] != 8) & (plotdf["hour"] != 19)]["customer_no"].std()/60
mu, sigma

(1.6589743589743589, 0.07625960310365552)

In [90]:
mu_peak = plotdf[(plotdf["location"]=="checkout") & ((plotdf["hour"] == 8) | (plotdf["hour"] == 19))]["customer_no"].mean()/60
sigma_peak = plotdf[(plotdf["location"]=="checkout") & ((plotdf["hour"] == 8) | (plotdf["hour"] == 19))]["customer_no"].std()/60
mu_peak, sigma_peak

(2.35, 0.0)