# Project: Monte Carlo Markov Chain Simulation

## Business goals:  

1. understand customer behavior  
2. explain customer behavior to non-data staff  
3. optimize staffing so that the queues do not get unnecessary long  

## Supermarket Area

We are using the following model supermarket with six areas: entrance, fruit, spices, dairy, drinks and checkout.

The customers can move between these areas freely. Sooner or later, they will enter the checkout area. Once they do, they are considered to have left the shop.

![Drag Racing](./project/supermarket.png)

## 8.1. Data Analysis

### Load data

In [None]:
import pandas as pd

In [None]:
from os import listdir
from os.path import isfile, join

# def load_file(day):
#     path = './project/data/'

#     df = pd.read_csv(os.path.join(path, day + '.csv'), sep=';', parse_dates=['timestamp'])
    
#     # individual dataframes with new column added to represent the day
#     df['day'] = day

#     return df

# df = load_file('monday').append(
#         load_file('tuesday').append(
#             load_file('wednesday').append(
#                 load_file('thursday').append(
#                     load_file('friday')
#                 )
#             )
#         )
# )
# df
def load_data(day):
    path = './project/data/'
    r = pd.read_csv(os.path.join(path, day + '.csv'), sep=';', parse_dates=['timestamp'], index_col=['timestamp'])
#     r['day'] = day

#     r = r.head(3).copy().append(r.tail(3).copy()) # TODO: drop me

    return r

# files = [f for f in listdir(path) if isfile(join(path, f))]

# load first file
df = load_data('monday')

# join data from all remaining files
for file in ['tuesday', 'wednesday', 'thursday', 'friday']:

    df_next = load_data(file)
    df_next['customer_no'] = df_next['customer_no'] + df['customer_no'].max()

    df = df.append(df_next)

# df.reset_index(inplace=True, drop=True)
# df

In [None]:
df[(df.index > '2019-09-02 21:48:00') & (df.index < '2019-09-03 07:07:00')].sort_values(by='timestamp')

In [None]:
# the total number of customers in each section (no unique customers)
df.groupby(by='location')['customer_no'].count()

## Fill out missing counter time

In [None]:
# When the shop closes, the remaining customers are rushed through the checkout. 
# Their checkout is not recorded, so it may look as if they stay in the market forever.

# TODO: fill out missing counter rows

In [None]:
# df.iloc[0:57]
df[df['customer_no'] == 6]

In [None]:
# Calculate the total number of customers in each section over time

# Display the number of customers at checkout over time

In [None]:
# The time each customer spent in the market
# visits = df.groupby(by='customer_no').index.agg(['min', 'max']) # TODO: find out how to aggregate by index value
# visits['duration'] = visits['max'] - visits['min']
# visits.sort_values(by='duration', ascending=False)

In [None]:
# Calculate the total number of customers in the supermarket over time.

In [None]:
# # Our business managers think that the first section customers visit follows a different pattern than the following ones. Plot the distribution of customers of their first visited section versus following sections (treat all sections visited after the first as “following”).

# df.groupby(['customer_no']).agg({'location': [' -> '.join, 'count']})

# df.groupby(['customer_no'])['location'].describe().sort_values(by='freq', ascending=False)

In [None]:
# df.groupby(['customer_no'])['timestamp'].describe()
# # .sort_values(by='freq', ascending=False)

In [None]:
### Probabilities plot

In [None]:
# # initial_state = np.array([0.4, 0.6])  # e.g. cold, hot
# initial_state = []
# for column in crosstab.columns:
#     initial_state.append(int(column == 'entrance'))

In [None]:
# state = initial_state
# lines = pd.DataFrame([state], columns=crosstab.columns)
# for i in range(0,20):
#     state = np.dot(state, crosstab.values)
#     lines = lines.append(pd.DataFrame([state], columns=crosstab.columns))
# lines = lines.reset_index()
# del lines['index']
# lines = lines.transpose()
# lines

In [None]:
# for i in range(len(lines.columns)):
#     print('i: %d;  %0.4f' % (i, lines[i].sum()))

In [None]:
# tmp = lines.transpose()
# for column in tmp.columns:
#     print('Column: %s, Sum: %0.2f' % (column, tmp[column].sum()))

In [None]:
# tmp = lines.melt(value_vars=range(0, len(lines.columns)), var_name='step', ignore_index=False).reset_index().rename(columns={'index': 'section'})
# tmp

In [None]:
# fig = px.line(tmp, x="step", y="value", color='section').show()

### Revenue Estimate

Estimate the total revenue for a customer using the following table:

| section | revenue per minute |
|---------|:--------------------:|
| fruit   | 4€                 |
| spices | 3€|
| dairy | 5€ |
| drinks | 6€ |

Which is the most profitable section according to your data?

## 8.2. Markov Chains

### Transition Probabilities

In [None]:
# # find customers who visited several different sections
# tmp = df.groupby('customer_no')['customer_no'].count()
# tmp[(tmp > 3) & (tmp <6)].sample(n=3)
# tmp = df[(df['customer_no'] == 3532) | (df['customer_no'] == 3685)].copy()
# tmp = df
# # tmp

In [None]:
transitions = df.groupby(by=['customer_no']).resample('1T').pad().drop(columns=['customer_no']).reset_index()
transitions['location_before'] = transitions.groupby(by=['customer_no'])['location'].shift(fill_value='entrance')
transitions

In [None]:
# When the shop closes, the remaining customers are rushed through the checkout. 
# Their checkout is not recorded, so it may look as if they stay in the market forever.
# Here we add last transition for such customers

last_locations = transitions.groupby(by='customer_no')[['timestamp', 'location']].last()
missing_checkouts = last_locations[last_locations['location'] != 'checkout'].copy()
missing_checkouts['timestamp'] = missing_checkouts['timestamp'] + pd.Timedelta(minutes=1)
missing_checkouts['location_before'] = missing_checkouts['location']
missing_checkouts['location'] = 'checkout'
missing_checkouts.reset_index(inplace=True)

transitions = transitions.append(missing_checkouts)

In [None]:
crosstab = pd.crosstab(transitions['location_before'], transitions['location'], normalize=0)
# crosstab['entrance'] = 0
values = dict(zip(crosstab.columns, [0] * len(crosstab.columns)))
crosstab = crosstab.reindex(sorted(crosstab.columns), axis=1)
crosstab = crosstab.reindex(sorted(crosstab.index), axis=0)
# crosstab.index.name = None
# crosstab.columns.name = None

crosstab

In [None]:
# pd.DataFrame(crosstab.to_dict())

In [None]:
crosstab.to_csv('./output/transition_matrix.csv', sep=';')

In [None]:
ar = np.arange(24)
ar.reshape(6, -1)

    ### How long users spend in the store?

In [None]:
df

In [None]:
# new df with column timestamp representing differences in first and last timestamp in the grouped table, ie. time spent in shop in minutes
# this is the time spent in the shop for each customer:
g = df.reset_index().groupby(['customer_no'])[['timestamp']]
time_in_market = g.last() - g.first()
time_in_market

In [None]:
# dummy column added :
time_in_market['counter'] = 1

# count the frequencies of the times spent in the shop:
time_in_market.groupby('timestamp').count().head()

In [None]:
# Plot frequency distribution:
time_in_market.groupby('timestamp').count().plot()

In [None]:
# df[(df['timestamp'] > '2019-09-02 21:48:00') & (df['timestamp'] < '2019-09-03 07:07:00')].sort_values(by='timestamp')

In [None]:
df.groupby(by=['timestamp', 'customer_no']).last().reset_index().sort_values(by=['customer_no'])

In [None]:
df.groupby(by=['customer_no'])['location'].shift(1).dropna()

In [None]:
_ = df.groupby(by=['timestamp', 'customer_no'])['location'].last().reset_index()
_[_['timestamp'] > '2019-09-02 21:49:00']
# _['before'] = _['location'].shift(1)
# _['after'] = _['location']
# _[_['before'].isna()]

In [None]:
# # build a transition for an every minute
# transitions = df.groupby(by=['timestamp', 'customer_no'])['location'].last().reset_index()
# transitions['before'] = transitions['location'].shift(1)
# transitions['before'].fillna('entrance', inplace=True)
# transitions['after'] = transitions['location']
# del transitions['location']
# transitions
# # transitions[transitions['before'] == 'entrance']

# # transitions = pd.DataFrame()
# # transitions['customer_no'] = tmp['customer_no']
# # transitions['before'] = tmp['location'].shift(1)
# # transitions['after'] = tmp['location']
# # transitions['before'].fillna('entrance', inplace=True)
# # transitions

In [None]:
# transitions[transitions['location_before'] == 'checkout']

### Probabilities plot

In [None]:
# initial_state = np.array([0.4, 0.6])  # e.g. cold, hot
initial_state = []
for column in crosstab.columns:
    initial_state.append(int(column == 'entrance'))

In [None]:
state = initial_state
lines = pd.DataFrame([state], columns=crosstab.columns)
for i in range(0,20):
    state = np.dot(state, crosstab.values)
    lines = lines.append(pd.DataFrame([state], columns=crosstab.columns))
lines = lines.reset_index()
del lines['index']
lines = lines.transpose()
lines

In [None]:
for i in range(len(lines.columns)):
    print('i: %d;  %0.4f' % (i, lines[i].sum()))

In [None]:
tmp = lines.transpose()
for column in tmp.columns:
    print('Column: %s, Sum: %0.2f' % (column, tmp[column].sum()))

In [None]:
tmp = lines.melt(value_vars=range(0, len(lines.columns)), var_name='step', ignore_index=False).reset_index().rename(columns={'index': 'section'})
tmp

In [None]:
px.line(tmp, x="step", y="value", color='section').show()

### MC-Simulation

In [None]:
from customer import Customer
from supermarket import Supermarket
# from clock import Clock
import datetime

In [None]:
customer1 = Customer(1, supermarket.get_entrance_section(), transitions)
customer2 = Customer(2, supermarket.get_entrance_section(), transitions)
customer3 = Customer(3, supermarket.get_entrance_section(), transitions)

# dt = datetime.datetime.strptime('2021-11-18 09:00:00', '%Y-%m-%d %H:%M:%S')
clock = Clock(current_time='2021-11-18 09:00:00')

supermarket = Supermarket(clock)
supermarket.add_new_customers((customer1, customer2, customer3))

for i in range(10):
    supermarket.next_minute()
    
supermarket.output('./output/transitions.csv')
    
    


In [None]:
["aaaaa"] + ["bbbb", "cccc"]

In [None]:
crosstab.to_dict()

In [None]:
_ = pd.read_csv('./output/transition_matrix.csv', index_col=0)
_dict = _.to_dict(orient='index')
list(_dict.keys())



In [None]:
# Data
data = [(1,2,3),
        (4,5,6),
        (7,8,9)];


# Create a DataFrame       
_ = pd.DataFrame(data, index=("R1", "R2", "R3"), columns=("C1", "C2", "C3"));

print("Contents of the DataFrame:");
print(_);

 

# Convert the DataFrame to Series
_dict = _.to_dict(orient='index');

print("DataFrame as a dictionary:");
print(_dict);

pd.DataFrame.from_records(_dict['data'], columns=_dict['columns'])

In [None]:
# # def load_file(day):
# #     path = './project/data/'

# #     df = pd.read_csv(os.path.join(path, day + '.csv'), sep=';', parse_dates=['timestamp'])
    
# #     # individual dataframes with new column added to represent the day
# # #     df['day'] = day

# #     return df

# # # dfs appended into one big df
# # total = load_file('monday').append(load_file('tuesday').append(load_file('wednesday').append(load_file('thursday').append(load_file('friday')))))

# # # new df with non-datetime index
# # time_ = total.reset_index()

# # new df with column timestamp representing differences in first and last timestamp in the grouped table, ie. time spent in shop in minutes
# # this is the time spent in the shop for each customer:

# g = df.reset_index().groupby(['customer_no'])[['timestamp']]
# time_in_market = g.last() - g.first()
# time_in_market

In [None]:
# new df with column timestamp representing differences in first and last timestamp in the grouped table, ie. time spent in shop in minutes
# # this is the time spent in the shop for each customer:

g = df.reset_index().groupby(['customer_no'])[['timestamp']]
time_in_market = g.last() - g.first()

# dummy column added :
time_in_market['counter'] = 1

# count number of customers at every second (count the frequencies of the times spent in the shop):
time_in_market.groupby('timestamp').count().head()

In [None]:
# Plot frequency distribution:
# time_in_market.groupby('timestamp').count().plot()
_ = time_in_market.groupby('timestamp').count().reset_index()
_['minutes'] = _['timestamp'].dt.total_seconds() / 60
px.line(_, x="minutes", y="counter").show()

In [None]:
import numpy as np

mylist = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]

def my_function(list_of_lists):
    inc = lambda i:i+1
    

    for _list in list_of_lists:
        yield list(map(inc, _list))
    
    return []

# my_function(mylist) should return [[2, 3, 4], [5, 6, 7], [8, 9, 10], [11, 12, 13]]

# #You can time it by using the ipython magic function %timeit
# %timeit my_function(mylist)
list(my_function(mylist))