In [1]:
import sys

import pandas as pd
from datetime import datetime, date, timedelta, time
import seaborn as sns
import pickle
import warnings
import random
from faker import Faker

f = Faker()
name = f.name()

warnings.filterwarnings('ignore')

In [3]:
with open('./data/full_df.bin', 'rb') as f:
    df = pickle.load(f)

In [4]:
df.head()

Unnamed: 0,customer_no,timestamp,before,shopping_duration,after,first_or_following,hour
0,mon_1,2019-09-02 07:03:00,dairy,120,dairy,first,7
1,mon_1,2019-09-02 07:04:00,dairy,120,checkout,following,7
2,mon_1,2019-09-02 07:05:00,checkout,120,,following,7
3,mon_2,2019-09-02 07:03:00,dairy,180,dairy,first,7
4,mon_2,2019-09-02 07:04:00,dairy,180,dairy,following,7


### 1. Calculate the transition probabilities

In [5]:
probabilities = pd.crosstab(df['after'], df['before'], normalize=0)
probabilities

In [6]:
probabilities

before,dairy,drinks,fruit,spices
after,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
checkout,0.247011,0.2818,0.344124,0.127065
dairy,0.837651,0.006781,0.077976,0.077592
drinks,0.121286,0.678201,0.081207,0.119306
fruit,0.089513,0.086381,0.766417,0.057688
spices,0.185305,0.171707,0.130708,0.512279


In [7]:
# Save Transition Probabilities
with open('./data/probabilities.bin', 'wb') as f:
    pickle.dump(probabilities, f)

In [8]:
# Add totals to row and
total = probabilities.append(pd.Series(probabilities.sum(numeric_only=True), index=probabilities.columns, name='Total'))
total['Total'] = total.sum(axis=1)
total

before,dairy,drinks,fruit,spices,Total
after,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
checkout,0.247011,0.2818,0.344124,0.127065,1.0
dairy,0.837651,0.006781,0.077976,0.077592,1.0
drinks,0.121286,0.678201,0.081207,0.119306,1.0
fruit,0.089513,0.086381,0.766417,0.057688,1.0
spices,0.185305,0.171707,0.130708,0.512279,1.0
Total,1.480767,1.224869,1.400432,0.893931,5.0


### 2. Initial State Probablities

In [25]:
df_first = df[df["first_or_following"] == "first"]

initial_state = df_first["before"].value_counts()

initial_state_probabilities = dict()

for i, j in zip(initial_state, initial_state.index):
    initial_state_probabilities[j] = round(i / initial_state.sum(), 2)

initial_state_probabilities

{'fruit': 0.38, 'dairy': 0.29, 'spices': 0.18, 'drinks': 0.15}

In [38]:
# Save Initial State Probabilities
with open('./data/initial_state.bin', 'wb') as f:
    pickle.dump(initial_state_probabilities, f)

### 3. Writing a Customer class

In [39]:
class Customer:
    """
    a single customer that moves through the supermarket
    in a MCMC simulation
    """
    def __init__(self, name, initial_state_probs, transition_probs, budget=100):
        self.name = name
        self.state = self._initial_state(initial_state_probs)
        self.budget = budget
        self.transition_probs = transition_probs

    def __repr__(self):
        return f'<Customer {self.name} in {self.state}>'

    def _initial_state(self, initial_state_probs):
        areas = [i for i in initial_state_probs.keys()]
        probs = [i for i in initial_state_probs.values()]

        return random.choices(areas, weights=probs, k=1)[0]

    def next_state(self):
        '''
        Propagates the customer to the next state.
        Returns nothing.
        '''
        if self.state != "checkout":
            # Slice the column with the probabilities for the next area
            column = self.transition_probs.loc[:,self.state]
            # Remove the row with probability 0
            column = column[column.iloc[:] > 0]

            list_state = list()
            list_prop = list()

            # Extract the name of the area and the probability and add the two values to different lists
            for i, j in zip(column.index, column.iloc[:]):
                list_state.append(i)
                list_prop.append(round(j, 2))

            # Pass the list to random.choices and generate a new state
            self.state = random.choices(list_state, weights=list_prop, k=1)[0]
        else:
            sys.exit("The customer has already left the supermarket")


    def is_active(self):
        """Returns True if the customer has not reached the checkout yet."""
        return self.state != "checkout"

In [37]:
def test_probability_matrix(no_of_customers, initial_state_probs, probability_matrix):

    sum_of_areas_visited = 0

    for _ in range(no_of_customers):

        cust = Customer("Mrs. X", initial_state_probs, probability_matrix)

        while cust.is_active():
            cust.next_state()
            sum_of_areas_visited += 1

    print(f"With this probability matrix visits the average customer {round(sum_of_areas_visited/no_of_customers, 1)} areas before checkout.\n"
          f"In the original data average customers visit 3.3 areas before checkout.")

test_probability_matrix(1000, initial_state_probabilities, probabilities)

With this probability matrix visits the average customer 5.1 areas before checkout.
In the original data average customers visit 3.3 areas before checkout.


In [50]:
cust1 = Customer("Jake", initial_state_probabilities, probabilities, 50)

In [None]:
cust2 = Customer("Margaret", "spices", probabilities)

for i in range(50):
    print(cust2)
    cust2.next_state()