# Project Supermarket - Markov


    

In [2]:
import pandas as pd
import seaborn as sns
import os
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# set some custom plotting options
plt.style.use('seaborn-white')

custom_theme = {
    'axes.titlelocation': 'left',
    'axes.spines.right': False,
    'axes.spines.top': False,
    'axes.titlesize': 14,
    'xaxis.labellocation': 'left',
    'yaxis.labellocation': 'bottom'    
}

for option, value in custom_theme.items():
    plt.rcParams[option] = value

---
### Read in the data

In [4]:
df = pd.read_csv('wednesday.csv', sep=";", parse_dates=["timestamp"], index_col=0)
df.location.unique()

array(['fruit', 'dairy', 'drinks', 'spices', 'checkout'], dtype=object)

---
### Make it a *regular* time series by sample and forward filling the missing entries

In [5]:
df = df.groupby('customer_no').resample('1min').ffill()
df = df.drop('customer_no', axis=1).reset_index('customer_no')
df['next_location']=df['location'].shift(-1)
df['next_location'][-1]=df['location'][-1]
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['next_location'][-1]=df['location'][-1]


Unnamed: 0_level_0,customer_no,location,next_location
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-09-04 07:00:00,1,fruit,fruit
2019-09-04 07:01:00,1,fruit,checkout
2019-09-04 07:02:00,1,checkout,dairy
2019-09-04 07:00:00,2,dairy,dairy
2019-09-04 07:01:00,2,dairy,dairy
...,...,...,...
2019-09-04 21:49:00,1530,fruit,drinks
2019-09-04 21:46:00,1531,drinks,drinks
2019-09-04 21:47:00,1531,drinks,drinks
2019-09-04 21:48:00,1531,drinks,checkout


# Detection of "thiefs" (customers that did not check out)

In [6]:
checkout=df[df.location=='checkout']
checkout.customer_no[0:1422]

timestamp
2019-09-04 07:02:00       1
2019-09-04 07:06:00       2
2019-09-04 07:11:00       3
2019-09-04 07:03:00       4
2019-09-04 07:01:00       5
                       ... 
2019-09-04 20:15:00    1418
2019-09-04 20:16:00    1419
2019-09-04 20:27:00    1420
2019-09-04 20:16:00    1421
2019-09-04 20:29:00    1422
Name: customer_no, Length: 1422, dtype: int64

In [7]:
df.tail(30)

Unnamed: 0_level_0,customer_no,location,next_location
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-09-04 21:45:00,1524,checkout,spices
2019-09-04 21:43:00,1525,spices,drinks
2019-09-04 21:44:00,1525,drinks,checkout
2019-09-04 21:45:00,1525,checkout,drinks
2019-09-04 21:43:00,1526,drinks,drinks
2019-09-04 21:44:00,1526,drinks,drinks
2019-09-04 21:45:00,1526,drinks,drinks
2019-09-04 21:46:00,1526,drinks,checkout
2019-09-04 21:47:00,1526,checkout,dairy
2019-09-04 21:44:00,1527,dairy,dairy


In [8]:
checkout=df[df.location=='checkout']
checkout

Unnamed: 0_level_0,customer_no,location,next_location
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-09-04 07:02:00,1,checkout,dairy
2019-09-04 07:06:00,2,checkout,fruit
2019-09-04 07:11:00,3,checkout,fruit
2019-09-04 07:03:00,4,checkout,dairy
2019-09-04 07:01:00,5,checkout,drinks
...,...,...,...
2019-09-04 21:43:00,1523,checkout,dairy
2019-09-04 21:45:00,1524,checkout,spices
2019-09-04 21:45:00,1525,checkout,drinks
2019-09-04 21:47:00,1526,checkout,dairy


In [9]:
df.customer_no.nunique()

1531

In [10]:
# pd.crosstab(df.groupby('customer_no')['location'], df.groupby('customer_no')['next_location'])
P = pd.crosstab(df['location'], df['next_location'],normalize=0) #normalize with respect to rows.
P.loc['checkout']=[1, 0, 0, 0, 0]
P

next_location,checkout,dairy,drinks,fruit,spices
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
checkout,1.0,0.0,0.0,0.0,0.0
dairy,0.108761,0.732216,0.057402,0.047789,0.053831
drinks,0.207885,0.012289,0.603175,0.094214,0.082437
fruit,0.215267,0.102585,0.05412,0.578756,0.049273
spices,0.154656,0.191093,0.157895,0.08664,0.409717


In [11]:
P.to_csv('crosstab_wednesday')


In [12]:
df

Unnamed: 0_level_0,customer_no,location,next_location
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-09-04 07:00:00,1,fruit,fruit
2019-09-04 07:01:00,1,fruit,checkout
2019-09-04 07:02:00,1,checkout,dairy
2019-09-04 07:00:00,2,dairy,dairy
2019-09-04 07:01:00,2,dairy,dairy
...,...,...,...
2019-09-04 21:49:00,1530,fruit,drinks
2019-09-04 21:46:00,1531,drinks,drinks
2019-09-04 21:47:00,1531,drinks,drinks
2019-09-04 21:48:00,1531,drinks,checkout


In [14]:
df.to_csv('wednesday_processed')

In [13]:
#df_dum = pd.get_dummies(df['location'])
#df_dum['customer_no']=df['customer_no']
#df_dum

In [11]:
#df_dum_next = pd.get_dummies(df['next_location'])
#df_dum_next['customer_no']=df['customer_no']
#df_dum_next