In [54]:
# imports
import pandas as pd
import numpy as np

## Indicators

In [2]:
# first of all we deserialize our dataframe
df = pd.read_csv("../dataset/customer_supermarket_sale_qta_merge.csv")
print(df.head())

   Unnamed: 0  BasketID           BasketDate  ProdID  Qta  Sale  CustomerID  \
0           0    536365  2010-01-12 08:26:00   21730    6  4.25       17850   
1           1    536365  2010-01-12 08:26:00   22752    2  7.65       17850   
2           2    536365  2010-01-12 08:26:00   71053    6  3.39       17850   
3           3    536365  2010-01-12 08:26:00  84029E    6  3.39       17850   
4           4    536365  2010-01-12 08:26:00  84029G    6  3.39       17850   

  CustomerCountry                            ProdDescr  
0  United Kingdom    GLASS STAR FROSTED T-LIGHT HOLDER  
1  United Kingdom         SET 7 BABUSHKA NESTING BOXES  
2  United Kingdom                  WHITE METAL LANTERN  
3  United Kingdom       RED WOOLLY HOTTIE WHITE HEART.  
4  United Kingdom  KNITTED UNION FLAG HOT WATER BOTTLE  


# I - indicator
the total number of items purchased by a customer during the period of
observation.

In [66]:
df_i = df.groupby("CustomerID").sum().reset_index()
df_i = df_I[["CustomerID", "Qta"]]
df_i

Unnamed: 0,CustomerID,Qta
0,12346,0
1,12347,2458
2,12348,2341
3,12349,631
4,12350,197
...,...,...
4367,18280,45
4368,18281,54
4369,18282,98
4370,18283,1357


In [6]:
#!! look at this, it might be an outlier
print(df[df.CustomerID == 12346])

       Unnamed: 0  BasketID           BasketDate ProdID    Qta  Sale  \
37034       37034    541431  2011-01-18 10:01:00  23166  74215  1.04   
37039       37039    541433  2011-01-18 10:17:00  23166 -74215  1.04   

       CustomerID CustomerCountry                       ProdDescr  
37034       12346  United Kingdom  MEDIUM CERAMIC TOP STORAGE JAR  
37039       12346  United Kingdom  MEDIUM CERAMIC TOP STORAGE JAR  


# Iu - indicator
the number of distinct items bought by a customer in the period of
observation.

In [74]:
df_iu = df.groupby('CustomerID')['ProdID'].nunique().reset_index()
df_iu

Unnamed: 0,CustomerID,ProdID
0,12346,1
1,12347,103
2,12348,22
3,12349,73
4,12350,17
...,...,...
4367,18280,10
4368,18281,7
4369,18282,12
4370,18283,263


# Imax - indicator
the maximum number of items purchased by a customer during a
shopping session

In [73]:
df_imax = df.groupby(["CustomerID", "BasketID"]).Qta.sum()
df_imax = df_imax.groupby(level=0).head(1).reset_index()

#df_imax = df_imax.max(level=0)

df_imax

Unnamed: 0,CustomerID,BasketID,Qta
0,12346,541431,74215
1,12347,537626,319
2,12348,539318,1254
3,12349,577609,631
4,12350,543037,197
...,...,...,...
4367,18280,545712,45
4368,18281,556464,54
4369,18282,562525,75
4370,18283,540350,61


# E - indicator
the Shannon entropy on the purchasing behaviour of the customer

In [72]:
df_temp = df.groupby(["CustomerID", "BasketID"]).Qta.sum().reset_index()
values = df_temp["Qta"]
df_temp['Entropy'] = -(values*np.log(values))
df_entropy = df_temp.groupby('CustomerID')['Entropy'].sum().reset_index()
# to remove nan values caused by logs
df_entropy['Entropy'] = df_entropy['Entropy'].fillna(0)


df_entropy

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,CustomerID,Entropy
0,12346,-832300.560951
1,12347,-14642.853340
2,12348,-15464.124169
3,12349,-4068.249999
4,12350,-1040.791135
...,...,...
4367,18280,-171.299812
4368,18281,-215.405139
4369,18282,-417.113335
4370,18283,-6354.286026


## Putting together all indicators

In [83]:
frames = [df_entropy, df_imax.Qta, df_iu.ProdID, df_i.Qta]
indicators = pd.concat(frames, join='outer', axis=1)
indicators.columns = ("CustomerID", "Entropy", "imax", "iu", "i")
indicators

indicators.to_csv("../dataset/indicators.csv")