In [2]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from tqdm import tqdm
import math

## Data cleaning

The following cells will improve the DF, which presents inconsistency, missing values and outliers, thanks to consideration done during the data understanding phase

In [2]:
# load df
df = pd.read_csv("../dataset/customer_supermarket_understanding.csv", index_col=0, parse_dates=["BasketDate"], decimal=",")
df.Sale = df.Sale.astype(float)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


For every canceled basket ('C'+'BasketID') check if exists at least one counterpart

In [None]:
df_check = df[(df['BasketID'].str.contains('C')) & (df['ProdID'] != 'D')][['CustomerID','Qta','ProdID']]
for index, col in  df_check.iterrows():
    if df[(df['CustomerID'] == col[0]) & (df['Qta'] == -col[1]) & (df['ProdID'] == col[2])].shape[0] == 0: 
        print(True)
        break

Remove from the dataset the rows with canceled basket and possible counterpart (if there are more then one counterpart, the first will be deleted)

In [9]:
df_canceled_basket = df[(df['BasketID'].str.contains('C')) & (df['ProdID'] != 'D')]

rows_with_counterparts = []
rows_without_counterparts = []

for index, col in df_canceled_basket.iterrows():
    df_temp = df[(df['CustomerID'] == col['CustomerID']) & (df['Qta'] == -col['Qta']) & (df['ProdID'] == col['ProdID'])]
    
    if df_temp.shape[0] == 0: 
        rows_without_counterparts.append(index)
    else:
        rows_with_counterparts.append(index)
        rows_with_counterparts.append(df_temp.index[0])
    
'''
print("DF len before dropping rows with counterparts: ", len(df))
df_canceled_basket = df.drop(df.index[rows_with_counterparts])
print("DF len after deleting rows with counterparts: ", len(df_canceled_basket))

df_canceled_basket = df_canceled_basket.drop(df_canceled_basket.index[rows_without_counterparts])
print("DF len after deleting rows without counterparts: ", len(df_canceled_basket))
'''

rows_to_be_dropped = rows_with_counterparts + rows_without_counterparts

df_canceled_basket = df.drop(df.index[rows_to_be_dropped])

print("Total number of dropped rows: ", len(df)-len(df_canceled_basket))

Total number of dropped rows:  11775


In [None]:
df = df_canceled_basket

In [None]:
# inconsistency resolution

def inconsistency_resolver(path,col1,col2):
    
    with open(path, 'r') as f:
        inconsistent_list = json.load(f)
        
    df_inconsistent = df[df[col1].isin(inconsistent_list)]

    df_grouped = df_inconsistent.groupby([col1,col2]).size().reset_index()
    
    df_grouped = df_grouped.sort_values(0, ascending=False).drop_duplicates(col1).sort_index()
    
    mydict = pd.Series(df_grouped[col2].values,index=df_grouped[col1]).to_dict()
    
    for k,v in mydict.items():
        
        df.loc[df[col1] == k, col2] = v

inconsistency_resolver("../dataset/inconsistent_CustomerID_CustomerCountry.json","CustomerID","CustomerCountry") 
inconsistency_resolver("../dataset/inconsistent_ProdID_ProdDescr.json","ProdID","ProdDescr") 

In [None]:
''' check if inconsistency has been solved
def inconsistent_set(K,V):
    
    inconsistentset = list()

    for key in df[K].unique().tolist():
        temp_df = df[df[K] == key]
        valueslist = temp_df[V].tolist()
        for value in valueslist:
            if(valueslist[0] != value):
                inconsistentset.append(key)
                break;
    return inconsistentset


# 1
ProdID_ProdDescr_IS = inconsistent_set("ProdID","ProdDescr")
            
print("Number of not consistent ProdDescr:", len(ProdID_ProdDescr_IS))

#3 
CustomerID_CustomerCountry_IS = inconsistent_set("CustomerID","CustomerCountry")
            
print("Number of not consistent CustomerCountry:", len(CustomerID_CustomerCountry_IS))
'''

DF without inconsistency serialization

In [None]:
df.to_csv("../dataset/customer_supermarket_no_inconsistency.csv", sep="\t", decimal=",")

## Indicators

In [3]:
# first of all we deserialize our dataframe
df = pd.read_csv("../dataset/customer_supermarket_no_inconsistency.csv", sep="\t", index_col=0, parse_dates=["BasketDate"], decimal=",")
# second remove outliers from df
df = df[df['Outlier'] == False]
print(df.head())
print(df.info())

   BasketID          BasketDate  ProdID  Qta  Sale  CustomerID  \
0    536365 2010-01-12 08:26:00   21730    6  4.25       17850   
1    536365 2010-01-12 08:26:00   22752    2  7.65       17850   
2    536365 2010-01-12 08:26:00   71053    6  3.39       17850   
3    536365 2010-01-12 08:26:00  84029E    6  3.39       17850   
4    536365 2010-01-12 08:26:00  84029G    6  3.39       17850   

  CustomerCountry                            ProdDescr  Outlier  
0  United Kingdom    GLASS STAR FROSTED T-LIGHT HOLDER    False  
1  United Kingdom         SET 7 BABUSHKA NESTING BOXES    False  
2  United Kingdom                  WHITE METAL LANTERN    False  
3  United Kingdom       RED WOOLLY HOTTIE WHITE HEART.    False  
4  United Kingdom  KNITTED UNION FLAG HOT WATER BOTTLE    False  
<class 'pandas.core.frame.DataFrame'>
Int64Index: 387641 entries, 0 to 387880
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   ----

# I - indicator
the total number of items purchased by a customer during the period of
observation.

In [None]:
df_i = df.groupby("CustomerID").sum().reset_index()
df_i = df_i[["CustomerID", "Qta"]]
df_i

In [None]:
df_i.sort_values(by='Qta', ascending=False)

In [None]:
#!! look at this, it might be an outlier
print(df[df.CustomerID == 14646])

# Iu - indicator
the number of distinct items bought by a customer in the period of
observation.

In [None]:
df_iu = df.groupby('CustomerID')['ProdID'].nunique().reset_index()
df_iu

# Imax - indicator
the maximum number of items purchased by a customer during a
shopping session

In [None]:
df_imax = df.groupby(["CustomerID", "BasketID"]).Qta.sum()
df_imax = df_imax.groupby(level=0).head(1).reset_index()

#df_imax = df_imax.max(level=0)

df_imax

# E - indicator
the Shannon entropy on the purchasing behaviour of the customer

In [27]:
# Entropy helper function 
import numpy as np
from scipy.stats import entropy
from math import log, e
import pandas as pd   

""" Usage: pandas_entropy(df['column1']) """

def pandas_entropy(column, base=None):
    vc = pd.Series(column).value_counts(normalize=True, sort=False)
    base = e if base is None else base
    return -(vc * np.log(vc)/np.log(base)).sum()

In [None]:
df_temp = df.groupby(["CustomerID", "BasketID"]).Qta.sum().reset_index()
print(df_temp)


for customer in df_temp.CustomerID.unique():
    customer_baskets = df_temp[df_temp.CustomerID == customer]
    print(pandas_entropy(customer_baskets["Qta"]))
    #print(df_temp[df_temp.CustomerID == customer])

### Putting together all indicators

In [None]:
frames = [df_entropy, df_imax.Qta, df_iu.ProdID, df_i.Qta]
indicators = pd.concat(frames, join='outer', axis=1)
indicators.columns = ("CustomerID", "Entropy", "imax", "iu", "i")
print(indicators.head())

indicators.to_csv("../dataset/indicators.csv")

# Additional indicators

### Customer Spending Profile 
we categorize each customer as either low, medium, or high spending according to their average expense

In [None]:
df_profile = df.groupby(["CustomerID"]).agg({"Sale":sum, "Qta":sum})

binwidth = 50
bins=range(0, 1000 + binwidth, binwidth)
print(bins)
n, bins, patches = plt.hist(df_profile.Sale, bins=bins, facecolor='blue', alpha=0.5)
plt.show()

spending_profile = pd.cut(df_profile['Sale'], bins=[0, 100, 300, df_profile.Sale.max()], include_lowest=True, labels=["low", "medium", "high"])
spending_profile

### The average cost of a basket

In [None]:
df_customer_basket_groupby_sum = df.groupby(["CustomerID",'BasketID'], as_index=False).agg({"Sale":sum})

series_customer_basket_groupby_mean = df_customer_basket_groupby_sum.groupby('CustomerID')['Sale'].mean()
df_customer_basket_groupby_mean = pd.DataFrame(df_customer_basket_groupby_mean)

binwidth = 50
bins=range(0, 400 + binwidth, binwidth)
print(bins)
n, bins, patches = plt.hist(df_customer_basket_groupby_mean.Sale, bins=bins, facecolor='blue', alpha=0.5)
plt.savefig('../output/total_receipt_price_distribution.png')
plt.show()

basket_cost_profile = pd.cut(df_customer_basket_groupby_mean['Sale'], bins=[0, 50, 200, df_customer_basket_groupby_mean.Sale.max()], include_lowest=True, labels=["low", "medium", "high"])
basket_cost_profile

### Sales per country
(is this even an indicator?)

In [None]:
df_sales_per_country = df.groupby(["CustomerCountry"])["Sale"].sum().reset_index()
print(df_sales_per_country)

In [None]:
# the pie plot is horrible because of england, that probably has some outliers
#explode=np.zeros(len(df_sales_per_country.CustomerCountry))

plt.pie(df_sales_per_country.Sale, labels=df_sales_per_country.CustomerCountry, autopct='%1.1f%%')
plt.show()

### Most bought items

In [None]:
cols_dropped = [2, 3, 6]
df_most_bought = df.groupby(["ProdID", "ProdDescr"]).sum().reset_index()
#df_most_bought = df_most_bought.drop(df_most_bought.columns[cols_dropped], axis=1)
df_most_bought = df_most_bought.sort_values(by="Qta", ascending=False)
#df_most_bought = df_most_bought.groupby(level=0).head(1).reset_index()
df_most_bought 

#print(df[df.ProdDescr == "Discount"])

In [None]:
plt.pie(df_most_bought[:10].Qta, labels=df_most_bought[:10].ProdDescr, autopct='%1.1f%%')
plt.savefig("../output/most_bought_item_piechart.png")
plt.show()

### Most bought item per country


In [None]:
#cols_dropped = [3, 4, 7]
df_mb_country = df.groupby(["ProdID", "ProdDescr", "CustomerCountry"]).sum().reset_index()
df_mb_country = df_mb_country.drop(["Sale"], axis = 1)

df_mb_country = df_mb_country[df_mb_country.groupby(["CustomerCountry"])["Qta"].transform("max") == df_mb_country["Qta"]].reset_index()

#df_mb_country = df_mb_country.groupby(["CustomerCountry"]).agg({"Qta" : "max"}).reset_index()
#print(df_mb_country.CustomerCountry.unique())


print(df_mb_country)


### Linear Graphs

In [None]:
# start and end datetimes
print(df.BasketDate.min(), df.BasketDate.max())

In [None]:
weekly = df.set_index("BasketDate").copy()
weekly = weekly.groupby(pd.Grouper(freq='M'))["Qta"].sum()

#print(weekly)

plt.figure(figsize=(16,5))
plt.plot(weekly.index, weekly, color='tab:blue', marker="o")