In [1]:
import pandas as pd
import numpy as np

#calling libraries for Market Basket Analysis
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
bakery_data = pd.read_csv('../data/clean_data/bakery_data.csv')
bakery_data.drop('Unnamed: 0', axis=1, inplace=True)

In [3]:
display(bakery_data)

Unnamed: 0,ticket_number,date,time,article,quantity,unit_price,article_total
0,150040.0,2021-01-02,08:38,baguette,1.0,1.00,1.00
1,150040.0,2021-01-02,08:38,pain_au_chocolat,3.0,1.25,3.75
2,150041.0,2021-01-02,09:14,pain_au_chocolat,2.0,1.25,2.50
3,150041.0,2021-01-02,09:14,pain,1.0,1.15,1.15
4,150042.0,2021-01-02,09:25,traditional_baguette,5.0,1.20,6.00
...,...,...,...,...,...,...,...
208072,288911.0,2022-09-30,18:52,campagne,2.0,1.90,3.80
208073,288911.0,2022-09-30,18:52,traditional_baguette,5.0,1.20,6.00
208074,288911.0,2022-09-30,18:52,boule_200g,1.0,1.10,1.10
208075,288912.0,2022-09-30,18:55,traditional_baguette,1.0,1.20,1.20


In [4]:
# group the DataFrame by "ticket_number" and "article" columns and calculate the count of each transaction in a ticket_number
grouped_bakery_data = bakery_data.groupby(["ticket_number", "article"]).size().reset_index(name="count")

# pivot the DataFrame to have 'articles' as columns and sum the counts
pivot_bakery_data = grouped_bakery_data.pivot_table(index="ticket_number", columns="article", values="count", aggfunc="sum", fill_value=0)

# print the aggregated DataFrame
display(pivot_bakery_data)

article,12_macaron,armoricain,baguette,baguette_apero,baguette_graine,banette,banettine,boisson_33cl,bottereau,boule_200g,...,tartelette_cocktail,tartelette_fraise,traditional_baguette,triangles,trois_chocolat,tropezienne,tropezienne_framboise,tulipe,viennoise,vik_bread
ticket_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
150040.0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150041.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150042.0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
150043.0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150044.0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288908.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
288910.0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
288911.0,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
288912.0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [5]:
# encoding the values to 1, if equal or greater than 1 and to 0 if smaller or equal to zero, to start applying Apriori Algorithm

def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = pivot_bakery_data.applymap(encode_units)

In [6]:
display(basket_sets)

article,12_macaron,armoricain,baguette,baguette_apero,baguette_graine,banette,banettine,boisson_33cl,bottereau,boule_200g,...,tartelette_cocktail,tartelette_fraise,traditional_baguette,triangles,trois_chocolat,tropezienne,tropezienne_framboise,tulipe,viennoise,vik_bread
ticket_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
150040.0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150041.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150042.0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
150043.0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150044.0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288908.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
288910.0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
288911.0,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
288912.0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [7]:
#creating frequent_itemsets with a minimun support of 0.004. The support is kept low to find more relationships and possibly more interesting ones

frequent_itemsets = apriori(basket_sets, min_support=0.004, use_colnames=True)



In [8]:
# creating itemsets column with the length of its values

frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x : len(x))

In [9]:
#showing just the itemsets with lenght inferior to 2

frequent_itemsets[frequent_itemsets['length'] < 2]

Unnamed: 0,support,itemsets,length
0,0.11274,(baguette),1
1,0.011061,(baguette_graine),1
2,0.111183,(banette),1
3,0.020788,(banettine),1
4,0.010772,(boisson_33cl),1
5,0.019817,(boule_200g),1
6,0.030092,(boule_400g),1
7,0.012218,(brioche),1
8,0.010527,(cafe_ou_eau),1
9,0.02878,(campagne),1


In [10]:
# showing just the itemsets with lenght equal or greater than 2

frequent_itemsets[frequent_itemsets['length'] >= 2]

Unnamed: 0,support,itemsets,length
42,0.006583,"(baguette, banette)",2
43,0.010898,"(croissant, baguette)",2
44,0.008859,"(baguette, pain_au_chocolat)",2
45,0.015042,"(traditional_baguette, baguette)",2
46,0.007087,"(croissant, banette)",2
47,0.005405,"(banette, pain_au_chocolat)",2
48,0.00883,"(traditional_baguette, banette)",2
49,0.00516,"(traditional_baguette, banettine)",2
50,0.004181,"(traditional_baguette, boule_200g)",2
51,0.007132,"(boule_400g, traditional_baguette)",2


In [11]:
# creating rule sets, with a minimum lift of 3 to keep number of rules manageable and just show the more interesting ones
# ordering them in ascending order

rules = association_rules(frequent_itemsets, metric="lift", min_threshold=3)
rules.sort_values(by=['lift'], ignore_index=True)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(pain_aux_raisins),(croissant),0.014672,0.084419,0.005612,0.382516,4.531142,0.004374,1.482761,0.790909
1,(croissant),(pain_aux_raisins),0.084419,0.014672,0.005612,0.066479,4.531142,0.004374,1.055497,0.851159
2,(pain_au_chocolat),(pain_aux_raisins),0.077688,0.014672,0.005849,0.075293,5.131901,0.00471,1.065558,0.872959
3,(pain_aux_raisins),(pain_au_chocolat),0.014672,0.077688,0.005849,0.398686,5.131901,0.00471,1.533828,0.817129
4,(pain_au_chocolat),"(croissant, traditional_baguette)",0.077688,0.036327,0.016755,0.215669,5.936895,0.013933,1.228657,0.901606
5,"(croissant, traditional_baguette)",(pain_au_chocolat),0.036327,0.077688,0.016755,0.461224,5.936895,0.013933,1.711867,0.862909
6,(croissant),(pain_au_chocolat),0.084419,0.077688,0.039797,0.471415,6.068065,0.033238,1.744869,0.912211
7,(pain_au_chocolat),(croissant),0.077688,0.084419,0.039797,0.512263,6.068065,0.033238,1.8772,0.905553
8,(pain_au_chocolat),"(croissant, baguette)",0.077688,0.010898,0.005241,0.067468,6.190833,0.004395,1.060663,0.909097
9,"(croissant, baguette)",(pain_au_chocolat),0.010898,0.077688,0.005241,0.480952,6.190833,0.004395,1.776932,0.847709
