In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
bakery_data = pd.read_csv('../data/clean_data/bakery_data.csv')
bakery_data.drop('Unnamed: 0', axis=1, inplace=True)

In [3]:
display(bakery_data)

Unnamed: 0,ticket_number,date,time,article,quantity,unit_price,article_total
0,150040.0,2021-01-02,08:38,baguette,1.0,1.00,1.00
1,150040.0,2021-01-02,08:38,pain_au_chocolat,3.0,1.20,3.60
2,150041.0,2021-01-02,09:14,pain_au_chocolat,2.0,1.20,2.40
3,150041.0,2021-01-02,09:14,pain,1.0,1.15,1.15
4,150042.0,2021-01-02,09:25,traditional_baguette,5.0,1.25,6.25
...,...,...,...,...,...,...,...
208573,288911.0,2022-09-30,18:52,campagne,2.0,1.90,3.80
208574,288911.0,2022-09-30,18:52,traditional_baguette,5.0,1.25,6.25
208575,288911.0,2022-09-30,18:52,boule_200g,1.0,1.20,1.20
208576,288912.0,2022-09-30,18:55,traditional_baguette,1.0,1.25,1.25


In [4]:
# Group the DataFrame by "transaction" and "item" columns and calculate the count of each item in a transaction
grouped_bakery_data = bakery_data.groupby(["ticket_number", "article"]).size().reset_index(name="count")

# Pivot the DataFrame to have items as columns and sum the counts
pivot_bakery_data = grouped_bakery_data.pivot_table(index="ticket_number", columns="article", values="count", aggfunc="sum", fill_value=0)


# Print the aggregated DataFrame
display(pivot_bakery_data)

article,12_macaron,armoricain,baguette,baguette_apero,baguette_graine,banette,banettine,boisson_33cl,bottereau,boule_200g,...,tartelette_cocktail,tartelette_fraise,traditional_baguette,triangles,trois_chocolat,tropezienne,tropezienne_framboise,tulipe,viennoise,vik_bread
ticket_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
150040.0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150041.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150042.0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
150043.0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150044.0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288908.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
288910.0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
288911.0,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
288912.0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [5]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = pivot_bakery_data.applymap(encode_units)

In [6]:
display(basket_sets)

article,12_macaron,armoricain,baguette,baguette_apero,baguette_graine,banette,banettine,boisson_33cl,bottereau,boule_200g,...,tartelette_cocktail,tartelette_fraise,traditional_baguette,triangles,trois_chocolat,tropezienne,tropezienne_framboise,tulipe,viennoise,vik_bread
ticket_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
150040.0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150041.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150042.0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
150043.0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150044.0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288908.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
288910.0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
288911.0,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
288912.0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [7]:
frequent_itemsets = apriori(basket_sets, min_support=0.005, use_colnames=True)



In [8]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=2)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(pain_au_chocolat),(croissant),0.077638,0.084365,0.039771,0.512263,6.071979,0.033221,1.877311,0.905619
1,(croissant),(pain_au_chocolat),0.084365,0.077638,0.039771,0.471415,6.071979,0.033221,1.744964,0.912273
2,(pain_aux_raisins),(croissant),0.014662,0.084365,0.005609,0.382516,4.534064,0.004372,1.482849,0.791046
3,(croissant),(pain_aux_raisins),0.084365,0.014662,0.005609,0.066479,4.534064,0.004372,1.055507,0.851264
4,(pain_au_chocolat),(pain_aux_raisins),0.077638,0.014662,0.005846,0.075293,5.135211,0.004707,1.065568,0.873047
5,(pain_aux_raisins),(pain_au_chocolat),0.014662,0.077638,0.005846,0.398686,5.135211,0.004707,1.533912,0.817249
6,"(pain_au_chocolat, baguette)",(croissant),0.008854,0.084365,0.005238,0.591632,7.012762,0.004491,2.24218,0.865062
7,"(baguette, croissant)",(pain_au_chocolat),0.010891,0.077638,0.005238,0.480952,6.194826,0.004393,1.777028,0.847809
8,(pain_au_chocolat),"(baguette, croissant)",0.077638,0.010891,0.005238,0.067468,6.194826,0.004393,1.060671,0.90916
9,(croissant),"(pain_au_chocolat, baguette)",0.084365,0.008854,0.005238,0.062088,7.012762,0.004491,1.056759,0.936402
