In [18]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

In [19]:
df = pd.read_csv('Groceries_dataset.csv') #Read data dari csv file
df.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [20]:
df.shape # Mencari jumlah baris dan kolom dari data frame

(38765, 3)

In [21]:
df.columns # Mencari label dari tiap kolom

Index(['Member_number', 'Date', 'itemDescription'], dtype='object')

In [22]:
df['Date'] = pd.to_datetime(df['Date']) # format tanggal pada label "Date"
df['Member_number'].nunique() # format unik pada label "Member_number"
df.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,2015-07-21,tropical fruit
1,2552,2015-05-01,whole milk
2,2300,2015-09-19,pip fruit
3,1187,2015-12-12,other vegetables
4,3037,2015-01-02,whole milk


In [23]:
df['itemDescription'] = df['itemDescription'].str.strip() # Mengilangkan spasi pada awal atau akhir "itemDescription" jika ada
df['itemDescription'] = df['itemDescription'].str.lower() # Mengubah "itemDescription" menjadi lowercase
df.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,2015-07-21,tropical fruit
1,2552,2015-05-01,whole milk
2,2300,2015-09-19,pip fruit
3,1187,2015-12-12,other vegetables
4,3037,2015-01-02,whole milk


In [24]:
# Mapping "itemDescription" menggunakan "Member_number" dan "Date"
# hasil ini akan menampilkan "itemDescription" pada satu baris berdasarkan "Member_numbera" dan "Date"

df['itemDescription'] = df.groupby(['Member_number', 'Date'])['itemDescription'].transform(lambda x: ','.join(x))
df

Unnamed: 0,Member_number,Date,itemDescription
0,1808,2015-07-21,"tropical fruit,rolls/buns,candy"
1,2552,2015-05-01,"whole milk,tropical fruit,chocolate"
2,2300,2015-09-19,"pip fruit,other vegetables,flour"
3,1187,2015-12-12,"other vegetables,onions,shopping bags"
4,3037,2015-01-02,"whole milk,other vegetables,white bread"
...,...,...,...
38760,4471,2014-08-10,"whole milk,yogurt,sliced cheese"
38761,2022,2014-02-23,"cat food,yogurt,candy"
38762,1097,2014-04-16,"sausage,whole milk,cake bar"
38763,1510,2014-03-12,"beef,canned beer,fruit/vegetable juice"


In [25]:
# Menggunakan perulangan untuk memisahkan setiap item berdasarkan karakter ','
lst=[]
for i in range(0,len(df)-1):    
    data = df['itemDescription'][i].split(',')
    lst.append(data)

In [26]:
# Menggunakan dungsi TransactionEncoder untuk mengubah data_frame menggunakan list sebelumnya
# kemudian mengubahnya lagi menjadi data_frame
te = TransactionEncoder()
te_ary = te.fit(lst).transform(lst)
df_new = pd.DataFrame(te_ary, columns=te.columns_)
df_new

Unnamed: 0,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,beverages,bottled beer,...,uht-milk,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38759,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
38760,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
38761,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
38762,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [27]:
# Mencari nilai support dengan standar 0.02

frq_items = apriori(df_new, min_support=0.02,use_colnames=True)
frq_items

Unnamed: 0,support,itemsets
0,0.039624,(beef)
1,0.024739,(berries)
2,0.054742,(bottled beer)
3,0.069497,(bottled water)
4,0.044165,(brown bread)
5,0.041895,(butter)
6,0.020586,(butter milk)
7,0.055954,(canned beer)
8,0.033072,(chicken)
9,0.028093,(chocolate)


In [28]:
rules = association_rules(frq_items, metric = "confidence", min_threshold = 0.1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(other vegetables),(whole milk),0.137679,0.183753,0.02234,0.162263,0.883052,-0.002959,0.974348
1,(whole milk),(other vegetables),0.183753,0.137679,0.02234,0.121578,0.883052,-0.002959,0.98167
2,(whole milk),(rolls/buns),0.183753,0.127954,0.022237,0.121016,0.945782,-0.001275,0.992108
3,(rolls/buns),(whole milk),0.127954,0.183753,0.022237,0.17379,0.945782,-0.001275,0.987942
