In [31]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules, apriori
from mlxtend.preprocessing import TransactionEncoder
import pyECLAT as pe
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [32]:
folder = 'C:\\Users\\JOinme\\Downloads\\Data Mining'

Read data


In [33]:
books = pd.read_csv(folder + '\\Books.csv')

# Apriori

In [34]:
# Nếu dữ liệu đã ở dạng nhị phân (0 và 1), dòng dưới đây sẽ chuyển đổi sang kiểu Boolean
apriori_data = books.applymap(lambda x: True if x == 1 else False)

apriori_data = apriori_data.astype(bool)
print(apriori_data.head())

   Child  Youth   Cook  Science  Music    Art   Geog  Sport  Tourism  \
0  False   True  False     True  False  False   True  False    False   
1   True  False  False    False  False  False  False  False    False   
2  False  False  False    False  False  False  False  False    False   
3   True   True   True    False   True  False   True  False    False   
4  False  False   True    False  False  False   True  False    False   

   Business     IT  
0     False  False  
1     False  False  
2     False  False  
3     False  False  
4     False  False  


In [35]:
# Tính các tập phổ biến có min_support = 10% 
frequent_itemsets = apriori(apriori_data, min_support=0.2, use_colnames=True)

print("Các tập phổ biến:")
print(frequent_itemsets)

Các tập phổ biến:
   support       itemsets
0   0.4230        (Child)
1   0.2475        (Youth)
2   0.4310         (Cook)
3   0.2820      (Science)
4   0.2145        (Music)
5   0.2410          (Art)
6   0.2760         (Geog)
7   0.2560  (Child, Cook)


In [36]:
# Tính số lượng itemsets dựa vào frequent_itemsets
num_itemsets_apriori = len(frequent_itemsets)

# Khởi tạo luật kết hợp dựa trên frequent_itemsets đó
rules = association_rules(frequent_itemsets, num_itemsets=num_itemsets_apriori, metric="confidence", min_threshold=0.2)


print("Luật kết hợp:")
print(rules)

Luật kết hợp:
  antecedents consequents  antecedent support  consequent support  support  \
0     (Child)      (Cook)               0.423               0.431    0.256   
1      (Cook)     (Child)               0.431               0.423    0.256   

   confidence      lift  representativity  leverage  conviction  \
0    0.605201  1.404179               1.0  0.073687    1.441240   
1    0.593968  1.404179               1.0  0.073687    1.421069   

   zhangs_metric   jaccard  certainty  kulczynski  
0       0.498856  0.428094   0.306153    0.599584  
1       0.505870  0.428094   0.296304    0.599584  


# FP-growth

In [37]:
# # Tính toán các tập hợp thường gặp bằng FP-Growth
# frequent_itemsets_fp = fpgrowth(books, min_support=0.01, use_colnames=True)

# # Kiểm tra kết quả
# print(frequent_itemsets_fp)
# # Tạo các quy tắc kết hợp
# rules_fp = association_rules(frequent_itemsets_fp, num_itemsets=num_itemsets_fp, metric="lift", min_threshold=1)

# FP-Growth Algorithm
frequent_itemsets_fp = fpgrowth(books, min_support=0.2, use_colnames=True)
num_itemsets_fp = len(frequent_itemsets_fp)
rules_fp = association_rules(frequent_itemsets_fp, num_itemsets=num_itemsets_fp, metric="confidence", min_threshold=0.2)

# Displaying results for FP-Growth
print("\n=== Frequent Itemsets (FP-Growth) ===")
fp_results = frequent_itemsets_fp[['itemsets', 'support']].sort_values(by='support', ascending=False)
print(fp_results)

print("\n=== Strong Association Rules (FP-Growth) ===")
fp_rules_results = rules_fp[['antecedents', 'consequents', 'support', 'confidence', 'lift']].sort_values(by='confidence', ascending=False)
print(fp_rules_results)



=== Frequent Itemsets (FP-Growth) ===
        itemsets  support
4         (Cook)   0.4310
3        (Child)   0.4230
0      (Science)   0.2820
1         (Geog)   0.2760
7  (Child, Cook)   0.2560
2        (Youth)   0.2475
6          (Art)   0.2410
5        (Music)   0.2145

=== Strong Association Rules (FP-Growth) ===
  antecedents consequents  support  confidence      lift
0     (Child)      (Cook)    0.256    0.605201  1.404179
1      (Cook)     (Child)    0.256    0.593968  1.404179


# ECLAT


In [38]:
# Chuyển đổi dữ liệu sang horizational format
result = books.apply(lambda row: [col for col, val in row.items() if val == 1] + [None] * (len(books.columns) - sum(row)), axis=1)

# Chuyển đổi kết quả về dataframe mới
result_df = pd.DataFrame(result.tolist(), columns=books.columns)

# Thay đổi header thành số 0, 1, 2, 3,...
result_df.columns = range(len(result_df.columns))

# Hiển thị kết quả
result_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Youth,Science,Geog,,,,,,,,
1,Child,,,,,,,,,,
2,,,,,,,,,,,
3,Child,Youth,Cook,Music,Geog,,,,,,
4,Cook,Geog,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
1995,Cook,Art,Geog,Sport,Business,IT,,,,,
1996,,,,,,,,,,,
1997,,,,,,,,,,,
1998,Cook,,,,,,,,,,


In [39]:
# Replace None with NaN
result_df = result_df.replace({None: np.nan})

# Hiển thị kết quả
result_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Youth,Science,Geog,,,,,,,,
1,Child,,,,,,,,,,
2,,,,,,,,,,,
3,Child,Youth,Cook,Music,Geog,,,,,,
4,Cook,Geog,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
1995,Cook,Art,Geog,Sport,Business,IT,,,,,
1996,,,,,,,,,,,
1997,,,,,,,,,,,
1998,Cook,,,,,,,,,,


In [42]:
eclat_instance = pe.ECLAT(data=result_df,verbose=True)
# the item shoud appear at least at 5% of transactions
min_support = 0.2
# start from transactions containing at least 2 items
min_combination = 1
# up to maximum items per transaction
max_combination = result_df.apply(lambda row: sum(row.notna()), axis=1).max()
rule_indices, rule_supports = eclat_instance.fit(min_support=min_support, min_combination=min_combination, max_combination=max_combination, separator=' & ', verbose=True)

100%|██████████| 11/11 [00:00<00:00, 275.16it/s]
100%|██████████| 11/11 [00:00<00:00, 11008.67it/s]
100%|██████████| 11/11 [00:00<00:00, 1830.70it/s]


Combination 1 by 1


7it [00:00, 170.91it/s]


Combination 2 by 2


21it [00:00, 296.00it/s]


Combination 3 by 3


35it [00:00, 294.44it/s]


Combination 4 by 4


35it [00:00, 240.88it/s]


Combination 5 by 5


21it [00:00, 266.10it/s]


Combination 6 by 6


7it [00:00, 226.04it/s]


Combination 7 by 7


1it [00:00, 166.92it/s]


Combination 8 by 8


0it [00:00, ?it/s]


Combination 9 by 9


0it [00:00, ?it/s]


Combination 10 by 10


0it [00:00, ?it/s]


Combination 11 by 11


0it [00:00, ?it/s]


In [43]:
result = pd.DataFrame(rule_supports.items(),columns=['Item', 'Support'])
result.sort_values(by=['Support'], ascending=False)

Unnamed: 0,Item,Support
0,Cook,0.431
2,Child,0.423
6,Science,0.282
3,Geog,0.276
7,Cook & Child,0.256
1,Youth,0.2475
4,Art,0.241
5,Music,0.2145
