In [2]:
# Import thư viện: pandas, numpy, apriori từ apyori
import pandas as pd
import numpy as np
from apyori import apriori

In [3]:
# Đọc dữ liệu bán lẻ từ Excel vào DataFrame df
df = pd.read_excel('Online Retail.xlsx')

In [4]:
# Lấy 5000 dòng đầu để demo (chạy nhanh)
df = df.head(5000)

In [6]:
# Xem nhanh 10 dòng đầu của dữ liệu
df.head(10)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,17850.0,United Kingdom
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.25,17850.0,United Kingdom
7,536366,22633,HAND WARMER UNION JACK,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,2010-12-01 08:34:00,1.69,13047.0,United Kingdom


In [7]:
# Kiểm tra kiểu dữ liệu các cột
df.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object

In [8]:
# Thống kê mô tả cơ bản cho các cột số
df.describe()

Unnamed: 0,Quantity,InvoiceDate,UnitPrice,CustomerID
count,5000.0,5000,5000.0,3795.0
mean,9.1858,2010-12-01 22:54:10.812000,3.792314,15906.28195
min,-9360.0,2010-12-01 08:26:00,0.0,12431.0
25%,1.0,2010-12-01 13:24:00,1.25,14606.0
50%,3.0,2010-12-01 17:06:00,2.51,15862.0
75%,10.0,2010-12-02 12:10:00,4.21,17841.0
max,2880.0,2010-12-02 18:08:00,607.49,18239.0
std,144.940788,,13.21172,1732.068892


In [9]:
# Đếm số mã sản phẩm (StockCode) khác nhau
df['StockCode'].nunique()

1595

In [10]:
# Tính tổng số lượng bán theo StockCode, sắp xếp và in top 10
df_top_sold = df.groupby('StockCode')['Quantity'].sum().sort_values(ascending=False).head(10)
print(df_top_sold)

StockCode
84077     2976
84950     1842
21915     1549
85123A     761
84029E     686
21212      676
22616      656
84879      623
21137      613
85099B     604
Name: Quantity, dtype: int64


In [11]:
# Chuẩn hóa kiểu chuỗi cho InvoiceNo/StockCode (thêm khoảng trắng)
df['InvoiceNo'] = df['InvoiceNo'].astype('str') + ' '
df['StockCode'] = df['StockCode'].astype('str') + ' '

In [12]:
# Khởi tạo danh sách transactions và lấy danh sách hóa đơn duy nhất
transactions = []
invoices = df['InvoiceNo'].unique()

In [13]:
# In số lượng hóa đơn (số giao dịch)
print(len(invoices))

300


In [14]:
# Chuyển dữ liệu thành danh sách giao dịch theo từng hóa đơn
for iv in invoices:
    items = df[df['InvoiceNo'] == iv]['StockCode'].tolist()
    transactions.append(items)

print('Total transactions: ', len(transactions))

Total transactions:  300


In [16]:
# In thử 5 giao dịch đầu để kiểm tra
for t in transactions[:5]:
    print(t)

['85123A ', '71053 ', '84406B ', '84029G ', '84029E ', '22752 ', '21730 ']
['22633 ', '22632 ']
['84879 ', '22745 ', '22748 ', '22749 ', '22310 ', '84969 ', '22623 ', '22622 ', '21754 ', '21755 ', '21777 ', '48187 ']
['22960 ', '22913 ', '22912 ', '22914 ']
['21756 ']


In [17]:
# Chạy Apriori với ngưỡng: support=0.02, confidence=0.3, lift>=3, max_length=2
apriori_rules = apriori(transactions, min_support=0.02, min_confidence=0.3, min_lift=3, max_length=2)

In [23]:
# Ép kết quả thành list và in số lượng luật/tập tìm được
apriori_rules = list(apriori_rules)
print(len(apriori_rules))

211


In [30]:
# In 5 luật đầu tiên: items, support, confidence, lift
for i, rule in enumerate(apriori_rules[:5]):
    print(f"Rule {i+1}:")
    print('Items: ', list(rule.items))
    print('Support: ', rule.support)
    for j, sub_rule in enumerate(rule.ordered_statistics):
        print(f'\tRule {i}.{j+1}:')
        print(f'\t\t{list(sub_rule.items_base)} -> {list(sub_rule.items_add)}')
        print('\t\tConfidence: ', sub_rule.confidence)
        print('\t\tLift: ', sub_rule.lift)

Rule 1:
Items:  ['15056BL ', '20679 ']
Support:  0.02
	Rule 0.1:
		['15056BL '] -> ['20679 ']
		Confidence:  0.8571428571428571
		Lift:  23.376623376623375
	Rule 0.2:
		['20679 '] -> ['15056BL ']
		Confidence:  0.5454545454545454
		Lift:  23.376623376623375
Rule 2:
Items:  ['21068 ', '20679 ']
Support:  0.023333333333333334
	Rule 1.1:
		['20679 '] -> ['21068 ']
		Confidence:  0.6363636363636364
		Lift:  11.229946524064172
	Rule 1.2:
		['21068 '] -> ['20679 ']
		Confidence:  0.411764705882353
		Lift:  11.229946524064172
Rule 3:
Items:  ['21071 ', '20679 ']
Support:  0.023333333333333334
	Rule 2.1:
		['20679 '] -> ['21071 ']
		Confidence:  0.6363636363636364
		Lift:  9.545454545454545
	Rule 2.2:
		['21071 '] -> ['20679 ']
		Confidence:  0.35000000000000003
		Lift:  9.545454545454547
Rule 4:
Items:  ['21730 ', '20679 ']
Support:  0.023333333333333334
	Rule 3.1:
		['20679 '] -> ['21730 ']
		Confidence:  0.6363636363636364
		Lift:  11.229946524064172
	Rule 3.2:
		['21730 '] -> ['20679 ']
		

In [29]:
# Sắp xếp theo support giảm dần và in 5 luật đầu
apriori_rules_sorted = sorted(apriori_rules, key=lambda x: x.support, reverse=True)
for i, rule in enumerate(apriori_rules_sorted[:5]):
    print(f"Rule {i+1}:")
    print('Items: ', list(rule.items))
    print('Support: ', rule.support)
    for j, sub_rule in enumerate(rule.ordered_statistics):
        print(f'\tRule {i}.{j+1}:')
        print(f'\t\t{list(sub_rule.items_base)} -> {list(sub_rule.items_add)}')
        print('\t\tConfidence: ', sub_rule.confidence)
        print('\t\tLift: ', sub_rule.lift)

Rule 1:
Items:  ['22633 ', '22632 ']
Support:  0.08666666666666667
	Rule 0.1:
		['22632 '] -> ['22633 ']
		Confidence:  0.6842105263157895
		Lift:  6.414473684210526
	Rule 0.2:
		['22633 '] -> ['22632 ']
		Confidence:  0.8125
		Lift:  6.414473684210526
Rule 2:
Items:  ['84029E ', '84029G ']
Support:  0.07333333333333333
	Rule 1.1:
		['84029E '] -> ['84029G ']
		Confidence:  0.7333333333333333
		Lift:  9.166666666666666
	Rule 1.2:
		['84029G '] -> ['84029E ']
		Confidence:  0.9166666666666666
		Lift:  9.166666666666666
Rule 3:
Items:  ['84029E ', '85123A ']
Support:  0.07
	Rule 2.1:
		['84029E '] -> ['85123A ']
		Confidence:  0.7000000000000001
		Lift:  6.000000000000001
	Rule 2.2:
		['85123A '] -> ['84029E ']
		Confidence:  0.6000000000000001
		Lift:  6.000000000000001
Rule 4:
Items:  ['84029G ', '85123A ']
Support:  0.07
	Rule 3.1:
		['84029G '] -> ['85123A ']
		Confidence:  0.8750000000000001
		Lift:  7.500000000000001
	Rule 3.2:
		['85123A '] -> ['84029G ']
		Confidence:  0.60000000

In [34]:
# Truy vấn luật có chứa cặp mã sản phẩm cụ thể
code1 = '84077 ' # '84029G '
code2 = '84950 '

for rule in apriori_rules:
    if code1 in rule.items and code2 in rule.items:
        print('Items: ', list(rule.items))
        print('Support: ', rule.support)
        for j, sub_rule in enumerate(rule.ordered_statistics):
            print(f'\tRule {j+1}:')
            print(f'\t\t{list(sub_rule.items_base)} -> {list(sub_rule.items_add)}')
            print('\t\tConfidence: ', sub_rule.confidence)
            print('\t\tLift: ', sub_rule.lift)