In [1]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules
from time import time
import pandas as pd
from  os.path import join
import pickle as pkl
import json

data_path = './data/'
output_path = './output/'
def load_category(path):
    # 读取产品数据，id作为key，category和price作为value
    category = dict()
    with open(path, 'r',encoding='utf-8') as f:
        product_file = json.loads(f.read())
        for product in product_file['products']:
            category[product['id']] = {'category': product['category'],'price': product['price']}
    return category

def load_all_category(path):
    with open(path, 'r',encoding='utf-8') as f:
        product_file = json.loads(f.read())
    return product_file
# 每个产品类别与价格的映射表
price_path = join(data_path,'product_catalog.json')    
categrory_path = join(data_path,'category_all.json')
category_single = load_category(price_path)
category_all = load_all_category(categrory_path)


In [2]:
pay_sta = pkl.load(open(join(data_path,'good-refund_10G.pkl'), 'rb'))
category = pkl.load(open(join(data_path,'good-good_10G.pkl'),'rb'))
pay_sta[:5],category[:5]

(['已支付', '已退款', '已支付', '已支付', '已支付'],
 [['电子产品'], ['服装'], ['服装'], ['服装'], ['办公', '家居', '办公']])

In [3]:
transactions_status = [
    txn for 
    txn,pay in zip(category, pay_sta) if pay in {"已退款","部分退款"}
]
len(transactions_status),transactions_status[:5]

(29996203, [['服装'], ['食品', '运动户外'], ['食品', '电子产品'], ['电子产品', '电子产品'], ['食品']])

In [4]:

# 步骤 2：转换为稀疏布尔矩阵
te = TransactionEncoder()
te_result = te.fit(transactions_status).transform(transactions_status, sparse=True)
df = pd.DataFrame.sparse.from_spmatrix(te_result, columns=te.columns_)


# 步骤 3：挖掘频繁项集
frequent_itemsets = fpgrowth(df, min_support=0.005, use_colnames=True)

frequent_itemsets.to_csv(join(output_path,'frequent_status_10G.csv'), 
                      columns=['support', 'itemsets'],
                      sep=',',  # 可以改成 ',' 以便导入 Excel
                      index=False,
                      encoding='utf-8')

  df = pd.DataFrame.sparse.from_spmatrix(te_result, columns=te.columns_)


In [7]:
# 步骤 4：生成关联规则
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.4)

print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
rules['antecedents'] = rules['antecedents'].apply(lambda x: ', '.join(list(x)))
rules['consequents'] = rules['consequents'].apply(lambda x: ', '.join(list(x)))

# 保存为 txt 文件，使用制表符或其他分隔符
rules.to_csv(join(output_path,'association_refund_rules_10G.txt'), 
                      columns=['antecedents', 'consequents', 'support', 'confidence', 'lift'],
                      sep='\t',  # 可以改成 ',' 以便导入 Excel
                      index=False,
                      encoding='utf-8')
print('done')


    antecedents consequents   support  confidence      lift
0          (食品)        (服装)  0.222669    0.459554  0.942927
1          (服装)        (食品)  0.222669    0.456878  0.942927
2        (运动户外)        (食品)  0.059807    0.449651  0.928012
3        (运动户外)        (服装)  0.060165    0.452348  0.928141
4        (运动户外)      (电子产品)  0.059764    0.449334  0.928073
..          ...         ...       ...         ...       ...
103    (母婴, 服装)      (电子产品)  0.026116    0.428158  0.884335
104  (母婴, 电子产品)        (服装)  0.026116    0.431196  0.884740
105    (母婴, 办公)        (服装)  0.006967    0.416669  0.854934
106    (母婴, 办公)      (电子产品)  0.006891    0.412133  0.851236
107    (母婴, 办公)        (食品)  0.006890    0.412047  0.850403

[108 rows x 5 columns]
done
