In [123]:
def evaluate_association_rules(association_rules,test_datas):
    """
    Parameters：
        association_rules：要评估的关联规则（推荐规则） list((rule_a,rule_b))
        test_data：测试用的数据集 dataframe

    Returns：
        关联规则（推荐规则）在测试集上的评分（百分比）
    """
    point_sum = 0
    miss = 0
    for rule_a,rule_b in association_rules:
        set_rule_a = set(rule_a)
        set_rule_b = set(rule_b)
        
        num_a = 0
        num_b = 0
        
        for test_data in test_datas.value:
            set_test_data = set(test_data)
            if set_rule_a.issubset(set_test_data):
                num_a += 1
                if set_rule_b.issubset(set_test_data):
                    num_b += 1
        if num_a:
            point_sum += num_b / num_a
        else:
            miss += 1
    print(miss,' rules miss')
    return point_sum / len(association_rules)
    

### 函数测试

In [125]:
# 导入所需模块
import pandas as pd
import numpy as np
import json
import fp_growth_py3 as fpg
import matplotlib.pyplot as plt
import pylab as pl
from collections import defaultdict

In [126]:
# 读入测试数据集
filepath='./test_data/user_following_animation.json'
data=pd.read_json(filepath,lines=True)

将测试数据集 划分为training_data与evaluate_data两部分，占比可调

In [127]:
training_data = data.sample(n=None, frac=0.9, replace=False, weights=None, random_state=None, axis=None)

In [128]:
print(len(training_data))
training_data.head()

9305


Unnamed: 0,db,key,size,ttl,type,value
2746,7,60058,6,-1,set,[139332]
2513,7,60289211,139,-1,set,"[1700, 2069, 2543, 2576, 2649, 3042, 3863, 505..."
6144,7,233059693,5,-1,set,[23432]
1153,7,3800384,16,-1,set,"[5473, 5978, 6433, 8752]"
3359,7,144956830,48,-1,set,"[1733, 2572, 3365, 6360, 7452, 8752, 25732, 10..."


In [129]:
evaluate_data = data.drop([x for x in training_data.index])

In [130]:
print(len(evaluate_data))
evaluate_data.head()

1034


Unnamed: 0,db,key,size,ttl,type,value
5,7,175812420,33,-1,set,"[1407, 5739, 6352, 6360, 6652, 12872, 11651971]"
7,7,3801571,427,-1,set,"[297, 311, 522, 616, 643, 685, 761, 762, 763, ..."
9,7,2265619,304,-1,set,"[53, 207, 297, 969, 1382, 1407, 1463, 1523, 16..."
11,7,36562891,20,-1,set,"[1586, 2543, 3287, 3461, 5997]"
24,7,38115290,409,-1,set,"[2580, 2760, 5786, 5849, 5856, 6360, 6434, 657..."


用training_data进行训练，导出关联规则（推荐规则）

注：这里只是用置信度进行了排名，因为只是对评估函数进行测试，所以没有加入更多的排名方法。

In [131]:

data_list = list(training_data["value"])
frequent_itemsets = fpg.find_frequent_itemsets(data_list, minimum_support=0.05 * len(data_list), include_support=True)
print(type(frequent_itemsets))  # print type
result = []
for itemset, support in frequent_itemsets:  # 将generator结果存入list
    result.append((itemset, support / len(data_list)))

result_patterns = [i[0] for i in result]
result_support = [i[1] for i in result]
patterns_df = pd.DataFrame({"fluent_patterns": result_patterns, "support": result_support})
patterns = {}
for i in result:
    patterns[frozenset(sorted(i[0]))] = i[1]
print("-------------挖掘频繁项集---------------")
print(patterns_df)

def generate_rules(patterns, min_confidence):
    patterns_group = group_patterns_by_length(patterns)
    raw_rules = defaultdict(set)
    for length, pattern_list in patterns_group.items():
        if length == 1:
            continue
        for pattern, support in pattern_list:
            item_list = list(pattern)
            for window_size in range(1, length):
                for i in range(0, length - window_size):
                    for j in range(i + window_size, length):
                        base_set = frozenset(item_list[i:j])
                        predict_set = frozenset(pattern - base_set)
                        confidence = support / patterns.get(base_set)
                        if confidence > min_confidence:
                            raw_rules[base_set].add((predict_set, confidence))

                        base_set, predict_set = predict_set, base_set
                        confidence = support / patterns.get(base_set)
                        if confidence > min_confidence:
                            raw_rules[base_set].add((predict_set, confidence))
    return raw_rules

def group_patterns_by_length(patterns):
    result = defaultdict(list)
    for pattern, support in patterns.items():
        result[len(pattern)].append((pattern, support))
    return result

def transform(raw_rules):
    result = list()
    for base_set, predict_set_list in raw_rules.items():
        for predict_set, confidence in predict_set_list:
            result.append((base_set, predict_set, confidence))

    return result
raw_rules = generate_rules(patterns, 0.6)
rules = transform(raw_rules)
rules.sort(key=lambda x: x[2], reverse=True)
rules_a = [i[0] for i in rules]
rules_b = [i[1] for i in rules]
confidence = [i[2] for i in rules]
rules_df = pd.DataFrame({"rules_a": rules_a, "rules_b": rules_b, "confidence": confidence})
print(rules_df)

<class 'generator'>
-------------挖掘频繁项集---------------
                           fluent_patterns   support
0                                   [5852]  0.195271
1                           [102392, 5852]  0.094143
2                           [139252, 5852]  0.079097
3                   [102392, 139252, 5852]  0.050296
4                          [5267730, 5852]  0.072327
5                  [139252, 5267730, 5852]  0.052767
6                             [3450, 5852]  0.069855
7                           [130412, 5852]  0.070930
8                             [5997, 5852]  0.069425
9                             [3461, 5852]  0.066953
10                            [8892, 5852]  0.087157
11                    [102392, 8892, 5852]  0.055132
12                    [139252, 8892, 5852]  0.050833
13                         [4312482, 5852]  0.060183
14                                  [2543]  0.195056
15                            [5852, 2543]  0.081032
16                          [139252, 2543]  

In [135]:
# 将规则转换为(rule_a,rule_b)的格式，根据rules的实际情况更改
new_rule = []
for a,b,c in rules:
    new_rule.append([a,b])

In [136]:
new_rule[:5]

[[frozenset({'139252', '21986963', '4316382'}), frozenset({'5267730'})],
 [frozenset({'139252', '4316382', '4316442'}), frozenset({'5267730'})],
 [frozenset({'134912', '4316382', '4316482'}), frozenset({'5267730'})],
 [frozenset({'4316382', '4762714'}), frozenset({'5267730'})],
 [frozenset({'4316382', '4316482', '4762754'}), frozenset({'5267730'})]]

In [145]:
# 计算前100条规则的得分
result = evaluate_association_rules(new_rule[:100],evaluate_data)
result

0  rules miss


0.877963539130011

In [148]:
# 计算第100到200条规则的得分
result = evaluate_association_rules(new_rule[100:200],evaluate_data)
result

0  rules miss


0.8364170452763535

In [149]:
# 计算第200到300条规则的得分
result = evaluate_association_rules(new_rule[200:300],evaluate_data)
result

0  rules miss


0.8073542457545337

In [144]:
# 计算全部规则的得分
result = evaluate_association_rules(new_rule,evaluate_data)
result

0  rules miss


0.7075517635501979

分别对前100条规则、第100到200条规则、第200到300条、全部规则进行评估，发现使用靠近前面的规则 平均评价较高，侧面证明了使用关联规则的置信度对规则进行排名有一定的优化效果。