In [180]:
def evaluate_association_rules(association_rules,test_datas):
    """
    Parameters：
        association_rules：要评估的关联规则（推荐规则） list((rule_a,rule_b))
        test_datas：测试用的数据集 dataframe

    Returns：
        关联规则（推荐规则）在测试集上的平均准确率
    """
    point_sum = 0
    miss = 0
    for rule_a,rule_b in association_rules:
        set_rule_a = set(rule_a)
        set_rule_b = set(rule_b)
        
        num_a = 0
        num_b = 0
        
        for test_data in test_datas.value:
            set_test_data = set(test_data)
            if set_rule_a.issubset(set_test_data):
                num_a += 1
                if set_rule_b.issubset(set_test_data):
                    num_b += 1
        if num_a:
            point_sum += num_b / num_a
        else:
            miss += 1
    print(miss,' rules miss')
    return point_sum / len(association_rules)
    

### 函数测试

In [125]:
# 导入所需模块
import pandas as pd
import numpy as np
import json
import fp_growth_py3 as fpg
import matplotlib.pyplot as plt
import pylab as pl
from collections import defaultdict

In [126]:
# 读入测试数据集
filepath='./test_data/user_following_animation.json'
data=pd.read_json(filepath,lines=True)

将测试数据集 划分为training_data与evaluate_data两部分，占比可调

In [163]:
training_data = data.sample(n=9000, frac=None, replace=False, weights=None, random_state=None, axis=None)

In [164]:
print(len(training_data))
training_data.head()

9000


Unnamed: 0,db,key,size,ttl,type,value
7762,7,77246224,462,-1,set,"[1177, 1512, 1586, 1587, 1689, 1699, 1733, 254..."
9196,7,299275542,253,-1,set,"[530, 1588, 1666, 2576, 2743, 2760, 3023, 3494..."
4560,7,70399334,39,-1,set,"[835, 1089, 2074, 2581, 2960, 5550, 5761, 6652..."
6545,7,1320581,63,-1,set,"[685, 1201, 2585, 8892, 102112, 102632, 140552..."
5456,7,14546601,509,-1,set,"[53, 499, 697, 1071, 1072, 1177, 1178, 1407, 1..."


In [165]:
evaluate_data = data.drop([x for x in training_data.index])[:1000]

In [166]:
print(len(evaluate_data))
evaluate_data.head()

1000


Unnamed: 0,db,key,size,ttl,type,value
10,7,344488725,4,-1,set,[5744]
19,7,28872792,203,-1,set,"[207, 427, 877, 963, 1089, 1733, 2520, 2572, 2..."
34,7,82151097,4,-1,set,[5998]
36,7,6994862,262,-1,set,"[3461, 4026, 5027, 5043, 5058, 5070, 5513, 553..."
45,7,11362620,136,-1,set,"[123, 296, 528, 767, 954, 959, 1548, 1563, 187..."


用training_data进行训练，导出关联规则（推荐规则）

注：这里只是用置信度进行了排名，因为只是对评估函数进行测试，所以没有加入更多的排名方法。

In [167]:

data_list = list(training_data["value"])
frequent_itemsets = fpg.find_frequent_itemsets(data_list, minimum_support=0.07 * len(data_list), include_support=True)
print(type(frequent_itemsets))  # print type
result = []
for itemset, support in frequent_itemsets:  # 将generator结果存入list
    result.append((itemset, support / len(data_list)))

result_patterns = [i[0] for i in result]
result_support = [i[1] for i in result]
patterns_df = pd.DataFrame({"fluent_patterns": result_patterns, "support": result_support})
patterns = {}
for i in result:
    patterns[frozenset(sorted(i[0]))] = i[1]
print("-------------挖掘频繁项集---------------")
print(patterns_df)

def generate_rules(patterns, min_confidence):
    patterns_group = group_patterns_by_length(patterns)
    raw_rules = defaultdict(set)
    for length, pattern_list in patterns_group.items():
        if length == 1:
            continue
        for pattern, support in pattern_list:
            item_list = list(pattern)
            for window_size in range(1, length):
                for i in range(0, length - window_size):
                    for j in range(i + window_size, length):
                        base_set = frozenset(item_list[i:j])
                        predict_set = frozenset(pattern - base_set)
                        confidence = support / patterns.get(base_set)
                        if confidence > min_confidence:
                            raw_rules[base_set].add((predict_set, confidence))

                        base_set, predict_set = predict_set, base_set
                        confidence = support / patterns.get(base_set)
                        if confidence > min_confidence:
                            raw_rules[base_set].add((predict_set, confidence))
    return raw_rules

def group_patterns_by_length(patterns):
    result = defaultdict(list)
    for pattern, support in patterns.items():
        result[len(pattern)].append((pattern, support))
    return result

def transform(raw_rules):
    result = list()
    for base_set, predict_set_list in raw_rules.items():
        for predict_set, confidence in predict_set_list:
            result.append((base_set, predict_set, confidence))

    return result
raw_rules = generate_rules(patterns, 0.6)
rules = transform(raw_rules)
rules.sort(key=lambda x: x[2], reverse=True)
rules_a = [i[0] for i in rules]
rules_b = [i[1] for i in rules]
confidence = [i[2] for i in rules]
rules_df = pd.DataFrame({"rules_a": rules_a, "rules_b": rules_b, "confidence": confidence})
print(rules_df)

<class 'generator'>
-------------挖掘频繁项集---------------
                 fluent_patterns   support
0                       [102392]  0.315444
1                         [5997]  0.207778
2                 [102392, 5997]  0.118333
3                 [139252, 5997]  0.101889
4                [5267730, 5997]  0.108333
5        [139252, 5267730, 5997]  0.074222
6        [102392, 5267730, 5997]  0.073667
7                   [8892, 5997]  0.103778
8           [102392, 8892, 5997]  0.070222
9                 [130412, 5997]  0.084444
10                        [5852]  0.195000
11                [102392, 5852]  0.093667
12                [139252, 5852]  0.079333
13                [130412, 5852]  0.072111
14               [5267730, 5852]  0.072667
15                  [8892, 5852]  0.085222
16                  [3450, 5852]  0.071000
17                        [2543]  0.193889
18                [102392, 2543]  0.091889
19                  [5852, 2543]  0.078778
20                [139252, 2543]  0.085333

In [168]:
# 将规则转换为(rule_a,rule_b)的格式，根据rules的实际情况更改
new_rule = []
for a,b,c in rules:
    new_rule.append([a,b])

In [169]:
new_rule[:5]

[[frozenset({'139252', '4316382', '4316482'}), frozenset({'5267730'})],
 [frozenset({'4316382', '4762754'}), frozenset({'5267730'})],
 [frozenset({'139252', '4316382', '8892'}), frozenset({'5267730'})],
 [frozenset({'4312482', '4316382'}), frozenset({'5267730'})],
 [frozenset({'130412', '4316482', '5267730'}), frozenset({'139252'})]]

In [174]:
# 计算前100条规则的平均准确率
result = evaluate_association_rules(new_rule[:100],evaluate_data)
result

0  rules miss


0.7794543136868225

In [175]:
# 计算第100到200条规则的平均准确率
result = evaluate_association_rules(new_rule[100:200],evaluate_data)
result

0  rules miss


0.6724204487956956

In [176]:
# 计算第200到300条规则的平均准确率
result = evaluate_association_rules(new_rule[200:300],evaluate_data)
result

0  rules miss


0.6094908336235078

In [177]:
# 计算全部规则的平均准确率
result = evaluate_association_rules(new_rule,evaluate_data)
result

0  rules miss


0.6973750205048295

分别对前100条规则、第100到200条规则、第200到300条规则、全部规则进行评估，
发现使用靠近前面的规则进行推荐，平均准确率较高，证明了使用关联规则的置信度对规则进行排名有一定的优化效果。
