In [10]:
def evaluate_association_rules(association_rules,test_datas):
    """
    Parameters：
        association_rules：要评估的关联规则（推荐规则） list((rule_a,rule_b))
        test_datas：测试用的数据集 dataframe

    Returns：
        关联规则（推荐规则）在测试集上的平均准确率
    """
    point_sum = 0
    miss = 0
    for rule_a,rule_b in association_rules:
        set_rule_a = set(rule_a)
        set_rule_b = set(rule_b)
        
        num_a = 0
        num_b = 0
        
        for test_data in test_datas.value:
            set_test_data = set(test_data)
            if set_rule_a.issubset(set_test_data):
                num_a += 1
                if set_rule_b.issubset(set_test_data):
                    num_b += 1
        if num_a:
            point_sum += num_b / num_a
        else:
            miss += 1
    print(miss,' rules miss')
    return point_sum / len(association_rules)
    

### 函数测试

In [11]:
# 导入所需模块
import pandas as pd
import numpy as np
import json
import fp_growth_py3 as fpg
import matplotlib.pyplot as plt
import pylab as pl
from collections import defaultdict
import association_rules 

10339
<class 'generator'>
-------------挖掘频繁项集---------------
                fluent_patterns   support
0                      [102392]  0.315795
1                        [8892]  0.269272
2                [102392, 8892]  0.142083
3                [139252, 8892]  0.141213
4        [102392, 139252, 8892]  0.086952
5               [5267730, 8892]  0.144405
6       [102392, 5267730, 8892]  0.086179
7       [139252, 5267730, 8892]  0.103008
8                     [4312482]  0.205726
9             [102392, 4312482]  0.099139
10              [8892, 4312482]  0.103105
11      [139252, 8892, 4312482]  0.075249
12     [5267730, 8892, 4312482]  0.075152
13            [139252, 4312482]  0.123803
14           [5267730, 4312482]  0.120708
15   [139252, 5267730, 4312482]  0.091208
16            [130412, 4312482]  0.097011
17    [139252, 130412, 4312482]  0.076700
18   [5267730, 130412, 4312482]  0.072831
19                     [135652]  0.194893
20             [102392, 135652]  0.088306
21             

                         rules_a     rules_b  confidence     score
0     (4316382, 4316482, 139252)   (5267730)    0.899207  1.550524
1             (4316382, 5267750)   (5267730)    0.898389  1.550115
2             (4316382, 4762754)   (5267730)    0.893303  1.547572
3        (4316382, 8892, 139252)   (5267730)    0.885057  1.543449
4     (130412, 4316482, 5267730)    (139252)    0.878021  1.555975
5             (4316382, 4312482)   (5267730)    0.872222  1.537032
6      (4316382, 130412, 139252)   (5267730)    0.872043  1.536942
7             (4316382, 4316482)   (5267730)    0.871673  1.536757
8    (4316382, 4316482, 5267730)    (139252)    0.865867  1.549898
9              (4316382, 134912)   (5267730)    0.861111  1.531476
10             (134912, 4316482)    (139252)    0.859954  1.546941
11             (4316442, 139252)   (5267730)    0.856187  1.529014
12             (102252, 4316482)    (139252)    0.855803  1.544866
13               (4316382, 8892)   (5267730)    0.854167  1.52

In [12]:
# 读入测试数据集
filepath='./test_data/user_following_animation.json'
data=pd.read_json(filepath,lines=True)
user_info = pd.read_csv("test_data/bilibili_crawler_user_info.csv",names = ['id','mid','name','sex','sign','the_rank','level','jointime','moral','silence','birthday','coins','fans_badge','role','title','desc','vip_type','vip_status'])

len(user_info)


51432

In [13]:
user_info.drop(user_info[user_info.vip_type.isna() | user_info.the_rank.isna() | user_info.level.isna()].index.tolist(),inplace=True) # 将vip_type、the_rank、level有NaN的行去掉
len(user_info)

51424

In [14]:
# 设置权重字典
add_rules={'the_rank':{10000:1,20000:2,25000:3,30000:4},'level':{'3':1,'4':2,'5':3,'6':4},'vip_type':{0:0,1:1,2:2}}


In [15]:
# 去除一下异常数据
for index,row in data.iterrows():
    try:
        int(row.key)
    except:
        print(row)
        data.drop(index,inplace=True)

db                                                       7
key                                         finished_users
size                                                507576
ttl                                                     -1
type                                                   set
value    [330817737, 74775, 259640193, 24774761, 540994...
Name: 4504, dtype: object


In [16]:
def user_power(data,rules,user_info):
    """
    params:
        data:用户收藏ID数据集
        rules:权重规则
        user_info:用户信息数据集
    return:新数据集
        
    """
    new_data = pd.DataFrame(columns=data.columns)
    for index,row in data.iterrows():
        info = user_info[user_info.mid == int(row.key)]

        if len(info):
            the_power = rules['the_rank'][info.the_rank.values[0]] + rules['level'][info.level.values[0]] + rules['vip_type'][info.vip_type.values[0]]
        else:  # len(info)==0 说明在user_info中没有这个用户的相关数据
            the_power = 1
#         print(the_power)
        for i in range(the_power):
            new_data = new_data.append(row,ignore_index=True)
    return new_data

    

In [17]:
# 根据权重规则对数据进行扩充
new_data= user_power(data,add_rules,user_info)
len(new_data)

38829

将测试数据集 划分为training_data与evaluate_data两部分，占比可调

In [18]:
training_data = new_data.sample(n=None, frac=0.9, replace=False, weights=None, random_state=None, axis=None)

In [19]:
print(len(training_data))
training_data.head()

34946


Unnamed: 0,db,key,size,ttl,type,value
37987,7,2741213,233,-1,set,"[442, 476, 519, 790, 791, 844, 1064, 1065, 108..."
1454,7,2615621,75,-1,set,"[1321, 1715, 1875, 2012, 2040, 2321, 5038, 506..."
28681,7,693131,245,-1,set,"[687, 709, 842, 1192, 1193, 1576, 1577, 1586, ..."
38490,7,3021765,12,-1,set,"[1270, 2580, 5761]"
4378,7,108528523,151,-1,set,"[468, 703, 863, 1778, 1995, 2014, 2167, 2300, ..."


In [20]:
evaluate_data = new_data.drop([x for x in training_data.index])

In [21]:
print(len(evaluate_data))
evaluate_data.head()

3883


Unnamed: 0,db,key,size,ttl,type,value
29,7,344488725,4,-1,set,[5744]
52,7,2598372,47,-1,set,"[1559, 1699, 1733, 5559, 5626, 5852, 6402, 875..."
53,7,392602869,76,-1,set,"[349, 2576, 6175, 6176, 6360, 8792, 77072, 118..."
79,7,38115290,409,-1,set,"[2580, 2760, 5786, 5849, 5856, 6360, 6434, 657..."
86,7,8784405,183,-1,set,"[425, 963, 1178, 1463, 1660, 3151, 3756, 4378,..."


用training_data进行训练，导出关联规则（推荐规则）

注：这里只是用置信度进行了排名，因为只是对评估函数进行测试，所以没有加入更多的排名方法。

In [28]:

data_list = list(training_data["value"])
frequent_itemsets = fpg.find_frequent_itemsets(data_list, minimum_support=0.07 * len(data_list), include_support=True)
print(type(frequent_itemsets))  # print type
result = []
for itemset, support in frequent_itemsets:  # 将generator结果存入list
    result.append((itemset, support / len(data_list)))

result_patterns = [i[0] for i in result]
result_support = [i[1] for i in result]
patterns_df = pd.DataFrame({"fluent_patterns": result_patterns, "support": result_support})
patterns = {}
for i in result:
    patterns[frozenset(sorted(i[0]))] = i[1]
print("-------------挖掘频繁项集---------------")
print(patterns_df)

def generate_rules(patterns, min_confidence):
    patterns_group = group_patterns_by_length(patterns)
    raw_rules = defaultdict(set)
    for length, pattern_list in patterns_group.items():
        if length == 1:
            continue
        for pattern, support in pattern_list:
            item_list = list(pattern)
            for window_size in range(1, length):
                for i in range(0, length - window_size):
                    for j in range(i + window_size, length):
                        base_set = frozenset(item_list[i:j])
                        predict_set = frozenset(pattern - base_set)
                        confidence = support / patterns.get(base_set)
                        if confidence > min_confidence:
                            raw_rules[base_set].add((predict_set, confidence))

                        base_set, predict_set = predict_set, base_set
                        confidence = support / patterns.get(base_set)
                        if confidence > min_confidence:
                            raw_rules[base_set].add((predict_set, confidence))
    return raw_rules

def group_patterns_by_length(patterns):
    result = defaultdict(list)
    for pattern, support in patterns.items():
        result[len(pattern)].append((pattern, support))
    return result

def transform(raw_rules):
    result = list()
    for base_set, predict_set_list in raw_rules.items():
        for predict_set, confidence in predict_set_list:
            result.append((base_set, predict_set, confidence))

    return result
raw_rules = generate_rules(patterns, 0.6)
rules = transform(raw_rules)
rules.sort(key=lambda x: x[2], reverse=True)
rules_a = [i[0] for i in rules]
rules_b = [i[1] for i in rules]
confidence = [i[2] for i in rules]
rules_df = pd.DataFrame({"rules_a": rules_a, "rules_b": rules_b, "confidence": confidence})
print(rules_df)

<class 'generator'>
-------------挖掘频繁项集---------------
                        fluent_patterns   support
0                                [8892]  0.306473
1                        [102392, 8892]  0.160648
2                              [139252]  0.305242
3                        [8892, 139252]  0.166772
4                [102392, 8892, 139252]  0.100956
5                      [102392, 139252]  0.158359
6                             [4312482]  0.226263
7                       [8892, 4312482]  0.120071
8               [102392, 8892, 4312482]  0.070366
9                     [139252, 4312482]  0.139444
10              [8892, 139252, 4312482]  0.089281
11            [102392, 139252, 4312482]  0.078893
12                    [102392, 4312482]  0.111372
13                   [5267730, 4312482]  0.136353
14           [102392, 5267730, 4312482]  0.074858
15           [139252, 5267730, 4312482]  0.104218
16     [8892, 139252, 5267730, 4312482]  0.072941
17             [8892, 5267730, 4312482]  0.08

In [30]:
# 对关联规则进行扩展，综合考虑置信度、番剧评分、声优导演等信息
rules_weight = {"confidence": 0.5, "score": 0.1, "play": 0.1, "follow": 0.1, "voice": 0.1, "staff": 0.1}
rules_df = association_rules.unfold_rules(rules_df)
association_rules.add_score(rules_df, rules_weight)
rules_df

ValueError: Length of values does not match length of index

In [None]:
# 将规则转换为(rule_a,rule_b)的格式，根据rules的实际情况更改
new_rule = []
for a,b,c in rules:
    new_rule.append([a,b])

In [None]:
new_rule[:5]

In [None]:
# 计算前100条规则的平均准确率
result = evaluate_association_rules(new_rule[:100],evaluate_data)
result

In [None]:
# 计算第100到200条规则的平均准确率
result = evaluate_association_rules(new_rule[100:200],evaluate_data)
result

In [None]:
# 计算第200到300条规则的平均准确率
result = evaluate_association_rules(new_rule[200:300],evaluate_data)
result

In [None]:
# 计算全部规则的平均准确率
result = evaluate_association_rules(new_rule,evaluate_data)
result

分别对前100条规则、第100到200条规则、第200到300条规则、全部规则进行评估，
发现使用靠近前面的规则进行推荐，平均准确率较高，证明了使用关联规则的置信度对规则进行排名有一定的优化效果。
