In [142]:
def evaluate_association_rules(rules_to_evaluate,test_datas):
    """
    Parameters：
        rules_to_evaluate：要评估的关联规则（推荐规则） list((rule_a,rule_b))
        test_datas：测试用的数据集 dataframe

    Returns：
        关联规则（推荐规则）在测试集上的平均准确率
    """
    point_sum = 0
    miss = 0
    for rule_a,rule_b in rules_to_evaluate:
        set_rule_a = set(rule_a)
        set_rule_b = set(rule_b)  
        
        num_a = 0
        num_b = 0
        
        for test_data in test_datas.value:
            set_test_data = set(test_data)
            if set_rule_a.issubset(set_test_data):
                num_a += 1
                if set_rule_b.issubset(set_test_data):
                    num_b += 1
        if num_a:
            point_sum += num_b / num_a
        else:
            miss += 1
    print(miss,' rules miss')
    return point_sum / len(rules_to_evaluate)
    

### 函数测试

In [3]:
# 导入所需模块
import pandas as pd
import numpy as np
import json
import fp_growth_py3 as fpg
import matplotlib.pyplot as plt
import pylab as pl
from collections import defaultdict
import association_rules 

10339
<class 'generator'>
-------------挖掘频繁项集---------------
                fluent_patterns   support
0                      [102392]  0.315795
1                        [8892]  0.269272
2                [102392, 8892]  0.142083
3                [139252, 8892]  0.141213
4        [102392, 139252, 8892]  0.086952
5               [5267730, 8892]  0.144405
6       [102392, 5267730, 8892]  0.086179
7       [139252, 5267730, 8892]  0.103008
8                     [4312482]  0.205726
9             [102392, 4312482]  0.099139
10              [8892, 4312482]  0.103105
11      [139252, 8892, 4312482]  0.075249
12     [5267730, 8892, 4312482]  0.075152
13            [139252, 4312482]  0.123803
14           [5267730, 4312482]  0.120708
15   [139252, 5267730, 4312482]  0.091208
16            [130412, 4312482]  0.097011
17    [139252, 130412, 4312482]  0.076700
18   [5267730, 130412, 4312482]  0.072831
19                     [135652]  0.194893
20             [102392, 135652]  0.088306
21             

In [5]:
# 读入测试数据集
filepath='./test_data/user_following_animation.json'
data=pd.read_json(filepath,lines=True)
user_info = pd.read_csv("test_data/bilibili_crawler_user_info.csv",names = ['id','mid','name','sex','sign','the_rank','level','jointime','moral','silence','birthday','coins','fans_badge','role','title','desc','vip_type','vip_status'])

len(user_info)


51432

In [6]:
user_info.drop(user_info[user_info.vip_type.isna() | user_info.the_rank.isna() | user_info.level.isna()].index.tolist(),inplace=True) # 将vip_type、the_rank、level有NaN的行去掉
len(user_info)

51424

In [7]:
# 设置权重字典
add_rules={'the_rank':{10000:1,20000:2,25000:3,30000:4},'level':{'3':1,'4':2,'5':3,'6':4},'vip_type':{0:0,1:1,2:2}}


In [8]:
# 去除一下异常数据
for index,row in data.iterrows():
    try:
        int(row.key)
    except:
        print(row)
        data.drop(index,inplace=True)

db                                                       7
key                                         finished_users
size                                                507576
ttl                                                     -1
type                                                   set
value    [330817737, 74775, 259640193, 24774761, 540994...
Name: 4504, dtype: object


In [9]:
def user_power(data,rules,user_info):
    """
    params:
        data:用户收藏ID数据集
        rules:权重规则
        user_info:用户信息数据集
    return:新数据集
        
    """
    new_data = pd.DataFrame(columns=data.columns)
    for index,row in data.iterrows():
        info = user_info[user_info.mid == int(row.key)]

        if len(info):
            the_power = rules['the_rank'][info.the_rank.values[0]] + rules['level'][info.level.values[0]] + rules['vip_type'][info.vip_type.values[0]]
        else:  # len(info)==0 说明在user_info中没有这个用户的相关数据
            the_power = 1
#         print(the_power)
        for i in range(the_power):
            new_data = new_data.append(row,ignore_index=True)
    return new_data

    

In [10]:
# 根据权重规则对数据进行扩充
new_data= user_power(data,add_rules,user_info)
len(new_data)

38829

将测试数据集 划分为training_data与evaluate_data两部分，占比可调

In [11]:
training_data = new_data.sample(n=None, frac=0.9, replace=False, weights=None, random_state=None, axis=None)

In [12]:
print(len(training_data))
training_data.head()

34946


Unnamed: 0,db,key,size,ttl,type,value
17638,7,3069809,301,-1,set,"[497, 2572, 2576, 2584, 2591, 2722, 2724, 2732..."
29182,7,306805,98,-1,set,"[1512, 1539, 1540, 3461, 6440, 6446, 8892, 234..."
4477,7,7063838,243,-1,set,"[1056, 1064, 1073, 1177, 1178, 1574, 1576, 157..."
32518,7,266481257,9,-1,set,"[5852, 86272]"
22614,7,333084550,12,-1,set,"[1699, 2543, 6260]"


In [77]:
int(training_data.value[0][0])

132112

In [13]:
evaluate_data = new_data.drop([x for x in training_data.index])

In [14]:
print(len(evaluate_data))
evaluate_data.head()

3883


Unnamed: 0,db,key,size,ttl,type,value
9,7,24774761,12,-1,set,"[5550, 5849, 5852]"
14,7,37868878,1246,-1,set,"[53, 110, 249, 282, 333, 334, 419, 470, 471, 5..."
26,7,5667082,526,-1,set,"[282, 289, 290, 311, 572, 687, 713, 723, 735, ..."
41,7,19662966,4,-1,set,[6474]
50,7,2598372,47,-1,set,"[1559, 1699, 1733, 5559, 5626, 5852, 6402, 875..."


用training_data进行训练，导出关联规则（推荐规则）

注：这里只是用置信度进行了排名，因为只是对评估函数进行测试，所以没有加入更多的排名方法。

In [161]:

data_list = list(training_data["value"])
frequent_itemsets = fpg.find_frequent_itemsets(data_list, minimum_support=0.07 * len(data_list), include_support=True)
print(type(frequent_itemsets))  # print type
result = []
for itemset, support in frequent_itemsets:  # 将generator结果存入list
    result.append((itemset, support / len(data_list)))

result_patterns = [i[0] for i in result]
result_support = [i[1] for i in result]
patterns_df = pd.DataFrame({"fluent_patterns": result_patterns, "support": result_support})
patterns = {}
for i in result:
    patterns[frozenset(sorted(i[0]))] = i[1]
print("-------------挖掘频繁项集---------------")
print(patterns_df)

def generate_rules(patterns, min_confidence):
    patterns_group = group_patterns_by_length(patterns)
    raw_rules = defaultdict(set)
    for length, pattern_list in patterns_group.items():
        if length == 1:
            continue
        for pattern, support in pattern_list:
            item_list = list(pattern)
            for window_size in range(1, length):
                for i in range(0, length - window_size):
                    for j in range(i + window_size, length):
                        base_set = frozenset(item_list[i:j])
                        predict_set = frozenset(pattern - base_set)
                        confidence = support / patterns.get(base_set)
                        if confidence > min_confidence:
                            raw_rules[base_set].add((predict_set, confidence))

                        base_set, predict_set = predict_set, base_set
                        confidence = support / patterns.get(base_set)
                        if confidence > min_confidence:
                            raw_rules[base_set].add((predict_set, confidence))
    return raw_rules

def group_patterns_by_length(patterns):
    result = defaultdict(list)
    for pattern, support in patterns.items():
        result[len(pattern)].append((pattern, support))
    return result

def transform(raw_rules):
    result = list()
    for base_set, predict_set_list in raw_rules.items():
        for predict_set, confidence in predict_set_list:
            result.append((base_set, predict_set, confidence))

    return result
raw_rules = generate_rules(patterns, 0.6)
rules = transform(raw_rules)
rules.sort(key=lambda x: x[2], reverse=True)
rules_a = [i[0] for i in rules]
rules_b = [i[1] for i in rules]
confidence = [i[2] for i in rules]
rules_df = pd.DataFrame({"rules_a": rules_a, "rules_b": rules_b, "confidence": confidence})
print(rules_df)

<class 'generator'>
-------------挖掘频繁项集---------------
                     fluent_patterns   support
0                           [102392]  0.330996
1                           [139252]  0.305099
2                   [102392, 139252]  0.157844
3                     [8892, 139252]  0.166772
4             [102392, 8892, 139252]  0.100984
5                          [5267730]  0.300063
6                  [102392, 5267730]  0.155268
7                  [139252, 5267730]  0.192211
8          [102392, 139252, 5267730]  0.106650
9            [8892, 139252, 5267730]  0.123619
10   [102392, 8892, 139252, 5267730]  0.075602
11                   [8892, 5267730]  0.168918
12           [102392, 8892, 5267730]  0.099468
13                          [130412]  0.253133
14                  [102392, 130412]  0.121616
15                  [139252, 130412]  0.169748
16          [102392, 139252, 130412]  0.089796
17            [8892, 139252, 130412]  0.112803
18                 [5267730, 130412]  0.155039
19   

In [155]:
new_rules1 = []
for a,b,c in rules:
    new_rules1.append([a,b])
new_rules1[:5]



[[frozenset({'130412', '139252', '4316382', '4316482'}),
  frozenset({'5267730'})],
 [frozenset({'130412', '4316382', '4316482', '5267730'}),
  frozenset({'139252'})],
 [frozenset({'139252', '4316382', '4762754'}), frozenset({'5267730'})],
 [frozenset({'134912', '4316482', '5267730'}), frozenset({'139252'})],
 [frozenset({'4316382', '4316442'}), frozenset({'5267730'})]]

In [156]:
result = evaluate_association_rules(new_rules1[:100],evaluate_data)
result

0  rules miss


0.8515642946205972

In [157]:
result = evaluate_association_rules(new_rules1[100:200],evaluate_data)
result

0  rules miss


0.7758942803178905

In [158]:
result = evaluate_association_rules(new_rules1[200:300],evaluate_data)
result

0  rules miss


0.7193738042259892

In [159]:
result = evaluate_association_rules(new_rules1,evaluate_data)
result

0  rules miss


0.7081574353078712

In [162]:
#import association_rules
import imp
imp.reload(association_rules)
# 读取animation 和 animation_feature
animation = pd.read_json("./test_data/bilibili_crawler_animation.json", encoding="utf-8")
animation["score"] = animation["score"].fillna('%.1f' % animation["score"].mean())
animation[["follow", "play"]] = animation[["follow", "play"]].applymap(association_rules.trans)
animation_feature = pd.read_json("./test_data/bilibili_crawler_animation_feature.json",dtype={"character_voice_list": str})
animation_feature[["tag_list", "character_voice_list", "character_staff_list"]] = animation_feature[["tag_list", "character_voice_list", "character_staff_list"]].applymap(json.loads)

# #将关联规则的后项展开
rules_df = association_rules.unfold_rules(rules_df)
# 对关联规则进行扩展，综合考虑置信度、番剧评分、声优导演等信息
rules_weight = {"confidence": 0.5, "score": 0.5, "play": 0.1, "follow": 0.1, "voice": 0.1, "staff": 0.1}
rules_df = association_rules.add_score(rules_df, rules_weight, animation, animation_feature)
for index,row in rules_df.iterrows():
    print(row["rules_a"], row["rules_b"], row["confidence"], row["score"])

10339
<class 'generator'>
-------------挖掘频繁项集---------------
                fluent_patterns   support
0                      [102392]  0.315795
1                        [8892]  0.269272
2                [102392, 8892]  0.142083
3                [139252, 8892]  0.141213
4        [102392, 139252, 8892]  0.086952
5               [5267730, 8892]  0.144405
6       [102392, 5267730, 8892]  0.086179
7       [139252, 5267730, 8892]  0.103008
8                     [4312482]  0.205726
9             [102392, 4312482]  0.099139
10              [8892, 4312482]  0.103105
11      [139252, 8892, 4312482]  0.075249
12     [5267730, 8892, 4312482]  0.075152
13            [139252, 4312482]  0.123803
14           [5267730, 4312482]  0.120708
15   [139252, 5267730, 4312482]  0.091208
16            [130412, 4312482]  0.097011
17    [139252, 130412, 4312482]  0.076700
18   [5267730, 130412, 4312482]  0.072831
19                     [135652]  0.194893
20             [102392, 135652]  0.088306
21             

In [164]:
rules_df.sort_values(by='score',inplace=True,ascending=False)
rules_df

Unnamed: 0,rules_a,rules_b,confidence,score
15,{140552},135652,0.888640,5.495017
0,"{130412, 4316482, 139252, 4316382}",5267730,0.916201,5.479021
2,"{4762754, 139252, 4316382}",5267730,0.912545,5.477193
4,"{4316442, 4316382}",5267730,0.900451,5.471146
5,"{4316482, 130412, 4316382}",5267730,0.899230,5.470536
7,"{4316482, 139252, 4316382}",5267730,0.898513,5.470177
8,"{4762754, 4316382}",5267730,0.898070,5.469956
10,"{21986963, 4316382}",5267730,0.896299,5.469070
13,"{5267750, 4316382}",5267730,0.890783,5.466312
16,"{4312482, 139252, 4316382}",5267730,0.888423,5.465132


In [170]:
rules_df.columns

Index(['rules_a', 'rules_b', 'confidence', 'score'], dtype='object')

In [165]:
# 将规则转换为(rule_a,rule_b)的格式，根据rules的实际情况更改
new_rule = []
for index,row in rules_df.iterrows():
    b=[]
    for item in row[0]:
        b.append(str(item))
    new_rule.append([b,[str(row[1])]])
new_rule[:5]

[[['140552'], ['135652']],
 [['130412', '4316482', '139252', '4316382'], ['5267730']],
 [['4762754', '139252', '4316382'], ['5267730']],
 [['4316442', '4316382'], ['5267730']],
 [['4316482', '130412', '4316382'], ['5267730']]]

In [166]:
# 计算前100条规则的平均准确率
result = evaluate_association_rules(new_rule[:100],evaluate_data)
result

0  rules miss


0.8006241357525562

In [167]:
# 计算第100到200条规则的平均准确率
result = evaluate_association_rules(new_rule[100:200],evaluate_data)
result

0  rules miss


0.6984146866930883

In [168]:
# 计算第200到300条规则的平均准确率
result = evaluate_association_rules(new_rule[200:300],evaluate_data)
result

0  rules miss


0.7426660868893857

In [169]:
# 计算全部规则的平均准确率
result = evaluate_association_rules(new_rule,evaluate_data)
result

0  rules miss


0.7166957931575683

分别对前100条规则、第100到200条规则、第200到300条规则、全部规则进行评估，
发现使用靠近前面的规则进行推荐，平均准确率较高，证明了使用关联规则的置信度对规则进行排名有一定的优化效果。


In [100]:
for rule_a,rule_b in new_rule[:5]:
    set_rule_a = set(rule_a)
    set_rule_b = set(rule_b)
    
    print(set_rule_a,set_rule_b)

{'1587'} {'1', '6', '5', '8'}
{'4316482', '4316382', '5267730', '130412'} {'5', '1', '9', '3', '2'}
{'4316482', '5267730', '134912'} {'5', '1', '9', '3', '2'}
{'4316482', '4316382', '130412'} {'5', '1', '9', '3', '2'}
{'4316482', '134912', '130412'} {'5', '1', '9', '3', '2'}
