In [180]:
def evaluate_association_rules(association_rules,test_datas):
    """
    Parameters：
        association_rules：要评估的关联规则（推荐规则） list((rule_a,rule_b))
        test_datas：测试用的数据集 dataframe

    Returns：
        关联规则（推荐规则）在测试集上的平均准确率
    """
    point_sum = 0
    miss = 0
    for rule_a,rule_b in association_rules:
        set_rule_a = set(rule_a)
        set_rule_b = set(rule_b)
        
        num_a = 0
        num_b = 0
        
        for test_data in test_datas.value:
            set_test_data = set(test_data)
            if set_rule_a.issubset(set_test_data):
                num_a += 1
                if set_rule_b.issubset(set_test_data):
                    num_b += 1
        if num_a:
            point_sum += num_b / num_a
        else:
            miss += 1
    print(miss,' rules miss')
    return point_sum / len(association_rules)
    

### 函数测试

In [125]:
# 导入所需模块
import pandas as pd
import numpy as np
import json
import fp_growth_py3 as fpg
import matplotlib.pyplot as plt
import pylab as pl
from collections import defaultdict

In [218]:
# 读入测试数据集
filepath='./test_data/user_following_animation.json'
data=pd.read_json(filepath,lines=True)
user_info = pd.read_csv("test_data/bilibili_crawler_user_info.csv",names = ['id','mid','name','sex','sign','the_rank','level','jointime','moral','silence','birthday','coins','fans_badge','role','title','desc','vip_type','vip_status'])

len(user_info)


51432

In [219]:
user_info.drop(user_info[user_info.vip_type.isna() | user_info.the_rank.isna() | user_info.level.isna()].index.tolist(),inplace=True) # 将vip_type、the_rank、level有NaN的行去掉
len(user_info)

51424

In [220]:
# 设置权重字典
add_rules={'the_rank':{10000:1,20000:2,25000:3,30000:4},'level':{'3':1,'4':2,'5':3,'6':4},'vip_type':{0:0,1:1,2:2}}


In [221]:
# 去除一下异常数据
for index,row in data.iterrows():
    try:
        int(row.key)
    except:
        print(row)
        data.drop(index,inplace=True)

db                                                       7
key                                         finished_users
size                                                507576
ttl                                                     -1
type                                                   set
value    [330817737, 74775, 259640193, 24774761, 540994...
Name: 4504, dtype: object


In [201]:
def user_power(data,rules,user_info):
    """
    params:
        data:用户收藏ID数据集
        rules:权重规则
        user_info:用户信息数据集
    return:新数据集
        
    """
    new_data = pd.DataFrame(columns=data.columns)
    for index,row in data.iterrows():
        info = user_info[user_info.mid == int(row.key)]

        if len(info):
            the_power = rules['the_rank'][info.the_rank.values[0]] + rules['level'][info.level.values[0]] + rules['vip_type'][info.vip_type.values[0]]
        else:  # len(info)==0 说明在user_info中没有这个用户的相关数据
            the_power = 1
#         print(the_power)
        for i in range(the_power):
            new_data = new_data.append(row,ignore_index=True)
    
    return new_data

In [202]:
# 根据权重规则对数据进行扩充
new_data= user_power(data,add_rules,user_info)
len(new_data)

38829

将测试数据集 划分为training_data与evaluate_data两部分，占比可调

In [205]:
training_data = new_data.sample(n=None, frac=0.9, replace=False, weights=None, random_state=None, axis=None)

In [206]:
print(len(training_data))
training_data.head()

34946


Unnamed: 0,db,key,size,ttl,type,value
25630,7,231642237,24,-1,set,"[6339, 139352, 4316442, 5268038]"
15916,7,2969676,70,-1,set,"[240, 844, 1553, 1635, 1687, 2271, 2602, 2876,..."
16400,7,33633146,10,-1,set,"[3462, 102792]"
25899,7,867942,262,-1,set,"[177, 184, 191, 856, 1512, 1552, 1559, 2338, 2..."
3319,7,312907,83,-1,set,"[184, 5998, 6301, 6312, 6339, 6420, 6434, 6439..."


In [207]:
evaluate_data = new_data.drop([x for x in training_data.index])

In [208]:
print(len(evaluate_data))
evaluate_data.head()

3883


Unnamed: 0,db,key,size,ttl,type,value
11,7,37868878,1246,-1,set,"[53, 110, 249, 282, 333, 334, 419, 470, 471, 5..."
14,7,37868878,1246,-1,set,"[53, 110, 249, 282, 333, 334, 419, 470, 471, 5..."
71,7,7862266,484,-1,set,"[480, 688, 814, 981, 994, 1064, 1078, 1083, 10..."
79,7,38115290,409,-1,set,"[2580, 2760, 5786, 5849, 5856, 6360, 6434, 657..."
86,7,8784405,183,-1,set,"[425, 963, 1178, 1463, 1660, 3151, 3756, 4378,..."


用training_data进行训练，导出关联规则（推荐规则）

注：这里只是用置信度进行了排名，因为只是对评估函数进行测试，所以没有加入更多的排名方法。

In [209]:

data_list = list(training_data["value"])
frequent_itemsets = fpg.find_frequent_itemsets(data_list, minimum_support=0.07 * len(data_list), include_support=True)
print(type(frequent_itemsets))  # print type
result = []
for itemset, support in frequent_itemsets:  # 将generator结果存入list
    result.append((itemset, support / len(data_list)))

result_patterns = [i[0] for i in result]
result_support = [i[1] for i in result]
patterns_df = pd.DataFrame({"fluent_patterns": result_patterns, "support": result_support})
patterns = {}
for i in result:
    patterns[frozenset(sorted(i[0]))] = i[1]
print("-------------挖掘频繁项集---------------")
print(patterns_df)

def generate_rules(patterns, min_confidence):
    patterns_group = group_patterns_by_length(patterns)
    raw_rules = defaultdict(set)
    for length, pattern_list in patterns_group.items():
        if length == 1:
            continue
        for pattern, support in pattern_list:
            item_list = list(pattern)
            for window_size in range(1, length):
                for i in range(0, length - window_size):
                    for j in range(i + window_size, length):
                        base_set = frozenset(item_list[i:j])
                        predict_set = frozenset(pattern - base_set)
                        confidence = support / patterns.get(base_set)
                        if confidence > min_confidence:
                            raw_rules[base_set].add((predict_set, confidence))

                        base_set, predict_set = predict_set, base_set
                        confidence = support / patterns.get(base_set)
                        if confidence > min_confidence:
                            raw_rules[base_set].add((predict_set, confidence))
    return raw_rules

def group_patterns_by_length(patterns):
    result = defaultdict(list)
    for pattern, support in patterns.items():
        result[len(pattern)].append((pattern, support))
    return result

def transform(raw_rules):
    result = list()
    for base_set, predict_set_list in raw_rules.items():
        for predict_set, confidence in predict_set_list:
            result.append((base_set, predict_set, confidence))

    return result
raw_rules = generate_rules(patterns, 0.6)
rules = transform(raw_rules)
rules.sort(key=lambda x: x[2], reverse=True)
rules_a = [i[0] for i in rules]
rules_b = [i[1] for i in rules]
confidence = [i[2] for i in rules]
rules_df = pd.DataFrame({"rules_a": rules_a, "rules_b": rules_b, "confidence": confidence})
print(rules_df)

<class 'generator'>
-------------挖掘频繁项集---------------
                 fluent_patterns   support
0                         [6339]  0.170377
1                 [102392, 6339]  0.088107
2                 [139252, 6339]  0.090454
3                   [8892, 6339]  0.093001
4                 [130412, 6339]  0.081583
5                   [5997, 6339]  0.096950
6                   [3461, 6339]  0.083042
7                [4316382, 6339]  0.070251
8                   [8792, 6339]  0.070824
9                   [5800, 6339]  0.077377
10                  [6446, 6339]  0.073685
11               [5267730, 6339]  0.092800
12                [134932, 6339]  0.079609
13                     [4316442]  0.148887
14             [139252, 4316442]  0.101099
15               [8892, 4316442]  0.087592
16            [5267730, 4316442]  0.114491
17    [139252, 5267730, 4316442]  0.086419
18      [8892, 5267730, 4316442]  0.073113
19             [130412, 4316442]  0.078206
20            [4312482, 4316442]  0.070909

In [210]:
# 将规则转换为(rule_a,rule_b)的格式，根据rules的实际情况更改
new_rule = []
for a,b,c in rules:
    new_rule.append([a,b])

In [211]:
new_rule[:5]

[[frozenset({'130412', '4316382', '4316482', '5267730'}),
  frozenset({'139252'})],
 [frozenset({'134912', '4316482', '5267730'}), frozenset({'139252'})],
 [frozenset({'139252', '4316382', '4762754'}), frozenset({'5267730'})],
 [frozenset({'130412', '139252', '4316382', '4316482'}),
  frozenset({'5267730'})],
 [frozenset({'130412', '4316382', '4316482'}), frozenset({'139252'})]]

In [212]:
# 计算前100条规则的平均准确率
result = evaluate_association_rules(new_rule[:100],evaluate_data)
result

0  rules miss


0.8567302328664756

In [213]:
# 计算第100到200条规则的平均准确率
result = evaluate_association_rules(new_rule[100:200],evaluate_data)
result

0  rules miss


0.7807110490250396

In [214]:
# 计算第200到300条规则的平均准确率
result = evaluate_association_rules(new_rule[200:300],evaluate_data)
result

0  rules miss


0.73296587114293

In [215]:
# 计算全部规则的平均准确率
result = evaluate_association_rules(new_rule,evaluate_data)
result

0  rules miss


0.7149250788211877

分别对前100条规则、第100到200条规则、第200到300条规则、全部规则进行评估，
发现使用靠近前面的规则进行推荐，平均准确率较高，证明了使用关联规则的置信度对规则进行排名有一定的优化效果。
