In [3]:
# 导入所需模块
import pandas as pd
import numpy as np
import json
import fp_growth_py3 as fpg
import matplotlib.pyplot as plt
import pylab as pl
from collections import defaultdict


In [18]:
def evaluate_association_rules(rules_to_evaluate,test_datas):
    """
    Parameters：
        rules_to_evaluate：要评估的关联规则（推荐规则） list((rule_a,rule_b))
        test_datas：测试用的数据集 dataframe

    Returns：
        关联规则（推荐规则）在测试集上的平均准确率
    """
    point_sum = 0
    miss = 0
    for rule_a,rule_b in rules_to_evaluate:
        set_rule_a = set(rule_a)
        set_rule_b = set(rule_b)  
        
        num_a = 0
        num_b = 0
        
        for test_data in test_datas.value:
            set_test_data = set(test_data)
            if set_rule_a.issubset(set_test_data):
                num_a += 1
                if set_rule_b.issubset(set_test_data):
                    num_b += 1
        if num_a:
            point_sum += num_b / num_a
        else:
            miss += 1
#     print(miss,' rules miss')
    return point_sum / len(rules_to_evaluate)
    

In [5]:
# 读入测试数据集
filepath='./test_data/user_following_animation.json'
data=pd.read_json(filepath,lines=True)
user_info = pd.read_csv("test_data/bilibili_crawler_user_info.csv",names = ['id','mid','name','sex','sign','the_rank','level','jointime','moral','silence','birthday','coins','fans_badge','role','title','desc','vip_type','vip_status'])

user_info.drop(user_info[user_info.vip_type.isna() | user_info.the_rank.isna() | user_info.level.isna()].index.tolist(),inplace=True) # 将vip_type、the_rank、level有NaN的行去掉


In [6]:
add_rules={'the_rank':{10000:1,20000:2,25000:3,30000:4},'level':{'3':1,'4':2,'5':3,'6':4},'vip_type':{0:0,1:1,2:2}}


In [7]:
for index,row in data.iterrows():
    try:
        int(row.key)
    except:
        print(row)
        data.drop(index,inplace=True)

db                                                       7
key                                         finished_users
size                                                507576
ttl                                                     -1
type                                                   set
value    [330817737, 74775, 259640193, 24774761, 540994...
Name: 4504, dtype: object


In [8]:
def user_power(data,rules,user_info):
    """
    params:
        data:用户收藏ID数据集
        rules:权重规则
        user_info:用户信息数据集
    return:新数据集
        
    """
    new_data = pd.DataFrame(columns=data.columns)
    for index,row in data.iterrows():
        info = user_info[user_info.mid == int(row.key)]

        if len(info):
            the_power = rules['the_rank'][info.the_rank.values[0]] + rules['level'][info.level.values[0]] + rules['vip_type'][info.vip_type.values[0]]
        else:  # len(info)==0 说明在user_info中没有这个用户的相关数据
            the_power = 1
#         print(the_power)
        for i in range(the_power):
            new_data = new_data.append(row,ignore_index=True)
    return new_data

In [9]:
new_data= user_power(data,add_rules,user_info)
training_data = new_data.sample(n=None, frac=0.9, replace=False, weights=None, random_state=None, axis=None)
evaluate_data = new_data.drop([x for x in training_data.index])


In [10]:

data_list = list(training_data["value"])
frequent_itemsets = fpg.find_frequent_itemsets(data_list, minimum_support=0.07 * len(data_list), include_support=True)
print(type(frequent_itemsets))  # print type
result = []
for itemset, support in frequent_itemsets:  # 将generator结果存入list
    result.append((itemset, support / len(data_list)))

result_patterns = [i[0] for i in result]
result_support = [i[1] for i in result]
patterns_df = pd.DataFrame({"fluent_patterns": result_patterns, "support": result_support})
patterns = {}
for i in result:
    patterns[frozenset(sorted(i[0]))] = i[1]
print("-------------挖掘频繁项集---------------")
print(patterns_df)

def generate_rules(patterns, min_confidence):
    patterns_group = group_patterns_by_length(patterns)
    raw_rules = defaultdict(set)
    for length, pattern_list in patterns_group.items():
        if length == 1:
            continue
        for pattern, support in pattern_list:
            item_list = list(pattern)
            for window_size in range(1, length):
                for i in range(0, length - window_size):
                    for j in range(i + window_size, length):
                        base_set = frozenset(item_list[i:j])
                        predict_set = frozenset(pattern - base_set)
                        confidence = support / patterns.get(base_set)
                        if confidence > min_confidence:
                            raw_rules[base_set].add((predict_set, confidence))

                        base_set, predict_set = predict_set, base_set
                        confidence = support / patterns.get(base_set)
                        if confidence > min_confidence:
                            raw_rules[base_set].add((predict_set, confidence))
    return raw_rules

def group_patterns_by_length(patterns):
    result = defaultdict(list)
    for pattern, support in patterns.items():
        result[len(pattern)].append((pattern, support))
    return result

def transform(raw_rules):
    result = list()
    for base_set, predict_set_list in raw_rules.items():
        for predict_set, confidence in predict_set_list:
            result.append((base_set, predict_set, confidence))

    return result
raw_rules = generate_rules(patterns, 0.6)
rules = transform(raw_rules)
rules.sort(key=lambda x: x[2], reverse=True)
rules_a = [i[0] for i in rules]
rules_b = [i[1] for i in rules]
confidence = [i[2] for i in rules]
rules_df = pd.DataFrame({"rules_a": rules_a, "rules_b": rules_b, "confidence": confidence})
print(rules_df)

<class 'generator'>
-------------挖掘频繁项集---------------
                      fluent_patterns   support
0                            [102392]  0.329480
1                              [8892]  0.305385
2                      [102392, 8892]  0.159475
3                            [139252]  0.304842
4                    [102392, 139252]  0.157700
5                      [8892, 139252]  0.165598
6              [102392, 8892, 139252]  0.100355
7                           [5267730]  0.299748
8                   [102392, 5267730]  0.155096
9                     [8892, 5267730]  0.168088
10            [102392, 8892, 5267730]  0.099010
11                  [139252, 5267730]  0.191553
12          [102392, 139252, 5267730]  0.106135
13            [8892, 139252, 5267730]  0.122246
14    [102392, 8892, 139252, 5267730]  0.074629
15                          [4312482]  0.225891
16                  [102392, 4312482]  0.110971
17                    [8892, 4312482]  0.119985
18            [102392, 8892, 4312

In [11]:
import association_rules
animation = pd.read_json("./test_data/bilibili_crawler_animation.json", encoding="utf-8")
animation["score"] = animation["score"].fillna('%.1f' % animation["score"].mean())
animation[["follow", "play"]] = animation[["follow", "play"]].applymap(association_rules.trans)
animation_feature = pd.read_json("./test_data/bilibili_crawler_animation_feature.json",dtype={"character_voice_list": str})
animation_feature[["tag_list", "character_voice_list", "character_staff_list"]] = animation_feature[["tag_list", "character_voice_list", "character_staff_list"]].applymap(json.loads)

# #将关联规则的后项展开
rules_df = association_rules.unfold_rules(rules_df)


In [28]:
def find_good_para(rules,animation, animation_feature,confidence=0.5,score=0.1,play=0.1,follow=0.1,voice=0.1,staff=0.1):
    rules_weight = {"confidence": confidence, "score": score, "play": play, "follow": follow, "voice": voice, "staff": staff}
    rules_df = association_rules.add_score(rules, rules_weight, animation, animation_feature)
    rules_df.sort_values(by='score',inplace=True,ascending=False)
    
    new_rule = []
    for index,row in rules_df.iterrows():
        b=[]
        for item in row[0]:
            b.append(str(item))
        new_rule.append([b,[str(row[1])]])
    
    print(rules_weight)
    print('\t000-100 : ',evaluate_association_rules(new_rule[:100],evaluate_data))
#     print('\t100-200 : ',evaluate_association_rules(new_rule[100:200],evaluate_data))
#     print('\t200-300 : ',evaluate_association_rules(new_rule[200:300],evaluate_data))
#     print('\tall_data: ',evaluate_association_rules(new_rule,evaluate_data))

In [47]:
config = {
    "confidence": 0.5, 
    "score": 0, 
    "play": 0, 
    "follow": 0, 
    "voice": 0, 
    "staff": 0.3
}


find_good_para(rules_df,animation, animation_feature,**config)


{'confidence': 0.5, 'score': 0, 'play': 0, 'follow': 0, 'voice': 0, 'staff': 0.3}
	000-100 :  0.8503290244248339
