In [1]:
# 导入所需模块
import pandas as pd
import numpy as np
import json
import fp_growth_py3 as fpg
import matplotlib.pyplot as plt
import pylab as pl
from collections import defaultdict


In [2]:
def evaluate_association_rules(rules_to_evaluate,test_datas):
    """
    Parameters：
        rules_to_evaluate：要评估的关联规则（推荐规则） list((rule_a,rule_b))
        test_datas：测试用的数据集 dataframe

    Returns：
        关联规则（推荐规则）在测试集上的平均准确率
    """
    point_sum = 0
    miss = 0
    for rule_a,rule_b in rules_to_evaluate:
        set_rule_a = set(rule_a)
        set_rule_b = set(rule_b)  
        
        num_a = 0
        num_b = 0
        
        for test_data in test_datas.value:
            set_test_data = set(test_data)
            if set_rule_a.issubset(set_test_data):
                num_a += 1
                if set_rule_b.issubset(set_test_data):
                    num_b += 1
        if num_a:
            point_sum += num_b / num_a
        else:
            miss += 1
#     print(miss,' rules miss')
    return point_sum / len(rules_to_evaluate)
    

In [3]:
# 读入测试数据集
filepath='./test_data/user_following_animation.json'
data=pd.read_json(filepath,lines=True)
user_info = pd.read_csv("test_data/bilibili_crawler_user_info.csv",names = ['id','mid','name','sex','sign','the_rank','level','jointime','moral','silence','birthday','coins','fans_badge','role','title','desc','vip_type','vip_status'])

user_info.drop(user_info[user_info.vip_type.isna() | user_info.the_rank.isna() | user_info.level.isna()].index.tolist(),inplace=True) # 将vip_type、the_rank、level有NaN的行去掉


In [4]:
add_rules={'the_rank':{10000:1,20000:2,25000:3,30000:4},'level':{'3':1,'4':2,'5':3,'6':4},'vip_type':{0:0,1:1,2:2}}


In [5]:
for index,row in data.iterrows():
    try:
        int(row.key)
    except:
        print(row)
        data.drop(index,inplace=True)

db                                                       7
key                                         finished_users
size                                                507576
ttl                                                     -1
type                                                   set
value    [330817737, 74775, 259640193, 24774761, 540994...
Name: 4504, dtype: object


In [6]:
def user_power(data,rules,user_info):
    """
    params:
        data:用户收藏ID数据集
        rules:权重规则
        user_info:用户信息数据集
    return:新数据集
        
    """
    new_data = pd.DataFrame(columns=data.columns)
    for index,row in data.iterrows():
        info = user_info[user_info.mid == int(row.key)]

        if len(info):
            the_power = rules['the_rank'][info.the_rank.values[0]] + rules['level'][info.level.values[0]] + rules['vip_type'][info.vip_type.values[0]]
        else:  # len(info)==0 说明在user_info中没有这个用户的相关数据
            the_power = 1
#         print(the_power)
        for i in range(the_power):
            new_data = new_data.append(row,ignore_index=True)
    return new_data

In [7]:
new_data= user_power(data,add_rules,user_info)
training_data = new_data.sample(n=None, frac=0.9, replace=False, weights=None, random_state=None, axis=None)
evaluate_data = new_data.drop([x for x in training_data.index])


In [8]:

data_list = list(training_data["value"])
frequent_itemsets = fpg.find_frequent_itemsets(data_list, minimum_support=0.07 * len(data_list), include_support=True)
print(type(frequent_itemsets))  # print type
result = []
for itemset, support in frequent_itemsets:  # 将generator结果存入list
    result.append((itemset, support / len(data_list)))

result_patterns = [i[0] for i in result]
result_support = [i[1] for i in result]
patterns_df = pd.DataFrame({"fluent_patterns": result_patterns, "support": result_support})
patterns = {}
for i in result:
    patterns[frozenset(sorted(i[0]))] = i[1]
print("-------------挖掘频繁项集---------------")
print(patterns_df)

def generate_rules(patterns, min_confidence):
    patterns_group = group_patterns_by_length(patterns)
    raw_rules = defaultdict(set)
    for length, pattern_list in patterns_group.items():
        if length == 1:
            continue
        for pattern, support in pattern_list:
            item_list = list(pattern)
            for window_size in range(1, length):
                for i in range(0, length - window_size):
                    for j in range(i + window_size, length):
                        base_set = frozenset(item_list[i:j])
                        predict_set = frozenset(pattern - base_set)
                        confidence = support / patterns.get(base_set)
                        if confidence > min_confidence:
                            raw_rules[base_set].add((predict_set, confidence))

                        base_set, predict_set = predict_set, base_set
                        confidence = support / patterns.get(base_set)
                        if confidence > min_confidence:
                            raw_rules[base_set].add((predict_set, confidence))
    return raw_rules

def group_patterns_by_length(patterns):
    result = defaultdict(list)
    for pattern, support in patterns.items():
        result[len(pattern)].append((pattern, support))
    return result

def transform(raw_rules):
    result = list()
    for base_set, predict_set_list in raw_rules.items():
        for predict_set, confidence in predict_set_list:
            result.append((base_set, predict_set, confidence))

    return result
raw_rules = generate_rules(patterns, 0.6)
rules = transform(raw_rules)
rules.sort(key=lambda x: x[2], reverse=True)
rules_a = [i[0] for i in rules]
rules_b = [i[1] for i in rules]
confidence = [i[2] for i in rules]
rules_df = pd.DataFrame({"rules_a": rules_a, "rules_b": rules_b, "confidence": confidence})
print(rules_df)

<class 'generator'>
-------------挖掘频繁项集---------------
                     fluent_patterns   support
0                             [8892]  0.306416
1                     [102392, 8892]  0.160047
2                           [139252]  0.304985
3                     [8892, 139252]  0.166800
4             [102392, 8892, 139252]  0.100641
5                   [102392, 139252]  0.157042
6                          [5267730]  0.300578
7                    [8892, 5267730]  0.168717
8            [102392, 8892, 5267730]  0.099010
9                  [139252, 5267730]  0.191925
10           [8892, 139252, 5267730]  0.123419
11   [102392, 8892, 139252, 5267730]  0.075001
12         [102392, 139252, 5267730]  0.106107
13                 [102392, 5267730]  0.155125
14                          [130412]  0.252046
15                    [8892, 130412]  0.149259
16            [102392, 8892, 130412]  0.082899
17                  [139252, 130412]  0.169204
18            [8892, 139252, 130412]  0.112402
19   

In [9]:
import association_rules
animation = pd.read_json("./test_data/bilibili_crawler_animation.json", encoding="utf-8")
animation["score"] = animation["score"].fillna('%.1f' % animation["score"].mean())
animation[["follow", "play"]] = animation[["follow", "play"]].applymap(association_rules.trans)
animation_feature = pd.read_json("./test_data/bilibili_crawler_animation_feature.json",dtype={"character_voice_list": str})
animation_feature[["tag_list", "character_voice_list", "character_staff_list"]] = animation_feature[["tag_list", "character_voice_list", "character_staff_list"]].applymap(json.loads)

# #将关联规则的后项展开
rules_df = association_rules.unfold_rules(rules_df)


In [10]:
def find_good_para(rules,animation, animation_feature,confidence=0.5,score=0.1,play=0.1,follow=0.1,voice=0.1,staff=0.1):
    rules_weight = {"confidence": confidence, "score": score, "play": play, "follow": follow, "voice": voice, "staff": staff}
    rules_df = association_rules.add_score(rules, rules_weight, animation, animation_feature)
    rules_df.sort_values(by='score',inplace=True,ascending=False)
    
    new_rule = []
    for index,row in rules_df.iterrows():
        b=[]
        for item in row[0]:
            b.append(str(item))
        new_rule.append([b,[str(row[1])]])
    
    print(rules_weight)
    print('\t000-100 : ',evaluate_association_rules(new_rule[:100],evaluate_data))
    return new_rule
#     print('\t100-200 : ',evaluate_association_rules(new_rule[100:200],evaluate_data))
#     print('\t200-300 : ',evaluate_association_rules(new_rule[200:300],evaluate_data))
#     print('\tall_data: ',evaluate_association_rules(new_rule,evaluate_data))

In [11]:
config = {
    "confidence": 0.5, 
    "score": 0.1, 
    "play": 0.1, 
    "follow": 0.1, 
    "voice": 0.1, 
    "staff": 0.1
}


final_rule = find_good_para(rules_df,animation, animation_feature,**config)


{'confidence': 0.5, 'score': 0.1, 'play': 0.1, 'follow': 0.1, 'voice': 0.1, 'staff': 0.1}
	000-100 :  0.8448568719800765


In [12]:
def recommend(rules,user_data):
    """
    Params:
        rules:推荐规则    格式：[[rules_a][rules_b]]
        user_data:用户收藏信息  格式：Dataframe 包含value
        
    Returns:
        推荐影片ID列表（10个以内）
    """
    recommend_list = []
    for rule_a,rule_b in rules:
        rule_a_set = set(rule_a)
        user_data_set = set(user_data.value) 

        if rule_a_set.issubset(user_data_set):
            if rule_b[0] not in recommend_list: # rule_b都是只有一个影片，所以用rule_b[0]提取出来就行
                recommend_list.append(rule_b[0])           
    return recommend_list[:10]

In [13]:
training_data.iloc[0]

db                                                       7
key                                                2328126
size                                                   252
ttl                                                     -1
type                                                   set
value    [53, 519, 779, 963, 964, 965, 1512, 1553, 1586...
Name: 13502, dtype: object

In [14]:
recommend(final_rule,training_data.iloc[0])

['1586',
 '139252',
 '5267730',
 '135652',
 '8892',
 '1587',
 '102392',
 '4312482',
 '130412',
 '4316382']

In [16]:
len(final_rule)

675

In [17]:
len(training_data)

34946

In [18]:
len(evaluate_data)

3883

In [19]:
len(new_data)

38829

In [36]:
def get_support(rule,data):
    set_a = set(rule[0])
    set_b = set(rule[1])
    count = 0
    
    for index,row in data.iterrows():
        set_row = set(row.value)
        if set_a.issubset(set_row) or set_b.issubset(set_row):
            count +=1 
    return count/len(data)

In [40]:
def get_confidence(rule,data):
    set_a = set(rule[0])
    set_b = set(rule[1])
    
    num_x = 0
    num_xy = 0
    for index,row in data.iterrows():
        set_row = set(row.value)
        if set_a.issubset(set_row):
            num_x += 1
            if set_b.issubset(set_row):
                num_xy+=1
    return num_xy/num_x
    

In [42]:
get_support(final_rule[0],evaluate_data)

0.15117177440123616

In [43]:
get_confidence(final_rule[0],evaluate_data)

0.8848167539267016

In [59]:
def get_lift(rule,data):
    set_a = set(rule[0])
    set_b = set(rule[1])
    
    num_x = 0
    num_y = 0
    num_xy = 0
    for index,row in data.iterrows():
        set_row = set(row.value)
        if set_a.issubset(set_row):
            num_x += 1
            if set_b.issubset(set_row):
                num_xy+=1
        if set_b.issubset(set_row):
            num_y +=1
    confi = num_xy/num_x
    l = len(data)
    return confi/num_y*l
    

In [61]:
get_lift(final_rule[0],evaluate_data)

6.327336013807334

In [53]:
def all_max_conf_kulc_cosine(rule,data):
    set_x = set(rule[0])
    set_y = set(rule[1])
    
    num_x = 0
    num_y = 0
    num_xy = 0
    p_x_y = 0
    p_y_x = 0
    for index,row in data.iterrows():
        set_row = set(row.value)
        if set_x.issubset(set_row):
            num_x += 1
            if set_y.issubset(set_row):
                num_xy+=1
        if set_y.issubset(set_row):
            num_y +=1
            
    p_x_y = num_xy / num_y
    p_y_x = num_xy / num_x
    
    
    return (min(p_x_y,p_y_x),max(p_x_y,p_y_x),(p_x_y+p_y_x)/2,pow((p_x_y*p_y_x),0.5))
    

In [67]:
print('all_conf,\t\tmax_conf,\t\tKulc,\t\t\tcosine')
for rule in final_rule[:10]:
    print(all_max_conf_kulc_cosine(rule,evaluate_data))

all_conf,		max_conf,		Kulc,			cosine
(0.6224677716390423, 0.8848167539267016, 0.7536422627828719, 0.7421387425041525)
(0.2270042194092827, 0.8996655518394648, 0.5633348856243737, 0.4519157845492225)
(0.23291139240506328, 0.8961038961038961, 0.5645076442544796, 0.4568509671448235)
(0.249789029535865, 0.8835820895522388, 0.5666855595440519, 0.46979688447724455)
(0.22784810126582278, 0.8794788273615635, 0.5536634643136931, 0.44764671440526016)
(0.25316455696202533, 0.8746355685131195, 0.5639000627375724, 0.47056001339452236)
(0.25569620253164554, 0.8706896551724138, 0.5631929288520297, 0.4718389962806955)
(0.28354430379746837, 0.8682170542635659, 0.5758806790305171, 0.49616327977416025)
(0.23050556983718937, 0.9087837837837838, 0.5696446768104866, 0.4576895497385517)
(0.2919831223628692, 0.8671679197994987, 0.5795755210811839, 0.5031882320126055)


In [56]:
pow(4,0.5)


2.0

In [62]:
for rule in final_rule:
    print(get_lift(rule,evaluate_data))

6.327336013807334
2.948018006576069
2.9363471971066906
2.8953158259336234
2.881870284088566
2.865999926191091
2.853069983995344
2.8449677820298955
3.023828134046643
2.8415299853008045
3.10235509732349
2.779052502269935
2.9797030272800527
2.7431440626883665
2.8295668737610327
2.9697706838557854
3.024014448910414
2.982533487841372
3.0258289587163723
2.8684701026591477
2.609482769676683
2.8043721363671663
2.805934394991153
2.977089252694719
2.83264958568451
2.7639038708493855
2.7785179868632826
2.726147551464007
2.979860184612715
2.930813253948169
2.743692394995286
2.7171205728167753
2.7165920611863754
2.9985863748082244
2.961118925589993
2.8058768538871623
2.7198679141441935
2.969006657438534
2.7367630766329585
2.54358812970604
2.7282551720292196
2.6922841829170943
2.7256136081798825
3.060753307448402
2.8805214836577306
2.9599636696860836
2.707769271583415
2.959488961517097
2.6526421539079768
2.7135944092827002
2.7564092094104344
2.732281611138239
2.629283205740418
2.8130009737098347
2.7

In [24]:
set(final_rule[0][1])

{'1586'}

In [26]:
training_data.iloc[0].value

['53',
 '519',
 '779',
 '963',
 '964',
 '965',
 '1512',
 '1553',
 '1586',
 '1587',
 '1588',
 '1650',
 '1732',
 '2660',
 '2843',
 '3120',
 '3448',
 '3449',
 '3467',
 '4340',
 '4349',
 '6075',
 '6301',
 '6439',
 '8752',
 '8892',
 '10352',
 '11712',
 '11912',
 '120732',
 '130412',
 '134912',
 '134932',
 '135652',
 '138832',
 '139252',
 '140552',
 '4312482',
 '4313642',
 '4316382',
 '4316422',
 '4316442',
 '4316462',
 '4316482',
 '4762714',
 '4762734',
 '4762754',
 '5267730',
 '5267750',
 '21971025']