In [8]:
#Apriori
def find_itemset(data, min_support): #配合数据清理用
    '''利用Apriori方法，从原生数据库的消费记录中，挖掘出高频项目组合的参照表
    data：原生数据集，如初诊数据
    min_support: 筛选阈值
    '''
    from collections import defaultdict
    from itertools import combinations 

    dict_itemset = {} #the final output, recording the itemsets titled by length of set
    data_itemset = data.groupby(['关联键','证件号（id）','date'])['消费项目'].unique().reset_index(drop = False)
    print("一共存在{}个初/复诊消费组合记录".format(len(data_itemset)))
    
    itemset = [set(i) for i in list(data_itemset['消费项目'])]
    
    #create c1 as units for longer subsets
    #all the longer subsets are constructed from the key of c1
    uni_items = list(data['消费项目'].unique())
    all_items = list(data['消费项目'])
    c1 = {}
    for i in uni_items:
        if i in all_items:
            c1[i] = all_items.count(i) 
    
    forlower = pd.DataFrame(np.array(list(c1.values()))).describe()
    print("单一项目作为itemset时，所有support的分布为{}，\n可作为取min_support参考".format(forlower))
    
    c1 = {key:value for key,value in c1.items() if value >= min_support} #we filter out all the items that is lower than min_support, and this c1 will be the units/blocks to create longer itemsets.
    #we also add c1 to our final output for a complete xlsx file.   
    formatedc1 = defaultdict(int)
    for key,value in c1.items():
        formatedc1["{'"+ key +"'}"] = value
    dict_itemset[1] = formatedc1
    
    #here, we initialize our c1 and f1 used for compare, and other parameter needed in loop
    c1_itemset = list(c1.keys())
    f1 = [set([i]) for i in c1_itemset] # used for check if the subset is exist in the former table
    len_of_count = None
    rounds = 0 #record how many times we run the while loop
    
    
    while len_of_count != 0:
        c2_itemset = [set(i) for i in list(combinations(c1_itemset,rounds + 2))] # we generate k+n subset by c1_itemset as unit
        #step1: record the k length subsets and their count from global itemset
        counts = defaultdict(int)
        for i in c2_itemset:
            for o in itemset:
                if i.issubset(o):
                    counts["{}".format(i)] += 1
            if counts["{}".format(i)] < min_support: #filter out the itemset whose count is lower than threshold
                del counts["{}".format(i)]

        #step2: check if len_of_count >0 and the (k-1) length subsets is in the former k length subset
        len_of_count = len(counts)
        if len_of_count > 0:
            key_of_count = list(counts.keys())
            len_to_sub = len(key_of_count[0].split(",")) - 1
            for nset in key_of_count:
                sub = nset.lstrip("{'").rstrip("'}").split("', '")
                list_of_subset = [set(i) for i in list(combinations(sub, len_to_sub))]
                if all((i in f1) for i in list_of_subset) == False: # if the (k-1) length subsets of k length subset are not all in f1 which is the former (k-1) subset. we remove it.
                    del counts[nset]

            rounds += 1
            f1 = c2_itemset.copy()
            dict_itemset[rounds+1] = counts
            
        else:
            break
    
    return dict_itemset

In [10]:
# two_way_sorting
def two_way(data): #配合set_to_cell用
    '''基于先比cardinality再比frequency的原则，进行贪心排序
    data: 原生消费项目字符串'''
    from itertools import combinations #该module用来将数组或列表进行组合配对
    
    item_dic = {}
    itemset = set(data.split(",")) #将消费组合分裂后放入set
    output = []
    
    if itemset in item_keys:   #如果该消费组合自己本身就在item_key里，我们就可以直接默认他为一个item-set，因为我们以cardinality为第一优先级
        pos1 = item_keys.index(itemset) #找到该组合再item——keys中的序列号
        #set_freq = list(item_values)[pos1]
        output.append(itemset)
        return output #导出list of sets
    elif len(itemset) == 2: #如果消费的总项目只有2个，而且它们的组合不在组合的对照表中，那么我们逐个进行识别，看他们是否伪单独得高频项目，是则纳入，不是则删除
        if {list(itemset)[0]} in item_keys:
            output.append({list(itemset)[0]})
        if {list(itemset)[1]} in item_keys:
            output.append({list(itemset)[1]})
        return output
    
    elif len(itemset) == 1: #如果总项目只有1个，且它不在对照表中出现，说明是低频项目，直接删除
        return output
    
    #总项目两个以上，并且它们的全项目组合不在对照表中，我们进行组合，然后查找对照表
    else: #如果不符合以上3种情况（长度小于等于2，且自己本身不是高频组合），我们再对消费组合进行配对组合（比如，5个项目的消费组合，我们就以4、3、2、1为subset的size进行配对，再去item_keys里查找是否存在这样的组合）
        for x in range(len(itemset)-1): #不让combinations的组合size取到0
            bags = list(combinations(itemset, len(itemset)-1-x))

            #检查分配出来的组合是否存在于组合集中
            quick_out = [] #单个项目且不在对照表里
            for p in bags: 
                if set(p) in item_keys:
                    pos2 = item_keys.index(set(p)) #同上，找到index，并找value，存入字典
                    item_dic["{}".format(set(p))] = list(item_values)[pos2]
                elif len(set(p)) == 1: #单个项目且不在对照表里，需要被拿出来，在return时直接与成包的消费项目组合输出
                    quick_out.append(set(p))
                    
        if (len(item_dic) == 0) and (len(quick_out) > 0): # 如果所有组合或者单个消费都不在对照表中出现，那么消费项目直接单独成包输出
            return quick_out
        #在item_dic中保存的是在对照表里存在的该消费项目的消费组合，cardinality不一，出现的frequency不一样，所以我们现在进行比较two-way sorting：  
        ###优先比较cardinality
        cardi_list = []
        for i in item_dic.keys():
            cardi_list.append(len(i.split(",")))
            
        if (len(set(cardi_list)) == 1) and (set(cardi_list) == {1}):
            later_out = [{i.lstrip("{'").rstrip("'}")} for i in list(item_dic.keys())]
            return later_out + quick_out #长度为1在对照表的包 + 长度为1单不在对照表中的包 

        else:    
            #得出最大cardinality在list中的位置  
            num_of_max = cardi_list.count(max(cardi_list))
            list_of_idx = []
            while num_of_max > 0:
                idx = cardi_list.index(max(cardi_list))
                list_of_idx.append(idx) 
                cardi_list[idx] = 0 #把最大cardi换成0，以便后面比更低的
                num_of_max -= 1
            ###得出最大cardi的组中，freq最高的item set  
            val = 0 #initalize value number
            for o in list_of_idx:
                if list(item_dic.values())[o] > val:
                    val = list(item_dic.values())[o]
                    max_freq = o
        
            ##得出了cardi最大并且freq最大的item-set，我们将该item-set作为其中一个super event
            superevent1 = set((list(item_dic.keys())[max_freq]).lstrip("{'").rstrip("'}").split("', '"))
            output.append(superevent1)
            itemset -= superevent1
            
            return output, ",".join(list(itemset))

In [11]:
def set_to_cell(dat):#配合two_way用
    '''基于two way算法，对每个原生消费字符串进行多轮切割直至所有项目都被切割完成
    dat： 原生消费项目字符串'''
    tocell = []
    
    ###看item_dic中是否存在组合，可以覆盖到剩下的消费项目：
    check = two_way(dat)    
    if (all([(type(e) == set) for e in check])) or (len(check) == 0): #如果list里面全部是set组成的,或者list里根本没有高频组合。说明第一次循环就完成了打包。直接导出
        tocell += check
        return tocell
    
    #进入多次分包循环
    else:
        tocell += check[0]
        while (len(check) != 1) and (type(check[1]) == str): #如果结果当中不是单一的list of sets，说明还有剩余的itemset子集需要进行打包处理
            round2 = two_way(check[1]) #将剩下部分按two-way的算法再进行分包 #[{“开髓引流术”}]

            if type(round2) == list:  
                tocell += round2 #分包后得出的第二轮的item set也加入输出的结果中   
                check = []
                break
            elif type(round2) == tuple:
                tocell += round2[0]
                forcheck = round2[1]
                check = two_way(forcheck)
                if len(check) == 0:
                    break


        tocell += check #将最后剩下的单输出放入tocell。完成全部分包步骤

    return tocell

            