## 1. 生成候选项集
- 构建大小为1的所有候选项集的集合C1。
- 扫描数据集，判断是否满足最小支持度的要求，构成集合 L1

In [2]:
"""
加载数据
"""
def loadDataSet():
    return [[1,3,4], [2,3,5], [1,2,3,5],[2,5]]

"""
求第一次扫描数据库后的候选集
param:
    dataSet: 初始数据集
"""
def createC1(dataSet):
    C1 = set()
    for transaction in dataSet:
        for item in transaction:
            C1.add(frozenset([item]))
    return C1

In [14]:
"""
从第k次候选集中选出频繁项
param:
    D: 数据集
    Ck: 第k次候选集
    minSupport: 最小支持度
return:
    Lk -> List[list]: 频繁项
    supportData -> Dict{tuple: int}: key为频繁项value为支持度
"""
def generate_lk_by_ck(D, Ck, minSupport):
    Lk = list()
    #用于保存各频繁项的支持度
    supportData = {}
    
    # TODO
    # 转换为另一种表达形式
    minSupport *= len(D)
    # 用于保存各项的支持度
    candidateData = {}
    for transaction in dataSet:
        for ci in Ck:
            if set(ci).issubset(set(transaction)):
                candidateData[tuple(ci)] = candidateData.get(tuple(ci), 0) + 1
    for key in candidateData.keys():
        if candidateData[key] >= minSupport:
            Lk.append(list(key))
            supportData[key] = candidateData[key]
                
    return Lk, supportData
            

In [15]:
dataSet = loadDataSet()
dataSet

[[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

In [16]:
C1 = createC1(dataSet)
C1

{frozenset({2}),
 frozenset({3}),
 frozenset({1}),
 frozenset({5}),
 frozenset({4})}

In [17]:
L1, supportData = generate_lk_by_ck(dataSet, C1, 0.5)

## 2. 生成所有频繁项集
- 从 Lk-1 生成 Ck
- 检查候选项子集是否都在频繁 Lk-1 中

In [18]:
"""
通过频繁项集Lk-1创建Ck候选项集
param:
    Lk-1: 第k-1次频繁项集
    k -> int: 第k次遍历
return:
    Ck: 第k次候选集
"""
def createCk(Lk_1, k):
    Ck = list()
    # TODO
    for i in range(len(Lk_1)):
        for j in range(i+1, len(Lk_1)):
            L1 = list(Lk_1[i])[:k-2]
            L2 = list(Lk_1[j])[:k-2]
            # 先排序，在进行组合
            L1.sort()
            L2.sort()

            if L1 == L2:
                if k > 2:
                    # 集合运算 对称差集 ^ （含义，集合的元素在t或s中，但不会同时出现在二者中）      
                    new = list( set(Lk_1[i]) ^ set(Lk_1[j]) ) 
                else:
                    # 第二次求候选集，不需要进行减枝
                    new = set()
                for x in Lk_1:
                    # 减枝new是 x 的子集，并且还没有加入 ck 中
                    if set(new).issubset(set(x)) and list(set(Lk_1[i]) | set(Lk_1[j])) not in Ck:
                        Ck.append( list(set(Lk_1[i]) | set(Lk_1[j])) )
    return Ck

"""
检查候选项Ck的子集是否都在Lk-1中
param:
    Ck: 第k次候选集
    Lk-1: 第k-1次频繁项集
"""
def has_infrequent_subset(Ck, Lk_1):
    # TODO
    # 因为createCk(Lk_1, k)已经做了剪枝操作，肯定是都在Lk-1中，直接返回True
    return True

In [20]:
"""
生成所有频繁项集
param:
    D: 数据集
    minSupport: 最小支持度
return:
    L: 所有频繁项集
    supportData: 存放所有频繁项集支持度的字典
"""
def generate_L(dataSet, minSupport=0.5):
    C1= createC1(dataSet)
    L1, supportData = generate_lk_by_ck(dataSet, C1, minSupport)
    L = [L1]
    k = 2
    Lk_1 = L1.copy()
    while True:
        Ck = createCk(Lk_1, k)
        Lk, supK = generate_lk_by_ck(dataSet, Ck, minSupport)
        supportData.update(supK)
        if len(Lk) == 0:
            break
        Lk_1 = Lk.copy()
        L.append(Lk_1)
        k += 1
    return L, supportData

In [21]:
L, supportData = generate_L(dataSet, 0.5)

In [22]:
createCk(L[0], 2)

[[1, 3], [2, 3], [3, 5], [1, 2], [1, 5], [2, 5]]

In [24]:
L, supportData = generate_L(dataSet, minSupport=0.5)
print(L)
print(supportData)

[[[3], [1], [2], [5]], [[1, 3], [2, 3], [3, 5], [2, 5]], [[2, 3, 5]]]
{(3,): 3, (1,): 2, (2,): 3, (5,): 3, (1, 3): 2, (2, 3): 2, (3, 5): 2, (2, 5): 3, (2, 3, 5): 2}


## 3. 从频繁项集中挖掘关联规则
- 计算规则置信度
- 如果某个规则不满足最小置信度要求，那么该规则的所有子集也不会满足

In [None]:
def generateRules(dataset, minSupport, minConf=0.7):
    L, supportData=generate_L(dataset,minSupport)
    big_rule = []
    for i in range(1, len(L)):
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]
            if i > 1:
                rulesFromConseq(freqSet, H1, supportData, big_rule, minConf)
            else:
                calcConf(freqSet, H1, supportData, big_rule, minConf)
    return big_rule

# 新候选规则
def rulesFromConseq(freqSet, H, supportData, big_rule, minConf):
    m = len(H[0])
    if (len(freqSet) > m+1):
        Hmp1 = createCk(H, m+1)
        Hmp1 = calcConf(freqSet, Hmp1, supportData, big_rule, minConf)
        if (len(Hmp1) > 1):
            rulesFromConseq(freqSet, Hmp1, supportData, big_rule, minConf)

def calcConf(freqSet, H, supportData, big_rule, minConf):
    prunedH = []
    for conseq in H:
        conf = supportData[freqSet]/supportData[freqSet-conseq]
    if conf >= minConf:
        big_rule.append((freqSet - conseq, conseq, conf))
        prunedH.append(conseq)
    return prunedH

蛮力方式，遍历所有子集

In [32]:
def generate_R(dataset, minSupport, minConf):
    L, supportData=generate_L(dataset,minSupport)
    # 保存满足置信度的规则
    rule_list = []
    # 该数组保存检查过的频繁项
    sub_set_list = []
    for i in range(len(L)):
        for freq_set in L[i]:
            # sub_set_list中保存的是L1到Lk-1
            for sub_set in sub_set_list:
                # 检查sub_set是否是freq_set的子集
                if set(sub_set).issubset(set(freq_set)):
                    # 检查置信度是否满足要求，是则添加到规则
                    conf = supportData[tuple(freq_set)] / supportData[tuple(set(freq_set) - set(sub_set))]
                    big_rule = (set(freq_set) - set(sub_set), set(sub_set), conf)
                    if conf >= minConf and big_rule not in rule_list:
                        rule_list.append(big_rule)
            sub_set_list.append(freq_set)
    rule_list = sorted(rule_list, key=lambda x:(x[2]), reverse=True)
    return rule_list

In [34]:
rules = generate_R(dataSet, 0.5, 0.5)
rules

[({1}, {3}, 1.0),
 ({5}, {2}, 1.0),
 ({2}, {5}, 1.0),
 ({3, 5}, {2}, 1.0),
 ({2, 3}, {5}, 1.0),
 ({3}, {1}, 0.6666666666666666),
 ({2}, {3}, 0.6666666666666666),
 ({3}, {2}, 0.6666666666666666),
 ({5}, {3}, 0.6666666666666666),
 ({3}, {5}, 0.6666666666666666),
 ({2, 5}, {3}, 0.6666666666666666),
 ({5}, {2, 3}, 0.6666666666666666),
 ({2}, {3, 5}, 0.6666666666666666),
 ({3}, {2, 5}, 0.6666666666666666)]

In [None]:
rules = generateRules(dataSet, 0.5, 0.5)