In [73]:
import numpy as np
import pandas as pd

In [74]:
def loadDataSet():
    return [[1,3,4,5,6],
            [2,3,5,6],
            [1,2,3,5],
            [1,2,5,6],
            [2,3,4,5,6],
            [1,2,4,5],
            [2,3,4,6]]
dataset = loadDataSet()
dataset

[[1, 3, 4, 5, 6],
 [2, 3, 5, 6],
 [1, 2, 3, 5],
 [1, 2, 5, 6],
 [2, 3, 4, 5, 6],
 [1, 2, 4, 5],
 [2, 3, 4, 6]]

In [75]:
def createC1(dataset):
    C1 = []
    for transaction in dataset:
        for item in transaction:
            if [item] not in C1:
                C1.append([item])
    C1.sort()
    return list(map(frozenset,C1))
C1 = createC1(dataset)
C1

[frozenset({1}),
 frozenset({2}),
 frozenset({3}),
 frozenset({4}),
 frozenset({5}),
 frozenset({6})]

In [76]:
def scanData(dataset, Ck, min_support=0.3):
    """
    Ck: k项集 -- 如C1 [{1},{2},{3},...]
    """
    support_data = {}
    sup_denominator = len(dataset)
    for transaction in dataset:
        for item in Ck:
            if item.issubset(set(transaction)):
                if not item in support_data:  # item 不能是哈希的（no set, frozeset can.） - 字典的键不能是哈希的
                    support_data[item] = 1
                else:
                    support_data[item] += 1
    
    retlist = []
    for item in support_data:
        support_data[item] /= sup_denominator
        if support_data[item] >= min_support:
            retlist.append(item)
    
    return retlist, support_data

C1_scan, support_data = scanData(dataset, C1)
C1_scan, support_data

([frozenset({1}),
  frozenset({3}),
  frozenset({4}),
  frozenset({5}),
  frozenset({6}),
  frozenset({2})],
 {frozenset({1}): 0.5714285714285714,
  frozenset({3}): 0.7142857142857143,
  frozenset({4}): 0.5714285714285714,
  frozenset({5}): 0.8571428571428571,
  frozenset({6}): 0.7142857142857143,
  frozenset({2}): 0.8571428571428571})

In [77]:
def aprioriGen(C, k):
    """
    拼接 - 一项集拼成二项集，以此类推
    C 是 k-1 项集，k是目标为k的项集数
    """
    retlist = []
    lenc = len(C)
    for i in range(lenc):
        for j in range(lenc):
            item = C[i].union(C[j]) # 返回的是拷贝
            if len(item)==k and item not in retlist: # 第一个判断是去除重复
                retlist.append(item)
    return retlist
            
C2 = aprioriGen(C1_scan, 2)
C2_scan = scanData(dataset, C2)
C2_scan

([frozenset({1, 5}),
  frozenset({3, 4}),
  frozenset({3, 5}),
  frozenset({3, 6}),
  frozenset({4, 5}),
  frozenset({4, 6}),
  frozenset({5, 6}),
  frozenset({2, 3}),
  frozenset({2, 5}),
  frozenset({2, 6}),
  frozenset({1, 2}),
  frozenset({2, 4})],
 {frozenset({1, 3}): 0.2857142857142857,
  frozenset({1, 4}): 0.2857142857142857,
  frozenset({1, 5}): 0.5714285714285714,
  frozenset({1, 6}): 0.2857142857142857,
  frozenset({3, 4}): 0.42857142857142855,
  frozenset({3, 5}): 0.5714285714285714,
  frozenset({3, 6}): 0.5714285714285714,
  frozenset({4, 5}): 0.42857142857142855,
  frozenset({4, 6}): 0.42857142857142855,
  frozenset({5, 6}): 0.5714285714285714,
  frozenset({2, 3}): 0.5714285714285714,
  frozenset({2, 5}): 0.7142857142857143,
  frozenset({2, 6}): 0.5714285714285714,
  frozenset({1, 2}): 0.42857142857142855,
  frozenset({2, 4}): 0.42857142857142855})

In [78]:
# 得到频繁项集和每个项集的支持度
def apriori(dataset, min_support=0.3):
    support_data = dict()
    C_total = []
    Ck = createC1(dataset) # C1

    k = 2
    while True:
        Ck_scan, support_data_son = scanData(dataset, Ck, min_support=min_support)
        if not Ck_scan: break
        C_total.append(Ck_scan)
        Ck = aprioriGen(Ck_scan, k)
        support_data.update(support_data_son)
        k += 1
    
    return C_total, support_data
    
fren_item,support_data = apriori(dataset)
fren_item, support_data

([[frozenset({1}),
   frozenset({3}),
   frozenset({4}),
   frozenset({5}),
   frozenset({6}),
   frozenset({2})],
  [frozenset({1, 5}),
   frozenset({3, 4}),
   frozenset({3, 5}),
   frozenset({3, 6}),
   frozenset({4, 5}),
   frozenset({4, 6}),
   frozenset({5, 6}),
   frozenset({2, 3}),
   frozenset({2, 5}),
   frozenset({2, 6}),
   frozenset({1, 2}),
   frozenset({2, 4})],
  [frozenset({3, 4, 6}),
   frozenset({3, 5, 6}),
   frozenset({2, 3, 5}),
   frozenset({2, 3, 6}),
   frozenset({2, 5, 6}),
   frozenset({1, 2, 5})]],
 {frozenset({1}): 0.5714285714285714,
  frozenset({3}): 0.7142857142857143,
  frozenset({4}): 0.5714285714285714,
  frozenset({5}): 0.8571428571428571,
  frozenset({6}): 0.7142857142857143,
  frozenset({2}): 0.8571428571428571,
  frozenset({1, 3}): 0.2857142857142857,
  frozenset({1, 4}): 0.2857142857142857,
  frozenset({1, 5}): 0.5714285714285714,
  frozenset({1, 6}): 0.2857142857142857,
  frozenset({3, 4}): 0.42857142857142855,
  frozenset({3, 5}): 0.57142857142

In [105]:
def rules(fren_item, support_data, min_conf=0.5, min_lift=0.5):
    transaction = []
    conf = []
    lift = []
    antecedent_support = []
    consequent_support = []
    for i in range(1,len(fren_item)): # i项集
        for j in range(0,i): # i --> j
            for item_i in fren_item[i]:
                for item_j in fren_item[j]:
                    if item_j.issubset(item_i):
                        conf_ = support_data[item_i] / support_data[item_i-item_j]
                        lift_ = conf_ / support_data[item_j]
                        transaction.append(f"{set(item_i-item_j)}-->{set(item_j)}")
                        conf.append(conf_)
                        lift.append(lift_)
                        antecedent_support.append(support_data[item_i])
                        consequent_support.append(support_data[item_j])
    ret = pd.DataFrame({"transaction":transaction,
                    "antecedent_support":antecedent_support,
                    "consequent_support":consequent_support,
                    "confidence":conf,
                    "lift":lift})
    return ret[(ret["confidence"]>min_conf) & (ret["lift"]>min_lift)]

rules(fren_item, support_data)

Unnamed: 0,transaction,antecedent_support,consequent_support,confidence,lift
0,{'Romance'}-->{'Comedy'},0.090741,0.385547,0.553885,1.43662
5,{'Romance'}-->{'Drama'},0.095874,0.447649,0.585213,1.307302


In [106]:
data = pd.read_csv("datasets/ml-latest-small/movies.csv")
data_preprocess = data.genres.tolist()
data_preprocess

['Adventure|Animation|Children|Comedy|Fantasy',
 'Adventure|Children|Fantasy',
 'Comedy|Romance',
 'Comedy|Drama|Romance',
 'Comedy',
 'Action|Crime|Thriller',
 'Comedy|Romance',
 'Adventure|Children',
 'Action',
 'Action|Adventure|Thriller',
 'Comedy|Drama|Romance',
 'Comedy|Horror',
 'Adventure|Animation|Children',
 'Drama',
 'Action|Adventure|Romance',
 'Crime|Drama',
 'Drama|Romance',
 'Comedy',
 'Comedy',
 'Action|Comedy|Crime|Drama|Thriller',
 'Comedy|Crime|Thriller',
 'Crime|Drama|Horror|Mystery|Thriller',
 'Action|Crime|Thriller',
 'Drama|Sci-Fi',
 'Drama|Romance',
 'Drama',
 'Children|Drama',
 'Drama|Romance',
 'Adventure|Drama|Fantasy|Mystery|Sci-Fi',
 'Crime|Drama',
 'Drama',
 'Mystery|Sci-Fi|Thriller',
 'Children|Drama',
 'Crime|Drama',
 'Children|Comedy',
 'Comedy|Romance',
 'Drama',
 'Drama|War',
 'Action|Crime|Drama',
 'Drama',
 'Action|Adventure|Fantasy',
 'Comedy|Drama|Thriller',
 'Drama|Romance',
 'Mystery|Thriller',
 'Animation|Children|Drama|Musical|Romance',
 'Dram

In [107]:
dataset = list(map(lambda x: x.split("|"),data_preprocess))
dataset

[['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy'],
 ['Adventure', 'Children', 'Fantasy'],
 ['Comedy', 'Romance'],
 ['Comedy', 'Drama', 'Romance'],
 ['Comedy'],
 ['Action', 'Crime', 'Thriller'],
 ['Comedy', 'Romance'],
 ['Adventure', 'Children'],
 ['Action'],
 ['Action', 'Adventure', 'Thriller'],
 ['Comedy', 'Drama', 'Romance'],
 ['Comedy', 'Horror'],
 ['Adventure', 'Animation', 'Children'],
 ['Drama'],
 ['Action', 'Adventure', 'Romance'],
 ['Crime', 'Drama'],
 ['Drama', 'Romance'],
 ['Comedy'],
 ['Comedy'],
 ['Action', 'Comedy', 'Crime', 'Drama', 'Thriller'],
 ['Comedy', 'Crime', 'Thriller'],
 ['Crime', 'Drama', 'Horror', 'Mystery', 'Thriller'],
 ['Action', 'Crime', 'Thriller'],
 ['Drama', 'Sci-Fi'],
 ['Drama', 'Romance'],
 ['Drama'],
 ['Children', 'Drama'],
 ['Drama', 'Romance'],
 ['Adventure', 'Drama', 'Fantasy', 'Mystery', 'Sci-Fi'],
 ['Crime', 'Drama'],
 ['Drama'],
 ['Mystery', 'Sci-Fi', 'Thriller'],
 ['Children', 'Drama'],
 ['Crime', 'Drama'],
 ['Children', 'Comedy'],
 

In [113]:
fren_item,support_data = apriori(dataset,min_support=0.025)
fren_item

[[frozenset({'Adventure'}),
  frozenset({'Animation'}),
  frozenset({'Children'}),
  frozenset({'Comedy'}),
  frozenset({'Fantasy'}),
  frozenset({'Romance'}),
  frozenset({'Drama'}),
  frozenset({'Action'}),
  frozenset({'Crime'}),
  frozenset({'Thriller'}),
  frozenset({'Horror'}),
  frozenset({'Mystery'}),
  frozenset({'Sci-Fi'}),
  frozenset({'War'}),
  frozenset({'Musical'}),
  frozenset({'Documentary'})],
 [frozenset({'Adventure', 'Animation'}),
  frozenset({'Adventure', 'Children'}),
  frozenset({'Adventure', 'Comedy'}),
  frozenset({'Adventure', 'Fantasy'}),
  frozenset({'Animation', 'Children'}),
  frozenset({'Animation', 'Comedy'}),
  frozenset({'Children', 'Comedy'}),
  frozenset({'Comedy', 'Fantasy'}),
  frozenset({'Comedy', 'Romance'}),
  frozenset({'Comedy', 'Drama'}),
  frozenset({'Drama', 'Romance'}),
  frozenset({'Action', 'Crime'}),
  frozenset({'Action', 'Thriller'}),
  frozenset({'Crime', 'Thriller'}),
  frozenset({'Action', 'Adventure'}),
  frozenset({'Crime', 'Dra

In [114]:
for items in fren_item:
    for item in items:
        print(f"item:{set(item)}, support:{support_data[item]}")

item:{'Adventure'}, support:0.12964483678916033
item:{'Animation'}, support:0.06271812769451858
item:{'Children'}, support:0.06815848901662903
item:{'Comedy'}, support:0.385547115582016
item:{'Fantasy'}, support:0.07996304660234038
item:{'Romance'}, support:0.1638267296243071
item:{'Drama'}, support:0.44764935331554095
item:{'Action'}, support:0.18764114144939437
item:{'Crime'}, support:0.1230753438718949
item:{'Thriller'}, support:0.19441593102032437
item:{'Horror'}, support:0.10039006364196264
item:{'Mystery'}, support:0.05881749127489222
item:{'Sci-Fi'}, support:0.10059536029562718
item:{'War'}, support:0.039211660849928144
item:{'Musical'}, support:0.03428454116197906
item:{'Documentary'}, support:0.04516526380619996
item:{'Animation', 'Adventure'}, support:0.02535413672757134
item:{'Children', 'Adventure'}, support:0.032026277971669063
item:{'Comedy', 'Adventure'}, support:0.04095668240607678
item:{'Adventure', 'Fantasy'}, support:0.03428454116197906
item:{'Animation', 'Children'}

In [116]:
rules(fren_item, support_data, min_conf=0, min_lift=4)

Unnamed: 0,transaction,antecedent_support,consequent_support,confidence,lift
8,{'Children'}-->{'Animation'},0.031,0.062718,0.454819,7.251799
9,{'Animation'}-->{'Children'},0.031,0.068158,0.494272,7.251799


In [118]:
pd.read_csv("datasets/ml-10M100K/movies.dat",sep="::",)

  pd.read_csv("datasets/ml-10M100K/movies.dat",sep="::")


Unnamed: 0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
0,2,Jumanji (1995),Adventure|Children|Fantasy
1,3,Grumpier Old Men (1995),Comedy|Romance
2,4,Waiting to Exhale (1995),Comedy|Drama|Romance
3,5,Father of the Bride Part II (1995),Comedy
4,6,Heat (1995),Action|Crime|Thriller
...,...,...,...
10675,65088,Bedtime Stories (2008),Adventure|Children|Comedy
10676,65091,Manhattan Melodrama (1934),Crime|Drama|Romance
10677,65126,Choke (2008),Comedy|Drama
10678,65130,Revolutionary Road (2008),Drama|Romance
