In [32]:
from google.colab import drive
drive.mount('/content/drive')
!ln -s "/content/drive/My Drive/Academic/CSCM35 - Big Data & Data Mining/coursework 2/Dataset.Small" "/content/"
!ln -s "/content/drive/My Drive/Academic/CSCM35 - Big Data & Data Mining/coursework 2/Dataset.Large" "/content/"
!pip3 install line_profiler
%load_ext line_profiler

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
ln: failed to create symbolic link '/content/Dataset.Small': File exists
ln: failed to create symbolic link '/content/Dataset.Large': File exists
The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [33]:
from itertools import combinations
from collections import Counter
import numpy as np
import csv
import pandas as pd


In [34]:
def read_from_file(file_name) :
  csvfile = open(file = file_name, mode = 'r') # open file
  print(type(csvfile))

  csvreader = csv.reader(csvfile, skipinitialspace = True) # csv reader object
  rows = [] # empty list
  for row in csvreader:
    rows.append(row)

  csvfile.close() #' close file
  return rows

def write_to_file(file_name, data) :
  file = open(file = file_name, mode = 'w') # file object

  csvwriter = csv.writer(file) # csv reader object
  for row in data:
    csvwriter.writerow(row)

  file.close() # close file

In [35]:
transactions = read_from_file('Dataset.Small/GroceryStore.csv') # read transactions or database
write_to_file('Transactions.csv', transactions) # write transactions or database into another file

<class '_io.TextIOWrapper'>


**Brute-Force Approach(Frequent itemset mining)**

$
Number\ of\ unique\ items\ =\ d\\
Number\ of\ transactions\ =\ N\\
Average\ width\ of\ a\ transaction=\ w\\ \\
Number\ of\ all\ possible\ combinations\ =\
\displaystyle\sum_{r=1} ^{d} c_{r}\ =\ 2^d - 1\\
Complexity\ =\ O(Nw2^{d})
$

In [36]:
def search_database(database, itemset) :
  """
    Search occurance of given itemset in the transaction database
    Inputs
      database : Transactions : List of lists 
      itemset : Itemset that should be searched : List
    Outputs
      occurance : Number of occurance : integer
  """
  frequency = np.count_nonzero([all(item in transaction for item in itemset) for transaction in database])
  return frequency

def calc_candidate_sup_cnt(database, c_k, k) : 
  """
  Calculate support count for all candidate k-itemsets and generate dataframe
  Inputs
    database : Transactions : List of lists
    c_k : Candidate k-itemsets : List of tuples
  Output : 
    c_k_df : Candidate k-itemsets dataframe with support counts
  """
  itemset_df = pd.DataFrame(columns=['key', 'item_cnt','itemset','support_cnt'])
  for itemset in c_k: # itemset : tuple of items
      support_count = search_database(database, itemset)
      data_entry = {
        "key" : frozenset(itemset), # immutable set
        "item_cnt": k,
        "itemset": itemset,
        "support_cnt": support_count
      }
      itemset_df = itemset_df.append(data_entry, ignore_index = True)
  return itemset_df

def select_freq_itemsets(c_k, min_sup_cnt) :
  """
  Select frequent k-itemsets from candidate k-itemsets(Candidate elimination)
  Inputs
    c_k : Candidate k-itemsets : DataFrame(itemset,support_cnt)
    min_sup_cnt : minimum support count(hyper parameter) : integer
  Outputs
    f_k : Frequent k-itemsets : DataFrame(itemset,support_cnt)
  """
  f_k = c_k[c_k['support_cnt'] >= min_sup_cnt] 
  return f_k

def brute_force_approach(database, unique_items, min_sup_cnt, itemset_groups):
  """ 
  Brute Force Method 
  Inputs
    database : Transaction database : List of lists
    unique_items : Unique items in the database : List of items
    min_sup_cnt : Minimum support count : intereger
    itemset_groups : Required item groups(k values)
  Outputs
    all_freq_itemsets = Frequent itemsets for given groups = DataFrame
  """
  all_candidate_df = pd.DataFrame(columns=['item_cnt','itemset','support_cnt'])
  for k in itemset_groups: # loop number of itemset groups
    c_k = combinations(unique_items, k) # create combinations for each itemset group
    c_k_df = calc_candidate_sup_cnt(database, c_k, k)
    all_candidate_df = all_candidate_df.append(c_k_df, ignore_index = True)

  all_freq_itemsets = select_freq_itemsets(all_candidate_df, min_sup_cnt)
  return all_freq_itemsets

In [37]:
"""Small dataset"""
transaction_df = pd.read_csv('Dataset.Small/GroceryStore.csv', header=None)
transaction_df.drop_duplicates(subset =0,
                     keep = False, inplace = True)
database = list(transaction_df[0].apply(lambda x:x.split(",") ))
print(f'Size of transactions : {len(database)}')

unique_items = set()
for transaction in database:
  for item in transaction :
    unique_items.add(item)
# print(unique_items)


""" Large dataset """
transaction_df_large = pd.read_csv('Dataset.Large/OnlineRetail.csv', header=None)
print(f'Size of large dataset : {transaction_df_large.shape}')

min_sup_cnt = 2
itemset_groups = [1, 2, 3, 4, 5]
freq_itemsets = brute_force_approach(database, unique_items, min_sup_cnt, itemset_groups)
print(f'\n\nShape of frequent itemsets : {freq_itemsets.shape}')
print(f'Frequent itemsets from brute-force approach \n{freq_itemsets.head()}')
print(f'Frequent itemsets from brute-force approach \n{freq_itemsets.tail()}')

Size of transactions : 14


  exec(code_obj, self.user_global_ns, self.user_ns)


Size of large dataset : (541910, 8)


Shape of frequent itemsets : (30, 4)
Frequent itemsets from brute-force approach 
  item_cnt     itemset support_cnt        key
0        1      (JAM,)           2      (JAM)
1        1   (COFFEE,)           4   (COFFEE)
2        1     (MILK,)           5     (MILK)
3        1  (BISCUIT,)           5  (BISCUIT)
4        1    (BREAD,)           9    (BREAD)
Frequent itemsets from brute-force approach 
    item_cnt                 itemset support_cnt                     key
64         2      (SUGER, BOURNVITA)           2      (SUGER, BOURNVITA)
91         3     (JAM, BREAD, MAGGI)           2     (JAM, MAGGI, BREAD)
147        3  (MILK, BISCUIT, BREAD)           2  (BISCUIT, BREAD, MILK)
181        3   (BISCUIT, TEA, MAGGI)           2   (BISCUIT, MAGGI, TEA)
196        3     (BREAD, TEA, MAGGI)           2     (MAGGI, BREAD, TEA)


**Apriori Algorithm(Frequent itemset mining)**

* Superset is frequent >>> Subset is frequent (Bottom Up)

* Sebset is infrequent >>> Superset is infrequent (Top down)

* This algorithm has **anti-monotone** propety >>> support of itemset can not exceed support of its subset


**Main Steps**

* Candidate Generation ($F_{k-1} >> C_{k}$)

* Candidate Pruning

* Support Counting

* Candidate Elimination ($C_{k} >> F_{k}$)

**Candidate Pruning ($F_{k-1}×F_{k-1}$)**

$Itemset\ is\ not\ frequent\ if\ one\ of\ its\ sub\ itemset\ is\ not\ frequent$

* Number of **(k-1)-size subsets** for a **k-itemset** = $C^{K}_{K-1}$ = $K$
* Number of **already verified (k-1)-size subsets** at the candidate generation = $2$
* Number of subsets for frequency verification stage per each **k-itemset** = $K-2$
* Total number of subsets for frequency verification for **all k-itemsets** = $L_{k}\times(K-2)$

In [38]:
def generate_candidates(f_prv, k):
  """
  Generate candidate set for k-itemsets
  Input : 
    f_prv : F(K-1) 
  Output : 
    c_k : List of candidate k-itemsets : List of lists
  """
  c_k = [] # candidate k-itemset
  f_prv_pairs = combinations(f_prv, 2) # select pair of frequent (k-1)-itemsets
  for f_prv_pair in f_prv_pairs:
    if(len(f_prv_pair[0]) == 1) : ### K=2 ###
      two_itemset = [*f_prv_pair[0],*f_prv_pair[1]]
      two_itemset.sort()         
      c_k.append(two_itemset)
    elif(f_prv_pair[0][0:k-2] == f_prv_pair[1][0:k-2]) :### K>=2 and first (k-2) items are same in both itemsets ###
      generated_itemset = f_prv_pair[0][0:k-2]
      delta_items = [f_prv_pair[0][k-2],f_prv_pair[1][k-2]]
      delta_items.sort()
      generated_itemset = [*generated_itemset,*delta_items]
      c_k.append(generated_itemset)
  return c_k

def prune_candidates(c_k, f_prv, k) :
  """
  Prune candidate set from k-itemsets
  Inputs
    c_k : candidate k-itemsets : List of lists
    f_prv : frequent (k-1)-itemsets 
  Output
    c_k_prune : pruned candidate k-itemsets : List of lists
  """
  infrq_itemsets = []
  ### search candidate k-itemsets to check frequency of its (k-1)-sized subsets
  for itemset in c_k :
    subsets = combinations(itemset, k-1)
    for subset in subsets:
      frequency = search_database(f_prv, subset)
      if(frequency==0):
        infrq_itemsets.append(itemset)
        break
  ### remove infrequent itemsets from candidate k-itemsets ###
  for itemset in infrq_itemsets :
    c_k.remove(itemset)

  return c_k


In [39]:
def apriori_algorithm(min_sup_cnt) :
  f_itemsets_store = {}
  all_frq_itemsets_df = pd.DataFrame(columns=['item_cnt','itemset','support_cnt'])
  for k in itemset_groups:
    if(k>1) :
      f_prv = f_itemsets_store[k-1]['itemset']
      c_k = generate_candidates(f_prv, k) # generate candidates
      c_k = prune_candidates(c_k, f_prv, k) # prune candidates
      c_k = calc_candidate_sup_cnt(database, c_k, k)
      f_k = select_freq_itemsets(c_k, min_sup_cnt)
      all_frq_itemsets_df = all_frq_itemsets_df.append(f_k, ignore_index=True)
      f_itemsets_store[k] = f_k
    else :
      c_1 = [[item] for item in unique_items] # candidate 1-itemset
      c_1 = calc_candidate_sup_cnt(database, c_1, 1)
      f_1 = select_freq_itemsets(c_1, min_sup_cnt)
      all_frq_itemsets_df = all_frq_itemsets_df.append(f_1, ignore_index=True)
      f_itemsets_store[1] = f_1
  return all_frq_itemsets_df, f_itemsets_store

# all_itemsets_df = %lprun -f apriori_algorithm apriori_algorithm() # execute line profiler
all_frq_itemsets_df, f_itemsets_store = apriori_algorithm(2) # execute line profiler
print(f'Shape of frequent itemsets : {all_frq_itemsets_df.shape}')
print(f'Frequent itemsets from apriori approach \n{all_frq_itemsets_df.head()}')
print(f'Frequent itemsets from apriori approach \n{all_frq_itemsets_df.tail()}')


Shape of frequent itemsets : (30, 4)
Frequent itemsets from apriori approach 
  item_cnt    itemset support_cnt        key
0        1      [JAM]           2      (JAM)
1        1   [COFFEE]           4   (COFFEE)
2        1     [MILK]           5     (MILK)
3        1  [BISCUIT]           5  (BISCUIT)
4        1    [BREAD]           9    (BREAD)
Frequent itemsets from apriori approach 
   item_cnt                 itemset support_cnt                     key
25        2      [BOURNVITA, SUGER]           2      (SUGER, BOURNVITA)
26        3     [BREAD, JAM, MAGGI]           2     (JAM, MAGGI, BREAD)
27        3  [BISCUIT, BREAD, MILK]           2  (BISCUIT, BREAD, MILK)
28        3   [BISCUIT, MAGGI, TEA]           2   (BISCUIT, MAGGI, TEA)
29        3     [BREAD, MAGGI, TEA]           2     (MAGGI, BREAD, TEA)


**Rule Generation**

Antecedent -> Consequent

Ex : {X,Y} -> {P,Q}

In [40]:
def generate_rules(f_k, h_m, f_itemsets_store, min_conf) :
  """
  Generate rules from frequent itemsets
  Inputs:
    f_k : Frequent k-itemsets (X,Y,P,Q) : Tuple
    h_m : Consequent (P,Q) : Tuple
    f_itemsets_store : Frequent itemset store(key = k, value = support counts) : Dictionary(Key = Integer, value = DataFrame)
  """
  conf = 0
  if(len(h_m) > 0) :
    k = len(f_k)
    m = len(h_m[0])
  else :
    return pd.DataFrame(columns = ['antecedent', 'consequent', 'confidence'])
  
  if(k > m+1) : #### Check whether an item can be passed from antecedent to consequent >>> Terminating condition of the recursive function###
    h_next = generate_candidates(h_m, m+1) # Generate (m+1) candidates for consequent
    h_next = prune_candidates(h_next, h_m, m+1) # Prune (m+1) candidates
    h_next_invalid = []
    rule_df = pd.DataFrame(columns = ['antecedent', 'consequent', 'confidence'])
    for h_next_i in h_next : # Iterate candidate consequents
      consequent = h_next_i
      antecedent = list(set(f_k) - set(h_next_i))
      rule_sup_cnt = f_itemsets_store[k].loc[f_itemsets_store[k]['key']==frozenset(f_k)]
      antecedent_sup_cnt = f_itemsets_store[len(antecedent)].loc[f_itemsets_store[len(antecedent)]['key']==frozenset(antecedent)]

      if len(rule_sup_cnt)>0 and len(antecedent_sup_cnt)>0 :
        conf = rule_sup_cnt['support_cnt'].values[0]/antecedent_sup_cnt['support_cnt'].values[0]

      if(conf > min_conf) : 
        data_entry = {
          'antecedent': antecedent,
          'consequent': consequent,
          'confidence': conf
        }
        rule_df = rule_df.append(data_entry, ignore_index = True)
      else : 
        h_next_invalid.append(h_next_i)

    for h_next_invalid_i in h_next_invalid : # Remove invalid consequents 
      h_next.remove(h_next_invalid_i)

    ### recursively generate rules for subgraphs 
    return generate_rules(f_k, h_next, f_itemsets_store, min_conf).append(rule_df, ignore_index=True)
  else :
    return pd.DataFrame(columns = ['antecedent', 'consequent', 'confidence'])

In [41]:
all_frq_itemsets_df, f_itemsets_store = apriori_algorithm(2)
min_conf = 0.5
print(all_frq_itemsets_df.shape)

rule_df = pd.DataFrame(columns = ['antecedent', 'consequent', 'confidence'])
for f_k in all_frq_itemsets_df['itemset'] :
  h_1_invalid = []
  for item in f_k :
    if len(f_k)==1 :
      continue
    consequent = []
    consequent.append(item)
    antecedent = list(set(f_k)-set(consequent))

    rule_sup_cnt = f_itemsets_store[len(f_k)].loc[f_itemsets_store[len(f_k)]['key']==frozenset(f_k)]
    antecedent_sup_cnt = f_itemsets_store[len(antecedent)].loc[f_itemsets_store[len(antecedent)]['key']==frozenset(antecedent)]

    if len(rule_sup_cnt)>0 and len(antecedent_sup_cnt)>0 :
      conf = rule_sup_cnt['support_cnt'].values[0]/antecedent_sup_cnt['support_cnt'].values[0]

    if(conf >= min_conf) : 
      data_entry = {
        'antecedent': antecedent,
        'consequent': consequent,
        'confidence': conf
      }
      rule_df = rule_df.append(data_entry, ignore_index = True)
    else : 
      h_1_invalid.append(consequent)

  h_1 = [[item] for item in f_k] # create consequents for single items
  if len(h_1_invalid)>0 : # remove low confidence items from consequents
    for h_1_invalid_i in h_1_invalid : # Remove invalid consequents 
      h_1.remove(h_1_invalid_i)
  if len(h_1)==0 :
    continue
    
  rule_df = rule_df.append(generate_rules(f_k, h_1, f_itemsets_store, min_conf), ignore_index=True)

print(rule_df)
print(f'\n\nShape of association rules : {rule_df.shape}')
print(f'Association rules(Head) \n{rule_df.head()}')
print(f'Association rules(Tail) \n{rule_df.tail()}')

(30, 4)
          antecedent      consequent  confidence
0              [JAM]         [BREAD]    1.000000
1              [JAM]         [MAGGI]    1.000000
2            [SUGER]        [COFFEE]    0.500000
3           [COFFEE]         [SUGER]    0.500000
4       [CORNFLAKES]        [COFFEE]    0.500000
5           [COFFEE]    [CORNFLAKES]    0.500000
6             [MILK]         [BREAD]    0.800000
7       [CORNFLAKES]          [MILK]    0.500000
8          [BISCUIT]         [BREAD]    0.800000
9            [MAGGI]         [BREAD]    0.600000
10           [SUGER]         [BREAD]    0.500000
11             [TEA]         [MAGGI]    0.800000
12           [MAGGI]           [TEA]    0.800000
13      [CORNFLAKES]           [TEA]    0.500000
14           [SUGER]     [BOURNVITA]    0.500000
15       [BOURNVITA]         [SUGER]    1.000000
16      [JAM, MAGGI]         [BREAD]    1.000000
17    [BREAD, MAGGI]           [JAM]    0.666667
18      [JAM, BREAD]         [MAGGI]    1.000000
19          

**FP-Growth Algorithm**

In [42]:
class ItemData :
  def __init__(self, item, sup_count):
    self.item = item
    self.sup_count = sup_count
    self.con_sup_cnt = 0

  def __str__(self):
     return str({
         "item" : self.item,
         "sup_count" : self.sup_count
     })

  def __cmp__(self,other):
    return self.count<other.count

class TreeNode :
  def __init__(self, data):
      self.parent = None
      self.children = {}
      self.next = None
      self.data = data

class ReferenceNode : 
  def __init__(self) :
    self.head = None
    self.tail = None
    self.sup_cnt = 0
    self.con_sup_cnt = 0
  


In [31]:
def generate_fp_tree(min_sup_cnt):

  ############################# Part (1) ##################################################
  c_1 = [[item] for item in unique_items]
  itemset_df = calc_candidate_sup_cnt(database, c_1, 1)

  infrequent_rows = itemset_df[ itemset_df['support_cnt'] < min_sup_cnt ] # filter items with lower confidence and retrieve indices of them
  print(f'Infrequent Row : {infrequent_rows}\n')
  infrequent_items = set(row['itemset'][0] for index,row in infrequent_rows.iterrows()) ################################## TODO : Need to review ##########################################
  print( f'Infrequent Items : {infrequent_items}\n')
  
  #### Drop infrequent items ####
  itemset_df.drop(infrequent_rows.index , inplace=True)
  #### Sort according to support count ####
  itemset_df = itemset_df.sort_values(by = ['support_cnt'], ascending=False)
  print(f'Frequent single items : {itemset_df}\n\n')
 
  #### Remove infrequent items from every transaction and sort by support count in descending order ####
  database_node_model = []
  for transaction in database:
    transaction_prune = set(transaction)-infrequent_items # remove infrequent items from every transaction

    transaction_node_model = []
    for item in transaction_prune : 
      sup_cnt = itemset_df[itemset_df['key']==frozenset([item])]['support_cnt'].values[0]
      #### Create item object for each item in every transaction ###
      item_data = ItemData(item, sup_cnt)
      transaction_node_model.append(item_data)
    
    #### Sort item nodes in transaction accoring to support count (Descending order) ####
    transaction_node_model.sort(key = lambda c: c.sup_count, reverse=True) 
    database_node_model.append(transaction_node_model)
  print(f'Length of pruned database : {len(database_node_model)}')

  ############################# Part (2) ##################################################
  
  #### Root Node : Null ####
  root_data = ItemData(item = None, sup_count = None)
  root_node = TreeNode(root_data) 
  #### Item chain maintain dictionary ####
  reference_keeper = {}
  
  ### Construct the model ####
  for transaction in database_node_model :
    current_root_locator = root_node
    for search_data in transaction:
      current_root_locator = traverse(current_root_locator, search_data, reference_keeper)

  return root_node, reference_keeper



def traverse(root_node, search_data, reference_keeper):
  current_root_locator = root_node
  if search_data.item not in current_root_locator.children:
    search_data.sup_count = 1
    search_node = TreeNode(search_data)
    search_node.parent = current_root_locator
    current_root_locator.children[search_data.item] = search_node
    current_root_locator = search_node

    if search_data.item not in reference_keeper :
      ##### Create reference for the new item ####
      reference_node = ReferenceNode()
      reference_node.head = search_node
      reference_node.tail = search_node
      reference_node.sup_cnt = search_data.sup_count
      reference_keeper[search_data.item] = reference_node
    else :
      reference_node = reference_keeper[search_data.item]
      tail_node = reference_node.tail
      tail_node.next = search_node
      reference_node.tail = search_node
      reference_node.sup_cnt += search_data.sup_count
      reference_keeper[search_data.item] = reference_node
  else :
    current_root_locator = current_root_locator.children[search_data.item]
    current_root_locator.data.sup_count += 1

    ##### Update Reference  ####
    reference_node = reference_keeper[search_data.item]
    reference_node.sup_cnt += 1
  return current_root_locator
  
def print_tree(tree_node) : 
  if(tree_node == None ) :
    return
  print(f'{tree_node.data}')
  children = tree_node.children
  if(len(children)==0) :
    return
  for child in children.values():
    print_tree(child)

def generate_fp_itemsets(root_node, reference_keeper, conditional_itemset, min_sup_cnt):

  ### sort references in descending order of support count ####
  reference_keeper = dict(sorted(reference_keeper.items(), key=lambda item: item[1].sup_cnt))

  for (item,reference) in reference_keeper.items() :
    if(len(conditional_itemset)>0 and reference.con_sup_cnt < min_sup_cnt) :
      continue
    else :
      ##################### Update conditional support counts from bottom to top ###########################
      con_reference_keeper = {}

      current_x_ref = reference.head
      print(current_x_ref.data.item)
      # conditional_itemset = conditional_itemset.append(current_x_ref.data.item)
      # print(conditional_itemset)current_y_ref

      while(current_x_ref != None) : 
        current_x_ref.data.con_sup_cnt = 0
        current_y_ref = current_x_ref.parent

        #### Update conditional support counts through prefix paths from bottom to top ####
        while(current_y_ref.data.item != None): #### vertical movement ####

          if current_y_ref.data.item in con_reference_keeper :
            current_pointer = con_reference_keeper[current_y_ref.data.item]
            current_pointer.con_sup_cnt += 1
          else :
            print(reference_keeper)
            current_pointer = reference_keeper[current_y_ref.data.item]
            current_pointer.con_sup_cnt = 1
            con_reference_keeper[current_y_ref.data.item] = current_pointer

          current_y_ref.data.con_sup_cnt += 1
          current_y_ref = current_y_ref.parent

        current_x_ref = current_x_ref.next   #### Horizontal movement ####
      
      ###################### remove low support count nodes from the tree #################################
      # for (key, reference) in con_reference_keeper.items():
      #   if(reference.con_sup_cnt<min_sup_cnt):
      #     current_pointer = reference.head 
      #     current_parent = current_pointer.parent
      #     for child_node in current_pointer.children:
      #       child_node.parent = current_parent
      
      return conditional_itemset.append(generate_fp_itemsets(root_node, con_reference_keeper, conditional_itemset, min_sup_cnt))
      
      
      
      
      
      
      




########################################### Testing ###################################################################
(root_node, reference_keeper) = generate_fp_tree(min_sup_cnt = 2)
#### Print reference chain for each frequent items ####
# for (key,value) in reference_keeper.items() :
#   print(f'Support Count of {value.head.data.item} : {value.sup_cnt}')
#   temp = value.head
#   while(temp != None) :
#     print(temp.data)
#     temp = temp.next
#   print()

#### Generate itemsets using FP tree ####
generate_fp_itemsets(root_node, reference_keeper, [], 2)
  

Infrequent Row :       key item_cnt itemset support_cnt
7  (COCK)        1  [COCK]           1

Infrequent Items : {'COCK'}

Frequent single items :              key item_cnt       itemset support_cnt
4        (BREAD)        1       [BREAD]           9
2         (MILK)        1        [MILK]           5
3      (BISCUIT)        1     [BISCUIT]           5
5          (TEA)        1         [TEA]           5
6        (MAGGI)        1       [MAGGI]           5
1       (COFFEE)        1      [COFFEE]           4
8        (SUGER)        1       [SUGER]           4
9   (CORNFLAKES)        1  [CORNFLAKES]           4
0          (JAM)        1         [JAM]           2
10   (BOURNVITA)        1   [BOURNVITA]           2


Length of pruned database : 14
JAM
{'JAM': <__main__.ReferenceNode object at 0x7f64040289d0>, 'BOURNVITA': <__main__.ReferenceNode object at 0x7f6404028c50>, 'CORNFLAKES': <__main__.ReferenceNode object at 0x7f6404028850>, 'SUGER': <__main__.ReferenceNode object at 0x7f6404028

KeyError: ignored

References

https://www.analyticsvidhya.com/blog/2021/08/python-tutorial-working-with-csv-file-for-data-science/

https://towardsdatascience.com/magic-commands-for-profiling-in-jupyter-notebook-d2ef00e29a63

http://www.btechsmartclass.com/data_structures/tree-terminology.html

In [43]:
from mlxtend.frequent_patterns import apriori, association_rules

#Let's transform the list, with one-hot encoding
from mlxtend.preprocessing import TransactionEncoder
a = TransactionEncoder()
a_data = a.fit(database).transform(database)
df = pd.DataFrame(a_data,columns=a.columns_)
df = df.replace(False,0)
df


df = apriori(df, min_support = 0.1, use_colnames = True)
print(f'Shape of frequent itemsets : {df.shape}')
print(f'Frequent itemsets from apriori \n{df.head()}')
print(f'Frequent itemsets from apriori approach \n{df.tail()}')

df_ar = association_rules(df, metric = "confidence", min_threshold = 0.5)
print(df_ar)
print(f'\n\nShape of association rules : {df_ar.shape}')
print(f'Association rules(Head) \n{df_ar.head()}')
print(f'Association rules(Tail) \n{df_ar.tail()}')


Shape of frequent itemsets : (30, 2)
Frequent itemsets from apriori 
    support      itemsets
0  0.357143     (BISCUIT)
1  0.142857   (BOURNVITA)
2  0.642857       (BREAD)
3  0.285714      (COFFEE)
4  0.285714  (CORNFLAKES)
Frequent itemsets from apriori approach 
     support                itemsets
25  0.285714            (MAGGI, TEA)
26  0.142857  (BISCUIT, BREAD, MILK)
27  0.142857   (BISCUIT, MAGGI, TEA)
28  0.142857     (JAM, MAGGI, BREAD)
29  0.142857     (MAGGI, BREAD, TEA)
         antecedents     consequents  antecedent support  consequent support  \
0          (BISCUIT)         (BREAD)            0.357143            0.642857   
1            (SUGER)     (BOURNVITA)            0.285714            0.142857   
2        (BOURNVITA)         (SUGER)            0.142857            0.285714   
3              (JAM)         (BREAD)            0.142857            0.642857   
4            (MAGGI)         (BREAD)            0.357143            0.642857   
5             (MILK)         (BR