In [1]:
from google.colab import drive
drive.mount('/content/drive')
!ln -s "/content/drive/My Drive/Academic/CSCM35 - Big Data & Data Mining/coursework 2/Dataset.Small" "/content/"
!ln -s "/content/drive/My Drive/Academic/CSCM35 - Big Data & Data Mining/coursework 2/Dataset.Large" "/content/"
!pip3 install line_profiler
%load_ext line_profiler

Mounted at /content/drive
Collecting line_profiler
  Downloading line_profiler-3.5.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (67 kB)
[K     |████████████████████████████████| 67 kB 2.6 MB/s 
[?25hInstalling collected packages: line-profiler
Successfully installed line-profiler-3.5.1


In [2]:
from itertools import combinations
from collections import Counter
import numpy as np
import csv
import pandas as pd


In [3]:
def read_from_file(file_name) :
  csvfile = open(file = file_name, mode = 'r') # open file
  print(type(csvfile))

  csvreader = csv.reader(csvfile, skipinitialspace = True) # csv reader object
  rows = [] # empty list
  for row in csvreader:
    rows.append(row)

  csvfile.close() #' close file
  return rows

def write_to_file(file_name, data) :
  file = open(file = file_name, mode = 'w') # file object

  csvwriter = csv.writer(file) # csv reader object
  for row in data:
    csvwriter.writerow(row)

  file.close() # close file

In [4]:
transactions = read_from_file('Dataset.Small/GroceryStore.csv') # read transactions or database
write_to_file('Transactions.csv', transactions) # write transactions or database into another file

<class '_io.TextIOWrapper'>


**Brute-Force Approach(Frequent itemset mining)**

$
Number\ of\ unique\ items\ =\ d\\
Number\ of\ transactions\ =\ N\\
Average\ width\ of\ a\ transaction=\ w\\ \\
Number\ of\ all\ possible\ combinations\ =\
\displaystyle\sum_{r=1} ^{d} c_{r}\ =\ 2^d - 1\\
Complexity\ =\ O(Nw2^{d})
$

In [5]:
def search_database(database, itemset) :
  """
    Search occurance of given itemset in the transaction database
    Inputs
      database : Transactions : List of lists 
      itemset : Itemset that should be searched : List
    Outputs
      occurance : Number of occurance : integer
  """
  frequency = np.count_nonzero([all(item in transaction for item in itemset) for transaction in database])
  return frequency

def calc_candidate_sup_cnt(database, c_k, k) : 
  """
  Calculate support count for all candidate k-itemsets and generate dataframe
  Inputs
    database : Transactions : List of lists
    c_k : Candidate k-itemsets : List of tuples
  Output : 
    c_k_df : Candidate k-itemsets dataframe with support counts
  """
  itemset_df = pd.DataFrame(columns=['key', 'item_cnt','itemset','support_cnt'])
  for itemset in c_k: # itemset : tuple of items
      support_count = search_database(database, itemset)
      data_entry = {
        "key" : frozenset(itemset), # immutable set
        "item_cnt": k,
        "itemset": itemset,
        "support_cnt": support_count
      }
      itemset_df = itemset_df.append(data_entry, ignore_index = True)
  return itemset_df

def select_freq_itemsets(c_k, min_sup_cnt) :
  """
  Select frequent k-itemsets from candidate k-itemsets(Candidate elimination)
  Inputs
    c_k : Candidate k-itemsets : DataFrame(itemset,support_cnt)
    min_sup_cnt : minimum support count(hyper parameter) : integer
  Outputs
    f_k : Frequent k-itemsets : DataFrame(itemset,support_cnt)
  """
  f_k = c_k[c_k['support_cnt'] >= min_sup_cnt] 
  return f_k

def brute_force_approach(database, unique_items, min_sup_cnt, itemset_groups):
  """ 
  Brute Force Method 
  Inputs
    database : Transaction database : List of lists
    unique_items : Unique items in the database : List of items
    min_sup_cnt : Minimum support count : intereger
    itemset_groups : Required item groups(k values)
  Outputs
    all_freq_itemsets = Frequent itemsets for given groups = DataFrame
  """
  all_candidate_df = pd.DataFrame(columns=['item_cnt','itemset','support_cnt'])
  for k in itemset_groups: # loop number of itemset groups
    c_k = combinations(unique_items, k) # create combinations for each itemset group
    c_k_df = calc_candidate_sup_cnt(database, c_k, k)
    all_candidate_df = all_candidate_df.append(c_k_df, ignore_index = True)

  all_freq_itemsets = select_freq_itemsets(all_candidate_df, min_sup_cnt)
  return all_freq_itemsets

In [76]:
"""Small dataset"""
transaction_df = pd.read_csv('Dataset.Small/GroceryStore.csv', header=None)
transaction_df.drop_duplicates(subset =0,
                     keep = False, inplace = True)
database = list(transaction_df[0].apply(lambda x:x.split(",") ))
print(f'Size of transactions : {len(database)}')

unique_items = set()
for transaction in database:
  for item in transaction :
    unique_items.add(item)
# print(unique_items)


""" Large dataset """
transaction_df_large = pd.read_csv('Dataset.Large/OnlineRetail.csv', header=None)
print(f'Size of large dataset : {transaction_df_large.shape}')

min_sup_cnt = 2
itemset_groups = [1, 2, 3, 4, 5]
freq_itemsets = brute_force_approach(database, unique_items, min_sup_cnt, itemset_groups)
print(f'\n\nShape of frequent itemsets : {freq_itemsets.shape}')
print(f'Frequent itemsets from brute-force approach \n{freq_itemsets.head()}')
print(f'Frequent itemsets from brute-force approach \n{freq_itemsets.tail()}')

Size of transactions : 14


  exec(code_obj, self.user_global_ns, self.user_ns)


Size of large dataset : (541910, 8)


Shape of frequent itemsets : (30, 4)
Frequent itemsets from brute-force approach 
  item_cnt       itemset support_cnt          key
0        1      (MAGGI,)           5      (MAGGI)
1        1      (SUGER,)           4      (SUGER)
2        1    (BISCUIT,)           5    (BISCUIT)
3        1  (BOURNVITA,)           2  (BOURNVITA)
4        1        (JAM,)           2        (JAM)
Frequent itemsets from brute-force approach 
    item_cnt                 itemset support_cnt                     key
62         2           (MILK, BREAD)           4           (MILK, BREAD)
78         3   (MAGGI, BISCUIT, TEA)           2   (MAGGI, BISCUIT, TEA)
95         3     (MAGGI, JAM, BREAD)           2     (MAGGI, BREAD, JAM)
104        3     (MAGGI, TEA, BREAD)           2     (MAGGI, BREAD, TEA)
171        3  (BISCUIT, MILK, BREAD)           2  (MILK, BREAD, BISCUIT)


**Apriori Algorithm(Frequent itemset mining)**

* Superset is frequent >>> Subset is frequent (Bottom Up)

* Sebset is infrequent >>> Superset is infrequent (Top down)

* This algorithm has **anti-monotone** propety >>> support of itemset can not exceed support of its subset


**Main Steps**

* Candidate Generation ($F_{k-1} >> C_{k}$)

* Candidate Pruning

* Support Counting

* Candidate Elimination ($C_{k} >> F_{k}$)

**Candidate Pruning ($F_{k-1}×F_{k-1}$)**

$Itemset\ is\ not\ frequent\ if\ one\ of\ its\ sub\ itemset\ is\ not\ frequent$

* Number of **(k-1)-size subsets** for a **k-itemset** = $C^{K}_{K-1}$ = $K$
* Number of **already verified (k-1)-size subsets** at the candidate generation = $2$
* Number of subsets for frequency verification stage per each **k-itemset** = $K-2$
* Total number of subsets for frequency verification for **all k-itemsets** = $L_{k}\times(K-2)$

In [77]:
def generate_candidates(f_prv, k):
  """
  Generate candidate set for k-itemsets
  Input : 
    f_prv : F(K-1) 
  Output : 
    c_k : List of candidate k-itemsets : List of lists
  """
  c_k = [] # candidate k-itemset
  f_prv_pairs = combinations(f_prv, 2) # select pair of frequent (k-1)-itemsets
  for f_prv_pair in f_prv_pairs:
    if(len(f_prv_pair[0]) == 1) : ### K=2 ###
      two_itemset = [*f_prv_pair[0],*f_prv_pair[1]]
      two_itemset.sort()         
      c_k.append(two_itemset)
    elif(f_prv_pair[0][0:k-2] == f_prv_pair[1][0:k-2]) :### K>=2 and first (k-2) items are same in both itemsets ###
      generated_itemset = f_prv_pair[0][0:k-2]
      delta_items = [f_prv_pair[0][k-2],f_prv_pair[1][k-2]]
      delta_items.sort()
      generated_itemset = [*generated_itemset,*delta_items]
      c_k.append(generated_itemset)
  return c_k

def prune_candidates(c_k, f_prv, k) :
  """
  Prune candidate set from k-itemsets
  Inputs
    c_k : candidate k-itemsets : List of lists
    f_prv : frequent (k-1)-itemsets 
  Output
    c_k_prune : pruned candidate k-itemsets : List of lists
  """
  infrq_itemsets = []
  ### search candidate k-itemsets to check frequency of its (k-1)-sized subsets
  for itemset in c_k :
    subsets = combinations(itemset, k-1)
    for subset in subsets:
      frequency = search_database(f_prv, subset)
      if(frequency==0):
        infrq_itemsets.append(itemset)
        break
  ### remove infrequent itemsets from candidate k-itemsets ###
  for itemset in infrq_itemsets :
    c_k.remove(itemset)

  return c_k


In [90]:
def apriori_algorithm(min_sup_cnt) :
  f_itemsets_store = {}
  all_frq_itemsets_df = pd.DataFrame(columns=['item_cnt','itemset','support_cnt'])
  for k in itemset_groups:
    if(k>1) :
      f_prv = f_itemsets_store[k-1]['itemset']
      c_k = generate_candidates(f_prv, k) # generate candidates
      c_k = prune_candidates(c_k, f_prv, k) # prune candidates
      c_k = calc_candidate_sup_cnt(database, c_k, k)
      f_k = select_freq_itemsets(c_k, min_sup_cnt)
      all_frq_itemsets_df = all_frq_itemsets_df.append(f_k, ignore_index=True)
      f_itemsets_store[k] = f_k
    else :
      c_1 = [[item] for item in unique_items] # candidate 1-itemset
      c_1 = calc_candidate_sup_cnt(database, c_1, 1)
      f_1 = select_freq_itemsets(c_1, min_sup_cnt)
      all_frq_itemsets_df = all_frq_itemsets_df.append(f_1, ignore_index=True)
      f_itemsets_store[1] = f_1
  return all_frq_itemsets_df, f_itemsets_store

# all_itemsets_df = %lprun -f apriori_algorithm apriori_algorithm() # execute line profiler
all_frq_itemsets_df, f_itemsets_store = apriori_algorithm(2) # execute line profiler
print(f'Shape of frequent itemsets : {all_frq_itemsets_df.shape}')
print(f'Frequent itemsets from apriori approach \n{all_frq_itemsets_df.head()}')
print(f'Frequent itemsets from apriori approach \n{all_frq_itemsets_df.tail()}')


Shape of frequent itemsets : (30, 4)
Frequent itemsets from apriori approach 
  item_cnt      itemset support_cnt          key
0        1      [MAGGI]           5      (MAGGI)
1        1      [SUGER]           4      (SUGER)
2        1    [BISCUIT]           5    (BISCUIT)
3        1  [BOURNVITA]           2  (BOURNVITA)
4        1        [JAM]           2        (JAM)
Frequent itemsets from apriori approach 
   item_cnt                 itemset support_cnt                     key
25        2           [BREAD, MILK]           4           (MILK, BREAD)
26        3   [BISCUIT, MAGGI, TEA]           2   (MAGGI, BISCUIT, TEA)
27        3     [BREAD, JAM, MAGGI]           2     (MAGGI, BREAD, JAM)
28        3     [BREAD, MAGGI, TEA]           2     (MAGGI, BREAD, TEA)
29        3  [BISCUIT, BREAD, MILK]           2  (MILK, BREAD, BISCUIT)


**Rule Generation**

Antecedent -> Consequent

Ex : {X,Y} -> {P,Q}

In [80]:
def generate_rules(f_k, h_m, f_itemsets_store, min_conf) :
  """
  Generate rules from frequent itemsets
  Inputs:
    f_k : Frequent k-itemsets (X,Y,P,Q) : Tuple
    h_m : Consequent (P,Q) : Tuple
    f_itemsets_store : Frequent itemset store(key = k, value = support counts) : Dictionary(Key = Integer, value = DataFrame)
  """
  conf = 0
  if(len(h_m) > 0) :
    k = len(f_k)
    m = len(h_m[0])
  else :
    return pd.DataFrame(columns = ['antecedent', 'consequent', 'confidence'])
  
  if(k > m+1) : #### Check whether an item can be passed from antecedent to consequent >>> Terminating condition of the recursive function###
    h_next = generate_candidates(h_m, m+1) # Generate (m+1) candidates for consequent
    h_next = prune_candidates(h_next, h_m, m+1) # Prune (m+1) candidates
    h_next_invalid = []
    rule_df = pd.DataFrame(columns = ['antecedent', 'consequent', 'confidence'])
    for h_next_i in h_next : # Iterate candidate consequents
      consequent = h_next_i
      antecedent = list(set(f_k) - set(h_next_i))
      rule_sup_cnt = f_itemsets_store[k].loc[f_itemsets_store[k]['key']==frozenset(f_k)]
      antecedent_sup_cnt = f_itemsets_store[len(antecedent)].loc[f_itemsets_store[len(antecedent)]['key']==frozenset(antecedent)]

      if len(rule_sup_cnt)>0 and len(antecedent_sup_cnt)>0 :
        conf = rule_sup_cnt['support_cnt'].values[0]/antecedent_sup_cnt['support_cnt'].values[0]

      if(conf > min_conf) : 
        data_entry = {
          'antecedent': antecedent,
          'consequent': consequent,
          'confidence': conf
        }
        rule_df = rule_df.append(data_entry, ignore_index = True)
      else : 
        h_next_invalid.append(h_next_i)

    for h_next_invalid_i in h_next_invalid : # Remove invalid consequents 
      h_next.remove(h_next_invalid_i)

    ### recursively generate rules for subgraphs 
    return generate_rules(f_k, h_next, f_itemsets_store, min_conf).append(rule_df, ignore_index=True)
  else :
    return pd.DataFrame(columns = ['antecedent', 'consequent', 'confidence'])

In [91]:
all_frq_itemsets_df, f_itemsets_store = apriori_algorithm(2)
min_conf = 0.5
print(all_frq_itemsets_df.shape)

rule_df = pd.DataFrame(columns = ['antecedent', 'consequent', 'confidence'])
for f_k in all_frq_itemsets_df['itemset'] :
  h_1_invalid = []
  for item in f_k :
    if len(f_k)==1 :
      continue
    consequent = []
    consequent.append(item)
    antecedent = list(set(f_k)-set(consequent))

    rule_sup_cnt = f_itemsets_store[len(f_k)].loc[f_itemsets_store[len(f_k)]['key']==frozenset(f_k)]
    antecedent_sup_cnt = f_itemsets_store[len(antecedent)].loc[f_itemsets_store[len(antecedent)]['key']==frozenset(antecedent)]

    if len(rule_sup_cnt)>0 and len(antecedent_sup_cnt)>0 :
      conf = rule_sup_cnt['support_cnt'].values[0]/antecedent_sup_cnt['support_cnt'].values[0]

    if(conf >= min_conf) : 
      data_entry = {
        'antecedent': antecedent,
        'consequent': consequent,
        'confidence': conf
      }
      rule_df = rule_df.append(data_entry, ignore_index = True)
    else : 
      h_1_invalid.append(consequent)

  h_1 = [[item] for item in f_k] # create consequents for single items
  if len(h_1_invalid)>0 : # remove low confidence items from consequents
    for h_1_invalid_i in h_1_invalid : # Remove invalid consequents 
      h_1.remove(h_1_invalid_i)
  if len(h_1)==0 :
    continue
    
  rule_df = rule_df.append(generate_rules(f_k, h_1, f_itemsets_store, min_conf), ignore_index=True)

print(rule_df)
print(f'\n\nShape of association rules : {rule_df.shape}')
print(f'Association rules(Head) \n{rule_df.head()}')
print(f'Association rules(Tail) \n{rule_df.tail()}')

(30, 4)
          antecedent      consequent  confidence
0              [JAM]         [MAGGI]    1.000000
1              [TEA]         [MAGGI]    0.800000
2            [MAGGI]           [TEA]    0.800000
3            [MAGGI]         [BREAD]    0.600000
4            [SUGER]     [BOURNVITA]    0.500000
5        [BOURNVITA]         [SUGER]    1.000000
6            [SUGER]        [COFFEE]    0.500000
7           [COFFEE]         [SUGER]    0.500000
8            [SUGER]         [BREAD]    0.500000
9          [BISCUIT]         [BREAD]    0.800000
10             [JAM]         [BREAD]    1.000000
11      [CORNFLAKES]           [TEA]    0.500000
12      [CORNFLAKES]          [MILK]    0.500000
13      [CORNFLAKES]        [COFFEE]    0.500000
14          [COFFEE]    [CORNFLAKES]    0.500000
15            [MILK]         [BREAD]    0.800000
16      [MAGGI, TEA]       [BISCUIT]    0.500000
17    [BISCUIT, TEA]         [MAGGI]    1.000000
18  [MAGGI, BISCUIT]           [TEA]    1.000000
19      [MAG

**FP-Growth Algorithm**

In [None]:
class Item:
  def __init__(self, item, sup_count):
    self.item = item
    self.sup_count = sup_count

class Node:
  def __init__(self, data):
      self.parent = None
      self.childrens = None
      self.data = data


In [96]:
def fp_algorithm(min_sup_cnt):
  c_1 = [[item] for item in unique_items]
  itemset_df = calc_candidate_sup_cnt(database, c_1, 1)
  print(itemset_df)

             key item_cnt       itemset support_cnt
0        (MAGGI)        1       [MAGGI]           5
1        (SUGER)        1       [SUGER]           4
2      (BISCUIT)        1     [BISCUIT]           5
3    (BOURNVITA)        1   [BOURNVITA]           2
4          (JAM)        1         [JAM]           2
5   (CORNFLAKES)        1  [CORNFLAKES]           4
6          (TEA)        1         [TEA]           5
7         (MILK)        1        [MILK]           5
8         (COCK)        1        [COCK]           1
9       (COFFEE)        1      [COFFEE]           4
10       (BREAD)        1       [BREAD]           9


References

https://www.analyticsvidhya.com/blog/2021/08/python-tutorial-working-with-csv-file-for-data-science/

https://towardsdatascience.com/magic-commands-for-profiling-in-jupyter-notebook-d2ef00e29a63

http://www.btechsmartclass.com/data_structures/tree-terminology.html

In [87]:
from mlxtend.frequent_patterns import apriori, association_rules

#Let's transform the list, with one-hot encoding
from mlxtend.preprocessing import TransactionEncoder
a = TransactionEncoder()
a_data = a.fit(database).transform(database)
df = pd.DataFrame(a_data,columns=a.columns_)
df = df.replace(False,0)
df


df = apriori(df, min_support = 0.1, use_colnames = True)
print(f'Shape of frequent itemsets : {df.shape}')
print(f'Frequent itemsets from apriori \n{df.head()}')
print(f'Frequent itemsets from apriori approach \n{df.tail()}')

df_ar = association_rules(df, metric = "confidence", min_threshold = 0.5)
print(df_ar)
print(f'\n\nShape of association rules : {df_ar.shape}')
print(f'Association rules(Head) \n{df_ar.head()}')
print(f'Association rules(Tail) \n{df_ar.tail()}')


Shape of frequent itemsets : (30, 2)
Frequent itemsets from apriori 
    support      itemsets
0  0.357143     (BISCUIT)
1  0.142857   (BOURNVITA)
2  0.642857       (BREAD)
3  0.285714      (COFFEE)
4  0.285714  (CORNFLAKES)
Frequent itemsets from apriori approach 
     support                itemsets
25  0.285714            (MAGGI, TEA)
26  0.142857  (MILK, BREAD, BISCUIT)
27  0.142857   (MAGGI, BISCUIT, TEA)
28  0.142857     (MAGGI, BREAD, JAM)
29  0.142857     (MAGGI, BREAD, TEA)
         antecedents     consequents  antecedent support  consequent support  \
0          (BISCUIT)         (BREAD)            0.357143            0.642857   
1        (BOURNVITA)         (SUGER)            0.142857            0.285714   
2            (SUGER)     (BOURNVITA)            0.285714            0.142857   
3              (JAM)         (BREAD)            0.142857            0.642857   
4            (MAGGI)         (BREAD)            0.357143            0.642857   
5             (MILK)         (BR