https://www.analyticsvidhya.com/blog/2021/08/python-tutorial-working-with-csv-file-for-data-science/

In [32]:
# from google.colab import drive
# drive.mount('/content/drive')
# !ln -s "/content/drive/My Drive/Academic/CSCM35 - Big Data & Data Mining/coursework 2/Dataset.Small" "/content/"
# !ln -s "/content/drive/My Drive/Academic/CSCM35 - Big Data & Data Mining/coursework 2/Dataset.Large" "/content/"
!pip install -U memory_profiler
!sudo pip install psutil
!pip install line_profiler



In [28]:
from itertools import combinations
from collections import Counter
import numpy as np
import csv
import pandas as pd

from memory_profiler import profile
import tracemalloc
import os
import psutil

In [3]:
def read_from_file(file_name) :
  csvfile = open(file = file_name, mode = 'r') # open file
  print(type(csvfile))

  csvreader = csv.reader(csvfile, skipinitialspace = True) # csv reader object
  rows = [] # empty list
  for row in csvreader:
    rows.append(row)

  csvfile.close() #' close file
  return rows

def write_to_file(file_name, data) :
  file = open(file = file_name, mode = 'w') # file object

  csvwriter = csv.writer(file) # csv reader object
  for row in data:
    csvwriter.writerow(row)

  file.close() # close file

In [4]:
transactions = read_from_file('Dataset.Small/GroceryStore.csv') # read transactions or database
write_to_file('Transactions.csv', transactions) # write transactions or database into another file

<class '_io.TextIOWrapper'>


**Brute-Force Approach(Frequent itemset mining)**

$
Number\ of\ unique\ items\ =\ d\\
Number\ of\ transactions\ =\ N\\
Average\ width\ of\ a\ transaction=\ w\\ \\
Number\ of\ all\ possible\ combinations\ =\
\displaystyle\sum_{r=1} ^{d} c_{r}\ =\ 2^d - 1\\
Complexity\ =\ O(Nw2^{d})
$

In [5]:
def search_database(database, itemset) :
  """
    Search occurance of given itemset in the transaction database
    Inputs
      database : Transactions : List of lists 
      itemset : Itemset that should be searched : List
    Outputs
      occurance : Number of occurance : integer
  """
  frequency = np.count_nonzero([all(item in transaction for item in itemset) for transaction in database])
  return frequency

def calc_candidate_sup_cnt(database, c_k, k) : 
  """
  Calculate support count for all candidate k-itemsets and generate dataframe
  Inputs
    database : Transactions : List of lists
    c_k : Candidate k-itemsets : List of tuples
  Output : 
    c_k_df : Candidate k-itemsets dataframe with support counts
  """
  itemset_df = pd.DataFrame(columns=['item_cnt','itemset','support_cnt'])
  for itemset in c_k: # itemset : tuple of items
      support_count = search_database(database, itemset)
      data_entry = {
        "item_cnt": k,
        "itemset": itemset,
        "support_cnt": support_count
      }
      itemset_df = itemset_df.append(data_entry, ignore_index = True)
  return itemset_df

def select_freq_itemsets(c_k, min_sup_cnt) :
  """
  Select frequent k-itemsets from candidate k-itemsets(Candidate elimination)
  Inputs
    c_k : Candidate k-itemsets : DataFrame(itemset,support_cnt)
    min_sup_cnt : minimum support count(hyper parameter) : integer
  Outputs
    f_k : Frequent k-itemsets : DataFrame(itemset,support_cnt)
  """
  f_k = c_k[c_k['support_cnt'] >= min_sup_cnt] 
  return f_k

def brute_force_approach(database, unique_items, min_sup_cnt, itemset_groups):
  """ 
  Brute Force Method 
  Inputs
    database : Transaction database : List of lists
    unique_items : Unique items in the database : List of items
    min_sup_cnt : Minimum support count : intereger
    itemset_groups : Required item groups(k values)
  Outputs
    all_freq_itemsets = Frequent itemsets for given groups = DataFrame
  """
  all_candidate_df = pd.DataFrame(columns=['item_cnt','itemset','support_cnt'])
  for k in itemset_groups: # loop number of itemset groups
    c_k = combinations(unique_items, k) # create combinations for each itemset group
    c_k_df = calc_candidate_sup_cnt(database, c_k, k)
    all_candidate_df = all_candidate_df.append(c_k_df, ignore_index = True)

  all_freq_itemsets = select_freq_itemsets(all_candidate_df, min_sup_cnt)
  return all_freq_itemsets

In [6]:
"""Small dataset"""
transactions = read_from_file('Dataset.Small/GroceryStore.csv') 
database = [] # transactions or database
for transaction in transactions:
  tr_str = ''.join(transaction)
  database.append(tr_str.split(','))
unique_items = read_from_file('Dataset.Small/Items.txt')[0] # unique items

""" Large dataset """
# transactions = read_from_file('Dataset.Large/OnlineRetail.csv') 
# database = [] # transactions or database
# for transaction in transactions:
#   tr_str = ''.join(transaction)
#   database.append(tr_str.split(','))
# unique_items = read_from_file('Dataset.Small/Items.txt')[0] # unique items
# print(database)

min_sup_cnt = 2
itemset_groups = [1, 2, 3, 4, 5]
freq_itemsets = brute_force_approach(database, unique_items, min_sup_cnt, itemset_groups)
print(freq_itemsets.shape)

<class '_io.TextIOWrapper'>
<class '_io.TextIOWrapper'>
(33, 3)


**Apriori Algorithm(Frequent itemset mining)**

* Superset is frequent >>> Subset is frequent (Bottom Up)

* Sebset is infrequent >>> Superset is infrequent (Top down)

* This algorithm has **anti-monotone** propety >>> support of itemset can not exceed support of its subset


**Main Steps**

* Candidate Generation ($F_{k-1} >> C_{k}$)

* Candidate Pruning

* Support Counting

* Candidate Elimination ($C_{k} >> F_{k}$)

**Candidate Pruning ($F_{k-1}×F_{k-1}$)**

$Itemset\ is\ not\ frequent\ if\ one\ of\ its\ sub\ itemset\ is\ not\ frequent$

* Number of **(k-1)-size subsets** for a **k-itemset** = $C^{K}_{K-1}$ = $K$
* Number of **already verified (k-1)-size subsets** at the candidate generation = $2$
* Number of subsets for frequency verification stage per each **k-itemset** = $K-2$
* Total number of subsets for frequency verification for **all k-itemsets** = $L_{k}\times(K-2)$

In [29]:
@profile
def generate_candidates(f_prv_df, k):
  """
  Generate candidate set for k-itemsets
  Input : 
    f_prv : F(K-1) : DataFrame 
  Output : 
    c_k : List of candidate k-itemsets : List of lists
  """
  tracemalloc.start()
  c_k = [] # candidate k-itemset
  f_prv = f_prv_df['itemset']
  f_prv_pairs = combinations(f_prv, 2) # select pair of frequent (k-1)-itemsets
  for f_prv_pair in f_prv_pairs:
    if(len(f_prv_pair[0]) == 1) : ### K=2 ###
      two_itemset = [*f_prv_pair[0],*f_prv_pair[1]]
      two_itemset.sort()         
      c_k.append(two_itemset)
    elif(f_prv_pair[0][0:k-2] == f_prv_pair[1][0:k-2]) :### K>=2 and first (k-2) items are same in both itemsets ###
      generated_itemset = f_prv_pair[0][0:k-2]
      delta_items = [f_prv_pair[0][k-2],f_prv_pair[1][k-2]]
      delta_items.sort()
      generated_itemset = [*generated_itemset,*delta_items]
      c_k.append(generated_itemset)
  print(tracemalloc.get_traced_memory())
  tracemalloc.stop()
  return c_k

def prune_candidates(c_k, f_prv_df, k) :
  """
  Prune candidate set from k-itemsets
  Inputs
    c_k : candidate k-itemsets : List of lists
    f_prv_df : frequent (k-1)-itemsets : DataFrame
  Output
    c_k_prune : pruned candidate k-itemsets : List of lists
  """
  f_prv = f_prv_df['itemset']
  infrq_itemsets = []
  ### search candidate k-itemsets to check frequency of its (k-1)-sized subsets
  for itemset in c_k :
    subsets = combinations(itemset, k-1)
    for subset in subsets:
      frequency = search_database(f_prv, subset)
      if(frequency==0):
        infrq_itemsets.append(itemset)
        break
  ### remove infrequent itemsets from candidate k-itemsets ###
  for itemset in infrq_itemsets :
    c_k.remove(itemset)


In [30]:

# items = ['Bread','Coke', 'Milk','Beer','Diaper','Eggs']
# database = [['Bread','Milk'],['Beer','Bread','Diaper','Eggs'],['Beer','Coke','Diaper','Milk'],['Beer','Bread','Diaper','Milk'],['Bread','Coke','Diaper','Milk']]
# min_sup_cnt = 3

# c_1 = [[item] for item in items] # candidate 1-itemset
# c_1 = calc_candidate_sup_cnt(database, c_1, 1)
# f_1 = select_freq_itemsets(c_1, min_sup_cnt)

# c_2 = generate_candidates(f_1,2) # generate 2-itemset
# c_2 = calc_candidate_sup_cnt(database, c_2, 2)
# f_2 = select_freq_itemsets(c_2, min_sup_cnt)

# c_3 = generate_candidates(f_2,3) # generate 3-itemset
# c_3 = calc_candidate_sup_cnt(database, c_3, 3)
# f_3 = select_freq_itemsets(c_3, min_sup_cnt)
# print(f_1)
# print(f_2)
# print(f_3)

# c_2 = [['Coke', 'Milk'], ['Beer', 'Bread'], ['Bread', 'Diaper'], ['Beer', 'Milk'], ['Diaper', 'Milk'], ['Beer', 'Diaper']]
# f_1 = pd.DataFrame({'itemset':[['Bread'],['Beer'],['Diaper']]})
# prune_candidates(c_2, f_1, 2)
# print(c_2)

f_itemsets_store = {}
all_itemsets_df = pd.DataFrame(columns=['item_cnt','itemset','support_cnt'])
for k in itemset_groups:
  if(k>1) :
    c_k = generate_candidates(f_itemsets_store[k-1],k) # generate 3-itemset
    c_k = calc_candidate_sup_cnt(database, c_k, k)
    f_k = select_freq_itemsets(c_k, min_sup_cnt)
    all_itemsets_df = all_itemsets_df.append(f_k, ignore_index=True)
    f_itemsets_store[k] = f_k
  else :
    c_1 = [[item] for item in unique_items] # candidate 1-itemset
    c_1 = calc_candidate_sup_cnt(database, c_1, 1)
    f_1 = select_freq_itemsets(c_1, min_sup_cnt)
    all_itemsets_df = all_itemsets_df.append(f_1, ignore_index=True)
    f_itemsets_store[1] = f_1

print(all_itemsets_df)


ERROR: Could not find file <ipython-input-29-5389b1594985>
NOTE: %mprun can only be used on functions defined in physical files, and not in the IPython environment.
(2944, 2944)
ERROR: Could not find file <ipython-input-29-5389b1594985>
NOTE: %mprun can only be used on functions defined in physical files, and not in the IPython environment.
(2584, 2600)
ERROR: Could not find file <ipython-input-29-5389b1594985>
NOTE: %mprun can only be used on functions defined in physical files, and not in the IPython environment.
(1496, 2185)
ERROR: Could not find file <ipython-input-29-5389b1594985>
NOTE: %mprun can only be used on functions defined in physical files, and not in the IPython environment.
(1296, 2281)
   item_cnt                        itemset support_cnt
0         1                          [JAM]           2
1         1                        [MAGGI]           5
2         1                       [COFFEE]           8
3         1                          [TEA]           7
4         1  

In [33]:
@profile

def my_func():

    a=[]

    for i in range(1000):

        a.append(i)

my_func()

ERROR: Could not find file <ipython-input-33-e9ddf1dc4c1f>
NOTE: %mprun can only be used on functions defined in physical files, and not in the IPython environment.
