<h1> Task 1: Implement the Apriori algorithm to mine frequent itemsets </h1>

In [None]:
!pip install mlxtend

In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
from collections import defaultdict
import time
import os
from functools import reduce
from mlxtend.frequent_patterns import apriori, association_rules

Using the lecture notes explanation of the Apriori Algorithm, we have 4 steps to do.
1. Candidate Generation
2. Candidate Pruning
3. Support Counting
4. Candidate Elimination

## Functions for my apriori algorithm

In [31]:
# Apriori algorithm
# We combine the above steps to generate frequent itemsets with k+1 items
# from frequent itemsets with k items
# We continue this process until we get no frequent itemsets with k+1 items
# We then combine the frequent itemsets with k items to generate association rules
# We continue this process until we get no association rules
# We then combine the association rules to generate association rules with k+1 items

# Function to generate frequent itemsets with 1 item (initialisation)
def generate_freq_1_itemsets(data, min_support, combined_freq_itemsets):
    # Check the support in terms of ratio
    support_ratio = min_support / len(data)
    print(f'The support ratio is {support_ratio}\n')

    # Count the number of 0s and 1s in each column
    # The number of 1s is the number of times each item appears
    value_counts = data.apply(pd.value_counts)

    # Get the frequent itemsets with count greater than or equal to min_support
    columns = data.columns
    frequent_itemsets = {}
    for column in columns:
        # Append the itemset and its count to the dictionary if the count is greater than or equal to min_support
        if value_counts[column][1] >= min_support:
            frequent_itemsets[column] = value_counts[column][1]
            # frequent_itemsets.append((column, value_counts[column][1]))
            # data.drop(column, axis=1, inplace=True)

    dummy_dict = frequent_itemsets.copy()
    for key, item in dummy_dict.copy().items():
        # For dummy data
        # dummy_dict[(tuple(key))] = dummy_dict.pop(key)
        # To convert the key to a tuple for rule generation
        dummy_dict[(key,)] = dummy_dict.pop(key)
    print(dummy_dict)

    combined_freq_itemsets.update(dummy_dict)
    print(frequent_itemsets)
    return frequent_itemsets
    
# Function to generate frequent itemsets with k+1 items
def generate_k_plus_1_candidate_itemsets(frequent_itemsets, k):
    # Generate all possible combinations of frequent itemsets with k+1 items

    # If k = 1, we do not need to merge the combinations
    if k == 1:
        combinations = []
        combinations.append(list(itertools.combinations(frequent_itemsets.keys(), k+1)))
        return combinations
    
    else:
        # Merge the combinations if the first k-1 items are the same
        # and the last item is different
        # This is done to generate combinations with k+1 items
        # from combinations with k items
        # Compare first k-1 items of each combination
        # If they are the same, merge them
        # If they are not the same, do not merge them
        # The merged combinations are stored in a dictionary
        merged_combinations = {}
        

        for index, combination1 in enumerate(frequent_itemsets.keys()):
            for combination2 in list(frequent_itemsets.keys())[index+1:]:
                # Check if the first k-1 items are the same
                if combination1[:-1] == combination2[:-1]:
                    # Check if the last item is different
                    if combination1[-1] != combination2[-1]:
                        # Merge the combinations
                        merged_combinations[combination1 + (combination2[-1],)] = 0

    
        return merged_combinations

def candidate_pruning_using_subset_checking(k_plus_1_candidate_itemsets, k_frequent_itemsets, k):
    # Prune the combinations if any of the subsets of the combination is not in the frequent itemsets
    # The subsets of the combination are the combinations with k items
    # If any of the subsets of the combination is not in the frequent itemsets,
    # then the combination is not frequent
    for combination in k_plus_1_candidate_itemsets.copy().keys():
        # Get the subsets of the combination
        subsets = list(itertools.combinations(combination, k))
        # print(subsets)
        # Check if any of the subsets of the combination is not in the frequent itemsets
        # If any of the subsets of the combination is not in the frequent itemsets,
        # then the combination is not frequent
        if any([subset not in k_frequent_itemsets.keys() for subset in subsets]):
            k_plus_1_candidate_itemsets.pop(combination)
    return k_plus_1_candidate_itemsets

# Function to count the number of occurences of each combination in the candidate itemsets
def k_plus_1_itemsets_support_counting(k_plus_1_candidate_itemsets, k, data):
    # If k = 1, we need to convert the list of lists of tuples to a list of tuples
    if k == 1:
        k_plus_1_candidate_itemsets = k_plus_1_candidate_itemsets[0]

    # Count the number of occurences of each combination in the data
    candidate_itemsets_count = {}
    for candidate_itemset in k_plus_1_candidate_itemsets:
        # Using groupby and size to count the number of occurences of each combination
        # Resetting the index to get the count of each combination as a column in the dataframe
        support_count = data.groupby(list(candidate_itemset)).size().reset_index(name='count')
        # Append the combination and its count to the dictionary
        # The count of each combination is the last value in the count column
        # Moreover, we need to check whether the last row is a combination of 1s instead of 1s and 0s
        # If it is a combination of 1s, then we append the combination and its count to the dictionary
        # Otherwise, we do not append it to the dictionary
        # if test[test.columns[0]].iloc[-1] == 1 and test[test.columns[1]].iloc[-1] == 1:
        #     candidate_itemsets_count[candidate_itemset] = test['count'].iloc[-1]
        num_ones = 0
        for i in range(len(support_count.columns)-1):
            if support_count[support_count.columns[i]].iloc[-1] != 1:
                break
            else:
                num_ones += 1
                continue
            
        if num_ones == len(support_count.columns)-1:
            candidate_itemsets_count[candidate_itemset] = support_count['count'].iloc[-1]

    return candidate_itemsets_count


def candidate_elimination(combinations_count, min_support, combined_freq_itemsets):
    
    # Prune the combinations with count less than min_support
    for combination in combinations_count.copy().keys():
        if combinations_count[combination] < min_support:
            combinations_count.pop(combination)
    
    combined_freq_itemsets.update(combinations_count)
    return combinations_count

def generate_rules(combined_freq_itemsets, min_confidence, target):
    # Generate rules for frequent itemsets with k+1 items with min confidence
    # The rules are generated by splitting the combination into two parts
    rules = {}
    for key in combined_freq_itemsets.keys():
        
        for i in range(1, len(key)+1):  # range will return the values 1,2,3,4 in this loop
            combinations = []
            combinations.append(list(itertools.combinations(key, i)))
            # print(target, key, combinations)
            if combinations:
                combinations = combinations[0]
                for combination in combinations:
                    
                    # Convert the combination to a tuple if it is a string
                    if type(combination) == str:
                        combination = (combination,)
                    
                    # Check if the target is in the combination
                    if target != None:
                        # Continue to the next combination if the target is not in the combination
                        if target not in combination or len(combination) == 1:
                            continue
                        
                        # Split the combination into two parts
                        # The first part is the antecedent and the second part is the consequent
                        # The antecedent is the combination without the target
                        # The consequent is the target

                        temp_target = (target,)
                        
                        # In order to keep the correct order of the items in the combination
                        difference = set(combination) - set(temp_target)
                        antecedent = tuple(item for item in combination if item in difference)
                        consequent = temp_target
                        
                        confidence = combined_freq_itemsets[combination] / combined_freq_itemsets[antecedent]

                    # If the target is None, then we do not need to assign the target to the consequent
                    else:
                        antecedent = combination
                        difference = set(key) - set(combination)
                        consequent = tuple(item for item in key if item in difference)
                        if consequent == ():
                            continue
                    # print(target, combination)
                    # print("Combinations is ", combinations, "Combination is: ", combination, "Antecedent is: ", antecedent, "Consequent is: ", consequent)
                        confidence = combined_freq_itemsets[key] / combined_freq_itemsets[antecedent]
                        
                    # print('key: ', key, 'antecedent: ', antecedent, 'consequent: ', consequent, 'confidence: ', confidence)
                    if confidence >= min_confidence:
                        rules[(antecedent, consequent)] = confidence
                        
        # Split the combination into two parts
        # The first part is the antecedent and the second part is the consequent
        # for i in range(1, len(key)):
        #     antecedent_1 = key[:i]
        #     consequent_1 = key[i:]

        #     antecedent_2 = key[i:]
        #     consequent_2 = key[:i]
        #     # Calculate the confidence of the rule
        #     # Confidence = support of combination / support of antecedent
        #     confidence_1 = combined_freq_itemsets[key] / combined_freq_itemsets[antecedent_1]
        #     confidence_2 = combined_freq_itemsets[key] / combined_freq_itemsets[antecedent_2]

        #     print(antecedent_1, consequent_1, confidence_1)
        #     print(antecedent_2, consequent_2, confidence_2)
        #     # Check if the confidence is greater than min_confidence
        #     if confidence_1 >= min_confidence:
        #         # Append the rule to the rules dictionary
        #         rules[(antecedent_1, consequent_1)] = confidence_1

        #     if confidence_2 >= min_confidence:
        #         # Append the rule to the rules dictionary
        #         rules[(antecedent_2, consequent_2)] = confidence_2
                
    return rules
    

## Functions to deal with large dataset

In [32]:
# Function to sum-merge an abitrary number of dictionaries
def reducer(accumulator, element):
    # print('accu', accumulator, 'ele', element)
    for key, value in element.items():
        accumulator[key] = accumulator.get(key, 0) + value
    return accumulator

def generate_freq_1_itemsets_for_large_data(data, min_support, combined_freq_itemsets):

    data_length = 0
    columns = data.get_chunk(1).columns
    # Initalise the dataframe with 0.0 and 1.0 as rows
    index = [0.0, 1.0]
    temp_df = pd.DataFrame(index=index, columns=columns)

    # Change all columns to string type
    temp_df = temp_df.astype(float)
    # print(temp_df.columns.tolist())

    for i, chunk in enumerate(data):
        print(f'Generating frequent 1-itemsets for chunk {i+1}')
        chunk.fillna(0, inplace=True)
        temp_value_counts = chunk.apply(pd.value_counts)
        # Change all columns to float type
        temp_value_counts = temp_value_counts.astype(float)
        # print(temp_value_counts)
        # Sum the value counts of each chunk to get the total value counts
        temp_df = temp_df.add(temp_value_counts, fill_value=0)

        data_length += len(chunk)
    # print(temp_df)
    # Check the support in terms of ratio
    support_ratio = min_support / data_length
    print(f'The support ratio is {support_ratio}\n')
    
    # Get the frequent itemsets with count greater than or equal to min_support
    # Get the columns and values of the dataframe with 1.0 and more than min_support
    temp_df = temp_df[temp_df.columns[temp_df.iloc[1] >= min_support]]
    # Drop the row with 0.0
    temp_df  = temp_df.drop(index=0.0, axis=1)
    frequent_1_itemsets = temp_df.iloc[0].to_dict()
    # print('freq1_itemset', frequent_1_itemsets)
    dummy_dict = frequent_1_itemsets.copy()
    for key, item in frequent_1_itemsets.copy().items():
        # For dummy data
        # dummy_dict[(tuple(key))] = dummy_dict.pop(key)
        # For real data
        dummy_dict[(key,)] = dummy_dict.pop(key)
    # print(dummy_dict)

    combined_freq_itemsets.update(dummy_dict)
    # print('combined freq1 ', combined_freq_itemsets)

    return frequent_1_itemsets

## Calling the apriori algorithm

In [33]:
def my_apriori(file_location, min_support, min_confidence, target=None):
    
    # Combined dictionary of frequent itemsets
    combined_freq_itemsets = {}
    frequent_1_itemsets = defaultdict(int)
    # Chunksize for reading the data
    chunksize = 2000000
    # File size threshold in GB
    file_size_threshold = 4

    # Check the size of the file
    # If the file size is less than 4 GB, we can read the file directly
    # Otherwise, we need to read the file in chunks
    file_size = os.path.getsize(file_location)
    file_size_GB = file_size/1000000000
    print(f'The file size is {file_size} bytes, which is {file_size_GB} GB')

    
    start_time = time.time()
    if file_size_GB < file_size_threshold:
        print('File size is less than 4 GB, reading the file directly to generate frequent 1 itemset\n')
        data = pd.read_csv(file_location)
        frequent_1_itemsets = generate_freq_1_itemsets(data, min_support, combined_freq_itemsets)

    else:
        print('File size is more than 4 GB, reading the file in chunks to generate frequent 1 itemset\n')
        # Read the data in chunks of 10000 rows each to ensure that the data fits in memory
        data = pd.read_csv(file_location, chunksize=chunksize, iterator=True)
        chunk_size_in_bytes = data.get_chunk().memory_usage(deep=True).sum()
        print(f'Size of one chunk in bytes is {chunk_size_in_bytes}, which is {chunk_size_in_bytes/1000000000} GB')
        frequent_1_itemsets = generate_freq_1_itemsets_for_large_data(data, min_support, combined_freq_itemsets)

    k_plus_1_candidate_itemsets = None
    previous_frequent_itemsets = None
    k = 1

    while True:
        # print(k)
        if k == 1:
            k_plus_1_candidate_itemsets = generate_k_plus_1_candidate_itemsets(frequent_1_itemsets, k)
        else:
            k_plus_1_candidate_itemsets = generate_k_plus_1_candidate_itemsets(k_plus_1_frequent_itemsets, k)
            
        # print(combined_freq_itemsets)
        # print('k+1 candidate before pruning: ',k_plus_1_candidate_itemsets)
        
        # Prune the combinations if candidate itemset contains subsets of length k that are infrequent
        # We do not need to prune the combinations if k = 1
        if k != 1:
            k_plus_1_candidate_itemsets = candidate_pruning_using_subset_checking(k_plus_1_candidate_itemsets, previous_frequent_itemsets, k)
            # for combination in k_plus_1_candidate_itemsets.copy().keys():
            #     print('combination', combination)
            #     combinations = []
            #     combinations.append(list(itertools.combinations(combination, k)))
            #     print('combinations', combinations)
            #     if combinations:
            #         combinations = combinations[0]
            #         for subset in combinations:
            #             print('subsets of candidate itemsets: ', subset)
            #             if subset not in previous_frequent_itemsets.keys():
            #                 k_plus_1_candidate_itemsets.pop(combination)
            #                 break

        
        # print('previous frequent itemsets', previous_frequent_itemsets)
        # print('k+1 candidate after pruning: ', k_plus_1_candidate_itemsets)
        k_plus_1_itemsets_support_count = defaultdict(int)
        k_plus_1_frequent_itemsets = defaultdict(int)

        if file_size_GB < file_size_threshold:
            k_plus_1_itemsets_support_count = k_plus_1_itemsets_support_counting(k_plus_1_candidate_itemsets, k, data)
            # print('temp_dict', temp_dict)
            # k_plus_1_itemsets_support_count = reduce(reducer, [k_plus_1_itemsets_support_count, temp_dict])
            # k_plus_1_itemsets_support_counting(k_plus_1_candidate_itemsets, k, k_plus_1_itemsets_support_count, data)
        else:
            print('File size is more than 4 GB, reading the file in chunks to generate k+1 candidate itemsets...')
            # Need to read the data again as the data is read in chunks
            data = pd.read_csv(file_location, chunksize=chunksize, iterator=True)
            for i, chunk in enumerate(data):
                print(f'Support Counting for chunk {i+1}')
                chunk.fillna(0, inplace=True)
                # print(chunk)
                # temp_dict = k_plus_1_itemsets_support_counting(k_plus_1_candidate_itemsets, k, chunk)
                # for key, value in temp_dict.items():
                #     k_plus_1_itemsets_support_count[key] += value
                # print(k_plus_1_itemsets_support_count)
                temp_dict = k_plus_1_itemsets_support_counting(k_plus_1_candidate_itemsets, k, chunk)
                k_plus_1_itemsets_support_count = reduce(reducer, [k_plus_1_itemsets_support_count, temp_dict])
        print('k_plus_1_itemsets_support_counting', k_plus_1_itemsets_support_count)
        
        k_plus_1_frequent_itemsets = candidate_elimination(k_plus_1_itemsets_support_count, min_support, combined_freq_itemsets)
        # Store the frequent itemsets with k items to prune the combinations with subsets of length k that are infrequent
        previous_frequent_itemsets = k_plus_1_frequent_itemsets.copy()
        # print(combined_freq_itemsets)
        # print(k_plus_1_frequent_itemsets)
        k += 1
        print('k: ', k)
        # If there are no frequent itemsets with k+1 items, break
        if len(k_plus_1_frequent_itemsets) == 0:
            break

    # Generate rules for frequent itemsets with k+1 items with min confidence
    # The rules are generated by splitting the combination into two parts
    rules = generate_rules(combined_freq_itemsets, min_confidence, target)
    
    running_time = time.time() - start_time
    print(f'Time taken to generate frequent itemsets and rules is {running_time} seconds')
    return combined_freq_itemsets, rules, running_time, k


## Function to print frequent itemsets and associate rules

In [34]:
def print_freq_itemsets(freq_itemsets):
    print('The generated frequent itemsets are: ')
    for i, (key, value) in enumerate(freq_itemsets.items()):
        print(i+1, key, value)

def print_rules(rules):
    print('The generated associated rules are: ')
    index = 1
    for key, item in rules.items():
        for i in range(1, len(key)):
            antecedent = key[:i]
            consequent = key[i:]
            print('Rule ', index, ': antecedent -> consequent: ', list(sum(antecedent, ())), '-> ', list(sum(consequent, ())), 'confidence: ', item)
            index += 1

## Verification of result of apriori algorithm

In [35]:
def verify_min_support_counting(data, freq_1_itemsets, min_support, k, large_data=False):
    combinations = []        
    for i in range(2, k):
        combinations.append(list(itertools.combinations(freq_1_itemsets.keys(), i)))

    # Make a flattened list of tuples
    combinations = [item for sublist in combinations for item in sublist]
    # Count the number of occurences of each combination in the data
    
    temp_combinations_count = {}
    for combination in combinations:
        # print(combination)
        # Using groupby and size to count the number of occurences of each combination
        # Resetting the index to get the count of each combination as a column in the dataframe
        support_count = data.groupby(list(combination)).size().reset_index(name='count')
        # Append the combination and its count to the dictionary
        # The count of each combination is the last value in the count column
        # Moreover, we need to check whether the last row is a combination of 1s instead of 1s and 0s
        # If it is a combination of 1s, then we append the combination and its count to the dictionary
        # Otherwise, we do not append it to the dictionary
        num_ones = 0
        for i in range(len(support_count.columns)-1):
            if support_count[support_count.columns[i]].iloc[-1] != 1:
                break
            else:
                num_ones += 1
                continue

        if not large_data:    
            if num_ones == len(support_count.columns)-1 and support_count['count'].iloc[-1] >= min_support:
                temp_combinations_count[combination] = support_count['count'].iloc[-1]
        else:
            if num_ones == len(support_count.columns)-1:
                temp_combinations_count[combination] = support_count['count'].iloc[-1]
    # print(combinations_count)
    return temp_combinations_count

In [7]:
def verify_apriori_frequent_itemsets(file_location, min_support, combined_freq_itemsets, k):
    # Check the size of the file
    # If the file size is less than 4 GB, we can read the file directly
    # Otherwise, we need to read the file in chunks
    file_size = os.path.getsize(file_location)
    file_size_GB = file_size/1000000000
    print(f'The file size is {file_size} bytes, which is {file_size_GB} GB')

    # Chunksize for reading the data
    chunksize = 2000000
    # File size threshold in GB
    file_size_threshold = 4

    # Verification combined dictionary of frequent itemsets
    combinations_count = defaultdict(int)

    if file_size_GB < file_size_threshold:
        print('File size is less than 4 GB, reading the file directly to verify frequent itemsets\n')
        data = pd.read_csv(file_location)
        
        freq_1_itemset = generate_freq_1_itemsets(data, min_support, combinations_count)
        print(freq_1_itemset)

        temp_combinations_count = verify_min_support_counting(data, freq_1_itemset, min_support, k)
        combinations_count = reduce(reducer, [combinations_count, temp_combinations_count])

        # Compare the combinations_count with the combined_freq_itemsets
        # If the combinations_count is not the same as the combined_freq_itemsets, then the frequent itemsets are not correct
        print('verify', combinations_count.keys())
        print('result', combined_freq_itemsets.keys())
        if combinations_count.keys() != combined_freq_itemsets.keys():
            print('The frequent itemsets mined are not correct.')
        else:
            print('The frequent itemsets mined are correct!')

    else:
        print('File size is more than 4 GB, reading the file in chunks to verify frequent itemsets\n')
        # Read the data in chunks of 10000 rows each to ensure that the data fits in memory
        data = pd.read_csv(file_location, chunksize=chunksize, iterator=True)
        chunk_size_in_bytes = data.get_chunk().memory_usage(deep=True).sum()
        print(f'Size of one chunk in bytes is {chunk_size_in_bytes}, which is {chunk_size_in_bytes/1000000000} GB')

        freq_1_itemset = generate_freq_1_itemsets_for_large_data(data, min_support, combinations_count)
        
        
        data = pd.read_csv(file_location, chunksize=chunksize, iterator=True)
        for i, chunk in enumerate(data):
            print(f'chunk {i}')
            chunk.fillna(0, inplace=True)
            
            temp_dict = verify_min_support_counting(chunk, freq_1_itemset, min_support, k, large_data=True)
            combinations_count = reduce(reducer, [combinations_count, temp_dict])
            # print(combinations_count)
        # Remove keys with count less than min_support
        for combination in combinations_count.copy().keys():
            if combinations_count[combination] < min_support:
                combinations_count.pop(combination)
        # Compare the combinations_count with the combined_freq_itemsets
        # If the combinations_count is not the same as the combined_freq_itemsets, then the frequent itemsets are not correct
        print('verify', combinations_count.items())
        print('result', combined_freq_itemsets.items())
        if combinations_count.keys() != combined_freq_itemsets.keys():
            print('The frequent itemsets mined are not correct.')
        else:
            print('The frequent itemsets mined are correct!')


<h1> Task 2: Use 3 datasets to run Apriori algorithm with different min-support thresholds </h1>

<h2> 1. Grocery store dataset </h2>

### Data Preprocessing

In [223]:
df = pd.read_csv('dataset/grocery_store/Market_Basket_Optimisation.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [224]:
# Data CLeaning
df.fillna(0, inplace=True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,chutney,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,turkey,avocado,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,mineral water,milk,energy bar,whole wheat rice,green tea,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [225]:
# Get the unique items in the dataset
unique_items = pd.unique(df.values.ravel('K'))
unique_items

array(['shrimp', 'burgers', 'chutney', 'turkey', 'mineral water',
       'low fat yogurt', 'whole wheat pasta', 'soup', 'frozen vegetables',
       'french fries', 'eggs', 'cookies', 'spaghetti', 'meatballs',
       'red wine', 'rice', 'parmesan cheese', 'ground beef',
       'sparkling water', 'herb & pepper', 'pickles', 'energy bar',
       'fresh tuna', 'escalope', 'avocado', 'tomato sauce',
       'clothes accessories', 'energy drink', 'chocolate',
       'grated cheese', 'yogurt cake', 'mint', 'asparagus', 'champagne',
       'ham', 'muffins', 'french wine', 'chicken', 'pasta', 'tomatoes',
       'pancakes', 'frozen smoothie', 'carrots', 'yams', 'shallot',
       'butter', 'light mayo', 'pepper', 'candy bars', 'cooking oil',
       'milk', 'green tea', 'bug spray', 'oil', 'olive oil', 'salmon',
       'cake', 'almonds', 'salt', 'strong cheese', 'hot dogs', 'pet food',
       'whole wheat rice', 'antioxydant juice', 'honey', 'sandwich',
       'salad', 'magazines', 'protein bar', '

In [226]:
# Set the unique items as the column names
transactions_data = pd.DataFrame(columns=unique_items)
transactions_data.drop(columns= 0, inplace=True)
transactions_data

Unnamed: 0,shrimp,burgers,chutney,turkey,mineral water,low fat yogurt,whole wheat pasta,soup,frozen vegetables,french fries,...,ketchup,cream,hand protein bar,body spray,oatmeal,zucchini,water spray,tea,napkins,asparagus


In [227]:
# Iterate through the supermarket dataset
# Each row is a transaction
# If the item is present in the transaction, set the value as 1

for i in range(0, len(df)):
    transaction = df.iloc[i, :].values
    # Remove the 0s from the transaction
    transaction = transaction[transaction != 0]

    # Set the value as 1 if the item is present in the transaction
    for item in transaction:
        transactions_data.at[i, item] = 1

In [228]:
transactions_data.head()

Unnamed: 0,shrimp,burgers,chutney,turkey,mineral water,low fat yogurt,whole wheat pasta,soup,frozen vegetables,french fries,...,ketchup,cream,hand protein bar,body spray,oatmeal,zucchini,water spray,tea,napkins,asparagus
0,1.0,,,,1.0,1.0,,,,,...,,,,,,,,,,
1,,1.0,,,,,,,,,...,,,,,,,,,,
2,,,1.0,,,,,,,,...,,,,,,,,,,
3,,,,1.0,,,,,,,...,,,,,,,,,,
4,,,,,1.0,,,,,,...,,,,,,,,,,


In [229]:
transactions_data.fillna(0, inplace=True)
transactions_data.head()

Unnamed: 0,shrimp,burgers,chutney,turkey,mineral water,low fat yogurt,whole wheat pasta,soup,frozen vegetables,french fries,...,ketchup,cream,hand protein bar,body spray,oatmeal,zucchini,water spray,tea,napkins,asparagus
0,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [230]:
transactions_data.columns = transactions_data.columns.str.strip()
transactions_data.columns

Index(['shrimp', 'burgers', 'chutney', 'turkey', 'mineral water',
       'low fat yogurt', 'whole wheat pasta', 'soup', 'frozen vegetables',
       'french fries',
       ...
       'ketchup', 'cream', 'hand protein bar', 'body spray', 'oatmeal',
       'zucchini', 'water spray', 'tea', 'napkins', 'asparagus'],
      dtype='object', length=120)

In [231]:
# Save the transactions_data dataframe as a csv file
transactions_data.to_csv('dataset/grocery_store/Market_Basket_Cleaned.csv', index=False)

### Trying out different values of min support

In [8]:
# Location of the cleaned dataset
file_location = 'dataset/grocery_store/Market_Basket_Cleaned.csv'

In [9]:
data = pd.read_csv(file_location)
data.shape

(7501, 120)

In [15]:
# Try out different values of min_support
min_support_values = [50, 100, 500, 1000, 2000]
results_dict = {}
combined_freq_itemsets_dict = {}
for min_support in min_support_values:
    print(f'min_support: {min_support}')
    combined_freq_itemsets, rules, running_time, k = my_apriori(file_location, min_support, 0.4)
    print_freq_itemsets(combined_freq_itemsets)
    print_rules(rules)
    print(f'Running time is {running_time} seconds\n')
    combined_freq_itemsets_dict[min_support] = [combined_freq_itemsets, k]
    results_dict[min_support] = [running_time, len(combined_freq_itemsets)]

min_support: 50
The file size is 1808993 bytes, which is 0.001808993 GB
File size is less than 4 GB, reading the file directly to generate frequent 1 itemset

The support ratio is 0.006665777896280496

{('shrimp',): 536, ('burgers',): 654, ('turkey',): 469, ('mineral water',): 1788, ('low fat yogurt',): 574, ('whole wheat pasta',): 221, ('soup',): 379, ('frozen vegetables',): 715, ('french fries',): 1282, ('eggs',): 1348, ('cookies',): 603, ('spaghetti',): 1306, ('meatballs',): 157, ('red wine',): 211, ('rice',): 141, ('parmesan cheese',): 149, ('ground beef',): 737, ('herb & pepper',): 371, ('energy bar',): 203, ('fresh tuna',): 167, ('escalope',): 595, ('avocado',): 250, ('tomato sauce',): 106, ('clothes accessories',): 63, ('energy drink',): 200, ('chocolate',): 1229, ('grated cheese',): 393, ('yogurt cake',): 205, ('mint',): 131, ('champagne',): 351, ('ham',): 199, ('muffins',): 181, ('french wine',): 169, ('chicken',): 450, ('pasta',): 118, ('tomatoes',): 513, ('pancakes',): 713, 

In [16]:
# Convert the dictionary to a dataframe
results_df = pd.DataFrame.from_dict(results_dict, orient='index', columns=['Running Time', 'Number of Frequent Itemsets'])
results_df.index.name = 'Minimum Support'
results_df.reset_index(inplace=True)
results_df

Unnamed: 0,Minimum Support,Running Time,Number of Frequent Itemsets
0,50,8.229676,470
1,100,5.271868,187
2,500,0.429124,17
3,1000,0.173665,5
4,2000,0.131948,0


<h3> Verify with mlextend Apriori library </h3>

In [216]:
from IPython.display import display, HTML
mlextend_result_dict = {}
for min_support in min_support_values:
    freq_items = apriori(data, min_support=min_support/len(data), use_colnames=True)
    print(f'min_support: {min_support}')
    print(len(freq_items))
    display(freq_items)

    mlextend_result_dict[min_support] = len(freq_items)

    if len(freq_items) != 0:
        rules = association_rules(freq_items, metric='confidence', min_threshold=0.4)
        display(rules)
    



min_support: 50
470


Unnamed: 0,support,itemsets
0,0.071457,(shrimp)
1,0.087188,(burgers)
2,0.062525,(turkey)
3,0.238368,(mineral water)
4,0.076523,(low fat yogurt)
...,...,...
465,0.006799,"(chocolate, pancakes, spaghetti)"
466,0.010932,"(chocolate, milk, spaghetti)"
467,0.007066,"(chocolate, green tea, spaghetti)"
468,0.007066,"(chocolate, olive oil, spaghetti)"


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(soup),(mineral water),0.050527,0.238368,0.023064,0.456464,1.914955,0.01102,1.401255,0.503221
1,(rice),(mineral water),0.018797,0.238368,0.007732,0.411348,1.725681,0.003252,1.293856,0.428575
2,(ground beef),(mineral water),0.098254,0.238368,0.040928,0.416554,1.747522,0.017507,1.305401,0.474369
3,(olive oil),(mineral water),0.065858,0.238368,0.027596,0.419028,1.757904,0.011898,1.310962,0.461536
4,(salmon),(mineral water),0.042528,0.238368,0.017064,0.401254,1.683336,0.006927,1.272045,0.423972
5,(protein bar),(mineral water),0.018531,0.238368,0.007732,0.417266,1.750511,0.003315,1.306998,0.436833
6,(light cream),(mineral water),0.015598,0.238368,0.007332,0.470085,1.972098,0.003614,1.437273,0.500736
7,"(shrimp, frozen vegetables)",(mineral water),0.016664,0.238368,0.007199,0.432,1.812322,0.003227,1.340901,0.455818
8,"(shrimp, spaghetti)",(mineral water),0.021197,0.238368,0.008532,0.402516,1.68863,0.003479,1.274731,0.416635
9,"(chocolate, shrimp)",(mineral water),0.017998,0.238368,0.007599,0.422222,1.771303,0.003309,1.318209,0.443424




min_support: 100
187


Unnamed: 0,support,itemsets
0,0.071457,(shrimp)
1,0.087188,(burgers)
2,0.062525,(turkey)
3,0.238368,(mineral water)
4,0.076523,(low fat yogurt)
...,...,...
182,0.013465,"(chocolate, mineral water, eggs)"
183,0.017064,"(mineral water, ground beef, spaghetti)"
184,0.015865,"(chocolate, mineral water, spaghetti)"
185,0.015731,"(milk, mineral water, spaghetti)"


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(soup),(mineral water),0.050527,0.238368,0.023064,0.456464,1.914955,0.01102,1.401255,0.503221
1,(ground beef),(mineral water),0.098254,0.238368,0.040928,0.416554,1.747522,0.017507,1.305401,0.474369
2,(olive oil),(mineral water),0.065858,0.238368,0.027596,0.419028,1.757904,0.011898,1.310962,0.461536
3,(salmon),(mineral water),0.042528,0.238368,0.017064,0.401254,1.683336,0.006927,1.272045,0.423972
4,"(chocolate, eggs)",(mineral water),0.033196,0.238368,0.013465,0.405622,1.701663,0.005552,1.281394,0.426498
5,"(mineral water, ground beef)",(spaghetti),0.040928,0.17411,0.017064,0.416938,2.394681,0.009938,1.41647,0.607262
6,"(ground beef, spaghetti)",(mineral water),0.039195,0.238368,0.017064,0.435374,1.826477,0.007722,1.348914,0.470957
7,"(chocolate, spaghetti)",(mineral water),0.039195,0.238368,0.015865,0.404762,1.698053,0.006522,1.279541,0.42786
8,"(milk, spaghetti)",(mineral water),0.035462,0.238368,0.015731,0.443609,1.861024,0.007278,1.368879,0.479672
9,"(chocolate, milk)",(mineral water),0.032129,0.238368,0.013998,0.435685,1.82778,0.00634,1.349656,0.467922


min_support: 500
17




Unnamed: 0,support,itemsets
0,0.071457,(shrimp)
1,0.087188,(burgers)
2,0.238368,(mineral water)
3,0.076523,(low fat yogurt)
4,0.095321,(frozen vegetables)
5,0.170911,(french fries)
6,0.179709,(eggs)
7,0.080389,(cookies)
8,0.17411,(spaghetti)
9,0.098254,(ground beef)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric


min_support: 1000
5




Unnamed: 0,support,itemsets
0,0.238368,(mineral water)
1,0.170911,(french fries)
2,0.179709,(eggs)
3,0.17411,(spaghetti)
4,0.163845,(chocolate)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric


min_support: 2000
0




Unnamed: 0,support,itemsets


In [127]:
# Convert the dictionary to a dataframe
mlextend_results_df = pd.DataFrame.from_dict(mlextend_result_dict, orient='index', columns=['Number of Frequent Itemsets'])
mlextend_results_df.index.name = 'Minimum Support'
mlextend_results_df.reset_index(inplace=True)
mlextend_results_df

Unnamed: 0,Minimum Support,Number of Frequent Itemsets
0,50,470
1,100,187
2,500,17
3,1000,5
4,2000,0


### Verify with my implemented verification function

In [17]:
for min_support in min_support_values:
    combined_freq_itemsets = combined_freq_itemsets_dict[min_support][0]
    k = combined_freq_itemsets_dict[min_support][1]
    print(f'min_support: {min_support}')
    verify_apriori_frequent_itemsets(file_location, min_support, combined_freq_itemsets, k)
    print('\n')

min_support: 50
The file size is 1808993 bytes, which is 0.001808993 GB
File size is less than 4 GB, reading the file directly to verify frequent itemsets

The support ratio is 0.006665777896280496

{('shrimp',): 536, ('burgers',): 654, ('turkey',): 469, ('mineral water',): 1788, ('low fat yogurt',): 574, ('whole wheat pasta',): 221, ('soup',): 379, ('frozen vegetables',): 715, ('french fries',): 1282, ('eggs',): 1348, ('cookies',): 603, ('spaghetti',): 1306, ('meatballs',): 157, ('red wine',): 211, ('rice',): 141, ('parmesan cheese',): 149, ('ground beef',): 737, ('herb & pepper',): 371, ('energy bar',): 203, ('fresh tuna',): 167, ('escalope',): 595, ('avocado',): 250, ('tomato sauce',): 106, ('clothes accessories',): 63, ('energy drink',): 200, ('chocolate',): 1229, ('grated cheese',): 393, ('yogurt cake',): 205, ('mint',): 131, ('champagne',): 351, ('ham',): 199, ('muffins',): 181, ('french wine',): 169, ('chicken',): 450, ('pasta',): 118, ('tomatoes',): 513, ('pancakes',): 713, ('f

<h2> 2. Titanic dataset </h2>

### Data Preprocessing

In [352]:
survival_df = pd.read_csv('dataset/titanic/gender_submission.csv')
survival_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [353]:
train_titanic_df = pd.read_csv('dataset/titanic/train.csv')
test_titanic_df = pd.read_csv('dataset/titanic/test.csv')

In [354]:
train_titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [355]:
# Drop the columns that are not required
train_titanic_df.drop(columns=['PassengerId', 'Name','SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin'], inplace=True)
train_titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked
0,0,3,male,22.0,S
1,1,1,female,38.0,C
2,1,3,female,26.0,S
3,1,1,female,35.0,S
4,0,3,male,35.0,S


In [356]:
# Categorise the Age column
# Age 21 and below is a Child
# Age between 21 and 55 is an Adult
# Age above 55 is an Elderly
train_titanic_df['Age'] = pd.cut(train_titanic_df['Age'], bins=[0, 21, 55, 80], labels=['Child', 'Adult', 'Elderly'])
train_titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked
0,0,3,male,Adult,S
1,1,1,female,Adult,C
2,1,3,female,Adult,S
3,1,1,female,Adult,S
4,0,3,male,Adult,S


In [357]:
# Convert into one hot encoding
train_titanic_df = pd.get_dummies(train_titanic_df)
train_titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex_female,Sex_male,Age_Child,Age_Adult,Age_Elderly,Embarked_C,Embarked_Q,Embarked_S
0,0,3,False,True,False,True,False,False,False,True
1,1,1,True,False,False,True,False,True,False,False
2,1,3,True,False,False,True,False,False,False,True
3,1,1,True,False,False,True,False,False,False,True
4,0,3,False,True,False,True,False,False,False,True


In [358]:
# Converet Pclass into one hot encoding
train_titanic_df = pd.get_dummies(train_titanic_df, columns=['Pclass'])
train_titanic_df.head()


Unnamed: 0,Survived,Sex_female,Sex_male,Age_Child,Age_Adult,Age_Elderly,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,0,False,True,False,True,False,False,False,True,False,False,True
1,1,True,False,False,True,False,True,False,False,True,False,False
2,1,True,False,False,True,False,False,False,True,False,False,True
3,1,True,False,False,True,False,False,False,True,True,False,False
4,0,False,True,False,True,False,False,False,True,False,False,True


In [359]:
train_titanic_df.groupby(['Sex_male', 'Sex_female', 'Survived']).size().reset_index(name='count')

Unnamed: 0,Sex_male,Sex_female,Survived,count
0,False,True,0,81
1,False,True,1,233
2,True,False,0,468
3,True,False,1,109


In [360]:
len(train_titanic_df.columns)

12

In [112]:
combination = ('Survived', 'Sex_female')
target = ('Survived',)
tuple(set(combination) - set(target))

('Sex_female',)

In [361]:
train_titanic_df.shape

(891, 12)

In [362]:
train_titanic_df.to_csv('dataset/titanic/train_cleaned.csv', index=False)

### Trying out different values of min support

In [10]:
file_location = 'dataset/titanic/train_cleaned.csv'

In [11]:
data = pd.read_csv(file_location)
data.shape

(891, 12)

In [12]:
# Try out different values of min_support with target as Survived and 891 records
min_support_values = [10, 50, 100, 300, 600]
results_dict = {}
combined_freq_itemsets_dict = {}
for min_support in min_support_values:
    print(f'min_support: {min_support}')
    combined_freq_itemsets, rules, running_time, k = my_apriori(file_location, min_support, 0.4, target='Survived')
    print_freq_itemsets(combined_freq_itemsets)
    print_rules(rules)
    print(f'Running time is {running_time} seconds\n')
    combined_freq_itemsets_dict[min_support] = [combined_freq_itemsets, k]
    results_dict[min_support] = [running_time, len(combined_freq_itemsets)]

min_support: 10
The file size is 58216 bytes, which is 5.8216e-05 GB
File size is less than 4 GB, reading the file directly to generate frequent 1 itemset

The support ratio is 0.01122334455667789

{('Survived',): 342, ('Sex_female',): 314, ('Sex_male',): 577, ('Age_Child',): 204, ('Age_Adult',): 470, ('Age_Elderly',): 40, ('Embarked_C',): 168, ('Embarked_Q',): 77, ('Embarked_S',): 644, ('Pclass_1',): 216, ('Pclass_2',): 184, ('Pclass_3',): 491}
{'Survived': 342, 'Sex_female': 314, 'Sex_male': 577, 'Age_Child': 204, 'Age_Adult': 470, 'Age_Elderly': 40, 'Embarked_C': 168, 'Embarked_Q': 77, 'Embarked_S': 644, 'Pclass_1': 216, 'Pclass_2': 184, 'Pclass_3': 491}
k_plus_1_itemsets_support_counting {('Survived', 'Sex_female'): 233, ('Survived', 'Sex_male'): 109, ('Survived', 'Age_Child'): 87, ('Survived', 'Age_Adult'): 191, ('Survived', 'Age_Elderly'): 12, ('Survived', 'Embarked_C'): 93, ('Survived', 'Embarked_Q'): 30, ('Survived', 'Embarked_S'): 217, ('Survived', 'Pclass_1'): 136, ('Survived

In [13]:
# Convert the dictionary to a dataframe
results_df = pd.DataFrame.from_dict(results_dict, orient='index', columns=['Running Time', 'Number of Frequent Itemsets'])
results_df.index.name = 'Minimum Support'
results_df.reset_index(inplace=True)
results_df

Unnamed: 0,Minimum Support,Running Time,Number of Frequent Itemsets
0,10,0.814567,204
1,50,0.375874,98
2,100,0.187252,46
3,300,0.099412,11
4,600,0.010526,1


### Verify with mlextend Apriori library

In [15]:
mlextend_result_dict = {}
for min_support in min_support_values:
    freq_items = apriori(data, min_support=min_support/len(data), use_colnames=True)
    print(f'min_support: {min_support}')
    print(len(freq_items))
    display(freq_items)

    mlextend_result_dict[min_support] = len(freq_items)

    if len(freq_items) != 0:
        rules = association_rules(freq_items, metric='confidence', min_threshold=0.4)
        display(rules)

min_support: 10
204




Unnamed: 0,support,itemsets
0,0.383838,(Survived)
1,0.352413,(Sex_female)
2,0.647587,(Sex_male)
3,0.228956,(Age_Child)
4,0.527497,(Age_Adult)
...,...,...
199,0.020202,"(Embarked_S, Age_Adult, Sex_female, Pclass_3, ..."
200,0.013468,"(Sex_male, Embarked_S, Pclass_3, Age_Child, Su..."
201,0.01459,"(Pclass_1, Sex_male, Age_Adult, Embarked_C, Su..."
202,0.022447,"(Pclass_1, Sex_male, Embarked_S, Age_Adult, Su..."


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Sex_female),(Survived),0.352413,0.383838,0.261504,0.742038,1.933205,0.126234,2.388577,0.745420
1,(Survived),(Sex_female),0.383838,0.352413,0.261504,0.681287,1.933205,0.126234,2.031878,0.783438
2,(Age_Child),(Survived),0.228956,0.383838,0.097643,0.426471,1.111068,0.009761,1.074333,0.129649
3,(Age_Adult),(Survived),0.527497,0.383838,0.214366,0.406383,1.058735,0.011892,1.037978,0.117409
4,(Survived),(Age_Adult),0.383838,0.527497,0.214366,0.558480,1.058735,0.011892,1.070172,0.090035
...,...,...,...,...,...,...,...,...,...,...
425,"(Age_Adult, Sex_male, Embarked_S, Survived)",(Pclass_3),0.047138,0.551066,0.020202,0.428571,0.777713,-0.005774,0.785634,-0.230746
426,"(Survived, Sex_male, Embarked_S, Pclass_3)",(Age_Adult),0.038159,0.527497,0.020202,0.529412,1.003630,0.000073,1.004068,0.003760
427,"(Age_Adult, Sex_male, Survived, Pclass_3)",(Embarked_S),0.024691,0.722783,0.020202,0.818182,1.131988,0.002356,1.524691,0.119550
428,"(Age_Adult, Survived, Embarked_S, Pclass_3)",(Sex_male),0.040404,0.647587,0.020202,0.500000,0.772097,-0.005963,0.704826,-0.235242


min_support: 50
98




Unnamed: 0,support,itemsets
0,0.383838,(Survived)
1,0.352413,(Sex_female)
2,0.647587,(Sex_male)
3,0.228956,(Age_Child)
4,0.527497,(Age_Adult)
...,...,...
93,0.056117,"(Age_Adult, Pclass_2, Embarked_S, Survived)"
94,0.057239,"(Age_Adult, Sex_female, Embarked_S, Pclass_2)"
95,0.085297,"(Age_Child, Sex_male, Embarked_S, Pclass_3)"
96,0.071829,"(Age_Adult, Pclass_2, Sex_male, Embarked_S)"


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Sex_female),(Survived),0.352413,0.383838,0.261504,0.742038,1.933205,0.126234,2.388577,0.745420
1,(Survived),(Sex_female),0.383838,0.352413,0.261504,0.681287,1.933205,0.126234,2.031878,0.783438
2,(Age_Child),(Survived),0.228956,0.383838,0.097643,0.426471,1.111068,0.009761,1.074333,0.129649
3,(Age_Adult),(Survived),0.527497,0.383838,0.214366,0.406383,1.058735,0.011892,1.037978,0.117409
4,(Survived),(Age_Adult),0.383838,0.527497,0.214366,0.558480,1.058735,0.011892,1.070172,0.090035
...,...,...,...,...,...,...,...,...,...,...
193,"(Age_Adult, Sex_male, Pclass_3)",(Embarked_S),0.176207,0.722783,0.150393,0.853503,1.180856,0.023034,1.892305,0.185916
194,"(Age_Adult, Embarked_S, Pclass_3)",(Sex_male),0.198653,0.647587,0.150393,0.757062,1.169051,0.021748,1.450630,0.180453
195,"(Sex_male, Embarked_S, Pclass_3)",(Age_Adult),0.297419,0.527497,0.150393,0.505660,0.958603,-0.006495,0.955826,-0.057907
196,"(Age_Adult, Sex_male)","(Embarked_S, Pclass_3)",0.338945,0.396184,0.150393,0.443709,1.119956,0.016108,1.085431,0.162025


min_support: 100
46




Unnamed: 0,support,itemsets
0,0.383838,(Survived)
1,0.352413,(Sex_female)
2,0.647587,(Sex_male)
3,0.228956,(Age_Child)
4,0.527497,(Age_Adult)
5,0.188552,(Embarked_C)
6,0.722783,(Embarked_S)
7,0.242424,(Pclass_1)
8,0.20651,(Pclass_2)
9,0.551066,(Pclass_3)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Sex_female),(Survived),0.352413,0.383838,0.261504,0.742038,1.933205,0.126234,2.388577,0.745420
1,(Survived),(Sex_female),0.383838,0.352413,0.261504,0.681287,1.933205,0.126234,2.031878,0.783438
2,(Age_Adult),(Survived),0.527497,0.383838,0.214366,0.406383,1.058735,0.011892,1.037978,0.117409
3,(Survived),(Age_Adult),0.383838,0.527497,0.214366,0.558480,1.058735,0.011892,1.070172,0.090035
4,(Survived),(Embarked_S),0.383838,0.722783,0.243547,0.634503,0.877860,-0.033885,0.758465,-0.184211
...,...,...,...,...,...,...,...,...,...,...
70,"(Age_Adult, Sex_male, Pclass_3)",(Embarked_S),0.176207,0.722783,0.150393,0.853503,1.180856,0.023034,1.892305,0.185916
71,"(Age_Adult, Embarked_S, Pclass_3)",(Sex_male),0.198653,0.647587,0.150393,0.757062,1.169051,0.021748,1.450630,0.180453
72,"(Sex_male, Embarked_S, Pclass_3)",(Age_Adult),0.297419,0.527497,0.150393,0.505660,0.958603,-0.006495,0.955826,-0.057907
73,"(Age_Adult, Sex_male)","(Embarked_S, Pclass_3)",0.338945,0.396184,0.150393,0.443709,1.119956,0.016108,1.085431,0.162025


min_support: 300
11




Unnamed: 0,support,itemsets
0,0.383838,(Survived)
1,0.352413,(Sex_female)
2,0.647587,(Sex_male)
3,0.527497,(Age_Adult)
4,0.722783,(Embarked_S)
5,0.551066,(Pclass_3)
6,0.338945,"(Age_Adult, Sex_male)"
7,0.494949,"(Sex_male, Embarked_S)"
8,0.38945,"(Sex_male, Pclass_3)"
9,0.415264,"(Age_Adult, Embarked_S)"


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Age_Adult),(Sex_male),0.527497,0.647587,0.338945,0.642553,0.992227,-0.002655,0.985917,-0.016309
1,(Sex_male),(Age_Adult),0.647587,0.527497,0.338945,0.523397,0.992227,-0.002655,0.991397,-0.021746
2,(Sex_male),(Embarked_S),0.647587,0.722783,0.494949,0.764298,1.057437,0.026884,1.176132,0.15413
3,(Embarked_S),(Sex_male),0.722783,0.647587,0.494949,0.684783,1.057437,0.026884,1.118,0.195939
4,(Sex_male),(Pclass_3),0.647587,0.551066,0.38945,0.601386,1.091314,0.032587,1.126238,0.237431
5,(Pclass_3),(Sex_male),0.551066,0.647587,0.38945,0.706721,1.091314,0.032587,1.201631,0.186383
6,(Age_Adult),(Embarked_S),0.527497,0.722783,0.415264,0.787234,1.08917,0.033998,1.302918,0.173268
7,(Embarked_S),(Age_Adult),0.722783,0.527497,0.415264,0.574534,1.08917,0.033998,1.110554,0.295328
8,(Embarked_S),(Pclass_3),0.722783,0.551066,0.396184,0.548137,0.994684,-0.002117,0.993517,-0.018915
9,(Pclass_3),(Embarked_S),0.551066,0.722783,0.396184,0.718941,0.994684,-0.002117,0.986329,-0.011765




min_support: 600
1


Unnamed: 0,support,itemsets
0,0.722783,(Embarked_S)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric


In [135]:
# Convert the dictionary to a dataframe
mlextend_results_df = pd.DataFrame.from_dict(mlextend_result_dict, orient='index', columns=['Number of Frequent Itemsets'])
mlextend_results_df.index.name = 'Minimum Support'
mlextend_results_df.reset_index(inplace=True)

mlextend_results_df

Unnamed: 0,Minimum Support,Number of Frequent Itemsets
0,10,204
1,50,98
2,100,46
3,300,11
4,600,1


### Verify with my implemented verification function

In [136]:
# Verify the frequent itemsets
for min_support in min_support_values:
    combined_freq_itemsets = combined_freq_itemsets_dict[min_support][0]
    k = combined_freq_itemsets_dict[min_support][1]
    print(f'min_support: {min_support}')
    verify_apriori_frequent_itemsets(file_location, min_support, combined_freq_itemsets, k)
    print('\n')

min_support: 10
The file size is 58216 bytes, which is 5.8216e-05 GB
File size is less than 4 GB, reading the file directly to verify frequent itemsets

The support ratio is 0.01122334455667789

{('Survived',): 342, ('Sex_female',): 314, ('Sex_male',): 577, ('Age_Child',): 204, ('Age_Adult',): 470, ('Age_Elderly',): 40, ('Embarked_C',): 168, ('Embarked_Q',): 77, ('Embarked_S',): 644, ('Pclass_1',): 216, ('Pclass_2',): 184, ('Pclass_3',): 491}
{'Survived': 342, 'Sex_female': 314, 'Sex_male': 577, 'Age_Child': 204, 'Age_Adult': 470, 'Age_Elderly': 40, 'Embarked_C': 168, 'Embarked_Q': 77, 'Embarked_S': 644, 'Pclass_1': 216, 'Pclass_2': 184, 'Pclass_3': 491}
{'Survived': 342, 'Sex_female': 314, 'Sex_male': 577, 'Age_Child': 204, 'Age_Adult': 470, 'Age_Elderly': 40, 'Embarked_C': 168, 'Embarked_Q': 77, 'Embarked_S': 644, 'Pclass_1': 216, 'Pclass_2': 184, 'Pclass_3': 491}
verify dict_keys([('Survived',), ('Sex_female',), ('Sex_male',), ('Age_Child',), ('Age_Adult',), ('Age_Elderly',), ('Emba

In [217]:
rules = association_rules(freq_items, metric='confidence', min_threshold=0.4)
rules 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Survived),(Sex_female),0.383838,0.352413,0.261504,0.681287,1.933205,0.126234,2.031878,0.783438
1,(Sex_female),(Survived),0.352413,0.383838,0.261504,0.742038,1.933205,0.126234,2.388577,0.745420
2,(Age_Child),(Survived),0.228956,0.383838,0.097643,0.426471,1.111068,0.009761,1.074333,0.129649
3,(Age_Adult),(Survived),0.527497,0.383838,0.214366,0.406383,1.058735,0.011892,1.037978,0.117409
4,(Survived),(Age_Adult),0.383838,0.527497,0.214366,0.558480,1.058735,0.011892,1.070172,0.090035
...,...,...,...,...,...,...,...,...,...,...
278,"(Pclass_2, Survived, Sex_female)","(Age_Adult, Embarked_S)",0.078563,0.415264,0.051627,0.657143,1.582471,0.019003,1.705481,0.399460
279,"(Pclass_2, Age_Adult, Survived)","(Embarked_S, Sex_female)",0.061728,0.227834,0.051627,0.836364,3.670936,0.037564,4.718793,0.775458
280,"(Pclass_2, Age_Adult, Sex_female)","(Embarked_S, Survived)",0.062851,0.243547,0.051627,0.821429,3.372778,0.036320,4.236139,0.750690
281,"(Pclass_2, Survived)","(Age_Adult, Embarked_S, Sex_female)",0.097643,0.140292,0.051627,0.528736,3.768828,0.037929,1.824259,0.814163


In [188]:
rules['consequents'][2] == {'Survived'}

True

In [220]:
rules[rules['consequents'] == {'Survived'}]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1,(Sex_female),(Survived),0.352413,0.383838,0.261504,0.742038,1.933205,0.126234,2.388577,0.74542
2,(Age_Child),(Survived),0.228956,0.383838,0.097643,0.426471,1.111068,0.009761,1.074333,0.129649
3,(Age_Adult),(Survived),0.527497,0.383838,0.214366,0.406383,1.058735,0.011892,1.037978,0.117409
5,(Embarked_C),(Survived),0.188552,0.383838,0.104377,0.553571,1.442199,0.032004,1.380202,0.377861
7,(Pclass_1),(Survived),0.242424,0.383838,0.152637,0.62963,1.640351,0.059586,1.663636,0.515294
8,(Pclass_2),(Survived),0.20651,0.383838,0.097643,0.472826,1.231836,0.018377,1.168801,0.237185
45,"(Age_Child, Sex_female)",(Survived),0.094276,0.383838,0.063973,0.678571,1.767857,0.027786,1.916947,0.479554
47,"(Age_Adult, Sex_female)",(Survived),0.188552,0.383838,0.148148,0.785714,2.046992,0.075775,2.875421,0.630328
51,"(Embarked_C, Sex_female)",(Survived),0.08193,0.383838,0.071829,0.876712,2.284066,0.040381,4.997755,0.612355
53,"(Embarked_S, Sex_female)",(Survived),0.227834,0.383838,0.157127,0.689655,1.796733,0.069675,1.98541,0.574273


<h2> 3. Yelp dataset </h2>

### Data Preprocessing

In [2]:
import json
import pandas as pd

In [None]:
data_file = open("dataset/yelp/yelp_academic_dataset_user.json", encoding="utf8")
data = []
for line in data_file:
    data.append(json.loads(line))
user_df = pd.DataFrame(data)
data_file.close()


In [18]:
user_df.head()

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,...,65,55,56,18,232,844,467,467,239,180
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,...,264,184,157,251,1847,7054,3131,3131,1521,1946
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,2086,1010,1003,20092010201120122013,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,...,13,10,17,3,66,96,119,119,35,18
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,2005-11-29 04:38:33,512,330,299,200920102011,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",28,...,4,1,6,2,12,16,26,26,10,9
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,2007-01-05 19:40:59,29,15,7,,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",1,...,1,0,0,0,1,1,0,0,0,0


In [23]:
user_df.columns

Index(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny',
       'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos'],
      dtype='object')

In [5]:
data_file = open("dataset/yelp/yelp_academic_dataset_review.json", encoding="utf8")
data = []
for line in data_file:
    data.append(json.loads(line))
review_df = pd.DataFrame(data)
data_file.close()

In [6]:
review_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [24]:
review_df.columns

Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date'],
      dtype='object')

In [23]:
review_df['cool'].value_counts()

cool
0      5377964
1      1016736
2       296999
3       114763
4        56609
        ...   
133          1
205          1
306          1
370          1
304          1
Name: count, Length: 210, dtype: int64

In [11]:
review_df.drop(columns=['review_id', 'user_id', 'text', 'date'], inplace=True)

In [7]:
data_file = open("dataset/yelp/yelp_academic_dataset_business.json", encoding="utf8")
data = []
for line in data_file:
    data.append(json.loads(line))
business_df = pd.DataFrame(data)
data_file.close()

In [8]:
business_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [None]:
business_df.drop(columns=['address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'attributes', 'hours'], inplace=True)

In [13]:
# rename stars column to business_stars
business_df.rename(columns={'stars': 'business_stars'}, inplace=True)

In [30]:
# save the dataframes as csv files
# user_df.to_csv('dataset/yelp/user.csv', index=False)
review_df.to_csv('dataset/yelp/review.csv', index=False)
business_df.to_csv('dataset/yelp/business.csv', index=False)

In [3]:
review_df = pd.read_csv('dataset/yelp/review.csv')
business_df = pd.read_csv('dataset/yelp/business.csv')

In [5]:
review_df.head()

Unnamed: 0,business_id,stars,useful,funny,cool
0,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0
1,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1
2,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0
3,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1
4,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1


In [6]:
business_df.head()

Unnamed: 0,business_id,name,business_stars,review_count,is_open,categories
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ",5.0,7,0,"Doctors, Traditional Chinese Medicine, Naturop..."
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,3.0,15,1,"Shipping Centers, Local Services, Notaries, Ma..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,3.5,22,0,"Department Stores, Shopping, Fashion, Home & G..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,4.0,80,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,4.5,13,1,"Brewpubs, Breweries, Food"


In [7]:
# Convert the stars column into categorical data with 2 categories: 0 and 1
# 0 if the stars is less than or equal to 3
# 1 if the stars is greater than 3
review_df['stars'] = pd.cut(review_df['stars'], bins=[0, 4, 5], labels=[0, 1])
review_df.head()


Unnamed: 0,business_id,stars,useful,funny,cool
0,XQfwVwDr-v0ZS3_CbbE5Xw,0,0,0,0
1,7ATYjTIgM3jUlt4UM3IypQ,1,1,0,1
2,YjUWPpI6HXG530lwP-fb2A,0,0,0,0
3,kxX2SOes4o-D3ZQBkiMRfA,1,1,0,1
4,e4Vwtrqf-wpJfwesgvdgxQ,0,1,0,1


In [8]:
# Rename the stars column to review_stars
review_df.rename(columns={'stars': 'good review'}, inplace=True)

In [9]:
# Match the business_id in the review_df with the business_id in the business_df and concatenate the dataframes
review_business_df = pd.merge(review_df, business_df, on='business_id')
review_business_df.head()

Unnamed: 0,business_id,good review,useful,funny,cool,name,business_stars,review_count,is_open,categories
0,XQfwVwDr-v0ZS3_CbbE5Xw,0,0,0,0,Turning Point of North Wales,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
1,XQfwVwDr-v0ZS3_CbbE5Xw,0,0,0,0,Turning Point of North Wales,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
2,XQfwVwDr-v0ZS3_CbbE5Xw,0,2,0,1,Turning Point of North Wales,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
3,XQfwVwDr-v0ZS3_CbbE5Xw,0,0,0,0,Turning Point of North Wales,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
4,XQfwVwDr-v0ZS3_CbbE5Xw,0,0,0,0,Turning Point of North Wales,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."


In [10]:
review_business_df.shape

(6990280, 10)

In [11]:
review_business_df.drop(columns=['business_id', 'name'], inplace=True)

In [12]:
review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,business_stars,review_count,is_open,categories
0,0,0,0,0,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
1,0,0,0,0,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
2,0,2,0,1,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
3,0,0,0,0,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
4,0,0,0,0,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."


In [13]:
# Convert the stars column into categorical data
review_business_df['business_stars'] = pd.cut(review_business_df['business_stars'], bins=[0, 2, 3, 5], labels=['Bad', 'Average', 'Good'])

In [14]:
review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,business_stars,review_count,is_open,categories
0,0,0,0,0,Average,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
1,0,0,0,0,Average,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
2,0,2,0,1,Average,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
3,0,0,0,0,Average,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
4,0,0,0,0,Average,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."


In [27]:
sorted(review_business_df['review_count'].unique())

[5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 1

In [15]:
# Convert the review_count column into categorical data
review_business_df['review_count'] = pd.cut(review_business_df['review_count'], bins=[0, 600, 700, 1200], labels=['Low', 'Medium', 'High'])

review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,business_stars,review_count,is_open,categories
0,0,0,0,0,Average,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
1,0,0,0,0,Average,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
2,0,2,0,1,Average,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
3,0,0,0,0,Average,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
4,0,0,0,0,Average,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."


In [16]:
# COnvert stars and buisness_stars into one hot encoding
review_business_df = pd.get_dummies(review_business_df, columns=['business_stars'])
review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,review_count,is_open,categories,business_stars_Bad,business_stars_Average,business_stars_Good
0,0,0,0,0,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",False,True,False
1,0,0,0,0,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",False,True,False
2,0,2,0,1,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",False,True,False
3,0,0,0,0,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",False,True,False
4,0,0,0,0,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",False,True,False


In [17]:
# Convert True and False for business_stars_Bad, business_stars_Average, business_stars_Good into 1 and 0
review_business_df['business_stars_Bad'] = review_business_df['business_stars_Bad'].astype(int)
review_business_df['business_stars_Average'] = review_business_df['business_stars_Average'].astype(int)
review_business_df['business_stars_Good'] = review_business_df['business_stars_Good'].astype(int)

review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,review_count,is_open,categories,business_stars_Bad,business_stars_Average,business_stars_Good
0,0,0,0,0,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0
1,0,0,0,0,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0
2,0,2,0,1,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0
3,0,0,0,0,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0
4,0,0,0,0,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0


In [18]:
# Convert the review_count column into one hot encoding
review_business_df = pd.get_dummies(review_business_df, columns=['review_count'])
review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,is_open,categories,business_stars_Bad,business_stars_Average,business_stars_Good,review_count_Low,review_count_Medium,review_count_High
0,0,0,0,0,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,True,False,False
1,0,0,0,0,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,True,False,False
2,0,2,0,1,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,True,False,False
3,0,0,0,0,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,True,False,False
4,0,0,0,0,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,True,False,False


In [19]:
# Convert True and False for review_count_High, review_count_Low, review_count_Medium into 1 and 0
review_business_df['review_count_High'] = review_business_df['review_count_High'].astype(int)
review_business_df['review_count_Low'] = review_business_df['review_count_Low'].astype(int)
review_business_df['review_count_Medium'] = review_business_df['review_count_Medium'].astype(int)

review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,is_open,categories,business_stars_Bad,business_stars_Average,business_stars_Good,review_count_Low,review_count_Medium,review_count_High
0,0,0,0,0,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,1,0,0
1,0,0,0,0,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,1,0,0
2,0,2,0,1,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,1,0,0
3,0,0,0,0,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,1,0,0
4,0,0,0,0,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,1,0,0


In [20]:
# Drop the categories column
review_business_df.drop(columns=['categories'], inplace=True)

In [21]:
review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,is_open,business_stars_Bad,business_stars_Average,business_stars_Good,review_count_Low,review_count_Medium,review_count_High
0,0,0,0,0,1,0,1,0,1,0,0
1,0,0,0,0,1,0,1,0,1,0,0
2,0,2,0,1,1,0,1,0,1,0,0
3,0,0,0,0,1,0,1,0,1,0,0
4,0,0,0,0,1,0,1,0,1,0,0


In [32]:
# Convert useful into one hot encoding, any value greater than 1 is 1
review_business_df['useful'] = review_business_df['useful'].apply(lambda x: 1 if x > 1 else 0)
review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,is_open,business_stars_Bad,business_stars_Average,business_stars_Good,review_count_Low,review_count_Medium,review_count_High
0,0,0,0,0,1,0,1,0,1,0,0
1,0,0,0,0,1,0,1,0,1,0,0
2,0,1,0,1,1,0,1,0,1,0,0
3,0,0,0,0,1,0,1,0,1,0,0
4,0,0,0,0,1,0,1,0,1,0,0


In [32]:
review_business_df = pd.read_csv('dataset/yelp/review_business.csv')

In [33]:
review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,is_open,business_stars_Bad,business_stars_Average,business_stars_Good,review_count_Low,review_count_Medium,review_count_High
0,0,0,0,0,1,0,1,0,1,0,0
1,0,0,0,0,1,0,1,0,1,0,0
2,0,2,0,1,1,0,1,0,1,0,0
3,0,0,0,0,1,0,1,0,1,0,0
4,0,0,0,0,1,0,1,0,1,0,0


In [34]:
# Convert funny into one hot encoding, any value greater than 1 is 1
# Convert cool into one hot encoding, any value greater than 1 is 1

review_business_df['useful'] = review_business_df['useful'].apply(lambda x: 1 if x > 1 else 0)
review_business_df['funny'] = review_business_df['funny'].apply(lambda x: 1 if x > 1 else 0)
review_business_df['cool'] = review_business_df['cool'].apply(lambda x: 1 if x > 1 else 0)

review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,is_open,business_stars_Bad,business_stars_Average,business_stars_Good,review_count_Low,review_count_Medium,review_count_High
0,0,0,0,0,1,0,1,0,1,0,0
1,0,0,0,0,1,0,1,0,1,0,0
2,0,1,0,0,1,0,1,0,1,0,0
3,0,0,0,0,1,0,1,0,1,0,0
4,0,0,0,0,1,0,1,0,1,0,0


In [35]:
# Save the dataframe as a csv file
review_business_df.to_csv('dataset/yelp/review_business.csv', index=False)

In [34]:
# Run the apriori algorithm
combined_freq_itemsets, rules = my_apriori(review_business_df, 1000000, 0.4, target='good review')

{('good review',): 3231627, ('useful',): 1609831, ('is_open',): 5791234, ('business_stars_Average',): 1097467, ('business_stars_Good',): 5540490, ('review_count_Low',): 5949264}
{'good review': 3231627, 'useful': 1609831, 'is_open': 5791234, 'business_stars_Average': 1097467, 'business_stars_Good': 5540490, 'review_count_Low': 5949264}
{('good review',): 3231627, ('useful',): 1609831, ('is_open',): 5791234, ('business_stars_Average',): 1097467, ('business_stars_Good',): 5540490, ('review_count_Low',): 5949264}
k:  2
{('good review',): 3231627, ('useful',): 1609831, ('is_open',): 5791234, ('business_stars_Average',): 1097467, ('business_stars_Good',): 5540490, ('review_count_Low',): 5949264, ('good review', 'is_open'): 2769878, ('good review', 'business_stars_Good'): 2950480, ('good review', 'review_count_Low'): 2721194, ('useful', 'is_open'): 1310564, ('useful', 'business_stars_Good'): 1214578, ('useful', 'review_count_Low'): 1437504, ('is_open', 'business_stars_Good'): 4645111, ('is_o

In [35]:
# Print the frequent itemsets
for key, value in combined_freq_itemsets.items():
    print(key, value)

('good review',) 3231627
('useful',) 1609831
('is_open',) 5791234
('business_stars_Average',) 1097467
('business_stars_Good',) 5540490
('review_count_Low',) 5949264
('good review', 'is_open') 2769878
('good review', 'business_stars_Good') 2950480
('good review', 'review_count_Low') 2721194
('useful', 'is_open') 1310564
('useful', 'business_stars_Good') 1214578
('useful', 'review_count_Low') 1437504
('is_open', 'business_stars_Good') 4645111
('is_open', 'review_count_Low') 4822703
('business_stars_Average', 'review_count_Low') 1046301
('business_stars_Good', 'review_count_Low') 4553295
('good review', 'is_open', 'business_stars_Good') 2538001
('good review', 'is_open', 'review_count_Low') 2289952
('good review', 'business_stars_Good', 'review_count_Low') 2449424
('useful', 'is_open', 'review_count_Low') 1152015
('useful', 'business_stars_Good', 'review_count_Low') 1054991
('is_open', 'business_stars_Good', 'review_count_Low') 3724826
('good review', 'is_open', 'business_stars_Good', 're

In [36]:
# Print the rules
index = 1
for key, item in rules.items():
    for i in range(1, len(key)):
        antecedent = key[:i]
        consequent = key[i:]
        print('Rule ', index, ': antecedent -> consequent: ', list(sum(antecedent, ())), '-> ', list(sum(consequent, ())), 'confidence: ', item)
        index += 1

Rule  1 : antecedent -> consequent:  ['is_open'] ->  ['good review'] confidence:  0.47828804707252376
Rule  2 : antecedent -> consequent:  ['business_stars_Good'] ->  ['good review'] confidence:  0.5325305162539775
Rule  3 : antecedent -> consequent:  ['review_count_Low'] ->  ['good review'] confidence:  0.45740010865209546
Rule  4 : antecedent -> consequent:  ['is_open', 'business_stars_Good'] ->  ['good review'] confidence:  0.5463811306123794
Rule  5 : antecedent -> consequent:  ['is_open', 'review_count_Low'] ->  ['good review'] confidence:  0.47482749818929343
Rule  6 : antecedent -> consequent:  ['business_stars_Good', 'review_count_Low'] ->  ['good review'] confidence:  0.5379453780174577
Rule  7 : antecedent -> consequent:  ['is_open', 'business_stars_Good', 'review_count_Low'] ->  ['good review'] confidence:  0.5547459129634512


### Trying out different values of min support

In [36]:
file_location = 'dataset/yelp/big_review_business.csv'

In [144]:
data = pd.read_csv(file_location)
data.shape

MemoryError: Unable to allocate 20.2 GiB for an array with shape (11, 246990280) and data type int64

In [37]:
# Try out different values of min_support with target as good review and 241 million records
min_support_values = [20000000, 40000000, 80000000, 160000000, 320000000]
results_dict = {}
combined_freq_itemsets_dict = {}
for min_support in min_support_values:
    print(f'min_support: {min_support}')
    combined_freq_itemsets, rules, running_time, k = my_apriori(file_location, min_support, 0.4, target='good review')
    print_freq_itemsets(combined_freq_itemsets)
    print_rules(rules)
    print(f'Running time is {running_time} seconds\n')
    combined_freq_itemsets_dict[min_support] = [combined_freq_itemsets, k]
    results_dict[min_support] = [running_time, len(combined_freq_itemsets)]

min_support: 20000000
The file size is 5680776596 bytes, which is 5.680776596 GB
File size is more than 4 GB, reading the file in chunks to generate frequent 1 itemset

Size of one chunk in bytes is 176000128, which is 0.176000128 GB
Generating frequent 1-itemsets for chunk 1
Generating frequent 1-itemsets for chunk 2
Generating frequent 1-itemsets for chunk 3
Generating frequent 1-itemsets for chunk 4
Generating frequent 1-itemsets for chunk 5
Generating frequent 1-itemsets for chunk 6
Generating frequent 1-itemsets for chunk 7
Generating frequent 1-itemsets for chunk 8
Generating frequent 1-itemsets for chunk 9
Generating frequent 1-itemsets for chunk 10
Generating frequent 1-itemsets for chunk 11
Generating frequent 1-itemsets for chunk 12
Generating frequent 1-itemsets for chunk 13
Generating frequent 1-itemsets for chunk 14
Generating frequent 1-itemsets for chunk 15
Generating frequent 1-itemsets for chunk 16
Generating frequent 1-itemsets for chunk 17
Generating frequent 1-items

In [38]:
# Convert the dictionary to a dataframe
results_df = pd.DataFrame.from_dict(results_dict, orient='index', columns=['Running Time', 'Number of Frequent Itemsets'])
results_df.index.name = 'Minimum Support'
results_df.reset_index(inplace=True)
results_df

Unnamed: 0,Minimum Support,Running Time,Number of Frequent Itemsets
0,20000000,1220.098638,29
1,40000000,850.724166,20
2,80000000,624.068643,14
3,160000000,390.127074,6
4,320000000,214.517378,0


### Verify with mlextend Apriori library

In [None]:
# Verify with mlxtend
mlextend_result_dict = {}
for min_support in min_support_values:
    freq_items = apriori(data, min_support=min_support/len(data), use_colnames=True)
    print(f'min_support: {min_support}')
    print(len(freq_items))
    print(freq_items)

    mlextend_result_dict[min_support] = len(freq_items)

    if len(freq_items) != 0:
        rules = association_rules(freq_items, metric='confidence', min_threshold=0.4)
        print(rules)

In [None]:
# Convert the dictionary to a dataframe
mlextend_results_df = pd.DataFrame.from_dict(mlextend_result_dict, orient='index', columns=['Number of Frequent Itemsets'])
mlextend_results_df.index.name = 'Minimum Support'
mlextend_results_df.reset_index(inplace=True)
mlextend_results_df

### Verify with my implemented verification function

In [208]:
# Verify the frequent itemsets
for min_support in min_support_values:
    combined_freq_itemsets = combined_freq_itemsets_dict[min_support][0]
    k = combined_freq_itemsets_dict[min_support][1]
    print(f'min_support: {min_support}')
    verify_apriori_frequent_itemsets(file_location, min_support, combined_freq_itemsets, k)
    print('\n')

min_support: 20000000
The file size is 5680776596 bytes, which is 5.680776596 GB
File size is more than 4 GB, reading the file in chunks to verify frequent itemsets

Size of one chunk in bytes is 176000128, which is 0.176000128 GB
The support ratio is 0.0816358921735013

chunk 0
chunk 1
chunk 2
chunk 3
chunk 4
chunk 5
chunk 6
chunk 7
chunk 8
chunk 9
chunk 10
chunk 11
chunk 12
chunk 13
chunk 14
chunk 15
chunk 16
chunk 17
chunk 18
chunk 19
chunk 20
chunk 21
chunk 22
chunk 23
chunk 24
chunk 25
chunk 26
chunk 27
chunk 28
chunk 29
chunk 30
chunk 31
chunk 32
chunk 33
chunk 34
chunk 35
chunk 36
chunk 37
chunk 38
chunk 39
chunk 40
chunk 41
chunk 42
chunk 43
chunk 44
chunk 45
chunk 46
chunk 47
chunk 48
chunk 49
chunk 50
chunk 51
chunk 52
chunk 53
chunk 54
chunk 55
chunk 56
chunk 57
chunk 58
chunk 59
chunk 60
chunk 61
chunk 62
chunk 63
chunk 64
chunk 65
chunk 66
chunk 67
chunk 68
chunk 69
chunk 70
chunk 71
chunk 72
chunk 73
chunk 74
chunk 75
chunk 76
chunk 77
chunk 78
chunk 79
chunk 80
chunk 81
