<h1> Task 1: Implement the Apriori algorithm to mine frequent itemsets </h1>

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import dask.dataframe as dd

In [1]:
# Make dummy data
data = pd.DataFrame(np.random.randint(0, 2, size=(10, 8)), columns=list('ABCDEFGH'))
data

Unnamed: 0,A,B,C,D,E,F,G,H
0,0,1,1,1,1,0,0,0
1,0,1,0,0,1,1,1,1
2,0,0,1,0,1,0,0,0
3,1,0,1,1,0,1,0,1
4,0,0,1,1,1,0,1,0
5,0,0,1,1,0,1,0,0
6,1,0,1,0,0,0,0,0
7,1,1,1,1,0,0,0,1
8,0,1,0,0,1,1,1,1
9,1,0,1,1,0,1,0,0


In [2]:
# Count the number of 0s and 1s in each column
# The number of 1s is the number of times each item appears
value_counts = data.apply(pd.value_counts)
value_counts

Unnamed: 0,A,B,C,D,E,F,G,H
0,6,6,2,4,5,5,7,6
1,4,4,8,6,5,5,3,4


In [3]:
value_counts['A'][1]

4

Using the lecture notes explanation of the Apriori Algorithm, we have 4 steps to do.
1. Candidate Generation
2. Candidate Pruning
3. Support Counting
4. Candidate Elimination

Sample code for 1 and 2 itemset

Define the min support

In [4]:
min_support = 4

In [100]:
# Combined dictionary of frequent itemsets
combined_freq_itemsets = {}

Generate F1 (frequent 1-itemsets)

In [101]:
# Get the frequent itemsets with count greater than or equal to min_support
columns = data.columns
frequent_itemsets = {}
for column in columns:
    # Append the itemset and its count to the dictionary if the count is greater than or equal to min_support
    if value_counts[column][1] >= min_support:
        frequent_itemsets[column] = value_counts[column][1]
        # frequent_itemsets.append((column, value_counts[column][1]))
        # data.drop(column, axis=1, inplace=True)

print(frequent_itemsets)

dummy_dict = frequent_itemsets.copy()
for key, item in dummy_dict.copy().items():
    dummy_dict[(tuple(key))] = dummy_dict.pop(key)
print(dummy_dict)
    
combined_freq_itemsets.update(dummy_dict)

{'A': 4, 'B': 4, 'C': 8, 'D': 6, 'E': 5, 'F': 5, 'H': 4}
{('A',): 4, ('B',): 4, ('C',): 8, ('D',): 6, ('E',): 5, ('F',): 5, ('H',): 4}


Step 1: Candidate Generation

In [102]:
# Generate all possible combinations of frequent itemsets with k+1 items
combinations = []
k = 1
combinations.append(list(itertools.combinations(frequent_itemsets.keys(), k+1)))

combinations

[[('A', 'B'),
  ('A', 'C'),
  ('A', 'D'),
  ('A', 'E'),
  ('A', 'F'),
  ('A', 'H'),
  ('B', 'C'),
  ('B', 'D'),
  ('B', 'E'),
  ('B', 'F'),
  ('B', 'H'),
  ('C', 'D'),
  ('C', 'E'),
  ('C', 'F'),
  ('C', 'H'),
  ('D', 'E'),
  ('D', 'F'),
  ('D', 'H'),
  ('E', 'F'),
  ('E', 'H'),
  ('F', 'H')]]

Step 2: Candidate Pruning (do not need to prune for 2 itemset as F1 items are all frequent)

Step 3: Support Counting

In [103]:
# Convert the list of lists of tuples to a list of tuples
combinations = combinations[0]

In [104]:
# Count the number of occurences of each combination in the data
combinations_count = {}
for combination in combinations:
    # Using groupby and size to count the number of occurences of each combination
    # Resetting the index to get the count of each combination as a column in the dataframe
    test = data.groupby(list(combination)).size().reset_index(name='count')
    
    # Append the combination and its count to the dictionary
    # The count of each combination is the last value in the count column
    # Moreover, we need to check whether the last row is a combination of 1s instead of 1s and 0s
    # If it is a combination of 1s, then we append the combination and its count to the dictionary
    # Otherwise, we do not append it to the dictionary
    if test[test.columns[0]].iloc[-1] == 1 and test[test.columns[1]].iloc[-1] == 1:
        combinations_count[combination] = test['count'].iloc[-1]

# print(test)
combinations_count

{('A', 'B'): 1,
 ('A', 'C'): 4,
 ('A', 'D'): 3,
 ('A', 'F'): 2,
 ('A', 'H'): 2,
 ('B', 'C'): 2,
 ('B', 'D'): 2,
 ('B', 'E'): 3,
 ('B', 'F'): 2,
 ('B', 'H'): 3,
 ('C', 'D'): 6,
 ('C', 'E'): 3,
 ('C', 'F'): 3,
 ('C', 'H'): 2,
 ('D', 'E'): 2,
 ('D', 'F'): 3,
 ('D', 'H'): 2,
 ('E', 'F'): 2,
 ('E', 'H'): 2,
 ('F', 'H'): 3}

In [105]:
# test.index.values[-1].count(1)
test1 = test
count = test1['count'].iloc[-1]
count

3

Step 4: Candidate Elimination

In [106]:
# Prune the combinations with count less than min_support
for combination in combinations_count.copy().keys():
    if combinations_count[combination] < min_support:
        combinations_count.pop(combination)

print(combinations_count)
combined_freq_itemsets.update(combinations_count)

{('A', 'C'): 4, ('C', 'D'): 6}


Candidate generation for 2 or more frequent itemsets

In [107]:
# Merge the combinations if the first k-1 items are the same
# and the last item is different
# This is done to generate combinations with k+1 items
# from combinations with k items

# Compare first k-1 items of each combination
# If they are the same, merge them
# If they are not the same, do not merge them
# The merged combinations are stored in a dictionary
merged_combinations = {}
# for combination1 in combinations_count.keys():
#     for combination2 in combinations_count.keys():
#         # Check if the first k-1 items are the same
#         if combination1[:-1] == combination2[:-1]:
#             # Check if the last item is different
#             if combination1[-1] != combination2[-1]:
#                 # Merge the combinations
#                 merged_combinations[combination1 + (combination2[-1],)] = 0

for index, combination1 in enumerate(combinations_count.keys()):
    for combination2 in list(combinations_count.keys())[index+1:]:
        # Check if the first k-1 items are the same
        if combination1[:-1] == combination2[:-1]:
            # Check if the last item is different
            if combination1[-1] != combination2[-1]:
                # Merge the combinations
                merged_combinations[combination1 + (combination2[-1],)] = 0


merged_combinations


{}

Support counting

In [108]:
# Count the number of occurences of each combination in the data
merged_combinations_count = {}
for combination in merged_combinations.keys():
    # Using groupby and size to count the number of occurences of each combination
    # Resetting the index to get the count of each combination as a column in the dataframe
    test = data.groupby(list(combination)).size().reset_index(name='count')

    # Append the combination and its count to the dictionary
    # The count of each combination is the last value in the count column
    # as the last row of the dataframe is when both items are present in one transaction in the original data dataframe
    merged_combinations_count[combination] = test['count'].iloc[-1]

# print(test)
merged_combinations_count

{}

In [109]:
# Prune the combinations with count less than min_support
for combination in merged_combinations_count.copy().keys():
    if merged_combinations_count[combination] < min_support:
        merged_combinations_count.pop(combination)

print(merged_combinations_count)
combined_freq_itemsets.update(merged_combinations_count)


{}


In [110]:
combined_freq_itemsets 

{('A',): 4,
 ('B',): 4,
 ('C',): 8,
 ('D',): 6,
 ('E',): 5,
 ('F',): 5,
 ('H',): 4,
 ('A', 'C'): 4,
 ('C', 'D'): 6}

Part 2: Rule generation

In [166]:
lis = ['Mineral Water', 'Ground Beef', 'Spagetti']

for i in range(1, len(lis)):  #  xrange will return the values 1,2,3,4 in this loop
    combinations = []
    combinations.append(list(itertools.combinations(lis, i)))
    if combinations:
        combinations = combinations[0]
        print(combinations)
        for combination in combinations:
            print(combination)

combinations

[('Mineral Water',), ('Ground Beef',), ('Spagetti',)]
('Mineral Water',)
('Ground Beef',)
('Spagetti',)
[('Mineral Water', 'Ground Beef'), ('Mineral Water', 'Spagetti'), ('Ground Beef', 'Spagetti')]
('Mineral Water', 'Ground Beef')
('Mineral Water', 'Spagetti')
('Ground Beef', 'Spagetti')


[('Mineral Water', 'Ground Beef'),
 ('Mineral Water', 'Spagetti'),
 ('Ground Beef', 'Spagetti')]

In [137]:
# Generate rules for frequent itemsets with k+1 items with min confidence
# The rules are generated by splitting the combination into two parts
min_confidence = 0.5
rules = {}
for key in combined_freq_itemsets.keys():
    combinations = []
    for i in range(1, len(key)):  #  xrange will return the values 1,2,3,4 in this loop
        combinations.append(list(itertools.combinations(key, i)))
        if combinations:
            combinations = combinations[0]
            for combination in combinations:
                antecedent = combination
                consequent = tuple(set(key) - set(combination))
                confidence = combined_freq_itemsets[key] / combined_freq_itemsets[antecedent]
                if confidence >= min_confidence:
                    rules[(antecedent, consequent)] = confidence
    # Split the combination into two parts
    # The first part is the antecedent and the second part is the consequent
    # for i in range(1, len(key)):
    #     antecedent_1 = key[:i]
    #     consequent_1 = key[i:]

    #     antecedent_2 = key[i:]
    #     consequent_2 = key[:i]

    #     # Calculate the confidence of the rule
    #     # Confidence = support of combination / support of antecedent
    #     confidence_1 = combined_freq_itemsets[key] / combined_freq_itemsets[antecedent_1]
    #     confidence_2 = combined_freq_itemsets[key] / combined_freq_itemsets[antecedent_2]

    #     # Check if the confidence is greater than min_confidence
    #     if confidence_1 >= min_confidence:
    #         # Append the rule to the rules dictionary
    #         rules[(antecedent_1, consequent_1)] = confidence_1
        
    #     if confidence_2 >= min_confidence:
    #         # Append the rule to the rules dictionary
    #         rules[(antecedent_2, consequent_2)] = confidence_2

rules


{(('A',), ('C',)): 1.0,
 (('C',), ('A',)): 0.5,
 (('C',), ('D',)): 0.75,
 (('D',), ('C',)): 1.0}

In [17]:
# Prune smaller rules based on confidence of larger rules
# If larger rule has confidence less than min_confidence, smaller rules are pruned

# Sort the rules in descending order of confidence
sorted_rules = sorted(rules.items(), key=lambda x: x[1], reverse=True)
sorted_rules

# Prune the rules
pruned_rules = {}
for rule in sorted_rules:
    # Append the rule to the pruned_rules dictionary if it is not a subset of any rule in the dictionary
    if not any([set(rule[0]).issubset(set(pruned_rule[0])) for pruned_rule in pruned_rules.keys()]):
        pruned_rules[rule[0]] = rule[1]

pruned_rules

{(('A',), ('C',)): 1.0,
 (('A',), ('E',)): 1.0,
 (('A',), ('C', 'E')): 1.0,
 (('A', 'C'), ('E',)): 1.0,
 (('C',), ('D',)): 0.75}

In [292]:
# Apriori algorithm
# We combine the above steps to generate frequent itemsets with k+1 items
# from frequent itemsets with k items
# We continue this process until we get no frequent itemsets with k+1 items
# We then combine the frequent itemsets with k items to generate association rules
# We continue this process until we get no association rules
# We then combine the association rules to generate association rules with k+1 items


# Function to generate frequent itemsets with 1 item (initialisation)
def generate_freq_1_itemsets(data, min_support, combined_freq_itemsets):

    # Count the number of 0s and 1s in each column
    # The number of 1s is the number of times each item appears
    value_counts = data.apply(pd.value_counts)

    # Get the frequent itemsets with count greater than or equal to min_support
    columns = data.columns
    frequent_itemsets = {}
    for column in columns:
        # Append the itemset and its count to the dictionary if the count is greater than or equal to min_support
        if value_counts[column][1] >= min_support:
            frequent_itemsets[column] = value_counts[column][1]
            # frequent_itemsets.append((column, value_counts[column][1]))
            # data.drop(column, axis=1, inplace=True)

    dummy_dict = frequent_itemsets.copy()
    for key, item in dummy_dict.copy().items():
        # For dummy data
        # dummy_dict[(tuple(key))] = dummy_dict.pop(key)
        # For real data
        dummy_dict[(key,)] = dummy_dict.pop(key)
    print(dummy_dict)

    combined_freq_itemsets.update(dummy_dict)

    print(frequent_itemsets)
    return frequent_itemsets


# Function to generate frequent itemsets with k+1 items
def generate_k_plus_1_candidate_itemsets(frequent_itemsets, k):
    # Generate all possible combinations of frequent itemsets with k+1 items

    # If k = 1, we do not need to merge the combinations
    if k == 1:
        combinations = []
        combinations.append(list(itertools.combinations(frequent_itemsets.keys(), k+1)))
        return combinations
    
    else:
        # Merge the combinations if the first k-1 items are the same
        # and the last item is different
        # This is done to generate combinations with k+1 items
        # from combinations with k items
        # Compare first k-1 items of each combination
        # If they are the same, merge them
        # If they are not the same, do not merge them
        # The merged combinations are stored in a dictionary
        merged_combinations = {}
        

        for index, combination1 in enumerate(frequent_itemsets.keys()):
            for combination2 in list(frequent_itemsets.keys())[index+1:]:
                # Check if the first k-1 items are the same
                if combination1[:-1] == combination2[:-1]:
                    # Check if the last item is different
                    if combination1[-1] != combination2[-1]:
                        # Merge the combinations
                        merged_combinations[combination1 + (combination2[-1],)] = 0

    
        return merged_combinations

# Function to count the number of occurences of each combination in the candidate itemsets
def k_plus_1_itemsets_support_counting(k_plus_1_candidate_itemsets, k, data):
    # If k = 1, we need to convert the list of lists of tuples to a list of tuples
    if k == 1:
        k_plus_1_candidate_itemsets = k_plus_1_candidate_itemsets[0]

    # Count the number of occurences of each combination in the data
    candidate_itemsets_count = {}
    for candidate_itemset in k_plus_1_candidate_itemsets:
        # Using groupby and size to count the number of occurences of each combination
        # Resetting the index to get the count of each combination as a column in the dataframe
        test = data.groupby(list(candidate_itemset)).size().reset_index(name='count')
        # print(test)
        # Append the combination and its count to the dictionary
        # The count of each combination is the last value in the count column
        # Moreover, we need to check whether the last row is a combination of 1s instead of 1s and 0s
        # If it is a combination of 1s, then we append the combination and its count to the dictionary
        # Otherwise, we do not append it to the dictionary
        # if test[test.columns[0]].iloc[-1] == 1 and test[test.columns[1]].iloc[-1] == 1:
        #     candidate_itemsets_count[candidate_itemset] = test['count'].iloc[-1]
        num_ones = 0
        for i in range(len(test.columns)-1):
            if test[test.columns[i]].iloc[-1] != 1:
                break
            else:
                num_ones += 1
                continue
            
        if num_ones == len(test.columns)-1:
            candidate_itemsets_count[candidate_itemset] = test['count'].iloc[-1]

    return candidate_itemsets_count


def candidate_elimination(combinations_count, min_support, combined_freq_itemsets):
    
    # Prune the combinations with count less than min_support
    for combination in combinations_count.copy().keys():
        if combinations_count[combination] < min_support:
            combinations_count.pop(combination)
    
    combined_freq_itemsets.update(combinations_count)
    return combinations_count

def generate_rules(combined_freq_itemsets, min_confidence, target):
    # Generate rules for frequent itemsets with k+1 items with min confidence
    # The rules are generated by splitting the combination into two parts
    rules = {}
    for key in combined_freq_itemsets.keys():
        
        for i in range(1, len(key)+1):  # range will return the values 1,2,3,4 in this loop
            combinations = []
            combinations.append(list(itertools.combinations(key, i)))
            # print(target, key, combinations)
            if combinations:
                combinations = combinations[0]
                for combination in combinations:
                    
                    # Convert the combination to a tuple if it is a string
                    if type(combination) == str:
                        combination = (combination,)
                    
                    # Check if the target is in the combination
                    if target != None:
                        # Continue to the next combination if the target is not in the combination
                        if target not in combination or len(combination) == 1:
                            continue
                        
                        # Split the combination into two parts
                        # The first part is the antecedent and the second part is the consequent
                        # The antecedent is the combination without the target
                        # The consequent is the target

                        temp_target = (target,)
                        
                        # In order to keep the correct order of the items in the combination
                        difference = set(combination) - set(temp_target)
                        antecedent = tuple(item for item in combination if item in difference)
                        consequent = temp_target
                        
                        confidence = combined_freq_itemsets[combination] / combined_freq_itemsets[antecedent]

                    # If the target is None, then we do not need to assign the target to the consequent
                    else:
                        antecedent = combination
                        difference = set(key) - set(combination)
                        consequent = tuple(item for item in key if item in difference)
                        if consequent == ():
                            continue
                    # print(target, combination)
                    # print("Combinations is ", combinations, "Combination is: ", combination, "Antecedent is: ", antecedent, "Consequent is: ", consequent)
                        confidence = combined_freq_itemsets[key] / combined_freq_itemsets[antecedent]
                        
                    print('key: ', key, 'antecedent: ', antecedent, 'consequent: ', consequent, 'confidence: ', confidence)
                    if confidence >= min_confidence:
                        rules[(antecedent, consequent)] = confidence
                        
        # Split the combination into two parts
        # The first part is the antecedent and the second part is the consequent
        # for i in range(1, len(key)):
        #     antecedent_1 = key[:i]
        #     consequent_1 = key[i:]

        #     antecedent_2 = key[i:]
        #     consequent_2 = key[:i]
        #     # Calculate the confidence of the rule
        #     # Confidence = support of combination / support of antecedent
        #     confidence_1 = combined_freq_itemsets[key] / combined_freq_itemsets[antecedent_1]
        #     confidence_2 = combined_freq_itemsets[key] / combined_freq_itemsets[antecedent_2]

        #     print(antecedent_1, consequent_1, confidence_1)
        #     print(antecedent_2, consequent_2, confidence_2)
        #     # Check if the confidence is greater than min_confidence
        #     if confidence_1 >= min_confidence:
        #         # Append the rule to the rules dictionary
        #         rules[(antecedent_1, consequent_1)] = confidence_1

        #     if confidence_2 >= min_confidence:
        #         # Append the rule to the rules dictionary
        #         rules[(antecedent_2, consequent_2)] = confidence_2
                
    return rules
    

In [316]:
from collections import defaultdict
def my_apriori(file_location, min_support, min_confidence, target=None):
    
    # Combined dictionary of frequent itemsets
    combined_freq_itemsets = {}
    frequent_1_itemsets = defaultdict(int)
    
    # Check the size of the file
    # If the file size is less than 1 GB, we can read the file directly
    # Otherwise, we need to read the file in chunks
    import os
    file_size = os.path.getsize(file_location)
    file_size_GB = file_size/1000000000
    print(f'The file size is {file_size} bytes, which is {file_size_GB} GB')
    if file_size_GB < 4:
        print('File size is less than 4 GB, reading the file directly to generate frequent 1 itemset')
        data = pd.read_csv(file_location)
        frequent_1_itemsets = generate_freq_1_itemsets(data, min_support, combined_freq_itemsets)

    else:
        print('File size is more than 4 GB, reading the file in chunks to generate frequent 1 itemset')
        # Read the data in chunks of 10000 rows each to ensure that the data fits in memory
        data = pd.read_csv(file_location, chunksize=10000, iterator=True)
        columns = data.get_chunk(1).columns

        # Initalise the dataframe with 0.0 and 1.0 as rows
        index = [0.0, 1.0]
        temp_df = pd.DataFrame(index=index, columns=columns)

        # Change all columns to string type
        temp_df = temp_df.astype(float)
        print(temp_df.columns.tolist())

        for chunk in data:
            chunk.fillna(0, inplace=True)
            temp_value_counts = chunk.apply(pd.value_counts)
            # Change all columns to string type
            temp_value_counts = temp_value_counts.astype(float)
            
            # Sum the value counts of each chunk to get the total value counts
            temp_df = temp_df.add(temp_value_counts, fill_value=0)
            # print('temp_df', temp_df)

        # Get the frequent itemsets with count greater than or equal to min_support
        # Get the columns and values of the dataframe with 1.0 and more than min_support
        temp_df = temp_df[temp_df.columns[temp_df.iloc[1] >= min_support]]
        # Drop the row with 0.0
        temp_df  = temp_df.drop(index=0.0, axis=1)
        frequent_1_itemsets = temp_df.iloc[0].to_dict()
        print(frequent_1_itemsets)
        dummy_dict = frequent_1_itemsets.copy()
        for key, item in frequent_1_itemsets.copy().items():
            # For dummy data
            # dummy_dict[(tuple(key))] = dummy_dict.pop(key)
            # For real data
            dummy_dict[(key,)] = dummy_dict.pop(key)
        print(dummy_dict)

        combined_freq_itemsets.update(dummy_dict)
        print(combined_freq_itemsets)
    

    k_plus_1_candidate_itemsets = None
    k = 1

    while True:
        # print(k)
        if k == 1:
            k_plus_1_candidate_itemsets = generate_k_plus_1_candidate_itemsets(frequent_1_itemsets, k)
        else:
            k_plus_1_candidate_itemsets = generate_k_plus_1_candidate_itemsets(k_plus_1_frequent_itemsets, k)
        # print(combined_freq_itemsets)
        print(k_plus_1_candidate_itemsets)
        k_plus_1_itemsets_support_count = defaultdict(int)
        k_plus_1_frequent_itemsets = defaultdict(int)

        if file_size_GB < 4:
            k_plus_1_itemsets_support_count = k_plus_1_itemsets_support_counting(k_plus_1_candidate_itemsets, k, data)
        
        else:
            print('File size is more than 4 GB, reading the file in chunks to generate k+1 candiate itemsets...')
            # Need to read the data again as the data is read in chunks
            data = pd.read_csv(file_location, chunksize=100, iterator=True)
            for chunk in data:
                chunk.fillna(0, inplace=True)
                # print(chunk)
                temp_dict = k_plus_1_itemsets_support_counting(k_plus_1_candidate_itemsets, k, chunk)
                for key, value in temp_dict.items():
                    k_plus_1_itemsets_support_count[key] += value
                print(k_plus_1_itemsets_support_count)
        
        k_plus_1_frequent_itemsets = candidate_elimination(k_plus_1_itemsets_support_count, min_support, combined_freq_itemsets)
        # print(k_plus_1_frequent_itemsets)
        k += 1
        print('k: ', k)
        # If there are no frequent itemsets with k+1 items, break
        if len(k_plus_1_frequent_itemsets) == 0:
            break

    # Generate rules for frequent itemsets with k+1 items with min confidence
    # The rules are generated by splitting the combination into two parts
    rules = generate_rules(combined_freq_itemsets, min_confidence, target)
    
    return combined_freq_itemsets, rules


In [317]:
combined_freq_itemsets, rules = my_apriori(file_location, 100, 0.4)

The file size is 1808993 bytes, which is 0.001808993 GB
File size is less than 4 GB, reading the file directly to generate frequent 1 itemset
{('shrimp',): 536, ('burgers',): 654, ('turkey',): 469, ('mineral water',): 1788, ('low fat yogurt',): 574, ('whole wheat pasta',): 221, ('soup',): 379, ('frozen vegetables',): 715, ('french fries',): 1282, ('eggs',): 1348, ('cookies',): 603, ('spaghetti',): 1306, ('meatballs',): 157, ('red wine',): 211, ('rice',): 141, ('parmesan cheese',): 149, ('ground beef',): 737, ('herb & pepper',): 371, ('energy bar',): 203, ('fresh tuna',): 167, ('escalope',): 595, ('avocado',): 250, ('tomato sauce',): 106, ('energy drink',): 200, ('chocolate',): 1229, ('grated cheese',): 393, ('yogurt cake',): 205, ('mint',): 131, ('champagne',): 351, ('ham',): 199, ('muffins',): 181, ('french wine',): 169, ('chicken',): 450, ('pasta',): 118, ('tomatoes',): 513, ('pancakes',): 713, ('frozen smoothie',): 475, ('carrots',): 115, ('butter',): 226, ('light mayo',): 204, ('pe

In [139]:
combined_freq_itemsets, rules = my_apriori(data, 4, 0.5)

{('A',): 4, ('B',): 4, ('C',): 8, ('D',): 6, ('E',): 5, ('F',): 5, ('H',): 4}
{'A': 4, 'B': 4, 'C': 8, 'D': 6, 'E': 5, 'F': 5, 'H': 4}
{('A',): 4, ('B',): 4, ('C',): 8, ('D',): 6, ('E',): 5, ('F',): 5, ('H',): 4}
k:  2
{('A',): 4, ('B',): 4, ('C',): 8, ('D',): 6, ('E',): 5, ('F',): 5, ('H',): 4, ('A', 'C'): 4, ('C', 'D'): 6}
k:  3


In [140]:
print('combined frequent itemsets: ', combined_freq_itemsets)

combined frequent itemsets:  {('A',): 4, ('B',): 4, ('C',): 8, ('D',): 6, ('E',): 5, ('F',): 5, ('H',): 4, ('A', 'C'): 4, ('C', 'D'): 6}


In [141]:
freq_itemsets_df = pd.DataFrame.from_dict(combined_freq_itemsets, orient='index', columns=['support'])
freq_itemsets_df

Unnamed: 0,support
"(A,)",4
"(B,)",4
"(C,)",8
"(D,)",6
"(E,)",5
"(F,)",5
"(H,)",4
"(A, C)",4
"(C, D)",6


In [142]:
print('rules: ', rules)
for key, item in rules.items():
    for i in range(1, len(key)):
        antecedent = key[:i]
        consequent = key[i:]
        print('antecedent: ', list(sum(antecedent, ())), '-> consequent: ', list(sum(consequent, ())), 'confidence: ', item)

rules:  {(('A',), ('C',)): 1.0, (('C',), ('A',)): 0.5, (('C',), ('D',)): 0.75, (('D',), ('C',)): 1.0}
antecedent:  ['A'] -> consequent:  ['C'] confidence:  1.0
antecedent:  ['C'] -> consequent:  ['A'] confidence:  0.5
antecedent:  ['C'] -> consequent:  ['D'] confidence:  0.75
antecedent:  ['D'] -> consequent:  ['C'] confidence:  1.0


<h3> Showing the results of my code is correct by using the actual official Apriori algorithm library extension </h3>

In [24]:
!pip install mlxtend

Collecting mlxtend
  Obtaining dependency information for mlxtend from https://files.pythonhosted.org/packages/73/da/d5d77a9a7a135c948dbf8d3b873655b105a152d69e590150c83d23c3d070/mlxtend-0.23.0-py3-none-any.whl.metadata
  Downloading mlxtend-0.23.0-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.0-py3-none-any.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   -------- ------------------------------- 0.3/1.4 MB 9.6 MB/s eta 0:00:01
   ---------------- ----------------------- 0.6/1.4 MB 7.4 MB/s eta 0:00:01
   ----------------------- ---------------- 0.8/1.4 MB 6.6 MB/s eta 0:00:01
   ----------------------------- ---------- 1.1/1.4 MB 6.1 MB/s eta 0:00:01
   -------------------------------------- - 1.4/1.4 MB 6.4 MB/s eta 0:00:01
   ---------------------------------------- 1.4/1.4 MB 6.2 MB/s eta 0:00:00
Installing collected packages: mlxtend
Successfully installed mlxtend-0.23.0



[notice] A new release of pip is available: 23.2.1 -> 23.3
[notice] To update, run: C:\Users\tengwei\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [143]:
from mlxtend.frequent_patterns import apriori, association_rules

freq_items = apriori(data, min_support=0.4, use_colnames=True)
freq_items



Unnamed: 0,support,itemsets
0,0.4,(A)
1,0.4,(B)
2,0.8,(C)
3,0.6,(D)
4,0.5,(E)
5,0.5,(F)
6,0.4,(H)
7,0.4,"(C, A)"
8,0.6,"(C, D)"


In [144]:
rules = association_rules(freq_items, metric='confidence', min_threshold=0.5)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(C),(A),0.8,0.4,0.4,0.5,1.25,0.08,1.2,1.0
1,(A),(C),0.4,0.8,0.4,1.0,1.25,0.08,inf,0.333333
2,(C),(D),0.8,0.6,0.6,0.75,1.25,0.12,1.6,1.0
3,(D),(C),0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5


<h1> Task 2: Use 3 datasets to run Apriori algorithm with different min-support thresholds </h1>

<h2> 1. Grocery store dataset </h2>

In [223]:
df = pd.read_csv('dataset/grocery_store/Market_Basket_Optimisation.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [224]:
# Data CLeaning
df.fillna(0, inplace=True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,chutney,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,turkey,avocado,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,mineral water,milk,energy bar,whole wheat rice,green tea,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [225]:
# Get the unique items in the dataset
unique_items = pd.unique(df.values.ravel('K'))
unique_items

array(['shrimp', 'burgers', 'chutney', 'turkey', 'mineral water',
       'low fat yogurt', 'whole wheat pasta', 'soup', 'frozen vegetables',
       'french fries', 'eggs', 'cookies', 'spaghetti', 'meatballs',
       'red wine', 'rice', 'parmesan cheese', 'ground beef',
       'sparkling water', 'herb & pepper', 'pickles', 'energy bar',
       'fresh tuna', 'escalope', 'avocado', 'tomato sauce',
       'clothes accessories', 'energy drink', 'chocolate',
       'grated cheese', 'yogurt cake', 'mint', 'asparagus', 'champagne',
       'ham', 'muffins', 'french wine', 'chicken', 'pasta', 'tomatoes',
       'pancakes', 'frozen smoothie', 'carrots', 'yams', 'shallot',
       'butter', 'light mayo', 'pepper', 'candy bars', 'cooking oil',
       'milk', 'green tea', 'bug spray', 'oil', 'olive oil', 'salmon',
       'cake', 'almonds', 'salt', 'strong cheese', 'hot dogs', 'pet food',
       'whole wheat rice', 'antioxydant juice', 'honey', 'sandwich',
       'salad', 'magazines', 'protein bar', '

In [226]:
# Set the unique items as the column names
transactions_data = pd.DataFrame(columns=unique_items)
transactions_data.drop(columns= 0, inplace=True)
transactions_data

Unnamed: 0,shrimp,burgers,chutney,turkey,mineral water,low fat yogurt,whole wheat pasta,soup,frozen vegetables,french fries,...,ketchup,cream,hand protein bar,body spray,oatmeal,zucchini,water spray,tea,napkins,asparagus


In [227]:
# Iterate through the supermarket dataset
# Each row is a transaction
# If the item is present in the transaction, set the value as 1

for i in range(0, len(df)):
    transaction = df.iloc[i, :].values
    # Remove the 0s from the transaction
    transaction = transaction[transaction != 0]

    # Set the value as 1 if the item is present in the transaction
    for item in transaction:
        transactions_data.at[i, item] = 1

In [228]:
transactions_data.head()

Unnamed: 0,shrimp,burgers,chutney,turkey,mineral water,low fat yogurt,whole wheat pasta,soup,frozen vegetables,french fries,...,ketchup,cream,hand protein bar,body spray,oatmeal,zucchini,water spray,tea,napkins,asparagus
0,1.0,,,,1.0,1.0,,,,,...,,,,,,,,,,
1,,1.0,,,,,,,,,...,,,,,,,,,,
2,,,1.0,,,,,,,,...,,,,,,,,,,
3,,,,1.0,,,,,,,...,,,,,,,,,,
4,,,,,1.0,,,,,,...,,,,,,,,,,


In [229]:
transactions_data.fillna(0, inplace=True)
transactions_data.head()

Unnamed: 0,shrimp,burgers,chutney,turkey,mineral water,low fat yogurt,whole wheat pasta,soup,frozen vegetables,french fries,...,ketchup,cream,hand protein bar,body spray,oatmeal,zucchini,water spray,tea,napkins,asparagus
0,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [230]:
transactions_data.columns = transactions_data.columns.str.strip()
transactions_data.columns

Index(['shrimp', 'burgers', 'chutney', 'turkey', 'mineral water',
       'low fat yogurt', 'whole wheat pasta', 'soup', 'frozen vegetables',
       'french fries',
       ...
       'ketchup', 'cream', 'hand protein bar', 'body spray', 'oatmeal',
       'zucchini', 'water spray', 'tea', 'napkins', 'asparagus'],
      dtype='object', length=120)

In [231]:
# Save the transactions_data dataframe as a csv file
transactions_data.to_csv('dataset/grocery_store/Market_Basket_Cleaned.csv', index=False)

In [232]:
# Location of the cleaned dataset
file_location = 'dataset/grocery_store/Market_Basket_Cleaned.csv'

In [19]:
transactions_data = pd.read_csv(file_location)

In [22]:
transactions_data.fillna(0, inplace=True)
transactions_data.head()

Unnamed: 0,shrimp,burgers,chutney,turkey,mineral water,low fat yogurt,whole wheat pasta,soup,frozen vegetables,french fries,...,ketchup,cream,hand protein bar,body spray,oatmeal,zucchini,water spray,tea,napkins,asparagus
0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
# combined_freq_itemsets, rules = my_apriori(transactions_data, 100, 0.4)
combined_freq_itemsets, rules = my_apriori(file_location, 100, 0.4)

Dask DataFrame Structure:
                 1.0    0.0
npartitions=1              
               int64  int64
                 ...    ...
Dask Name: apply, 3 graph layers


KeyError: 'shrimp'

In [26]:
for key, value in combined_freq_itemsets.items():
    print(key, value)

('shrimp',) 536
('burgers',) 654
('turkey',) 469
('mineral water',) 1788
('low fat yogurt',) 574
('whole wheat pasta',) 221
('soup',) 379
('frozen vegetables',) 715
('french fries',) 1282
('eggs',) 1348
('cookies',) 603
('spaghetti',) 1306
('meatballs',) 157
('red wine',) 211
('rice',) 141
('parmesan cheese',) 149
('ground beef',) 737
('herb & pepper',) 371
('energy bar',) 203
('fresh tuna',) 167
('escalope',) 595
('avocado',) 250
('tomato sauce',) 106
('energy drink',) 200
('chocolate',) 1229
('grated cheese',) 393
('yogurt cake',) 205
('mint',) 131
('champagne',) 351
('ham',) 199
('muffins',) 181
('french wine',) 169
('chicken',) 450
('pasta',) 118
('tomatoes',) 513
('pancakes',) 713
('frozen smoothie',) 475
('carrots',) 115
('butter',) 226
('light mayo',) 204
('pepper',) 199
('cooking oil',) 383
('milk',) 972
('green tea',) 991
('oil',) 173
('olive oil',) 494
('salmon',) 319
('cake',) 608
('almonds',) 153
('hot dogs',) 243
('whole wheat rice',) 439
('honey',) 356
('protein bar',) 13

In [27]:
index = 1
for key, item in rules.items():
    for i in range(1, len(key)):
        antecedent = key[:i]
        consequent = key[i:]
        print('Rule ', index, ': antecedent -> consequent: ', list(sum(antecedent, ())), '-> ', list(sum(consequent, ())), 'confidence: ', item)
        index += 1

Rule  1 : antecedent -> consequent:  ['soup'] ->  ['mineral water'] confidence:  0.45646437994722955
Rule  2 : antecedent -> consequent:  ['ground beef'] ->  ['mineral water'] confidence:  0.41655359565807326
Rule  3 : antecedent -> consequent:  ['olive oil'] ->  ['mineral water'] confidence:  0.4190283400809717
Rule  4 : antecedent -> consequent:  ['salmon'] ->  ['mineral water'] confidence:  0.4012539184952978
Rule  5 : antecedent -> consequent:  ['eggs', 'chocolate'] ->  ['mineral water'] confidence:  0.40562248995983935
Rule  6 : antecedent -> consequent:  ['mineral water', 'ground beef'] ->  ['spaghetti'] confidence:  0.4169381107491857
Rule  7 : antecedent -> consequent:  ['spaghetti', 'ground beef'] ->  ['mineral water'] confidence:  0.43537414965986393
Rule  8 : antecedent -> consequent:  ['spaghetti', 'chocolate'] ->  ['mineral water'] confidence:  0.40476190476190477
Rule  9 : antecedent -> consequent:  ['spaghetti', 'milk'] ->  ['mineral water'] confidence:  0.44360902255639

<h3> Verify with official Apriori library </h3>

In [96]:
transactions_data.shape

(7501, 120)

In [173]:
freq_items = apriori(transactions_data, min_support=0.0133, use_colnames=True)
freq_items



Unnamed: 0,support,itemsets
0,0.071457,(shrimp)
1,0.087188,(burgers)
2,0.062525,(turkey)
3,0.238368,(mineral water)
4,0.076523,(low fat yogurt)
...,...,...
182,0.013465,"(eggs, mineral water, chocolate)"
183,0.017064,"(ground beef, spaghetti, mineral water)"
184,0.015865,"(spaghetti, mineral water, chocolate)"
185,0.015731,"(spaghetti, mineral water, milk)"


In [174]:
rules = association_rules(freq_items, metric='confidence', min_threshold=0.4)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(soup),(mineral water),0.050527,0.238368,0.023064,0.456464,1.914955,0.01102,1.401255,0.503221
1,(ground beef),(mineral water),0.098254,0.238368,0.040928,0.416554,1.747522,0.017507,1.305401,0.474369
2,(olive oil),(mineral water),0.065858,0.238368,0.027596,0.419028,1.757904,0.011898,1.310962,0.461536
3,(salmon),(mineral water),0.042528,0.238368,0.017064,0.401254,1.683336,0.006927,1.272045,0.423972
4,"(eggs, chocolate)",(mineral water),0.033196,0.238368,0.013465,0.405622,1.701663,0.005552,1.281394,0.426498
5,"(ground beef, spaghetti)",(mineral water),0.039195,0.238368,0.017064,0.435374,1.826477,0.007722,1.348914,0.470957
6,"(ground beef, mineral water)",(spaghetti),0.040928,0.17411,0.017064,0.416938,2.394681,0.009938,1.41647,0.607262
7,"(spaghetti, chocolate)",(mineral water),0.039195,0.238368,0.015865,0.404762,1.698053,0.006522,1.279541,0.42786
8,"(spaghetti, milk)",(mineral water),0.035462,0.238368,0.015731,0.443609,1.861024,0.007278,1.368879,0.479672
9,"(milk, chocolate)",(mineral water),0.032129,0.238368,0.013998,0.435685,1.82778,0.00634,1.349656,0.467922


<h2> 2. Titanic dataset </h2>

In [23]:
survival_df = pd.read_csv('dataset/titanic/gender_submission.csv')
survival_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [76]:
train_titanic_df = pd.read_csv('dataset/titanic/train.csv')
test_titanic_df = pd.read_csv('dataset/titanic/test.csv')

In [77]:
train_titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [78]:
# Drop the columns that are not required
train_titanic_df.drop(columns=['PassengerId', 'Name','SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin'], inplace=True)
train_titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked
0,0,3,male,22.0,S
1,1,1,female,38.0,C
2,1,3,female,26.0,S
3,1,1,female,35.0,S
4,0,3,male,35.0,S


In [79]:
# Categorise the Age column
# Age 21 and below is a Child
# Age between 21 and 55 is an Adult
# Age above 55 is an Elderly
train_titanic_df['Age'] = pd.cut(train_titanic_df['Age'], bins=[0, 21, 55, 80], labels=['Child', 'Adult', 'Elderly'])
train_titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked
0,0,3,male,Adult,S
1,1,1,female,Adult,C
2,1,3,female,Adult,S
3,1,1,female,Adult,S
4,0,3,male,Adult,S


In [80]:
# Convert into one hot encoding
train_titanic_df = pd.get_dummies(train_titanic_df)
train_titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex_female,Sex_male,Age_Child,Age_Adult,Age_Elderly,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,1,0,1,0,0,0,1
1,1,1,1,0,0,1,0,1,0,0
2,1,3,1,0,0,1,0,0,0,1
3,1,1,1,0,0,1,0,0,0,1
4,0,3,0,1,0,1,0,0,0,1


In [81]:
# Converet Pclass into one hot encoding
train_titanic_df = pd.get_dummies(train_titanic_df, columns=['Pclass'])
train_titanic_df.head()


Unnamed: 0,Survived,Sex_female,Sex_male,Age_Child,Age_Adult,Age_Elderly,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,0,0,1,0,1,0,0,0,1,0,0,1
1,1,1,0,0,1,0,1,0,0,1,0,0
2,1,1,0,0,1,0,0,0,1,0,0,1
3,1,1,0,0,1,0,0,0,1,1,0,0
4,0,0,1,0,1,0,0,0,1,0,0,1


In [45]:
train_titanic_df.groupby(['Sex_male', 'Sex_female', 'Survived']).size().reset_index(name='count')

Unnamed: 0,Sex_male,Sex_female,Survived,count
0,0,1,0,81
1,0,1,1,233
2,1,0,0,468
3,1,0,1,109


In [82]:
len(train_titanic_df.columns)

12

In [112]:
combination = ('Survived', 'Sex_female')
target = ('Survived',)
tuple(set(combination) - set(target))

('Sex_female',)

In [177]:
train_titanic_df.shape

(891, 12)

In [211]:
# Run the apriori algorithm
combined_freq_itemsets, rules = my_apriori(train_titanic_df, 30, 0.4, target='Survived')

{('Survived',): 342, ('Sex_female',): 314, ('Sex_male',): 577, ('Age_Child',): 204, ('Age_Adult',): 470, ('Age_Elderly',): 40, ('Embarked_C',): 168, ('Embarked_Q',): 77, ('Embarked_S',): 644, ('Pclass_1',): 216, ('Pclass_2',): 184, ('Pclass_3',): 491}
{'Survived': 342, 'Sex_female': 314, 'Sex_male': 577, 'Age_Child': 204, 'Age_Adult': 470, 'Age_Elderly': 40, 'Embarked_C': 168, 'Embarked_Q': 77, 'Embarked_S': 644, 'Pclass_1': 216, 'Pclass_2': 184, 'Pclass_3': 491}
{('Survived',): 342, ('Sex_female',): 314, ('Sex_male',): 577, ('Age_Child',): 204, ('Age_Adult',): 470, ('Age_Elderly',): 40, ('Embarked_C',): 168, ('Embarked_Q',): 77, ('Embarked_S',): 644, ('Pclass_1',): 216, ('Pclass_2',): 184, ('Pclass_3',): 491}
k:  2
{('Survived',): 342, ('Sex_female',): 314, ('Sex_male',): 577, ('Age_Child',): 204, ('Age_Adult',): 470, ('Age_Elderly',): 40, ('Embarked_C',): 168, ('Embarked_Q',): 77, ('Embarked_S',): 644, ('Pclass_1',): 216, ('Pclass_2',): 184, ('Pclass_3',): 491, ('Survived', 'Sex_fema

In [212]:
for key, value in combined_freq_itemsets.items():
    print(key, value)

('Survived',) 342
('Sex_female',) 314
('Sex_male',) 577
('Age_Child',) 204
('Age_Adult',) 470
('Age_Elderly',) 40
('Embarked_C',) 168
('Embarked_Q',) 77
('Embarked_S',) 644
('Pclass_1',) 216
('Pclass_2',) 184
('Pclass_3',) 491
('Survived', 'Sex_female') 233
('Survived', 'Sex_male') 109
('Survived', 'Age_Child') 87
('Survived', 'Age_Adult') 191
('Survived', 'Embarked_C') 93
('Survived', 'Embarked_Q') 30
('Survived', 'Embarked_S') 217
('Survived', 'Pclass_1') 136
('Survived', 'Pclass_2') 87
('Survived', 'Pclass_3') 119
('Sex_female', 'Age_Child') 84
('Sex_female', 'Age_Adult') 168
('Sex_female', 'Embarked_C') 73
('Sex_female', 'Embarked_Q') 36
('Sex_female', 'Embarked_S') 203
('Sex_female', 'Pclass_1') 94
('Sex_female', 'Pclass_2') 76
('Sex_female', 'Pclass_3') 144
('Sex_male', 'Age_Child') 120
('Sex_male', 'Age_Adult') 302
('Sex_male', 'Age_Elderly') 31
('Sex_male', 'Embarked_C') 95
('Sex_male', 'Embarked_Q') 41
('Sex_male', 'Embarked_S') 441
('Sex_male', 'Pclass_1') 122
('Sex_male', 'P

In [213]:
index = 1
for key, item in rules.items():
    for i in range(1, len(key)):
        antecedent = key[:i]
        consequent = key[i:]
        print('Rule ', index, ': antecedent -> consequent: ', list(sum(antecedent, ())), '-> ', list(sum(consequent, ())), 'confidence: ', item)
        index += 1

Rule  1 : antecedent -> consequent:  ['Sex_female'] ->  ['Survived'] confidence:  0.7420382165605095
Rule  2 : antecedent -> consequent:  ['Age_Child'] ->  ['Survived'] confidence:  0.4264705882352941
Rule  3 : antecedent -> consequent:  ['Age_Adult'] ->  ['Survived'] confidence:  0.40638297872340423
Rule  4 : antecedent -> consequent:  ['Embarked_C'] ->  ['Survived'] confidence:  0.5535714285714286
Rule  5 : antecedent -> consequent:  ['Pclass_1'] ->  ['Survived'] confidence:  0.6296296296296297
Rule  6 : antecedent -> consequent:  ['Pclass_2'] ->  ['Survived'] confidence:  0.47282608695652173
Rule  7 : antecedent -> consequent:  ['Sex_female', 'Age_Child'] ->  ['Survived'] confidence:  0.6785714285714286
Rule  8 : antecedent -> consequent:  ['Sex_female', 'Age_Adult'] ->  ['Survived'] confidence:  0.7857142857142857
Rule  9 : antecedent -> consequent:  ['Sex_female', 'Embarked_C'] ->  ['Survived'] confidence:  0.8767123287671232
Rule  10 : antecedent -> consequent:  ['Sex_female', 'E

<h3> Verify with official Apriori library </h3>

In [216]:
from mlxtend.frequent_patterns import apriori, association_rules
freq_items = apriori(train_titanic_df, min_support=0.0337, use_colnames=True)
freq_items



Unnamed: 0,support,itemsets
0,0.383838,(Survived)
1,0.352413,(Sex_female)
2,0.647587,(Sex_male)
3,0.228956,(Age_Child)
4,0.527497,(Age_Adult)
...,...,...
126,0.085297,"(Pclass_3, Sex_male, Embarked_S, Age_Child)"
127,0.052750,"(Sex_male, Pclass_1, Age_Adult, Embarked_S)"
128,0.071829,"(Pclass_2, Sex_male, Age_Adult, Embarked_S)"
129,0.150393,"(Age_Adult, Pclass_3, Sex_male, Embarked_S)"


In [217]:
rules = association_rules(freq_items, metric='confidence', min_threshold=0.4)
rules 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Survived),(Sex_female),0.383838,0.352413,0.261504,0.681287,1.933205,0.126234,2.031878,0.783438
1,(Sex_female),(Survived),0.352413,0.383838,0.261504,0.742038,1.933205,0.126234,2.388577,0.745420
2,(Age_Child),(Survived),0.228956,0.383838,0.097643,0.426471,1.111068,0.009761,1.074333,0.129649
3,(Age_Adult),(Survived),0.527497,0.383838,0.214366,0.406383,1.058735,0.011892,1.037978,0.117409
4,(Survived),(Age_Adult),0.383838,0.527497,0.214366,0.558480,1.058735,0.011892,1.070172,0.090035
...,...,...,...,...,...,...,...,...,...,...
278,"(Pclass_2, Survived, Sex_female)","(Age_Adult, Embarked_S)",0.078563,0.415264,0.051627,0.657143,1.582471,0.019003,1.705481,0.399460
279,"(Pclass_2, Age_Adult, Survived)","(Embarked_S, Sex_female)",0.061728,0.227834,0.051627,0.836364,3.670936,0.037564,4.718793,0.775458
280,"(Pclass_2, Age_Adult, Sex_female)","(Embarked_S, Survived)",0.062851,0.243547,0.051627,0.821429,3.372778,0.036320,4.236139,0.750690
281,"(Pclass_2, Survived)","(Age_Adult, Embarked_S, Sex_female)",0.097643,0.140292,0.051627,0.528736,3.768828,0.037929,1.824259,0.814163


In [188]:
rules['consequents'][2] == {'Survived'}

True

In [220]:
rules[rules['consequents'] == {'Survived'}]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1,(Sex_female),(Survived),0.352413,0.383838,0.261504,0.742038,1.933205,0.126234,2.388577,0.74542
2,(Age_Child),(Survived),0.228956,0.383838,0.097643,0.426471,1.111068,0.009761,1.074333,0.129649
3,(Age_Adult),(Survived),0.527497,0.383838,0.214366,0.406383,1.058735,0.011892,1.037978,0.117409
5,(Embarked_C),(Survived),0.188552,0.383838,0.104377,0.553571,1.442199,0.032004,1.380202,0.377861
7,(Pclass_1),(Survived),0.242424,0.383838,0.152637,0.62963,1.640351,0.059586,1.663636,0.515294
8,(Pclass_2),(Survived),0.20651,0.383838,0.097643,0.472826,1.231836,0.018377,1.168801,0.237185
45,"(Age_Child, Sex_female)",(Survived),0.094276,0.383838,0.063973,0.678571,1.767857,0.027786,1.916947,0.479554
47,"(Age_Adult, Sex_female)",(Survived),0.188552,0.383838,0.148148,0.785714,2.046992,0.075775,2.875421,0.630328
51,"(Embarked_C, Sex_female)",(Survived),0.08193,0.383838,0.071829,0.876712,2.284066,0.040381,4.997755,0.612355
53,"(Embarked_S, Sex_female)",(Survived),0.227834,0.383838,0.157127,0.689655,1.796733,0.069675,1.98541,0.574273


<h2> 3. Yelp dataset </h2>

In [2]:
import json
import pandas as pd

In [None]:
data_file = open("dataset/yelp/yelp_academic_dataset_user.json", encoding="utf8")
data = []
for line in data_file:
    data.append(json.loads(line))
user_df = pd.DataFrame(data)
data_file.close()


In [18]:
user_df.head()

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,...,65,55,56,18,232,844,467,467,239,180
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,...,264,184,157,251,1847,7054,3131,3131,1521,1946
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,2086,1010,1003,20092010201120122013,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,...,13,10,17,3,66,96,119,119,35,18
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,2005-11-29 04:38:33,512,330,299,200920102011,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",28,...,4,1,6,2,12,16,26,26,10,9
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,2007-01-05 19:40:59,29,15,7,,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",1,...,1,0,0,0,1,1,0,0,0,0


In [23]:
user_df.columns

Index(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny',
       'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos'],
      dtype='object')

In [5]:
data_file = open("dataset/yelp/yelp_academic_dataset_review.json", encoding="utf8")
data = []
for line in data_file:
    data.append(json.loads(line))
review_df = pd.DataFrame(data)
data_file.close()

In [6]:
review_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [24]:
review_df.columns

Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date'],
      dtype='object')

In [23]:
review_df['cool'].value_counts()

cool
0      5377964
1      1016736
2       296999
3       114763
4        56609
        ...   
133          1
205          1
306          1
370          1
304          1
Name: count, Length: 210, dtype: int64

In [11]:
review_df.drop(columns=['review_id', 'user_id', 'text', 'date'], inplace=True)

In [7]:
data_file = open("dataset/yelp/yelp_academic_dataset_business.json", encoding="utf8")
data = []
for line in data_file:
    data.append(json.loads(line))
business_df = pd.DataFrame(data)
data_file.close()

In [8]:
business_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [None]:
business_df.drop(columns=['address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'attributes', 'hours'], inplace=True)

In [13]:
# rename stars column to business_stars
business_df.rename(columns={'stars': 'business_stars'}, inplace=True)

In [30]:
# save the dataframes as csv files
# user_df.to_csv('dataset/yelp/user.csv', index=False)
review_df.to_csv('dataset/yelp/review.csv', index=False)
business_df.to_csv('dataset/yelp/business.csv', index=False)

In [3]:
review_df = pd.read_csv('dataset/yelp/review.csv')
business_df = pd.read_csv('dataset/yelp/business.csv')

In [5]:
review_df.head()

Unnamed: 0,business_id,stars,useful,funny,cool
0,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0
1,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1
2,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0
3,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1
4,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1


In [6]:
business_df.head()

Unnamed: 0,business_id,name,business_stars,review_count,is_open,categories
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ",5.0,7,0,"Doctors, Traditional Chinese Medicine, Naturop..."
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,3.0,15,1,"Shipping Centers, Local Services, Notaries, Ma..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,3.5,22,0,"Department Stores, Shopping, Fashion, Home & G..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,4.0,80,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,4.5,13,1,"Brewpubs, Breweries, Food"


In [7]:
# Convert the stars column into categorical data with 2 categories: 0 and 1
# 0 if the stars is less than or equal to 3
# 1 if the stars is greater than 3
review_df['stars'] = pd.cut(review_df['stars'], bins=[0, 4, 5], labels=[0, 1])
review_df.head()


Unnamed: 0,business_id,stars,useful,funny,cool
0,XQfwVwDr-v0ZS3_CbbE5Xw,0,0,0,0
1,7ATYjTIgM3jUlt4UM3IypQ,1,1,0,1
2,YjUWPpI6HXG530lwP-fb2A,0,0,0,0
3,kxX2SOes4o-D3ZQBkiMRfA,1,1,0,1
4,e4Vwtrqf-wpJfwesgvdgxQ,0,1,0,1


In [8]:
# Rename the stars column to review_stars
review_df.rename(columns={'stars': 'good review'}, inplace=True)

In [9]:
# Match the business_id in the review_df with the business_id in the business_df and concatenate the dataframes
review_business_df = pd.merge(review_df, business_df, on='business_id')
review_business_df.head()

Unnamed: 0,business_id,good review,useful,funny,cool,name,business_stars,review_count,is_open,categories
0,XQfwVwDr-v0ZS3_CbbE5Xw,0,0,0,0,Turning Point of North Wales,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
1,XQfwVwDr-v0ZS3_CbbE5Xw,0,0,0,0,Turning Point of North Wales,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
2,XQfwVwDr-v0ZS3_CbbE5Xw,0,2,0,1,Turning Point of North Wales,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
3,XQfwVwDr-v0ZS3_CbbE5Xw,0,0,0,0,Turning Point of North Wales,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
4,XQfwVwDr-v0ZS3_CbbE5Xw,0,0,0,0,Turning Point of North Wales,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."


In [10]:
review_business_df.shape

(6990280, 10)

In [11]:
review_business_df.drop(columns=['business_id', 'name'], inplace=True)

In [12]:
review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,business_stars,review_count,is_open,categories
0,0,0,0,0,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
1,0,0,0,0,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
2,0,2,0,1,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
3,0,0,0,0,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
4,0,0,0,0,3.0,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."


In [13]:
# Convert the stars column into categorical data
review_business_df['business_stars'] = pd.cut(review_business_df['business_stars'], bins=[0, 2, 3, 5], labels=['Bad', 'Average', 'Good'])

In [14]:
review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,business_stars,review_count,is_open,categories
0,0,0,0,0,Average,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
1,0,0,0,0,Average,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
2,0,2,0,1,Average,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
3,0,0,0,0,Average,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
4,0,0,0,0,Average,169,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."


In [27]:
sorted(review_business_df['review_count'].unique())

[5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 1

In [15]:
# Convert the review_count column into categorical data
review_business_df['review_count'] = pd.cut(review_business_df['review_count'], bins=[0, 600, 700, 1200], labels=['Low', 'Medium', 'High'])

review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,business_stars,review_count,is_open,categories
0,0,0,0,0,Average,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
1,0,0,0,0,Average,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
2,0,2,0,1,Average,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
3,0,0,0,0,Average,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."
4,0,0,0,0,Average,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B..."


In [16]:
# COnvert stars and buisness_stars into one hot encoding
review_business_df = pd.get_dummies(review_business_df, columns=['business_stars'])
review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,review_count,is_open,categories,business_stars_Bad,business_stars_Average,business_stars_Good
0,0,0,0,0,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",False,True,False
1,0,0,0,0,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",False,True,False
2,0,2,0,1,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",False,True,False
3,0,0,0,0,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",False,True,False
4,0,0,0,0,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",False,True,False


In [17]:
# Convert True and False for business_stars_Bad, business_stars_Average, business_stars_Good into 1 and 0
review_business_df['business_stars_Bad'] = review_business_df['business_stars_Bad'].astype(int)
review_business_df['business_stars_Average'] = review_business_df['business_stars_Average'].astype(int)
review_business_df['business_stars_Good'] = review_business_df['business_stars_Good'].astype(int)

review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,review_count,is_open,categories,business_stars_Bad,business_stars_Average,business_stars_Good
0,0,0,0,0,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0
1,0,0,0,0,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0
2,0,2,0,1,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0
3,0,0,0,0,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0
4,0,0,0,0,Low,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0


In [18]:
# Convert the review_count column into one hot encoding
review_business_df = pd.get_dummies(review_business_df, columns=['review_count'])
review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,is_open,categories,business_stars_Bad,business_stars_Average,business_stars_Good,review_count_Low,review_count_Medium,review_count_High
0,0,0,0,0,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,True,False,False
1,0,0,0,0,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,True,False,False
2,0,2,0,1,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,True,False,False
3,0,0,0,0,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,True,False,False
4,0,0,0,0,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,True,False,False


In [19]:
# Convert True and False for review_count_High, review_count_Low, review_count_Medium into 1 and 0
review_business_df['review_count_High'] = review_business_df['review_count_High'].astype(int)
review_business_df['review_count_Low'] = review_business_df['review_count_Low'].astype(int)
review_business_df['review_count_Medium'] = review_business_df['review_count_Medium'].astype(int)

review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,is_open,categories,business_stars_Bad,business_stars_Average,business_stars_Good,review_count_Low,review_count_Medium,review_count_High
0,0,0,0,0,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,1,0,0
1,0,0,0,0,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,1,0,0
2,0,2,0,1,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,1,0,0
3,0,0,0,0,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,1,0,0
4,0,0,0,0,1,"Restaurants, Breakfast & Brunch, Food, Juice B...",0,1,0,1,0,0


In [20]:
# Drop the categories column
review_business_df.drop(columns=['categories'], inplace=True)

In [21]:
review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,is_open,business_stars_Bad,business_stars_Average,business_stars_Good,review_count_Low,review_count_Medium,review_count_High
0,0,0,0,0,1,0,1,0,1,0,0
1,0,0,0,0,1,0,1,0,1,0,0
2,0,2,0,1,1,0,1,0,1,0,0
3,0,0,0,0,1,0,1,0,1,0,0
4,0,0,0,0,1,0,1,0,1,0,0


In [32]:
# Convert useful into one hot encoding, any value greater than 1 is 1
review_business_df['useful'] = review_business_df['useful'].apply(lambda x: 1 if x > 1 else 0)
review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,is_open,business_stars_Bad,business_stars_Average,business_stars_Good,review_count_Low,review_count_Medium,review_count_High
0,0,0,0,0,1,0,1,0,1,0,0
1,0,0,0,0,1,0,1,0,1,0,0
2,0,1,0,1,1,0,1,0,1,0,0
3,0,0,0,0,1,0,1,0,1,0,0
4,0,0,0,0,1,0,1,0,1,0,0


In [33]:
# Convert funny into one hot encoding, any value greater than 1 is 1
# Convert cool into one hot encoding, any value greater than 1 is 1

review_business_df['useful'] = review_business_df['useful'].apply(lambda x: 1 if x > 1 else 0)
review_business_df['funny'] = review_business_df['funny'].apply(lambda x: 1 if x > 1 else 0)
review_business_df['cool'] = review_business_df['cool'].apply(lambda x: 1 if x > 1 else 0)


review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,is_open,business_stars_Bad,business_stars_Average,business_stars_Good,review_count_Low,review_count_Medium,review_count_High
0,0,0,0,0,1,0,1,0,1,0,0
1,0,0,0,0,1,0,1,0,1,0,0
2,0,1,0,0,1,0,1,0,1,0,0
3,0,0,0,0,1,0,1,0,1,0,0
4,0,0,0,0,1,0,1,0,1,0,0


In [22]:
# Save the dataframe as a csv file
review_business_df.to_csv('dataset/yelp/review_business.csv', index=False)

In [34]:
# Run the apriori algorithm
combined_freq_itemsets, rules = my_apriori(review_business_df, 1000000, 0.4, target='good review')

{('good review',): 3231627, ('useful',): 1609831, ('is_open',): 5791234, ('business_stars_Average',): 1097467, ('business_stars_Good',): 5540490, ('review_count_Low',): 5949264}
{'good review': 3231627, 'useful': 1609831, 'is_open': 5791234, 'business_stars_Average': 1097467, 'business_stars_Good': 5540490, 'review_count_Low': 5949264}
{('good review',): 3231627, ('useful',): 1609831, ('is_open',): 5791234, ('business_stars_Average',): 1097467, ('business_stars_Good',): 5540490, ('review_count_Low',): 5949264}
k:  2
{('good review',): 3231627, ('useful',): 1609831, ('is_open',): 5791234, ('business_stars_Average',): 1097467, ('business_stars_Good',): 5540490, ('review_count_Low',): 5949264, ('good review', 'is_open'): 2769878, ('good review', 'business_stars_Good'): 2950480, ('good review', 'review_count_Low'): 2721194, ('useful', 'is_open'): 1310564, ('useful', 'business_stars_Good'): 1214578, ('useful', 'review_count_Low'): 1437504, ('is_open', 'business_stars_Good'): 4645111, ('is_o

In [35]:
# Print the frequent itemsets
for key, value in combined_freq_itemsets.items():
    print(key, value)

('good review',) 3231627
('useful',) 1609831
('is_open',) 5791234
('business_stars_Average',) 1097467
('business_stars_Good',) 5540490
('review_count_Low',) 5949264
('good review', 'is_open') 2769878
('good review', 'business_stars_Good') 2950480
('good review', 'review_count_Low') 2721194
('useful', 'is_open') 1310564
('useful', 'business_stars_Good') 1214578
('useful', 'review_count_Low') 1437504
('is_open', 'business_stars_Good') 4645111
('is_open', 'review_count_Low') 4822703
('business_stars_Average', 'review_count_Low') 1046301
('business_stars_Good', 'review_count_Low') 4553295
('good review', 'is_open', 'business_stars_Good') 2538001
('good review', 'is_open', 'review_count_Low') 2289952
('good review', 'business_stars_Good', 'review_count_Low') 2449424
('useful', 'is_open', 'review_count_Low') 1152015
('useful', 'business_stars_Good', 'review_count_Low') 1054991
('is_open', 'business_stars_Good', 'review_count_Low') 3724826
('good review', 'is_open', 'business_stars_Good', 're

In [36]:
# Print the rules
index = 1
for key, item in rules.items():
    for i in range(1, len(key)):
        antecedent = key[:i]
        consequent = key[i:]
        print('Rule ', index, ': antecedent -> consequent: ', list(sum(antecedent, ())), '-> ', list(sum(consequent, ())), 'confidence: ', item)
        index += 1

Rule  1 : antecedent -> consequent:  ['is_open'] ->  ['good review'] confidence:  0.47828804707252376
Rule  2 : antecedent -> consequent:  ['business_stars_Good'] ->  ['good review'] confidence:  0.5325305162539775
Rule  3 : antecedent -> consequent:  ['review_count_Low'] ->  ['good review'] confidence:  0.45740010865209546
Rule  4 : antecedent -> consequent:  ['is_open', 'business_stars_Good'] ->  ['good review'] confidence:  0.5463811306123794
Rule  5 : antecedent -> consequent:  ['is_open', 'review_count_Low'] ->  ['good review'] confidence:  0.47482749818929343
Rule  6 : antecedent -> consequent:  ['business_stars_Good', 'review_count_Low'] ->  ['good review'] confidence:  0.5379453780174577
Rule  7 : antecedent -> consequent:  ['is_open', 'business_stars_Good', 'review_count_Low'] ->  ['good review'] confidence:  0.5547459129634512


Verify with official Apriori library

In [37]:
from mlxtend.frequent_patterns import apriori, association_rules
freq_items = apriori(review_business_df, min_support=0.1667, use_colnames=True)
freq_items



Unnamed: 0,support,itemsets
0,0.462303,(good review)
1,0.230296,(useful)
2,0.82847,(is_open)
3,0.792599,(business_stars_Good)
4,0.851077,(review_count_Low)
5,0.396247,"(good review, is_open)"
6,0.422083,"(good review, business_stars_Good)"
7,0.389283,"(good review, review_count_Low)"
8,0.187484,"(useful, is_open)"
9,0.173752,"(useful, business_stars_Good)"


In [38]:
rules = association_rules(freq_items, metric='confidence', min_threshold=0.4)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(good review),(is_open),0.462303,0.82847,0.396247,0.857116,1.034577,0.013243,1.200484,0.062157
1,(is_open),(good review),0.82847,0.462303,0.396247,0.478288,1.034577,0.013243,1.03064,0.194843
2,(good review),(business_stars_Good),0.462303,0.792599,0.422083,0.913001,1.151908,0.055662,2.383956,0.245259
3,(business_stars_Good),(good review),0.792599,0.462303,0.422083,0.532531,1.151908,0.055662,1.150229,0.635847
4,(good review),(review_count_Low),0.462303,0.851077,0.389283,0.842051,0.989395,-0.004173,0.942856,-0.019545
5,(review_count_Low),(good review),0.851077,0.462303,0.389283,0.4574,0.989395,-0.004173,0.990964,-0.067143
6,(useful),(is_open),0.230296,0.82847,0.187484,0.8141,0.982656,-0.003309,0.922705,-0.022417
7,(useful),(business_stars_Good),0.230296,0.792599,0.173752,0.754475,0.9519,-0.00878,0.844726,-0.061604
8,(useful),(review_count_Low),0.230296,0.851077,0.205643,0.892953,1.049204,0.009644,1.391201,0.060928
9,(is_open),(business_stars_Good),0.82847,0.792599,0.66451,0.802093,1.011979,0.007866,1.047974,0.069008


In [40]:
rules[rules['consequents'] == {'good review'}]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1,(is_open),(good review),0.82847,0.462303,0.396247,0.478288,1.034577,0.013243,1.03064,0.194843
3,(business_stars_Good),(good review),0.792599,0.462303,0.422083,0.532531,1.151908,0.055662,1.150229,0.635847
5,(review_count_Low),(good review),0.851077,0.462303,0.389283,0.4574,0.989395,-0.004173,0.990964,-0.067143
17,"(is_open, business_stars_Good)",(good review),0.66451,0.462303,0.363076,0.546381,1.181868,0.055871,1.18535,0.458678
23,"(review_count_Low, is_open)",(good review),0.689916,0.462303,0.327591,0.474827,1.027092,0.008641,1.023848,0.085064
27,"(review_count_Low, business_stars_Good)",(good review),0.651375,0.462303,0.350404,0.537945,1.163621,0.049272,1.163709,0.403338
40,"(review_count_Low, is_open, business_stars_Good)",(good review),0.532858,0.462303,0.295601,0.554746,1.199962,0.049259,1.207618,0.356723


In [41]:
# Concatenate random 40 million rows to the review_business_df
big_review_business_df = pd.concat([review_business_df, review_business_df.sample(n=40000000, replace=True)], ignore_index=True)
big_review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,is_open,business_stars_Bad,business_stars_Average,business_stars_Good,review_count_Low,review_count_Medium,review_count_High
0,0,0,0,0,1,0,1,0,1,0,0
1,0,0,0,0,1,0,1,0,1,0,0
2,0,1,0,0,1,0,1,0,1,0,0
3,0,0,0,0,1,0,1,0,1,0,0
4,0,0,0,0,1,0,1,0,1,0,0


In [42]:
big_review_business_df.shape

(46990280, 11)

In [43]:
# Save the dataframe as a csv file
big_review_business_df.to_csv('dataset/yelp/big_review_business.csv', index=False)

In [44]:
# Not enough data, so we will concat 120 million rows to the review_business_df
big_review_business_df = pd.concat([review_business_df, review_business_df.sample(n=120000000, replace=True)], ignore_index=True)
big_review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,is_open,business_stars_Bad,business_stars_Average,business_stars_Good,review_count_Low,review_count_Medium,review_count_High
0,0,0,0,0,1,0,1,0,1,0,0
1,0,0,0,0,1,0,1,0,1,0,0
2,0,1,0,0,1,0,1,0,1,0,0
3,0,0,0,0,1,0,1,0,1,0,0
4,0,0,0,0,1,0,1,0,1,0,0


In [45]:
big_review_business_df.shape

(126990280, 11)

In [46]:
big_review_business_df.tail()

Unnamed: 0,good review,useful,funny,cool,is_open,business_stars_Bad,business_stars_Average,business_stars_Good,review_count_Low,review_count_Medium,review_count_High
126990275,0,0,0,0,1,0,0,1,1,0,0
126990276,0,0,0,0,1,0,1,0,1,0,0
126990277,0,0,0,0,1,0,0,1,1,0,0
126990278,0,0,0,0,1,0,0,1,1,0,0
126990279,0,1,1,1,1,0,0,1,1,0,0


In [47]:
# Save the dataframe as a csv file
big_review_business_df.to_csv('dataset/yelp/big_review_business.csv', index=False)

In [1]:
import pandas as pd

In [2]:
review_business_df = pd.read_csv('dataset/yelp/review_business.csv')
review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,is_open,business_stars_Bad,business_stars_Average,business_stars_Good,review_count_Low,review_count_Medium,review_count_High
0,0,0,0,0,1,0,1,0,1,0,0
1,0,0,0,0,1,0,1,0,1,0,0
2,0,2,0,1,1,0,1,0,1,0,0
3,0,0,0,0,1,0,1,0,1,0,0
4,0,0,0,0,1,0,1,0,1,0,0


In [3]:
# Not enough data, so we will concat 240 million rows to the review_business_df
big_review_business_df = pd.concat([review_business_df, review_business_df.sample(n=240000000, replace=True)], ignore_index=True)
big_review_business_df.head()

Unnamed: 0,good review,useful,funny,cool,is_open,business_stars_Bad,business_stars_Average,business_stars_Good,review_count_Low,review_count_Medium,review_count_High
0,0,0,0,0,1,0,1,0,1,0,0
1,0,0,0,0,1,0,1,0,1,0,0
2,0,2,0,1,1,0,1,0,1,0,0
3,0,0,0,0,1,0,1,0,1,0,0
4,0,0,0,0,1,0,1,0,1,0,0


In [4]:
big_review_business_df.shape

(246990280, 11)

In [5]:
# Save the dataframe as a csv file
big_review_business_df.to_csv('dataset/yelp/big_review_business.csv', index=False)

In [4]:
big_review_business_df = pd.read_csv('dataset/yelp/big_review_business.csv')

In [5]:
# Run the apriori algorithm
combined_freq_itemsets, rules = my_apriori(big_review_business_df, 24699028*3, 0.4, target='good review')

{('good review',): 114174609.0, ('is_open',): 204629953.0, ('business_stars_Good',): 195763731.0, ('review_count_Low',): 210205206.0}
{'good review': 114174609.0, 'is_open': 204629953.0, 'business_stars_Good': 195763731.0, 'review_count_Low': 210205206.0}
{('good review',): 114174609.0, ('is_open',): 204629953.0, ('business_stars_Good',): 195763731.0, ('review_count_Low',): 210205206.0}
k:  2
{('good review',): 114174609.0, ('is_open',): 204629953.0, ('business_stars_Good',): 195763731.0, ('review_count_Low',): 210205206.0, ('good review', 'is_open'): 97863668, ('good review', 'business_stars_Good'): 104248379, ('good review', 'review_count_Low'): 96138474, ('is_open', 'business_stars_Good'): 164131193, ('is_open', 'review_count_Low'): 170407058, ('business_stars_Good', 'review_count_Low'): 160880739}
k:  3
{('good review',): 114174609.0, ('is_open',): 204629953.0, ('business_stars_Good',): 195763731.0, ('review_count_Low',): 210205206.0, ('good review', 'is_open'): 97863668, ('good re

In [6]:
# Print the frequent itemsets
for key, value in combined_freq_itemsets.items():
    print(key, value)


('good review',) 114174609.0
('is_open',) 204629953.0
('business_stars_Good',) 195763731.0
('review_count_Low',) 210205206.0
('good review', 'is_open') 97863668
('good review', 'business_stars_Good') 104248379
('good review', 'review_count_Low') 96138474
('is_open', 'business_stars_Good') 164131193
('is_open', 'review_count_Low') 170407058
('business_stars_Good', 'review_count_Low') 160880739
('good review', 'is_open', 'business_stars_Good') 89676062
('good review', 'is_open', 'review_count_Low') 80907105
('good review', 'business_stars_Good', 'review_count_Low') 86542851
('is_open', 'business_stars_Good', 'review_count_Low') 131613529


In [7]:
# Print the rules
index = 1
for key, item in rules.items():
    for i in range(1, len(key)):
        antecedent = key[:i]
        consequent = key[i:]
        print('Rule ', index, ': antecedent -> consequent: ', list(sum(antecedent, ())), '-> ', list(sum(consequent, ())), 'confidence: ', item)
        index += 1

Rule  1 : antecedent -> consequent:  ['is_open'] ->  ['good review'] confidence:  0.47824703356111314
Rule  2 : antecedent -> consequent:  ['business_stars_Good'] ->  ['good review'] confidence:  0.5325214148069133
Rule  3 : antecedent -> consequent:  ['review_count_Low'] ->  ['good review'] confidence:  0.4573553425693938
Rule  4 : antecedent -> consequent:  ['is_open', 'business_stars_Good'] ->  ['good review'] confidence:  0.5463681848702581
Rule  5 : antecedent -> consequent:  ['is_open', 'review_count_Low'] ->  ['good review'] confidence:  0.4747872884467027
Rule  6 : antecedent -> consequent:  ['business_stars_Good', 'review_count_Low'] ->  ['good review'] confidence:  0.5379317097741576
