<h1> Task 1: Implement the Apriori algorithm to mine frequent itemsets </h1>

In [365]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Make dummy data
data = pd.DataFrame(np.random.randint(0, 2, size=(10, 8)), columns=list('ABCDEFGH'))
data

Unnamed: 0,A,B,C,D,E,F,G,H
0,1,0,0,0,0,0,0,0
1,1,1,0,1,1,1,0,1
2,1,1,1,1,0,0,0,1
3,1,0,0,0,1,1,0,0
4,0,0,0,1,1,1,0,0
5,1,0,0,1,0,0,0,1
6,1,0,1,1,0,1,1,1
7,0,1,1,1,1,0,1,1
8,1,1,0,0,0,1,1,0
9,0,1,1,1,0,1,1,1


In [426]:
# Count the number of 0s and 1s in each column
# The number of 1s is the number of times each item appears
value_counts = data.apply(pd.value_counts)
value_counts

Unnamed: 0,A,B,C,D,E,F,G,H
0,3,5,6,3,6,4,6,4
1,7,5,4,7,4,6,4,6


In [427]:
value_counts['A'][1]

7

Using the lecture notes explanation of the Apriori Algorithm, we have 4 steps to do.
1. Candidate Generation
2. Candidate Pruning
3. Support Counting
4. Candidate Elimination

Sample code for 1 and 2 itemset

Define the min support

In [428]:
min_support = 4

In [429]:
# Combined dictionary of frequent itemsets
combined_freq_itemsets = {}

Generate F1 (frequent 1-itemsets)

In [430]:
# Get the frequent itemsets with count greater than or equal to min_support
columns = data.columns
frequent_itemsets = {}
for column in columns:
    # Append the itemset and its count to the dictionary if the count is greater than or equal to min_support
    if value_counts[column][1] >= min_support:
        frequent_itemsets[column] = value_counts[column][1]
        # frequent_itemsets.append((column, value_counts[column][1]))
        # data.drop(column, axis=1, inplace=True)

print(frequent_itemsets)

dummy_dict = frequent_itemsets.copy()
for key, item in dummy_dict.copy().items():
    dummy_dict[(tuple(key))] = dummy_dict.pop(key)
print(dummy_dict)
    
combined_freq_itemsets.update(dummy_dict)

{'A': 7, 'B': 5, 'C': 4, 'D': 7, 'E': 4, 'F': 6, 'G': 4, 'H': 6}
{('A',): 7, ('B',): 5, ('C',): 4, ('D',): 7, ('E',): 4, ('F',): 6, ('G',): 4, ('H',): 6}


Step 1: Candidate Generation

In [431]:
# Generate all possible combinations of frequent itemsets with k+1 items
combinations = []
k = 1
combinations.append(list(itertools.combinations(frequent_itemsets.keys(), k+1)))

combinations

[[('A', 'B'),
  ('A', 'C'),
  ('A', 'D'),
  ('A', 'E'),
  ('A', 'F'),
  ('A', 'G'),
  ('A', 'H'),
  ('B', 'C'),
  ('B', 'D'),
  ('B', 'E'),
  ('B', 'F'),
  ('B', 'G'),
  ('B', 'H'),
  ('C', 'D'),
  ('C', 'E'),
  ('C', 'F'),
  ('C', 'G'),
  ('C', 'H'),
  ('D', 'E'),
  ('D', 'F'),
  ('D', 'G'),
  ('D', 'H'),
  ('E', 'F'),
  ('E', 'G'),
  ('E', 'H'),
  ('F', 'G'),
  ('F', 'H'),
  ('G', 'H')]]

Step 2: Candidate Pruning (do not need to prune for 2 itemset as F1 items are all frequent)

Step 3: Support Counting

In [432]:
# Convert the list of lists of tuples to a list of tuples
combinations = combinations[0]

In [433]:
# Count the number of occurences of each combination in the data
combinations_count = {}
for combination in combinations:
    # Using groupby and size to count the number of occurences of each combination
    # Resetting the index to get the count of each combination as a column in the dataframe
    test = data.groupby(list(combination)).size().reset_index(name='count')

    # Append the combination and its count to the dictionary
    # The count of each combination is the last value in the count column
    # as the last row of the dataframe is when both items are present in one transaction in the original data dataframe
    combinations_count[combination] = test['count'].iloc[-1]

# print(test)
combinations_count

{('A', 'B'): 3,
 ('A', 'C'): 2,
 ('A', 'D'): 4,
 ('A', 'E'): 2,
 ('A', 'F'): 4,
 ('A', 'G'): 2,
 ('A', 'H'): 4,
 ('B', 'C'): 3,
 ('B', 'D'): 4,
 ('B', 'E'): 2,
 ('B', 'F'): 3,
 ('B', 'G'): 3,
 ('B', 'H'): 4,
 ('C', 'D'): 4,
 ('C', 'E'): 1,
 ('C', 'F'): 2,
 ('C', 'G'): 3,
 ('C', 'H'): 4,
 ('D', 'E'): 3,
 ('D', 'F'): 4,
 ('D', 'G'): 3,
 ('D', 'H'): 6,
 ('E', 'F'): 3,
 ('E', 'G'): 1,
 ('E', 'H'): 2,
 ('F', 'G'): 3,
 ('F', 'H'): 3,
 ('G', 'H'): 3}

In [434]:
# test.index.values[-1].count(1)
test1 = test
count = test1['count'].iloc[-1]
count

3

Step 4: Candidate Elimination

In [435]:
# Prune the combinations with count less than min_support
for combination in combinations_count.copy().keys():
    if combinations_count[combination] < min_support:
        combinations_count.pop(combination)

print(combinations_count)
combined_freq_itemsets.update(combinations_count)

{('A', 'D'): 4, ('A', 'F'): 4, ('A', 'H'): 4, ('B', 'D'): 4, ('B', 'H'): 4, ('C', 'D'): 4, ('C', 'H'): 4, ('D', 'F'): 4, ('D', 'H'): 6}


Candidate generation for 2 or more frequent itemsets

In [436]:
# Merge the combinations if the first k-1 items are the same
# and the last item is different
# This is done to generate combinations with k+1 items
# from combinations with k items

# Compare first k-1 items of each combination
# If they are the same, merge them
# If they are not the same, do not merge them
# The merged combinations are stored in a dictionary
merged_combinations = {}
# for combination1 in combinations_count.keys():
#     for combination2 in combinations_count.keys():
#         # Check if the first k-1 items are the same
#         if combination1[:-1] == combination2[:-1]:
#             # Check if the last item is different
#             if combination1[-1] != combination2[-1]:
#                 # Merge the combinations
#                 merged_combinations[combination1 + (combination2[-1],)] = 0

for index, combination1 in enumerate(combinations_count.keys()):
    for combination2 in list(combinations_count.keys())[index+1:]:
        # Check if the first k-1 items are the same
        if combination1[:-1] == combination2[:-1]:
            # Check if the last item is different
            if combination1[-1] != combination2[-1]:
                # Merge the combinations
                merged_combinations[combination1 + (combination2[-1],)] = 0


merged_combinations


{('A', 'D', 'F'): 0,
 ('A', 'D', 'H'): 0,
 ('A', 'F', 'H'): 0,
 ('B', 'D', 'H'): 0,
 ('C', 'D', 'H'): 0,
 ('D', 'F', 'H'): 0}

Support counting

In [437]:
# Count the number of occurences of each combination in the data
merged_combinations_count = {}
for combination in merged_combinations.keys():
    # Using groupby and size to count the number of occurences of each combination
    # Resetting the index to get the count of each combination as a column in the dataframe
    test = data.groupby(list(combination)).size().reset_index(name='count')

    # Append the combination and its count to the dictionary
    # The count of each combination is the last value in the count column
    # as the last row of the dataframe is when both items are present in one transaction in the original data dataframe
    merged_combinations_count[combination] = test['count'].iloc[-1]

# print(test)
merged_combinations_count

{('A', 'D', 'F'): 2,
 ('A', 'D', 'H'): 4,
 ('A', 'F', 'H'): 2,
 ('B', 'D', 'H'): 4,
 ('C', 'D', 'H'): 4,
 ('D', 'F', 'H'): 3}

In [438]:
# Prune the combinations with count less than min_support
for combination in merged_combinations_count.copy().keys():
    if merged_combinations_count[combination] < min_support:
        merged_combinations_count.pop(combination)

print(merged_combinations_count)
combined_freq_itemsets.update(merged_combinations_count)


{('A', 'D', 'H'): 4, ('B', 'D', 'H'): 4, ('C', 'D', 'H'): 4}


In [439]:
combined_freq_itemsets 

{('A',): 7,
 ('B',): 5,
 ('C',): 4,
 ('D',): 7,
 ('E',): 4,
 ('F',): 6,
 ('G',): 4,
 ('H',): 6,
 ('A', 'D'): 4,
 ('A', 'F'): 4,
 ('A', 'H'): 4,
 ('B', 'D'): 4,
 ('B', 'H'): 4,
 ('C', 'D'): 4,
 ('C', 'H'): 4,
 ('D', 'F'): 4,
 ('D', 'H'): 6,
 ('A', 'D', 'H'): 4,
 ('B', 'D', 'H'): 4,
 ('C', 'D', 'H'): 4}

Part 2: Rule generation

In [440]:
# Generate rules for frequent itemsets with k+1 items with min confidence
# The rules are generated by splitting the combination into two parts
min_confidence = 0.5
rules = {}
for key in combined_freq_itemsets.keys():
    # Split the combination into two parts
    # The first part is the antecedent and the second part is the consequent
    for i in range(1, len(key)):
        antecedent = key[:i]
        consequent = key[i:]

        # Calculate the confidence of the rule
        # Confidence = support of combination / support of antecedent
        confidence = combined_freq_itemsets[key] / combined_freq_itemsets[antecedent]

        # Check if the confidence is greater than min_confidence
        if confidence > min_confidence:
            # Append the rule to the rules dictionary
            rules[(antecedent, consequent)] = confidence

rules

{(('A',), ('D',)): 0.5714285714285714,
 (('A',), ('F',)): 0.5714285714285714,
 (('A',), ('H',)): 0.5714285714285714,
 (('B',), ('D',)): 0.8,
 (('B',), ('H',)): 0.8,
 (('C',), ('D',)): 1.0,
 (('C',), ('H',)): 1.0,
 (('D',), ('F',)): 0.5714285714285714,
 (('D',), ('H',)): 0.8571428571428571,
 (('A',), ('D', 'H')): 0.5714285714285714,
 (('A', 'D'), ('H',)): 1.0,
 (('B',), ('D', 'H')): 0.8,
 (('B', 'D'), ('H',)): 1.0,
 (('C',), ('D', 'H')): 1.0,
 (('C', 'D'), ('H',)): 1.0}

In [441]:
# Prune smaller rules based on confidence of larger rules
# If larger rule has confidence less than min_confidence, smaller rules are pruned

# Sort the rules in descending order of confidence
sorted_rules = sorted(rules.items(), key=lambda x: x[1], reverse=True)
sorted_rules

# Prune the rules
pruned_rules = {}
for rule in sorted_rules:
    # Append the rule to the pruned_rules dictionary if it is not a subset of any rule in the dictionary
    if not any([set(rule[0]).issubset(set(pruned_rule[0])) for pruned_rule in pruned_rules.keys()]):
        pruned_rules[rule[0]] = rule[1]

pruned_rules

{(('C',), ('D',)): 1.0,
 (('C',), ('H',)): 1.0,
 (('A', 'D'), ('H',)): 1.0,
 (('B', 'D'), ('H',)): 1.0,
 (('C',), ('D', 'H')): 1.0,
 (('C', 'D'), ('H',)): 1.0,
 (('D',), ('H',)): 0.8571428571428571,
 (('B',), ('D',)): 0.8,
 (('B',), ('H',)): 0.8,
 (('B',), ('D', 'H')): 0.8,
 (('A',), ('D',)): 0.5714285714285714,
 (('A',), ('F',)): 0.5714285714285714,
 (('A',), ('H',)): 0.5714285714285714,
 (('D',), ('F',)): 0.5714285714285714,
 (('A',), ('D', 'H')): 0.5714285714285714}

In [419]:
# Apriori algorithm
# We combine the above steps to generate frequent itemsets with k+1 items
# from frequent itemsets with k items
# We continue this process until we get no frequent itemsets with k+1 items
# We then combine the frequent itemsets with k items to generate association rules
# We continue this process until we get no association rules
# We then combine the association rules to generate association rules with k+1 items


# Function to generate frequent itemsets with 1 item (initialisation)
def generate_freq_1_itemsets(data, min_support, combined_freq_itemsets):
    # Count the number of 0s and 1s in each column
    # The number of 1s is the number of times each item appears
    value_counts = data.apply(pd.value_counts)

    # Get the frequent itemsets with count greater than or equal to min_support
    columns = data.columns
    frequent_itemsets = {}
    for column in columns:
        # Append the itemset and its count to the dictionary if the count is greater than or equal to min_support
        if value_counts[column][1] >= min_support:
            frequent_itemsets[column] = value_counts[column][1]
            # frequent_itemsets.append((column, value_counts[column][1]))
            # data.drop(column, axis=1, inplace=True)

    dummy_dict = frequent_itemsets.copy()
    for key, item in dummy_dict.copy().items():
        # For dummy data
        # dummy_dict[(tuple(key))] = dummy_dict.pop(key)
        # For real data
        dummy_dict[(key,)] = dummy_dict.pop(key)
    print(dummy_dict)

    combined_freq_itemsets.update(dummy_dict)

    print(frequent_itemsets)
    return frequent_itemsets


# Function to generate frequent itemsets with k+1 items
def generate_k_plus_1_candidate_itemsets(frequent_itemsets, k):
    # Generate all possible combinations of frequent itemsets with k+1 items

    # If k = 1, we do not need to merge the combinations
    if k == 1:
        combinations = []
        combinations.append(list(itertools.combinations(frequent_itemsets.keys(), k+1)))
        return combinations
    
    else:
        # Merge the combinations if the first k-1 items are the same
        # and the last item is different
        # This is done to generate combinations with k+1 items
        # from combinations with k items
        # Compare first k-1 items of each combination
        # If they are the same, merge them
        # If they are not the same, do not merge them
        # The merged combinations are stored in a dictionary
        merged_combinations = {}
        

        for index, combination1 in enumerate(frequent_itemsets.keys()):
            for combination2 in list(frequent_itemsets.keys())[index+1:]:
                # Check if the first k-1 items are the same
                if combination1[:-1] == combination2[:-1]:
                    # Check if the last item is different
                    if combination1[-1] != combination2[-1]:
                        # Merge the combinations
                        merged_combinations[combination1 + (combination2[-1],)] = 0

    
        return merged_combinations

# Function to count the number of occurences of each combination in the candidate itemsets
def k_plus_1_itemsets_support_counting(k_plus_1_candidate_itemsets, k, data):
    # If k = 1, we need to convert the list of lists of tuples to a list of tuples
    if k == 1:
        k_plus_1_candidate_itemsets = k_plus_1_candidate_itemsets[0]

    # Count the number of occurences of each combination in the data
    candidate_itemsets_count = {}
    for candidate_itemset in k_plus_1_candidate_itemsets:
        # Using groupby and size to count the number of occurences of each combination
        # Resetting the index to get the count of each combination as a column in the dataframe
        test = data.groupby(list(candidate_itemset)).size().reset_index(name='count')

        # Append the combination and its count to the dictionary
        # The count of each combination is the last value in the count column
        # as the last row of the dataframe is when both items are present in one transaction in the original data dataframe
        candidate_itemsets_count[candidate_itemset] = test['count'].iloc[-1]

    return candidate_itemsets_count


def candidate_elimination(combinations_count, min_support, combined_freq_itemsets):
    
    # Prune the combinations with count less than min_support
    for combination in combinations_count.copy().keys():
        if combinations_count[combination] < min_support:
            combinations_count.pop(combination)
    
    combined_freq_itemsets.update(combinations_count)
    return combinations_count

def generate_rules(combined_freq_itemsets, min_confidence):
    # Generate rules for frequent itemsets with k+1 items with min confidence
    # The rules are generated by splitting the combination into two parts
    rules = {}
    for key in combined_freq_itemsets.keys():
        # print(key, len(key))
        # Split the combination into two parts
        # The first part is the antecedent and the second part is the consequent
        for i in range(1, len(key)):
            antecedent = key[:i]
            consequent = key[i:]

            # Calculate the confidence of the rule
            # Confidence = support of combination / support of antecedent
            confidence = combined_freq_itemsets[key] / combined_freq_itemsets[antecedent]

            # Check if the confidence is greater than min_confidence
            if confidence > min_confidence:
                # Append the rule to the rules dictionary
                rules[(antecedent, consequent)] = confidence

    return rules
    

In [420]:
def apriori(data, min_support, min_confidence):
    
    # Combined dictionary of frequent itemsets
    combined_freq_itemsets = {}

    # Get frequent 1 itemsets
    frequent_1_itemsets = generate_freq_1_itemsets(data, min_support, combined_freq_itemsets)

    k_plus_1_candidate_itemsets = None
    k_plus_1_itemsets_support_count = None
    k_plus_1_frequent_itemsets = None
    
    k = 1

    while True:
        # print(k)
        if k == 1:
            k_plus_1_candidate_itemsets = generate_k_plus_1_candidate_itemsets(frequent_1_itemsets, k)
        else:
            k_plus_1_candidate_itemsets = generate_k_plus_1_candidate_itemsets(k_plus_1_frequent_itemsets, k)
        print(combined_freq_itemsets)
        # print(k_plus_1_candidate_itemsets)
        k_plus_1_itemsets_support_count = k_plus_1_itemsets_support_counting(k_plus_1_candidate_itemsets, k, data)
        
        k_plus_1_frequent_itemsets = candidate_elimination(k_plus_1_itemsets_support_count, min_support, combined_freq_itemsets)
        # print(k_plus_1_frequent_itemsets)
        k += 1
        print('k: ', k)
        # If there are no frequent itemsets with k+1 items, break
        if len(k_plus_1_frequent_itemsets) == 0:
            break

    # Generate rules for frequent itemsets with k+1 items with min confidence
    # The rules are generated by splitting the combination into two parts
    rules = generate_rules(combined_freq_itemsets, min_confidence)
    
    return combined_freq_itemsets, rules


In [442]:
combined_freq_itemsets, rules = apriori(data, 4, 0.5)

{('A',): 7, ('B',): 5, ('C',): 4, ('D',): 7, ('E',): 4, ('F',): 6, ('G',): 4, ('H',): 6}
{'A': 7, 'B': 5, 'C': 4, 'D': 7, 'E': 4, 'F': 6, 'G': 4, 'H': 6}
{('A',): 7, ('B',): 5, ('C',): 4, ('D',): 7, ('E',): 4, ('F',): 6, ('G',): 4, ('H',): 6}
k:  2
{('A',): 7, ('B',): 5, ('C',): 4, ('D',): 7, ('E',): 4, ('F',): 6, ('G',): 4, ('H',): 6, ('A', 'D'): 4, ('A', 'F'): 4, ('A', 'H'): 4, ('B', 'D'): 4, ('B', 'H'): 4, ('C', 'D'): 4, ('C', 'H'): 4, ('D', 'F'): 4, ('D', 'H'): 6}
k:  3
{('A',): 7, ('B',): 5, ('C',): 4, ('D',): 7, ('E',): 4, ('F',): 6, ('G',): 4, ('H',): 6, ('A', 'D'): 4, ('A', 'F'): 4, ('A', 'H'): 4, ('B', 'D'): 4, ('B', 'H'): 4, ('C', 'D'): 4, ('C', 'H'): 4, ('D', 'F'): 4, ('D', 'H'): 6, ('A', 'D', 'H'): 4, ('B', 'D', 'H'): 4, ('C', 'D', 'H'): 4}
k:  4


In [443]:
print('combined frequent itemsets: ', combined_freq_itemsets)

combined frequent itemsets:  {('A',): 7, ('B',): 5, ('C',): 4, ('D',): 7, ('E',): 4, ('F',): 6, ('G',): 4, ('H',): 6, ('A', 'D'): 4, ('A', 'F'): 4, ('A', 'H'): 4, ('B', 'D'): 4, ('B', 'H'): 4, ('C', 'D'): 4, ('C', 'H'): 4, ('D', 'F'): 4, ('D', 'H'): 6, ('A', 'D', 'H'): 4, ('B', 'D', 'H'): 4, ('C', 'D', 'H'): 4}


In [444]:
for key, item in rules.items():
    for i in range(1, len(key)):
        antecedent = key[:i]
        consequent = key[i:]
        print('antecedent: ', list(sum(antecedent, ())), '-> consequent: ', list(sum(consequent, ())), 'confidence: ', item)

antecedent:  ['A'] -> consequent:  ['D'] confidence:  0.5714285714285714
antecedent:  ['A'] -> consequent:  ['F'] confidence:  0.5714285714285714
antecedent:  ['A'] -> consequent:  ['H'] confidence:  0.5714285714285714
antecedent:  ['B'] -> consequent:  ['D'] confidence:  0.8
antecedent:  ['B'] -> consequent:  ['H'] confidence:  0.8
antecedent:  ['C'] -> consequent:  ['D'] confidence:  1.0
antecedent:  ['C'] -> consequent:  ['H'] confidence:  1.0
antecedent:  ['D'] -> consequent:  ['F'] confidence:  0.5714285714285714
antecedent:  ['D'] -> consequent:  ['H'] confidence:  0.8571428571428571
antecedent:  ['A'] -> consequent:  ['D', 'H'] confidence:  0.5714285714285714
antecedent:  ['A', 'D'] -> consequent:  ['H'] confidence:  1.0
antecedent:  ['B'] -> consequent:  ['D', 'H'] confidence:  0.8
antecedent:  ['B', 'D'] -> consequent:  ['H'] confidence:  1.0
antecedent:  ['C'] -> consequent:  ['D', 'H'] confidence:  1.0
antecedent:  ['C', 'D'] -> consequent:  ['H'] confidence:  1.0


<h1> Task 2: Use 3 datasets to run Apriori algorithm with different min-support thresholds </h1>

In [387]:
df = pd.read_csv('Market_Basket_Optimisation.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [388]:
# Data CLeaning
df.fillna(0, inplace=True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,chutney,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,turkey,avocado,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,mineral water,milk,energy bar,whole wheat rice,green tea,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [389]:
# Get the unique items in the dataset
unique_items = pd.unique(df.values.ravel('K'))
unique_items

array(['shrimp', 'burgers', 'chutney', 'turkey', 'mineral water',
       'low fat yogurt', 'whole wheat pasta', 'soup', 'frozen vegetables',
       'french fries', 'eggs', 'cookies', 'spaghetti', 'meatballs',
       'red wine', 'rice', 'parmesan cheese', 'ground beef',
       'sparkling water', 'herb & pepper', 'pickles', 'energy bar',
       'fresh tuna', 'escalope', 'avocado', 'tomato sauce',
       'clothes accessories', 'energy drink', 'chocolate',
       'grated cheese', 'yogurt cake', 'mint', 'asparagus', 'champagne',
       'ham', 'muffins', 'french wine', 'chicken', 'pasta', 'tomatoes',
       'pancakes', 'frozen smoothie', 'carrots', 'yams', 'shallot',
       'butter', 'light mayo', 'pepper', 'candy bars', 'cooking oil',
       'milk', 'green tea', 'bug spray', 'oil', 'olive oil', 'salmon',
       'cake', 'almonds', 'salt', 'strong cheese', 'hot dogs', 'pet food',
       'whole wheat rice', 'antioxydant juice', 'honey', 'sandwich',
       'salad', 'magazines', 'protein bar', '

In [390]:
transactions_data = pd.DataFrame(columns=unique_items)
transactions_data.drop(columns= 0, inplace=True)
transactions_data

Unnamed: 0,shrimp,burgers,chutney,turkey,mineral water,low fat yogurt,whole wheat pasta,soup,frozen vegetables,french fries,...,ketchup,cream,hand protein bar,body spray,oatmeal,zucchini,water spray,tea,napkins,asparagus


In [391]:
for i in range(0, len(df)):
    transaction = df.iloc[i, :].values
    transaction = transaction[transaction != 0]
    for item in transaction:
        transactions_data.at[i, item] = 1

In [392]:
transactions_data.head()

Unnamed: 0,shrimp,burgers,chutney,turkey,mineral water,low fat yogurt,whole wheat pasta,soup,frozen vegetables,french fries,...,ketchup,cream,hand protein bar,body spray,oatmeal,zucchini,water spray,tea,napkins,asparagus
0,1.0,,,,1.0,1.0,,,,,...,,,,,,,,,,
1,,1.0,,,,,,,,,...,,,,,,,,,,
2,,,1.0,,,,,,,,...,,,,,,,,,,
3,,,,1.0,,,,,,,...,,,,,,,,,,
4,,,,,1.0,,,,,,...,,,,,,,,,,


In [393]:
transactions_data.fillna(0, inplace=True)
transactions_data

Unnamed: 0,shrimp,burgers,chutney,turkey,mineral water,low fat yogurt,whole wheat pasta,soup,frozen vegetables,french fries,...,ketchup,cream,hand protein bar,body spray,oatmeal,zucchini,water spray,tea,napkins,asparagus
0,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7497,0,1,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
7498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [421]:
combined_freq_itemsets, rules = apriori(transactions_data, 100, 0.5)

{('shrimp',): 536, ('burgers',): 654, ('turkey',): 469, ('mineral water',): 1788, ('low fat yogurt',): 574, ('whole wheat pasta',): 221, ('soup',): 379, ('frozen vegetables',): 715, ('french fries',): 1282, ('eggs',): 1348, ('cookies',): 603, ('spaghetti',): 1306, ('meatballs',): 157, ('red wine',): 211, ('rice',): 141, ('parmesan cheese',): 149, ('ground beef',): 737, ('herb & pepper',): 371, ('energy bar',): 203, ('fresh tuna',): 167, ('escalope',): 595, ('avocado',): 250, ('tomato sauce',): 106, ('energy drink',): 200, ('chocolate',): 1229, ('grated cheese',): 393, ('yogurt cake',): 205, ('mint',): 131, ('champagne',): 351, ('ham',): 199, ('muffins',): 181, ('french wine',): 169, ('chicken',): 450, ('pasta',): 118, ('tomatoes',): 513, ('pancakes',): 713, ('frozen smoothie',): 475, ('carrots',): 115, ('butter',): 226, ('light mayo',): 204, ('pepper',): 199, ('cooking oil',): 383, ('milk',): 972, ('green tea',): 991, ('oil',): 173, ('olive oil',): 494, ('salmon',): 319, ('cake',): 608

In [422]:
print('combined frequent itemsets: ', combined_freq_itemsets)

combined frequent itemsets:  {('shrimp',): 536, ('burgers',): 654, ('turkey',): 469, ('mineral water',): 1788, ('low fat yogurt',): 574, ('whole wheat pasta',): 221, ('soup',): 379, ('frozen vegetables',): 715, ('french fries',): 1282, ('eggs',): 1348, ('cookies',): 603, ('spaghetti',): 1306, ('meatballs',): 157, ('red wine',): 211, ('rice',): 141, ('parmesan cheese',): 149, ('ground beef',): 737, ('herb & pepper',): 371, ('energy bar',): 203, ('fresh tuna',): 167, ('escalope',): 595, ('avocado',): 250, ('tomato sauce',): 106, ('energy drink',): 200, ('chocolate',): 1229, ('grated cheese',): 393, ('yogurt cake',): 205, ('mint',): 131, ('champagne',): 351, ('ham',): 199, ('muffins',): 181, ('french wine',): 169, ('chicken',): 450, ('pasta',): 118, ('tomatoes',): 513, ('pancakes',): 713, ('frozen smoothie',): 475, ('carrots',): 115, ('butter',): 226, ('light mayo',): 204, ('pepper',): 199, ('cooking oil',): 383, ('milk',): 972, ('green tea',): 991, ('oil',): 173, ('olive oil',): 494, ('s

In [425]:
for key, item in rules.items():
    for i in range(1, len(key)):
        antecedent = key[:i]
        consequent = key[i:]
        print('antecedent -> consequent: ', list(sum(antecedent, ())), '-> ', list(sum(consequent, ())), 'confidence: ', item)

antecedent -> consequent:  ['cookies'] ->  ['meatballs'] confidence:  1.0
antecedent -> consequent:  ['meatballs'] ->  ['white wine'] confidence:  1.0
antecedent -> consequent:  ['energy bar'] ->  ['gums'] confidence:  1.0
antecedent -> consequent:  ['tomato sauce'] ->  ['pasta'] confidence:  1.0
antecedent -> consequent:  ['energy drink'] ->  ['mushroom cream sauce'] confidence:  1.0
antecedent -> consequent:  ['mint'] ->  ['white wine'] confidence:  1.0
antecedent -> consequent:  ['ham'] ->  ['mushroom cream sauce'] confidence:  1.0
antecedent -> consequent:  ['pasta'] ->  ['butter'] confidence:  1.0
antecedent -> consequent:  ['pasta'] ->  ['light cream'] confidence:  1.0
antecedent -> consequent:  ['pasta'] ->  ['black tea'] confidence:  1.0
antecedent -> consequent:  ['pepper'] ->  ['cottage cheese'] confidence:  1.0
antecedent -> consequent:  ['hot dogs'] ->  ['black tea'] confidence:  1.0
antecedent -> consequent:  ['mushroom cream sauce'] ->  ['cereals'] confidence:  1.0
antece