<a href="https://colab.research.google.com/github/davidofitaly/07_association_rules_projects/blob/main/01_shopping_basket_analysis_visualizing_purchase_patterns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Table of contents:
1. [Import of libraries](#0)
2. [Data creation](#1)
3. [Data preprocessing](#2)
4. [Algorithm Aprori](#3)


### <a name='0'> </a> Import of libraries

In [None]:
# Import the necessary libraries
import pandas as pd

pd.set_option('display.float_format', lambda x: f'{x:.2f}')

# Print the version of the imported libraries for reference
print(f'Pandas: {pd.__version__}')

Pandas: 2.2.2


### <a name='1'> </a> Data creation

In [None]:
# The dataset represents a collection of shopping transactions.
# Each transaction (list) contains products purchased together.
# Products range from groceries (milk, bread, eggs) to household items (coffee, butter, chicken).
# This dataset is used for market basket analysis to find frequent itemsets and generate association rules.
dataset = [
    ['milk', 'bread', 'eggs', 'apple', 'carrot', 'chicken'],
    ['bread', 'butter', 'cheese', 'lettuce', 'tomato'],
    ['milk', 'bread', 'butter', 'yogurt', 'orange'],
    ['bread', 'butter', 'eggs', 'coffee', 'sugar', 'flour'],
    ['milk', 'butter', 'eggs', 'banana', 'lettuce'],
    ['tea', 'bread', 'butter', 'onion', 'carrot'],
    ['milk', 'chocolate', 'bread', 'eggs', 'chicken', 'coffee'],
    ['apple', 'banana', 'milk', 'bread', 'lettuce', 'orange'],
    ['butter', 'cheese', 'yogurt', 'broccoli', 'tomato'],
    ['coffee', 'bread', 'butter', 'milk', 'banana', 'flour'],
    ['milk', 'eggs', 'chocolate', 'yogurt', 'apple', 'tomato'],
    ['tea', 'coffee', 'milk', 'bread', 'lettuce', 'carrot'],
    ['bread', 'cheese', 'apple', 'onion', 'chicken'],
    ['banana', 'yogurt', 'milk', 'carrot', 'lettuce'],
    ['chocolate', 'cheese', 'bread', 'butter', 'eggs', 'flour'],
    ['apple', 'banana', 'bread', 'onion', 'broccoli'],
    ['coffee', 'tea', 'milk', 'butter', 'eggs', 'carrot'],
    ['yogurt', 'bread', 'eggs', 'lettuce', 'tomato'],
    ['tea', 'chocolate', 'apple', 'milk', 'broccoli', 'flour'],
    ['coffee', 'banana', 'butter', 'bread', 'cheese', 'chicken'],
    ['milk', 'eggs', 'lettuce', 'chicken', 'tomato', 'onion'],
    ['apple', 'carrot', 'bread', 'yogurt', 'lettuce', 'butter'],
    ['coffee', 'milk', 'butter', 'orange', 'sugar'],
    ['tea', 'bread', 'eggs', 'carrot', 'flour'],
    ['banana', 'bread', 'chocolate', 'coffee', 'lettuce'],
    ['milk', 'chicken', 'eggs', 'broccoli', 'orange'],
    ['tea', 'sugar', 'lettuce', 'apple', 'butter'],
    ['yogurt', 'bread', 'banana', 'flour', 'chocolate'],
    ['coffee', 'bread', 'onion', 'lettuce', 'milk'],
    ['apple', 'broccoli', 'chicken', 'bread', 'cheese']
]

### <a name='2'> </a> Data preprocessing

In [None]:
# Import the TransactionEncoder class from the mlxtend.preprocessing module, which is used for converting
from mlxtend.preprocessing import TransactionEncoder

baskets = dataset.copy()

encoder = TransactionEncoder()
encoder_product = encoder.fit_transform(baskets)



In [None]:
# Create a DataFrame named 'df' using the one-hot encoded data generated by the TransactionEncoder.
df = pd.DataFrame(encoder_product, columns=encoder.columns_)
df

Unnamed: 0,apple,banana,bread,broccoli,butter,carrot,cheese,chicken,chocolate,coffee,eggs,flour,lettuce,milk,onion,orange,sugar,tea,tomato,yogurt
0,True,False,True,False,False,True,False,True,False,False,True,False,False,True,False,False,False,False,False,False
1,False,False,True,False,True,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False
2,False,False,True,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,False,True
3,False,False,True,False,True,False,False,False,False,True,True,True,False,False,False,False,True,False,False,False
4,False,True,False,False,True,False,False,False,False,False,True,False,True,True,False,False,False,False,False,False
5,False,False,True,False,True,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False
6,False,False,True,False,False,False,False,True,True,True,True,False,False,True,False,False,False,False,False,False
7,True,True,True,False,False,False,False,False,False,False,False,False,True,True,False,True,False,False,False,False
8,False,False,False,True,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True
9,False,True,True,False,True,False,False,False,False,True,False,True,False,True,False,False,False,False,False,False


### <a name='3'> </a> Algorithm Aprori

In [None]:
# Importing the necessary functions from mlxtend library for frequent pattern mining
from mlxtend.frequent_patterns import apriori, association_rules

# Applying the Apriori algorithm to find frequent itemsets with a minimum support of 0.10
# This means that we are interested in itemsets that appear in at least 10% of the transaction
supports = apriori(df, min_support=0.10, use_colnames=True)
supports = supports.sort_values(by='support', ascending=False)

supports.reset_index(drop=True, inplace=True)
supports


Unnamed: 0,support,itemsets
0,0.67,(bread)
1,0.50,(milk)
2,0.43,(butter)
3,0.37,(eggs)
4,0.37,(lettuce)
...,...,...
86,0.10,"(apple, lettuce)"
87,0.10,"(flour, eggs)"
88,0.10,"(lettuce, eggs)"
89,0.10,"(butter, flour)"


In [None]:
# Iterate over different min_support values
for i in [0.01, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70]:
    supports_i = apriori(df, min_support=i, use_colnames=True)
    supports_i = supports_i.sort_values(by='support', ascending=False)
    supports_i.reset_index(drop=True, inplace=True)

    # Display the number of product sets
    print(f'Product sets for min_support = {i}: {len(supports_i)}')

    # Optionally: Display detailed information about the sets
    if not supports.empty:
        print(f'Product sets:\n{supports}\n')
    else:
        print('No product sets found.\n')

Product sets for min_support = 0.01: 890
Product sets:
     support                                       itemsets
0       0.67                                        (bread)
1       0.50                                         (milk)
2       0.43                                       (butter)
3       0.37                                         (eggs)
4       0.37                                      (lettuce)
..       ...                                            ...
885     0.03                      (yogurt, lettuce, butter)
886     0.03                          (sugar, milk, butter)
887     0.03                            (milk, tea, butter)
888     0.03                         (milk, yogurt, butter)
889     0.03  (lettuce, eggs, onion, tomato, milk, chicken)

[890 rows x 2 columns]

Product sets for min_support = 0.1: 91
Product sets:
     support                                       itemsets
0       0.67                                        (bread)
1       0.50               

In [None]:
# Generating association rules from the frequent itemsets using the specified metric
# Here, we are using 'confidence' as the metric and setting a minimum threshold of 0.6.
# This means we are interested in rules that have a confidence of at least 60%.
rules = association_rules(supports, metric='confidence', min_threshold=0.6)
rules = rules.iloc[:, :-3]
rules = rules.sort_values(by='confidence', ascending=False)
rules.reset_index(drop=True, inplace=True)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift
0,"(coffee, banana)",(bread),0.1,0.67,0.1,1.0,1.5
1,(orange),(milk),0.13,0.5,0.13,1.0,2.0
2,"(butter, flour)",(bread),0.1,0.67,0.1,1.0,1.5
3,"(cheese, chicken)",(bread),0.1,0.67,0.1,1.0,1.5
4,"(coffee, lettuce)",(bread),0.1,0.67,0.1,1.0,1.5
5,"(flour, eggs)",(bread),0.1,0.67,0.1,1.0,1.5
6,"(chicken, apple)",(bread),0.1,0.67,0.1,1.0,1.5
7,(sugar),(butter),0.1,0.43,0.1,1.0,2.31
8,"(milk, chicken)",(eggs),0.13,0.37,0.13,1.0,2.73
9,"(chicken, eggs)",(milk),0.13,0.5,0.13,1.0,2.0
