In [37]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder

!pip install mlxtend==0.23.1

  and should_run_async(code)




# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here:
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [58]:
# load the data set ans show the first five transaction
df = pd.read_csv('https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv')
df.head()

  and should_run_async(code)


Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


In [59]:
unique_values = df['6'].unique()
print(unique_values)

['Diaper' 'Pencil' nan 'Bagel' 'Cheese' 'Milk' 'Meat' 'Bread' 'Eggs'
 'Wine']


  and should_run_async(code)


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [61]:
# create an itemset based on the products
itemset = set()
for col in df.columns:
    itemset.update(df[col].unique())

# encoding the feature
encoded_data = []
for _, row in df.iterrows():
    transaction_dict = {product: (1 if product in row.values else 0) for product in itemset}
    encoded_data.append(transaction_dict)

encoded_data[0]

  and should_run_async(code)


{'Wine': 1,
 'Eggs': 1,
 'Bread': 1,
 'Diaper': 1,
 'Meat': 1,
 'Milk': 0,
 'Cheese': 1,
 'Pencil': 1,
 'Bagel': 0,
 nan: 0}

In [64]:
# Fill the NaN column with a specific value
df = df.fillna('NaN')

# Flatten the data
flattened_data = df.values.flatten()

# Use OneHotEncoder, including 'No Product' (which represents missing values)
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_data = encoder.fit_transform(flattened_data.reshape(-1, 1))

# Create new dataframe from the encoded features with item names as columns
encoded_df = pd.DataFrame(encoded_data, columns=encoder.categories_[0])

# Create a DataFrame with 0 and 1
new_df = pd.DataFrame(0, index=df.index, columns=encoder.categories_[0])

# Loop through each row and mark presence of products, including 'No Product'
for i, row in df.iterrows():
    for product in row:
        new_df.loc[i, product] = 1

# show the new dataframe
new_df.head()

  and should_run_async(code)


Unnamed: 0,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,NaN,Pencil,Wine
0,0,1,1,1,1,1,0,0,1,1
1,0,1,1,1,0,1,1,0,1,1
2,0,0,1,0,1,1,1,1,0,1
3,0,0,1,0,1,1,1,1,0,1
4,0,0,0,0,0,1,0,1,1,1


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

In [66]:
# Since, the encoded dataframe consist of the empty column. We will drop the NaN column or u can use the index.
new_df = new_df.drop(columns=['NaN'], axis=1)
new_df.head()

  and should_run_async(code)


Unnamed: 0,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,Pencil,Wine
0,0,1,1,1,1,1,0,1,1
1,0,1,1,1,0,1,1,1,1
2,0,0,1,0,1,1,1,0,1
3,0,0,1,0,1,1,1,0,1
4,0,0,0,0,0,1,0,1,1


## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products.
For this case study, we will min_support=0.2

In [75]:
#Set threshold value untuk digunakan dalam penghitungan support
from mlxtend.frequent_patterns import apriori, association_rules
apriori(new_df, min_support=0.2, use_colnames=True)

  and should_run_async(code)


Unnamed: 0,support,itemsets
0,0.425397,(Bagel)
1,0.504762,(Bread)
2,0.501587,(Cheese)
3,0.406349,(Diaper)
4,0.438095,(Eggs)
5,0.47619,(Meat)
6,0.501587,(Milk)
7,0.361905,(Pencil)
8,0.438095,(Wine)
9,0.279365,"(Bagel, Bread)"


The we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [77]:
frequent_itemsets = apriori(new_df, min_support=0.2, use_colnames=True)
ass_rule = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)

ass_rule.drop(columns='zhangs_metric', axis = 1)

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265
1,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203
2,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754
3,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891
4,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
5,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
6,(Wine),(Cheese),0.438095,0.501587,0.269841,0.615942,1.227986,0.050098,1.297754
7,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624
8,"(Meat, Eggs)",(Cheese),0.266667,0.501587,0.215873,0.809524,1.613924,0.082116,2.616667
9,"(Meat, Cheese)",(Eggs),0.32381,0.438095,0.215873,0.666667,1.521739,0.074014,1.685714


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__, __conviction__, __conviction__ and the interpretation from the case above (please use text section)

Explanation:
- Antecedent Support: The proportion of transactions that contain the antecedent item(s) in a rule.
- Consequent Support: The proportion of transactions that contain the consequent item(s) in a rule.
- Support: Support is an indication of how frequently the item set appears in the data set.
- Confidence: The proportion of transactions containing the antecedent that also contain the consequent. It measures the strength of the rule.
- Lift: The degree of association between X and Y, as compared to the independence of X and Y. It measures how much more likely the consequent is given the antecedent.
- Leverage: Measures the difference between the observed frequency of X and Y appearing together and the expected frequency of X and Y appearing together if they were independent.
- Conviction: Measures how dependent the consequent is on the antecedent by considering the frequency of the consequent not occurring when the antecedent is present.

Interpretation:

- The table shows how items in a market basket are related and often bought together. Each rule links items (e.g., {Meat, Eggs} → {Cheese}) and measures how strong the connection is. For instance, "Cheese" is 1.61 times more likely to be bought when "Meat" and "Eggs" are purchased together (highest lift), and "Meat" is bought 72.3% of the time when "Eggs" and "Cheese" are bought (highest confidence). The rule {Meat, Milk} → {Cheese} shows a strong dependency with a high conviction of 2.95. These insights can help businesses bundle items, suggest related products, and arrange inventory more effectively.

### References
https://towardsdatascience.com/association-rules-2-aa9a77241654

https://medium.com/@iqra.bismi/how-to-perform-market-basket-analysis-using-apriori-algorithm-and-association-rules-3f6ba61d6e4b

https://chatgpt.com/c/674834fe-2a84-800d-b2c3-98321e157591