# Market Basket Analysis - Association Rule Mining

**Objective:** Discover patterns in customer purchase behavior using Association Rule Mining.

**What is Association Rule Mining?**
- **Market Basket Analysis:** Find products frequently bought together
- **Association Rules:** If customer buys X, they likely buy Y
- **Applications:** Product recommendations, store layout optimization, cross-selling strategies

**Key Concepts:**
- **Support:** How often items appear together (popularity)
- **Confidence:** How often rule is true (reliability)
- **Lift:** How much more likely items are bought together vs independently

**Algorithms Used:**
1. **Apriori:** Classic algorithm, bottom-up approach
2. **FP-Growth:** Faster algorithm using tree structure

**Key Steps:**
1. Load and prepare transaction data
2. Apply both Apriori and FP-Growth
3. Generate association rules
4. Compare algorithms


In [None]:
# Import necessary libraries for association rule mining
import pandas as pd  # Data manipulation
import numpy as np  # Numerical operations
import matplotlib.pyplot as plt  # Visualization
import seaborn as sns  # Statistical plots
from sklearn.preprocessing import StandardScaler  # Not used in this notebook
from mlxtend.preprocessing import TransactionEncoder  # Convert transactions to binary matrix
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules  # Mining algorithms

In [None]:
# Load sales transaction data
# sep=";" indicates the CSV uses semicolon as delimiter
data = pd.read_csv("../Data/Sales.csv", sep=";")

In [None]:
# Use only first 10,000 transactions for faster processing
# Full dataset might be too large for demonstration purposes
data = data.iloc[:10000, :]

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data = data[['BillNo', 'Itemname']]

In [None]:
data.isnull().sum()

In [None]:
data.dropna(inplace=True)

In [None]:
data.info()

In [None]:
data.head(10)

In [None]:
transactions = data.groupby("BillNo")["Itemname"].apply(list).reset_index()
transactions.head()

In [None]:
te = TransactionEncoder()
transactions_bool_list = te.fit(transactions["Itemname"]).transform(transactions["Itemname"])


data_transaction = pd.DataFrame(transactions_bool_list, columns=te.columns_)
data_transaction.shape

In [None]:
pd.set_option('display.max_columns', None)
data_transaction.columns

# Evaluation for apiori

In [None]:
frequent_itemsets = apriori(data_transaction, min_support=0.05, use_colnames=True)

rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0, num_itemsets=100)
rules = rules.sort_values(["confidence", "lift"], ascending=[False, False])

rules[["antecedents", "consequents", "support", "confidence", "lift"]].head(10)

# Evaluation FPgrowth

In [None]:
frequent_itemsets_fp = fpgrowth(data_transaction, min_support=0.05, use_colnames=True)

rules_fp = association_rules(frequent_itemsets_fp, metric="lift", min_threshold=1.0, num_itemsets=100)
rules_fp = rules_fp.sort_values(["confidence", "lift"], ascending=[False, False])

rules_fp[["antecedents", "consequents", "support", "confidence", "lift"]].head(10)

# Compare Both Algorithms

In [None]:
# Compare execution time and results
import time

# Test Apriori speed
start = time.time()
freq_ap = apriori(data_transaction, min_support=0.05, use_colnames=True)
rules_ap = association_rules(freq_ap, metric="lift", min_threshold=1.0)
apriori_time = time.time() - start

# Test FP-Growth speed
start = time.time()
freq_fp = fpgrowth(data_transaction, min_support=0.05, use_colnames=True)
rules_fp = association_rules(freq_fp, metric="lift", min_threshold=1.0)
fpgrowth_time = time.time() - start

print("Comparison Results:")
print("-" * 50)
print(f"Apriori Time: {apriori_time:.3f} seconds")
print(f"FP-Growth Time: {fpgrowth_time:.3f} seconds")
print(f"\nApriori Rules: {len(rules_ap)}")
print(f"FP-Growth Rules: {len(rules_fp)}")
print(f"\nAverage Confidence (Apriori): {rules_ap['confidence'].mean():.3f}")
print(f"Average Confidence (FP-Growth): {rules_fp['confidence'].mean():.3f}")

In [None]:
# Plot execution time
plt.figure(figsize=(8, 5))
plt.bar(['Apriori', 'FP-Growth'], [apriori_time, fpgrowth_time], color=['blue', 'red'])
plt.ylabel('Time (seconds)')
plt.title('Algorithm Speed Comparison')
plt.show()

In [None]:
# Visualize rules - Support vs Confidence
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(rules_ap['support'], rules_ap['confidence'], c=rules_ap['lift'], cmap='viridis', alpha=0.6)
plt.colorbar(label='Lift')
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.title('Apriori Rules')

plt.subplot(1, 2, 2)
plt.scatter(rules_fp['support'], rules_fp['confidence'], c=rules_fp['lift'], cmap='plasma', alpha=0.6)
plt.colorbar(label='Lift')
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.title('FP-Growth Rules')

plt.tight_layout()
plt.show()

In [None]:
# Show top 10 rules by lift
top_rules = rules_fp.nlargest(10, 'lift')

plt.figure(figsize=(10, 6))
rules_text = [f"{list(r['antecedents'])[0]} â†’ {list(r['consequents'])[0]}" for _, r in top_rules.iterrows()]
plt.barh(range(10), top_rules['lift'], color='coral')
plt.yticks(range(10), rules_text)
plt.xlabel('Lift')
plt.title('Top 10 Association Rules')
plt.tight_layout()
plt.show()

# Conclusion

In [None]:
print("Final Results:")
print("=" * 60)
print(f"\nBoth algorithms give the same rules (same support, confidence, and lift)")
print(f"But FP-Growth is faster: {fpgrowth_time:.3f}s vs {apriori_time:.3f}s")
print(f"FP-Growth is {((apriori_time - fpgrowth_time) / apriori_time * 100):.1f}% faster\n")
print("Recommendation: Use FP-Growth for better performance")
print("=" * 60)