## Step 1: Import Libraries

In [None]:
# Import necessary libraries for association rule mining
import pandas as pd  # Data manipulation
import numpy as np  # Numerical operations
import matplotlib.pyplot as plt  # Visualization
import seaborn as sns  # Statistical plots
import time  # For execution time measurement
from mlxtend.preprocessing import TransactionEncoder  # Convert transactions to binary matrix
from mlxtend.frequent_patterns import apriori, association_rules  # Apriori algorithm

## Step 2: Load and Explore Data

In [None]:
# Load sales transaction data
# sep=";" indicates the CSV uses semicolon as delimiter
data = pd.read_csv("../Data/Sales.csv", sep=";")

In [None]:
# Display first few rows
data.head()

In [None]:
# Check dataset shape
print(f"Dataset shape: {data.shape}")
print(f"Number of transactions: {data.shape[0]}")

## Step 3: Data Preprocessing

In [None]:
# Select only relevant columns
data = data[['BillNo', 'Itemname']]

## Step 5: Apply Apriori Algorithm

In [None]:
# Measure execution time
print("="*60)
print("Running APRIORI Algorithm...")
print("="*60)

start_time = time.time()

# Generate frequent itemsets using Apriori
# min_support=0.05 means item must appear in at least 5% of transactions
frequent_itemsets = apriori(data_transaction, min_support=0.05, use_colnames=True)

end_time = time.time()
execution_time = end_time - start_time

print(f"\nâœ“ Apriori algorithm completed in {execution_time:.4f} seconds")
print(f"âœ“ Frequent itemsets found: {len(frequent_itemsets)}")

In [None]:
# Display frequent itemsets
print("\nFrequent Itemsets:")
frequent_itemsets.sort_values('support', ascending=False).head(15)

## Step 6: Generate Association Rules

In [None]:
# Generate association rules from frequent itemsets
# metric="lift" prioritizes rules where items are more likely bought together
# min_threshold=1.0 means lift must be greater than 1 (positive correlation)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0, num_itemsets=100)

# Sort by confidence and lift
rules = rules.sort_values(["confidence", "lift"], ascending=[False, False])

print(f"\nâœ“ Association rules generated: {len(rules)}")

In [None]:
# Display top association rules
print("\nTop 10 Association Rules:")
print("="*80)
rules[["antecedents", "consequents", "support", "confidence", "lift"]].head(10)

## Step 7: Visualize Results

In [None]:
# Scatter plot: Support vs Confidence
plt.figure(figsize=(10, 6))
scatter = plt.scatter(rules['support'], rules['confidence'], c=rules['lift'], 
                      cmap='viridis', s=100, alpha=0.6, edgecolors='black')
plt.colorbar(scatter, label='Lift')
plt.xlabel('Support', fontsize=12, fontweight='bold')
plt.ylabel('Confidence', fontsize=12, fontweight='bold')
plt.title('Apriori - Association Rules (Support vs Confidence)', fontsize=14, fontweight='bold')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Distribution of Lift values
plt.figure(figsize=(10, 6))
plt.hist(rules['lift'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
plt.xlabel('Lift', fontsize=12, fontweight='bold')
plt.ylabel('Frequency', fontsize=12, fontweight='bold')
plt.title('Distribution of Lift Values in Association Rules', fontsize=14, fontweight='bold')
plt.axvline(rules['lift'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {rules["lift"].mean():.2f}')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## Step 8: Results Summary and Interpretation

In [None]:
# Summary statistics
print("="*70)
print(" APRIORI ALGORITHM - RESULTS SUMMARY")
print("="*70)
print(f"\nðŸ“Š Performance Metrics:")
print(f"   â€¢ Execution Time: {execution_time:.4f} seconds")
print(f"   â€¢ Frequent Itemsets: {len(frequent_itemsets)}")
print(f"   â€¢ Association Rules: {len(rules)}")
print(f"   â€¢ Minimum Support: 0.05 (5%)")
print(f"   â€¢ Minimum Lift: 1.0")

print(f"\nðŸ“ˆ Rule Statistics:")
print(f"   â€¢ Average Support: {rules['support'].mean():.4f}")
print(f"   â€¢ Average Confidence: {rules['confidence'].mean():.4f}")
print(f"   â€¢ Average Lift: {rules['lift'].mean():.4f}")
print(f"   â€¢ Max Lift: {rules['lift'].max():.4f}")
print(f"   â€¢ Max Confidence: {rules['confidence'].max():.4f}")

print("\n" + "="*70)

In [None]:
# Interpret top 3 rules
print("\nðŸ’¡ Business Insights - Top 3 Rules:\n")
for idx, row in rules.head(3).iterrows():
    antecedents = ', '.join(list(row['antecedents']))
    consequents = ', '.join(list(row['consequents']))
    print(f"Rule {idx + 1}:")
    print(f"  If customer buys: {antecedents}")
    print(f"  Then likely buys: {consequents}")
    print(f"  Confidence: {row['confidence']:.2%} | Lift: {row['lift']:.2f}")
    print()

## Algorithm Characteristics

**Apriori Algorithm:**
- **Approach:** Breadth-first, level-wise search
- **Candidate Generation:** Generates candidates at each level
- **Database Scans:** Multiple scans (K+1 for K-itemsets)
- **Memory:** Moderate memory usage
- **Speed:** Slower for large datasets
- **Best For:** Small to medium-sized datasets, educational purposes

**Advantages:**
- Simple and easy to understand
- Easy to implement
- Works well for smaller datasets

**Disadvantages:**
- Multiple database scans required
- Generates many candidate itemsets
- Can be slow on large datasets

**Business Applications:**
- Product bundling recommendations
- Store layout optimization
- Cross-selling strategies
- Inventory management