## Step 1: Import Libraries

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
import warnings
warnings.filterwarnings('ignore')

## Step 2: Load Raw Data

In [None]:
# Load sales transaction data
print("Loading sales data...")
data = pd.read_csv("../Data/Sales.csv", sep=";")
print(f"âœ“ Data loaded successfully")
print(f"  Shape: {data.shape}")
print(f"  Columns: {list(data.columns)}")

In [None]:
# Display first few rows
print("\nFirst 5 rows of raw data:")
data.head()

In [None]:
# Dataset information
print("\nDataset Info:")
data.info()

## Step 3: Data Exploration

In [None]:
# Basic statistics
print("="*60)
print(" DATA EXPLORATION")
print("="*60)
print(f"\nTotal rows: {len(data)}")
print(f"Total columns: {len(data.columns)}")
print(f"\nUnique transactions (BillNo): {data['BillNo'].nunique() if 'BillNo' in data.columns else 'N/A'}")
print(f"Unique items: {data['Itemname'].nunique() if 'Itemname' in data.columns else 'N/A'}")

In [None]:
# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())
print(f"\nTotal missing values: {data.isnull().sum().sum()}")

## Step 4: Data Cleaning

In [None]:
# Use subset of data for faster processing (10,000 rows)
# For full dataset, remove this line
print("\nSubsetting data to 10,000 rows for faster processing...")
data = data.iloc[:10000, :]
print(f"âœ“ Using {len(data)} rows")

In [None]:
# Select only relevant columns for association rule mining
print("\nSelecting relevant columns...")
data = data[['BillNo', 'Itemname']]
print(f"âœ“ Selected columns: {list(data.columns)}")

In [None]:
# Remove rows with missing values
print("\nHandling missing values...")
initial_rows = len(data)
data.dropna(inplace=True)
rows_removed = initial_rows - len(data)
print(f"âœ“ Removed {rows_removed} rows with missing values")
print(f"âœ“ Remaining rows: {len(data)}")

In [None]:
# Display cleaned data sample
print("\nCleaned data sample:")
data.head(10)

## Step 5: Transform to Transaction Format

In [None]:
# Group items by transaction (BillNo)
# Each row will contain a list of items purchased in one transaction
print("\nGrouping items by transaction...")
transactions = data.groupby("BillNo")["Itemname"].apply(list).reset_index()
print(f"âœ“ Created {len(transactions)} transactions")
print(f"âœ“ Average items per transaction: {data.groupby('BillNo').size().mean():.2f}")

In [None]:
# Display transaction format
print("\nTransaction Format (first 5 transactions):")
transactions.head()

## Step 6: Binary Encoding (One-Hot Encoding)

In [None]:
# Convert transaction list format to binary matrix
# Each column represents an item, each row is a transaction
# True/1 means item was purchased, False/0 means it wasn't
print("\nEncoding transactions to binary matrix...")
te = TransactionEncoder()
transactions_bool_list = te.fit(transactions["Itemname"]).transform(transactions["Itemname"])

# Create dataframe with encoded transactions
data_transaction = pd.DataFrame(transactions_bool_list, columns=te.columns_)
print(f"âœ“ Binary encoding completed")
print(f"âœ“ Matrix shape: {data_transaction.shape}")
print(f"âœ“ Number of unique items: {data_transaction.shape[1]}")

In [None]:
# Display all items (columns)
pd.set_option('display.max_columns', None)
print("\nAll items in dataset:")
print(data_transaction.columns.tolist())

In [None]:
# Display sample of binary encoded data
print("\nBinary Encoded Transaction Matrix (first 10 rows):")
data_transaction.head(10)

## Step 7: Data Validation

In [None]:
# Validate the processed data
print("="*60)
print(" DATA VALIDATION")
print("="*60)

# Check for any issues
print(f"\nâœ“ No missing values: {data_transaction.isnull().sum().sum() == 0}")
print(f"âœ“ All values are boolean: {data_transaction.dtypes.unique()[0] == bool}")
print(f"âœ“ Matrix dimensions: {data_transaction.shape[0]} transactions Ã— {data_transaction.shape[1]} items")

# Calculate sparsity (how sparse is the matrix)
total_cells = data_transaction.shape[0] * data_transaction.shape[1]
true_cells = data_transaction.sum().sum()
sparsity = (1 - true_cells / total_cells) * 100
print(f"âœ“ Matrix sparsity: {sparsity:.2f}% (typical for transaction data)")
print(f"âœ“ Average items per transaction: {data_transaction.sum(axis=1).mean():.2f}")

## Step 8: Save Processed Data

In [None]:
# Save the processed binary matrix to CSV
output_path = "../Data/processed_transactions.csv"
print(f"\nSaving processed data to {output_path}...")
data_transaction.to_csv(output_path, index=False)
print("âœ“ Processed data saved successfully!")
print(f"âœ“ File size: {data_transaction.memory_usage(deep=True).sum() / 1024:.2f} KB")

## Preprocessing Summary

In [None]:
# Final summary
print("="*70)
print(" PREPROCESSING COMPLETED SUCCESSFULLY")
print("="*70)
print(f"\nðŸ“Š Summary:")
print(f"   â€¢ Original data: {initial_rows} rows")
print(f"   â€¢ After cleaning: {len(data)} rows")
print(f"   â€¢ Unique transactions: {len(transactions)}")
print(f"   â€¢ Unique items: {data_transaction.shape[1]}")
print(f"   â€¢ Binary matrix: {data_transaction.shape[0]} Ã— {data_transaction.shape[1]}")
print(f"   â€¢ Output file: processed_transactions.csv")
print(f"\nâœ“ Data is now ready for Apriori and FP-Growth algorithms!")
print("\n" + "="*70)

## Next Steps

The processed data has been saved to `../Data/processed_transactions.csv`

**You can now run:**
1. `Apriori_Model.ipynb` - For Apriori algorithm
2. `FPGrowth_Model.ipynb` - For FP-Growth algorithm

Both models will load the preprocessed data automatically.