In [2]:
# Importing required libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [10]:
# Load dataset
data_path = '../data/Online Retail.csv'
df = pd.read_csv(data_path, encoding='latin1', on_bad_lines='skip', delimiter=';')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,01/12/2010 08:26,255,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,01/12/2010 08:26,339,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,01/12/2010 08:26,275,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,01/12/2010 08:26,339,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,01/12/2010 08:26,339,17850.0,United Kingdom


In [11]:
# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("Columns:", df.columns)

Dataset Shape: (541909, 8)
Columns: Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')


In [13]:
# Clean the dataset
# Drop rows with missing values and filter transactions with non-positive quantities or values
df.dropna(subset=['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice'], inplace=True)

# Ensure 'Quantity' and 'UnitPrice' are numeric
df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce')
df['UnitPrice'] = pd.to_numeric(df['UnitPrice'], errors='coerce')

# Drop rows where conversion failed (NaN values)
df.dropna(subset=['Quantity', 'UnitPrice'], inplace=True)

# Filter transactions with positive quantities and unit prices
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]


In [14]:
# Create a basket analysis format (Transaction-Item matrix)
# Grouping by InvoiceNo and Description
basket = df.groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().fillna(0)

In [15]:
# Convert to binary format (presence or absence of item in transaction)
basket = basket.applymap(lambda x: 1 if x > 0 else 0)

In [16]:
# Set minimum support (e.g., 0.01) and minimum confidence (e.g., 0.5)
minsupp = 0.01
minconf = 0.5

In [24]:
# Perform Apriori
frequent_itemsets = apriori(basket, min_support=minsupp, use_colnames=True)

# Check for issues in frequent_itemsets
print("Frequent Itemsets:\n", frequent_itemsets.head())

# Generate association rules
try:
    num_itemsets = frequent_itemsets['itemsets'].apply(len).max()
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=minconf, num_itemsets=num_itemsets)
    print("Association Rules:\n", rules)
    
    # Save results to CSV
    frequent_itemsets.to_csv('../data/frequent_itemsets.csv', index=False)
    rules.to_csv('../data/association_rules.csv', index=False)
    print("Frequent itemsets and association rules have been saved.")
except TypeError as e:
    print("Error with association_rules:", e)
    print("Please check your 'mlxtend' version or consult its documentation.")
    # Handle case where rules are not generated
    if 'rules' in locals():
        rules.to_csv('../data/association_rules.csv', index=False)
        

Frequent Itemsets:
     support                         itemsets
0  0.038439  (BOTANICAL GARDENS WALL CLOCK )
1  0.083383                       (CARRIAGE)
2  0.011236                 (DOTCOM POSTAGE)
3  0.024246  (LOVE SEAT ANTIQUE WHITE METAL)
4  0.030751                         (Manual)
Association Rules:
 Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, representativity, leverage, conviction, zhangs_metric, jaccard, certainty, kulczynski]
Index: []
Frequent itemsets and association rules have been saved.


