In [1]:
!pip install mlxtend



In [2]:
import pandas as pd
import numpy as np


In [3]:
df = pd.read_csv("combined_unique.csv", encoding='latin1')
print(df.shape)
print(df.head())
df.info()

  df = pd.read_csv("combined_unique.csv", encoding='latin1')


(1008898, 8)
  Invoice StockCode                          Description  Quantity  \
0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   
1  489434    79323P                   PINK CHERRY LIGHTS        12   
2  489434    79323W                  WHITE CHERRY LIGHTS        12   
3  489434     22041         RECORD FRAME 7" SINGLE SIZE         48   
4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   

           InvoiceDate  Price  Customer ID         Country  
0  2009-12-01 07:45:00   6.95      13085.0  United Kingdom  
1  2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
2  2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
3  2009-12-01 07:45:00   2.10      13085.0  United Kingdom  
4  2009-12-01 07:45:00   1.25      13085.0  United Kingdom  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008898 entries, 0 to 1008897
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  


In [4]:
#convert InvoiceDate to datetime format
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format='%d-%m-%Y %H:%M', errors='coerce')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008898 entries, 0 to 1008897
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   Invoice      1008898 non-null  object        
 1   StockCode    1008898 non-null  object        
 2   Description  1008898 non-null  object        
 3   Quantity     1008898 non-null  int64         
 4   InvoiceDate  0 non-null        datetime64[ns]
 5   Price        1008898 non-null  float64       
 6   Customer ID  779495 non-null   float64       
 7   Country      1008898 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 61.6+ MB


In [5]:
from mlxtend.frequent_patterns import apriori, association_rules

In [6]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [7]:
# Subset to avoid memory issues
sample_invoices = df['Invoice'].drop_duplicates().sample(5000, random_state=42)
df_sample = df[df['Invoice'].isin(sample_invoices)]

# Create basket matrix (Invoice Ã— Description)
basket = (
    df_sample.groupby(['Invoice', 'Description'])['Quantity']
    .sum()
    .unstack()
    .fillna(0)
)
basket = (basket > 0).astype(int)

# Apriori model
frequent_items = apriori(basket, min_support=0.02, use_colnames=True)

# Generate rules and compute metrics
rules = association_rules(frequent_items, metric="lift", min_threshold=1.0)

# Filter rules for meaningful recommendations
rules = rules[
    (rules['support'] >= 0.02) &  # at least 2% of transactions
    (rules['confidence'] >= 0.2) &  # at least 20% confidence
    (rules['lift'] >= 2)          # stronger-than-chance relationships
]

# Sort by lift (best recommendations first)
rules = rules.sort_values('lift', ascending=False)

# Show top 10 strongest associations
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


                             antecedents  \
8       (PINK REGENCY TEACUP AND SAUCER)   
9      (GREEN REGENCY TEACUP AND SAUCER)   
10    (ROSES REGENCY TEACUP AND SAUCER )   
11     (GREEN REGENCY TEACUP AND SAUCER)   
34      (PINK REGENCY TEACUP AND SAUCER)   
35    (ROSES REGENCY TEACUP AND SAUCER )   
3            (ALARM CLOCK BAKELIKE RED )   
2           (ALARM CLOCK BAKELIKE GREEN)   
7                 (DOLLY GIRL LUNCH BOX)   
6                  (SPACEBOY LUNCH BOX )   
38      (SWEETHEART CERAMIC TRINKET BOX)   
39      (STRAWBERRY CERAMIC TRINKET BOX)   
14            (LOVE BUILDING BLOCK WORD)   
15            (HOME BUILDING BLOCK WORD)   
42   (WOODEN PICTURE FRAME WHITE FINISH)   
43         (WOODEN FRAME ANTIQUE WHITE )   
12               (HEART OF WICKER LARGE)   
13               (HEART OF WICKER SMALL)   
0          (60 TEATIME FAIRY CAKE CASES)   
1   (PACK OF 60 PINK PAISLEY CAKE CASES)   
33          (LUNCH BAG SPACEBOY DESIGN )   
32                  (LUNCH BAG W

In [8]:
# number of transactions used (sampled invoices)
n_transactions = basket.shape[0]


In [9]:
# Build a readable result table
rules_out = rules[['antecedents','consequents','support','confidence','lift']].copy()
rules_out['antecedents'] = rules_out['antecedents'].apply(lambda s: ', '.join(sorted(list(s))))
rules_out['consequents'] = rules_out['consequents'].apply(lambda s: ', '.join(sorted(list(s))))
rules_out['support_count'] = (rules_out['support'] * n_transactions).round().astype(int)

# reorder columns for presentation
rules_out = rules_out[['antecedents','consequents','support_count','support','confidence','lift']]

# show results
print(rules_out)

# save to CSV for reporting
rules_out.to_csv('apriori_rules.csv', index=False)


                           antecedents                         consequents  \
8       PINK REGENCY TEACUP AND SAUCER     GREEN REGENCY TEACUP AND SAUCER   
9      GREEN REGENCY TEACUP AND SAUCER      PINK REGENCY TEACUP AND SAUCER   
10    ROSES REGENCY TEACUP AND SAUCER      GREEN REGENCY TEACUP AND SAUCER   
11     GREEN REGENCY TEACUP AND SAUCER    ROSES REGENCY TEACUP AND SAUCER    
34      PINK REGENCY TEACUP AND SAUCER    ROSES REGENCY TEACUP AND SAUCER    
35    ROSES REGENCY TEACUP AND SAUCER       PINK REGENCY TEACUP AND SAUCER   
3            ALARM CLOCK BAKELIKE RED           ALARM CLOCK BAKELIKE GREEN   
2           ALARM CLOCK BAKELIKE GREEN           ALARM CLOCK BAKELIKE RED    
7                 DOLLY GIRL LUNCH BOX                 SPACEBOY LUNCH BOX    
6                  SPACEBOY LUNCH BOX                 DOLLY GIRL LUNCH BOX   
38      SWEETHEART CERAMIC TRINKET BOX      STRAWBERRY CERAMIC TRINKET BOX   
39      STRAWBERRY CERAMIC TRINKET BOX      SWEETHEART CERAMIC T