In [1]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
import pyfpgrowth
from mlxtend.preprocessing import TransactionEncoder



In [2]:
df = pd.read_excel('./Datasets/Online Retail.xlsx', engine='openpyxl')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [4]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [5]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [6]:
df.tail()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.1,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France
541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France


**We only need InvoiceNo with no cancellation and StockCode for each transaction.**

In [7]:
# Assuming your dataframe is named df
grouped_df = df.groupby('InvoiceNo').agg({'Description': lambda x: list(x)}).reset_index()

# If you want to prefix "T" to each InvoiceNo
grouped_df['InvoiceNo'] = 'T' + grouped_df['InvoiceNo'].astype(str)

In [8]:
grouped_df

Unnamed: 0,InvoiceNo,Description
0,T536365,"[WHITE HANGING HEART T-LIGHT HOLDER, WHITE MET..."
1,T536366,"[HAND WARMER UNION JACK, HAND WARMER RED POLKA..."
2,T536367,"[ASSORTED COLOUR BIRD ORNAMENT, POPPY'S PLAYHO..."
3,T536368,"[JAM MAKING SET WITH JARS, RED COAT RACK PARIS..."
4,T536369,[BATH BUILDING BLOCK WORD]
...,...,...
25895,TC581484,"[PAPER CRAFT , LITTLE BIRDIE]"
25896,TC581490,"[VICTORIAN GLASS HANGING T-LIGHT, ZINC T-LIGHT..."
25897,TC581499,[Manual]
25898,TC581568,[VICTORIAN SEWING BOX LARGE]


In [9]:
# Convert all descriptions to strings
grouped_df['Description'] = grouped_df['Description'].apply(lambda x: [str(item) for item in x])

# Convert 'Description' to a list of lists
transactions = grouped_df['Description'].tolist()


In [10]:
# Use one-hot encoding
te = TransactionEncoder()
te_ary = te.fit(grouped_df['Description']).transform(grouped_df['Description'])
df_onehot = pd.DataFrame(te_ary, columns=te.columns_)
print(df_onehot.head())

    4 PURPLE FLOCK DINNER CANDLES   50'S CHRISTMAS GIFT BAG LARGE  \
0                           False                           False   
1                           False                           False   
2                           False                           False   
3                           False                           False   
4                           False                           False   

    DOLLY GIRL BEAKER   I LOVE LONDON MINI BACKPACK  \
0               False                         False   
1               False                         False   
2               False                         False   
3               False                         False   
4               False                         False   

    I LOVE LONDON MINI RUCKSACK   NINE DRAWER OFFICE TIDY  \
0                         False                     False   
1                         False                     False   
2                         False                     False   
3         

In [11]:
# Implementing Apriori algorithm
def run_apriori(df, min_support):
 frequent_itemsets = apriori(df, min_support=min_support,
use_colnames=True)
 return frequent_itemsets
# Implementing FP-Growth algorithm
def run_fpgrowth(transactions, min_support):
 patterns = pyfpgrowth.find_frequent_patterns(transactions,
min_support * len(transactions))
 return patterns
# Generate association rules
def generate_rules(frequent_itemsets, min_confidence):
 rules = association_rules(frequent_itemsets, metric="confidence",
min_threshold=min_confidence)
 return rules

In [12]:
# Example usage
min_support = 0.025
min_confidence = 0.7
# Using Apriori
frequent_itemsets_apriori = run_apriori(df_onehot, min_support)
rules_apriori = generate_rules(frequent_itemsets_apriori,min_confidence)
# Print results or perform further analysis
print("Apriori frequent itemsets:")
print(frequent_itemsets_apriori)
print("Apriori association rules:")
print(rules_apriori[['antecedents', 'consequents', 'confidence']])

Apriori frequent itemsets:
      support                                           itemsets
0    0.037104                           (6 RIBBONS RUSTIC CHARM)
1    0.032278                      (60 TEATIME FAIRY CAKE CASES)
2    0.038649                       (ALARM CLOCK BAKELIKE GREEN)
3    0.030849                        (ALARM CLOCK BAKELIKE PINK)
4    0.041737                        (ALARM CLOCK BAKELIKE RED )
..        ...                                                ...
114  0.030270  (ROSES REGENCY TEACUP AND SAUCER , GREEN REGEN...
115  0.032162  (JUMBO BAG RED RETROSPOT, JUMBO BAG PINK POLKA...
116  0.026371  (JUMBO BAG RED RETROSPOT, JUMBO SHOPPER VINTAG...
117  0.028301  (JUMBO STORAGE BAG SUKI, JUMBO BAG RED RETROSPOT)
118  0.025019  (LUNCH BAG  BLACK SKULL., LUNCH BAG RED RETROS...

[119 rows x 2 columns]
Apriori association rules:
                          antecedents                         consequents  \
0  (ROSES REGENCY TEACUP AND SAUCER )   (GREEN REGENCY TEACUP AND

In [13]:
# Example usage
min_support = 0.025
min_confidence = 0.7
patterns_fpgrowth = run_fpgrowth(transactions, min_support)
rules_fpgrowth =pyfpgrowth.generate_association_rules(patterns_fpgrowth,min_confidence)
# Print results or perform further analysis
print("FP-Growth frequent itemsets:\n")
print(patterns_fpgrowth)
print("FP-Growth association rules:\n")
print(rules_fpgrowth)

FP-Growth frequent itemsets:

{('72 SWEETHEART FAIRY CAKE CASES',): 650, ('60 CAKE CASES VINTAGE CHRISTMAS',): 652, ('HOMEMADE JAM SCENTED CANDLES',): 655, ('RETROSPOT HEART HOT WATER BOTTLE',): 655, ('LOVE BUILDING BLOCK WORD',): 656, ('HOT WATER BOTTLE TEA AND SYMPATHY',): 656, ('SMALL WHITE HEART OF WICKER',): 657, ('SET OF 60 PANTRY DESIGN CAKE CASES ',): 659, ('JUMBO BAG OWLS',): 673, ('JUMBO BAG SCANDINAVIAN BLUE PAISLEY',): 675, ('RED  HARMONICA IN BOX ',): 676, ('PACK OF 72 SKULL CAKE CASES',): 678, ('CHILLI LIGHTS',): 679, ('HOT WATER BOTTLE I AM SO POORLY',): 679, ('HAND OVER THE CHOCOLATE   SIGN ',): 686, ('HAND WARMER OWL DESIGN',): 687, ('GINGERBREAD MAN COOKIE CUTTER',): 691, ('LUNCH BAG VINTAGE LEAF DESIGN',): 692, ('PLASTERS IN TIN SPACEBOY',): 697, ('WOOD BLACK BOARD ANT WHITE FINISH',): 698, ('SET OF 3 HEART COOKIE CUTTERS',): 700, ('JUMBO BAG SPACEBOY DESIGN',): 702, ('LUNCH BAG DOLLY GIRL DESIGN',): 706, ('SET OF 3 BUTTERFLY COOKIE CUTTERS',): 707, ('DOORMAT RED RET

In [14]:
len(rules_fpgrowth)

5