<a href="https://colab.research.google.com/github/gulcenurcagiran/market-basket-analysis/blob/main/market_basket_analysis2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
url = 'https://raw.githubusercontent.com/gulcenurcagiran/groceries-dataset/refs/heads/main/Groceries%20data.csv'
data = pd.read_csv(url)

data_head = data.head()
data_info = data.info()
data_summary = data.describe(include='all')

data_head, data_info, data_summary

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38765 entries, 0 to 38764
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Member_number    38765 non-null  int64 
 1   Date             38765 non-null  object
 2   itemDescription  38765 non-null  object
 3   year             38765 non-null  int64 
 4   month            38765 non-null  int64 
 5   day              38765 non-null  int64 
 6   day_of_week      38765 non-null  int64 
dtypes: int64(5), object(2)
memory usage: 2.1+ MB


(   Member_number        Date   itemDescription  year  month  day  day_of_week
 0           1808  2015-07-21    tropical fruit  2015      7   21            1
 1           2552  2015-05-01        whole milk  2015      5    1            4
 2           2300  2015-09-19         pip fruit  2015      9   19            5
 3           1187  2015-12-12  other vegetables  2015     12   12            5
 4           3037  2015-01-02        whole milk  2015      1    2            4,
 None,
         Member_number        Date itemDescription          year         month  \
 count    38765.000000       38765           38765  38765.000000  38765.000000   
 unique            NaN         728             167           NaN           NaN   
 top               NaN  2015-01-21      whole milk           NaN           NaN   
 freq              NaN          96            2502           NaN           NaN   
 mean      3003.641868         NaN             NaN   2014.528518      6.477570   
 std       1153.611031    

In [3]:
def check_data_quality(df):
    print("Duplicate Records:", df.duplicated().sum())
    print("\nMissing Values:\n", df.isnull().sum())
    print("\nUnique Items:", df['itemDescription'].nunique())
    print("\nSample Unique Items:\n", df['itemDescription'].unique()[:10])

check_data_quality(data)

def clean_data(df):
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

    df['itemDescription'] = (df['itemDescription']
                           .str.strip()
                           .str.lower()
                           .str.replace('  ', ' ')
                           .str.replace('[^\w\s]', ''))

    df = df.drop_duplicates()

    cleaned_df = df[['Member_number', 'Date', 'itemDescription']]

    assert cleaned_df.isnull().sum().sum() == 0, "There are missing values!"

    return cleaned_df

cleaned_data = clean_data(data.copy())

print("\nCleaned Data Info:")
print(cleaned_data.info())
print("\nSample Cleaned Data:")
print(cleaned_data.head())

print("\nSummary Statistics:")
print("Total Transactions:", len(cleaned_data))
print("Unique Customers:", cleaned_data['Member_number'].nunique())
print("Unique Products:", cleaned_data['itemDescription'].nunique())
print("Date Range:", cleaned_data['Date'].min(), "to", cleaned_data['Date'].max())

Duplicate Records: 759

Missing Values:
 Member_number      0
Date               0
itemDescription    0
year               0
month              0
day                0
day_of_week        0
dtype: int64

Unique Items: 167

Sample Unique Items:
 ['tropical fruit' 'whole milk' 'pip fruit' 'other vegetables' 'rolls/buns'
 'pot plants' 'citrus fruit' 'beef' 'frankfurter' 'chicken']

Cleaned Data Info:
<class 'pandas.core.frame.DataFrame'>
Index: 38006 entries, 0 to 38764
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Member_number    38006 non-null  int64         
 1   Date             38006 non-null  datetime64[ns]
 2   itemDescription  38006 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 1.2+ MB
None

Sample Cleaned Data:
   Member_number       Date   itemDescription
0           1808 2015-07-21    tropical fruit
1           2552 2015-05-01        whole milk

In [5]:
import numpy as np

def create_transactions(df):
    grouped_data = df.groupby(['Member_number', 'Date'])['itemDescription'].apply(list).reset_index()

    grouped_data['itemDescription'] = grouped_data['itemDescription'].apply(lambda x: list(dict.fromkeys(x)))

    transactions = grouped_data['itemDescription'].tolist()

    print(f"Total number of transactions: {len(transactions)}")
    print(f"Average basket size: {np.mean([len(t) for t in transactions]):.2f}")
    print(f"Maximum basket size: {max([len(t) for t in transactions])}")

    return transactions

transactions = create_transactions(cleaned_data)

print("\nSample transactions:")
for i, transaction in enumerate(transactions[:5], 1):
    print(f"\nTransaction {i}:")
    print(f"Number of items: {len(transaction)}")
    print(f"Items: {transaction}")

Total number of transactions: 14963
Average basket size: 2.54
Maximum basket size: 10

Sample transactions:

Transaction 1:
Number of items: 3
Items: ['whole milk', 'pastry', 'salty snack']

Transaction 2:
Number of items: 4
Items: ['sausage', 'whole milk', 'semi-finished bread', 'yogurt']

Transaction 3:
Number of items: 2
Items: ['soda', 'pickled vegetables']

Transaction 4:
Number of items: 2
Items: ['canned beer', 'misc. beverages']

Transaction 5:
Number of items: 2
Items: ['sausage', 'hygiene articles']


In [6]:
from mlxtend.preprocessing import TransactionEncoder

def encode_transactions(transactions):
    te = TransactionEncoder()

    te_ary = te.fit(transactions).transform(transactions)

    df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

    print(f"Shape of encoded matrix: {df_encoded.shape}")
    print(f"Number of unique items: {len(te.columns_)}")

    return df_encoded

df_encoded = encode_transactions(transactions)

print("\nFirst 5 rows and 10 columns of encoded data:")
print(df_encoded.iloc[:5, :10])

sparsity = (df_encoded.values == False).sum() / df_encoded.size * 100
print(f"\nMatrix sparsity: {sparsity:.2f}%")

Shape of encoded matrix: (14963, 167)
Number of unique items: 167

First 5 rows and 10 columns of encoded data:
   abrasive cleaner  artif. sweetener  baby cosmetics   bags  baking powder  \
0             False             False           False  False          False   
1             False             False           False  False          False   
2             False             False           False  False          False   
3             False             False           False  False          False   
4             False             False           False  False          False   

   bathroom cleaner   beef  berries  beverages  bottled beer  
0             False  False    False      False         False  
1             False  False    False      False         False  
2             False  False    False      False         False  
3             False  False    False      False         False  
4             False  False    False      False         False  

Matrix sparsity: 98.48%


In [13]:
from mlxtend.frequent_patterns import apriori, association_rules

def apply_apriori(df_encoded, min_support=0.01, min_confidence=0.2):
    frequent_itemsets = apriori(df_encoded,
                               min_support=min_support,
                               use_colnames=True)

    print("Frequent Itemsets Analysis:")
    print(f"Number of frequent itemsets found: {len(frequent_itemsets)}")
    print("\nTop 10 frequent itemsets by support:")
    print(frequent_itemsets.sort_values(by='support', ascending=False).head(10))

    rules = association_rules(df=frequent_itemsets,
                            metric="confidence",
                            min_threshold=min_confidence,
                            num_itemsets=len(frequent_itemsets))  # Added this parameter

    print("\nAssociation Rules Analysis:")
    print(f"Number of rules generated: {len(rules)}")

    if len(rules) > 0:
        print("\nTop 10 rules by lift:")
        rules_by_lift = rules.sort_values('lift', ascending=False).head(10)
        print(rules_by_lift[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

        print("\nTop 10 rules by confidence:")
        rules_by_conf = rules.sort_values('confidence', ascending=False).head(10)
        print(rules_by_conf[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

    return frequent_itemsets, rules

frequent_itemsets, rules = apply_apriori(df_encoded, min_support=0.01, min_confidence=0.2)

if len(rules) == 0:
    print("\nTrying with lower threshold values...")
    frequent_itemsets, rules = apply_apriori(df_encoded,
                                           min_support=0.005,
                                           min_confidence=0.1)

  and should_run_async(code)


Frequent Itemsets Analysis:
Number of frequent itemsets found: 69

Top 10 frequent itemsets by support:
     support            itemsets
62  0.157923        (whole milk)
39  0.122101  (other vegetables)
45  0.110005        (rolls/buns)
51  0.097106              (soda)
63  0.085879            (yogurt)
46  0.069572   (root vegetables)
56  0.067767    (tropical fruit)
4   0.060683     (bottled water)
48  0.060349           (sausage)
14  0.053131      (citrus fruit)

Association Rules Analysis:
Number of rules generated: 0

Trying with lower threshold values...
Frequent Itemsets Analysis:
Number of frequent itemsets found: 126

Top 10 frequent itemsets by support:
     support            itemsets
87  0.157923        (whole milk)
52  0.122101  (other vegetables)
65  0.110005        (rolls/buns)
74  0.097106              (soda)
88  0.085879            (yogurt)
66  0.069572   (root vegetables)
80  0.067767    (tropical fruit)
5   0.060683     (bottled water)
69  0.060349           (sausage)
1

In [14]:
from mlxtend.frequent_patterns import fpgrowth, association_rules

def apply_fpgrowth(df_encoded, min_support=0.01, min_confidence=0.2):
    frequent_itemsets = fpgrowth(df_encoded,
                                min_support=min_support,
                                use_colnames=True)

    print("Frequent Itemsets Analysis:")
    print(f"Number of frequent itemsets found: {len(frequent_itemsets)}")
    print("\nTop 10 frequent itemsets by support:")
    print(frequent_itemsets.sort_values(by='support', ascending=False).head(10))

    rules = association_rules(df=frequent_itemsets,
                            metric="confidence",
                            min_threshold=min_confidence,
                            num_itemsets=len(frequent_itemsets))  # Added this parameter

    print("\nAssociation Rules Analysis:")
    print(f"Number of rules generated: {len(rules)}")

    if len(rules) > 0:
        print("\nTop 10 rules by lift:")
        rules_by_lift = rules.sort_values('lift', ascending=False).head(10)
        print(rules_by_lift[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

        print("\nTop 10 rules by confidence:")
        rules_by_conf = rules.sort_values('confidence', ascending=False).head(10)
        print(rules_by_conf[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

    return frequent_itemsets, rules

frequent_itemsets_fp, rules_fp = apply_fpgrowth(df_encoded, min_support=0.01, min_confidence=0.2)

if len(rules_fp) == 0:
    print("\nTrying with lower threshold values...")
    frequent_itemsets_fp, rules_fp = apply_fpgrowth(df_encoded,
                                                   min_support=0.005,
                                                   min_confidence=0.1)

  and should_run_async(code)


Frequent Itemsets Analysis:
Number of frequent itemsets found: 69

Top 10 frequent itemsets by support:
     support            itemsets
0   0.157923        (whole milk)
16  0.122101  (other vegetables)
9   0.110005        (rolls/buns)
5   0.097106              (soda)
3   0.085879            (yogurt)
23  0.069572   (root vegetables)
18  0.067767    (tropical fruit)
29  0.060683     (bottled water)
4   0.060349           (sausage)
42  0.053131      (citrus fruit)

Association Rules Analysis:
Number of rules generated: 0

Trying with lower threshold values...
Frequent Itemsets Analysis:
Number of frequent itemsets found: 126

Top 10 frequent itemsets by support:
     support            itemsets
0   0.157923        (whole milk)
18  0.122101  (other vegetables)
11  0.110005        (rolls/buns)
6   0.097106              (soda)
3   0.085879            (yogurt)
25  0.069572   (root vegetables)
20  0.067767    (tropical fruit)
34  0.060683     (bottled water)
4   0.060349           (sausage)
5

In [15]:
import time

start_time = time.time()
frequent_itemsets, rules = apply_apriori(df_encoded, min_support=0.005, min_confidence=0.1)
apriori_time = time.time() - start_time

start_time = time.time()
frequent_itemsets_fp, rules_fp = apply_fpgrowth(df_encoded, min_support=0.005, min_confidence=0.1)
fpgrowth_time = time.time() - start_time

print("\nExecution Time Comparison:")
print(f"Apriori Time: {apriori_time:.2f} seconds")
print(f"FP-Growth Time: {fpgrowth_time:.2f} seconds")
print(f"FP-Growth is {(apriori_time/fpgrowth_time):.2f}x {'faster' if fpgrowth_time < apriori_time else 'slower'} than Apriori")

  and should_run_async(code)


Frequent Itemsets Analysis:
Number of frequent itemsets found: 126

Top 10 frequent itemsets by support:
     support            itemsets
87  0.157923        (whole milk)
52  0.122101  (other vegetables)
65  0.110005        (rolls/buns)
74  0.097106              (soda)
88  0.085879            (yogurt)
66  0.069572   (root vegetables)
80  0.067767    (tropical fruit)
5   0.060683     (bottled water)
69  0.060349           (sausage)
18  0.053131      (citrus fruit)

Association Rules Analysis:
Number of rules generated: 19

Top 10 rules by lift:
        antecedents         consequents   support  confidence      lift
5     (frankfurter)  (other vegetables)  0.005146    0.136283  1.116150
0    (bottled beer)        (whole milk)  0.007151    0.157817  0.999330
14        (sausage)        (whole milk)  0.008955    0.148394  0.939663
7      (newspapers)        (whole milk)  0.005614    0.144330  0.913926
4   (domestic eggs)        (whole milk)  0.005280    0.142342  0.901341
6     (frankfurter