In [2]:
# Basket analysis based on Apriori algorithm
import pandas as pd
import numpy as np
from apyori import apriori

# reading data
ds=pd.read_csv('C:/Users/donad/OneDrive/Desktop/Internship/1_Machine Learning/5_Unsupervised_Learning/3_Association/dataset/Groceries_dataset.csv')
ds

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk
...,...,...,...
38760,4471,08-10-2014,sliced cheese
38761,2022,23-02-2014,candy
38762,1097,16-04-2014,cake bar
38763,1510,03-12-2014,fruit/vegetable juice


In [3]:
# Combine items by Member_number and Date
data = ds.groupby(['Member_number', 'Date'])['itemDescription'].apply(lambda x: ', '.join(x)).reset_index()

In [4]:
# Drop Member_number and Date columns
data.drop(['Member_number', 'Date'], axis=1, inplace=True)
data

Unnamed: 0,itemDescription
0,"sausage, whole milk, semi-finished bread, yogurt"
1,"whole milk, pastry, salty snack"
2,"canned beer, misc. beverages"
3,"sausage, hygiene articles"
4,"soda, pickled vegetables"
...,...
14958,"tropical fruit, berries, other vegetables, yog..."
14959,"bottled water, herbs"
14960,"fruit/vegetable juice, onions"
14961,"soda, root vegetables, semi-finished bread"


In [5]:
# Save to CSV
data.to_csv("itemDescription.csv", index=False)

In [6]:
# Read the CSV file and remove duplicates
txn = pd.read_csv("itemDescription.csv", header=None).drop_duplicates()

# Print the transaction data
print(txn)


                                                       0
0                                        itemDescription
1       sausage, whole milk, semi-finished bread, yogurt
2                        whole milk, pastry, salty snack
3                           canned beer, misc. beverages
4                              sausage, hygiene articles
...                                                  ...
14957                                    berries, onions
14958                        other vegetables, detergent
14959  tropical fruit, berries, other vegetables, yog...
14961                      fruit/vegetable juice, onions
14962         soda, root vegetables, semi-finished bread

[9112 rows x 1 columns]


In [7]:
# Split itemDescription into separate items
split_data = data['itemDescription'].str.split(',', expand=True)

In [8]:
split_data.isnull().sum()

0         0
1         0
2     10080
3     12778
4     14168
5     14512
6     14687
7     14767
8     14912
9     14962
10    14962
dtype: int64

In [9]:
# Fill NaN values with empty strings
split_data.fillna('', inplace=True)
split_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,sausage,whole milk,semi-finished bread,yogurt,,,,,,,
1,whole milk,pastry,salty snack,,,,,,,,
2,canned beer,misc. beverages,,,,,,,,,
3,sausage,hygiene articles,,,,,,,,,
4,soda,pickled vegetables,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
14958,tropical fruit,berries,other vegetables,yogurt,kitchen towels,napkins,,,,,
14959,bottled water,herbs,,,,,,,,,
14960,fruit/vegetable juice,onions,,,,,,,,,
14961,soda,root vegetables,semi-finished bread,,,,,,,,


In [10]:
# Create records
records = []
for i in range(0, split_data.shape[0]):
    records.append([str(split_data.values[i, j]).strip() for j in range(0, split_data.shape[1]) if str(split_data.values[i, j]).strip()])

In [11]:
# Apply Apriori algorithm
association_rules = apriori(records, min_support=0.00030, min_confidence=0.05)
association_results = list(association_rules)

print(f"There are {len(association_results)} Relation derived.")

There are 1198 Relation derived.


In [12]:
# Define a function to inspect results
def inspect(results):
    lhs = []
    rhs = []
    supports = []
    confidences = []
    lifts = []
    for result in results:
        for rule in result.ordered_statistics:
            lhs.append(', '.join(rule.items_base))
            rhs.append(', '.join(rule.items_add))
            supports.append(result.support)
            confidences.append(rule.confidence)
            lifts.append(rule.lift)
    return list(zip(lhs, rhs, supports, confidences, lifts))

In [13]:
# Create a DataFrame with the results
resultsinDataFrame = pd.DataFrame(inspect(association_results), columns=["Left hand side", "Right hand side", "Support", "Confidence", "Lift"])

In [15]:
# Print first 10 rows with the highest Lift
resultsinDataFrame.nlargest(n=10, columns="Lift")

Unnamed: 0,Left hand side,Right hand side,Support,Confidence,Lift
797,soups,seasonal products,0.000334,0.104167,14.704206
1215,"fruit/vegetable juice, curd",sausage,0.000334,0.5,8.285161
1749,"other vegetables, pastry","soda, whole milk",0.000334,0.090909,7.817659
1029,"brown bread, frozen vegetables",canned beer,0.000334,0.357143,7.612434
1030,"canned beer, frozen vegetables",brown bread,0.000334,0.25,6.644316
1118,"yogurt, soda",chewing gum,0.000401,0.068966,5.73295
1754,"other vegetables, soda, whole milk",pastry,0.000334,0.294118,5.685895
1066,"yogurt, butter milk",canned beer,0.000334,0.263158,5.609162
1756,"rolls/buns, sausage","yogurt, whole milk",0.000334,0.0625,5.599925
1750,"soda, pastry","other vegetables, whole milk",0.000334,0.081967,5.524664
