In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [22]:
dataset = pd.read_csv("data/market_basket_optimisation.csv")
dataset.head()

Unnamed: 0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
0,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
1,chutney,,,,,,,,,,,,,,,,,,,
2,turkey,avocado,,,,,,,,,,,,,,,,,,
3,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,
4,low fat yogurt,,,,,,,,,,,,,,,,,,,


In [24]:
dataset.shape

(7500, 20)

In [25]:
transaction = dataset.values.flatten()
transaction

array(['burgers', 'meatballs', 'eggs', ..., nan, nan, nan], dtype=object)

In [26]:
df = pd.DataFrame(transaction, columns=["items"])
df.head()

Unnamed: 0,items
0,burgers
1,meatballs
2,eggs
3,
4,


In [27]:
df["incident_count"] = 1
df.dropna(subset=['items'], inplace=True)
df_table = df.groupby("items").sum().sort_values("incident_count", ascending=False).reset_index()
df_table.head(5).style.background_gradient(cmap='Greens')

Unnamed: 0,items,incident_count
0,mineral water,1787
1,eggs,1348
2,spaghetti,1306
3,french fries,1282
4,chocolate,1230


# Data pre-processing

In [28]:
from mlxtend.preprocessing import TransactionEncoder

In [29]:
# Transform Every Transaction to Seperate List & Gather Them into Numpy Array
transaction = []
for i in range(dataset.shape[0]):
    transaction.append([str(dataset.values[i,j]) for j in range(dataset.shape[1])])
# creating the numpy array of the transactions
transaction = np.array(transaction)
# importing the required module
from mlxtend.preprocessing import TransactionEncoder
# initializing the transactionEncoder
te = TransactionEncoder()
te_ary = te.fit(transaction).transform(transaction)
dataset = pd.DataFrame(te_ary, columns=te.columns_)
# dataset after encoded
dataset.head()

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [20]:
# Assuming you have a DataFrame named 'data' containing transactional data

# Convert the data into a binary format using one-hot encoding
te = TransactionEncoder()
te_ary = te.fit(data).transform(data)
data_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Running the fpgrowth algorithm with min_support = 0.05 and using column names in the result
res = fpgrowth(data_encoded, min_support=0.05, use_colnames=True)

# Printing the top 10 frequent itemsets
print(res.head(10))

Empty DataFrame
Columns: [support, itemsets]
Index: []


In [30]:
# select top 30 items
first30 = df_table["items"].head(30).values 
# Extract Top 30
dataset = dataset.loc[:,first30] 
# shape of the dataset
dataset.shape

(7500, 30)

**FP growth algorithm**

In [31]:
#Importing Libraries
from mlxtend.frequent_patterns import fpgrowth
#running the fpgrowth algorithm
res=fpgrowth(dataset,min_support=0.05, use_colnames=True)
# printing top 10
res.head(10)

Unnamed: 0,support,itemsets
0,0.179733,(eggs)
1,0.0872,(burgers)
2,0.062533,(turkey)
3,0.238267,(mineral water)
4,0.132,(green tea)
5,0.1296,(milk)
6,0.058533,(whole wheat rice)
7,0.0764,(low fat yogurt)
8,0.170933,(french fries)
9,0.050533,(soup)


**Association rules**

In [32]:
from mlxtend.frequent_patterns import association_rules
res=association_rules(res, metric="lift", min_threshold=1)
res

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(mineral water),(eggs),0.238267,0.179733,0.050933,0.213766,1.189351,0.008109,1.043286,0.209004
1,(eggs),(mineral water),0.179733,0.238267,0.050933,0.283383,1.189351,0.008109,1.062957,0.19409
2,(spaghetti),(mineral water),0.174133,0.238267,0.059733,0.343032,1.439698,0.018243,1.159468,0.369806
3,(mineral water),(spaghetti),0.238267,0.174133,0.059733,0.250699,1.439698,0.018243,1.102184,0.400941
4,(chocolate),(mineral water),0.163867,0.238267,0.052667,0.3214,1.348907,0.013623,1.122506,0.309351
5,(mineral water),(chocolate),0.238267,0.163867,0.052667,0.221041,1.348907,0.013623,1.073398,0.339566


In [33]:
# Sorting values based on confidence
res.sort_values("confidence",ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
2,(spaghetti),(mineral water),0.174133,0.238267,0.059733,0.343032,1.439698,0.018243,1.159468,0.369806
4,(chocolate),(mineral water),0.163867,0.238267,0.052667,0.3214,1.348907,0.013623,1.122506,0.309351
1,(eggs),(mineral water),0.179733,0.238267,0.050933,0.283383,1.189351,0.008109,1.062957,0.19409
3,(mineral water),(spaghetti),0.238267,0.174133,0.059733,0.250699,1.439698,0.018243,1.102184,0.400941
5,(mineral water),(chocolate),0.238267,0.163867,0.052667,0.221041,1.348907,0.013623,1.073398,0.339566
0,(mineral water),(eggs),0.238267,0.179733,0.050933,0.213766,1.189351,0.008109,1.043286,0.209004
