In [39]:
# Importing necessary libraries
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
# from apyori import apriori


Build another apriori algorithm on ‘groceries’ dataset:

a. Set the support value to be 0.003 and the confidence value to be 0.7. Store the resulting ‘rule 2’

b. Inspect the rule

c. Plot the rule and set the method to be ‘grouped’

In [40]:
# Reading the dataset
data = pd.read_csv('./data/Groceries_dataset.csv')

In [41]:
data.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [42]:
# EDA: Checking for missing values
print("Number of missing values:")
print(data.isnull().sum())

Number of missing values:
Member_number      0
Date               0
itemDescription    0
dtype: int64


In [43]:
# Handling missing values (replace with mean, or any desired method)
data['Member_number'].fillna(data['Member_number'].mean(), inplace=True)

In [44]:
# Handling missing values in 'Date' column
data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y')  # Convert 'Date' to datetime
data['Date'].fillna(pd.to_datetime('1900-01-01'), inplace=True)  # Replace missing values with a specific value (e.g., 1900-01-01)

In [45]:
data.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,2015-07-21,tropical fruit
1,2552,2015-01-05,whole milk
2,2300,2015-09-19,pip fruit
3,1187,2015-12-12,other vegetables
4,3037,2015-02-01,whole milk


In [46]:
print(data.isnull().sum())

Member_number      0
Date               0
itemDescription    0
dtype: int64


In [47]:
# Assuming you have a DataFrame named 'df'
index_col = data.index.name  # Get the name of the index column
index_values = data.index  # Get the index values
index_dtype = data.index.dtype  # Get the data type of the index

print("Index column:", index_col)
print("Index values:", index_values)
print("Index dtype:", index_dtype)

Index column: None
Index values: RangeIndex(start=0, stop=38765, step=1)
Index dtype: int64


In [48]:
data.columns

Index(['Member_number', 'Date', 'itemDescription'], dtype='object')

In [49]:
data.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,2015-07-21,tropical fruit
1,2552,2015-01-05,whole milk
2,2300,2015-09-19,pip fruit
3,1187,2015-12-12,other vegetables
4,3037,2015-02-01,whole milk


In [50]:
data.describe()

Unnamed: 0,Member_number
count,38765.0
mean,3003.641868
std,1153.611031
min,1000.0
25%,2002.0
50%,3005.0
75%,4007.0
max,5000.0


In [51]:
# Perform one-hot encoding
one_hot_encoder = TransactionEncoder()
one_hot_data = one_hot_encoder.fit_transform(data.groupby(['Member_number', 'Date'])['itemDescription'].apply(list).tolist())
one_hot_data = pd.DataFrame(one_hot_data, columns=one_hot_encoder.columns_)


In [52]:
# Apply the Apriori algorithm
frequent_itemsets = apriori(one_hot_data, min_support=0.003, use_colnames=True)

In [57]:
# Generate association rules
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.7)

In [58]:
# Check if there are any rules generated
if len(rules) > 1:
    # Store the resulting 'rule 2'
    rule_2 = rules.iloc[1]
    
    # Inspect the rule
    print(rule_2)
    
    # Plot the rule
    plt.figure(figsize=(8, 4))
    plt.scatter(rule_2['support'], rule_2['confidence'], c='r', alpha=0.5)
    plt.xlabel('Support')
    plt.ylabel('Confidence')
    plt.title('Association Rule')
    plt.show()
else:
    print("No second rule found.")

No second rule found.
