In [1]:
import pandas as pd
import matplotlib as mlp
import matplotlib.pyplot as plt
import seaborn as sns

#mlextend library for applying the apriori algorithm and association rules.
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

import time

In [None]:
#Reading the file into a pandas dataframe
df=pd.read_excel('online_retail_II.xlsx' , header=0)
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

SO, CustomerID has maximum null values which we need to remove.

In [None]:
df.dropna(inplace=True)
len(df)

In [None]:
df

In [None]:
#Number of unique countries 
print(f"Number of unique Country: {df['Country'].nunique()}")

In [None]:
#Number of transactions left per country
top10 = df["Country"].value_counts().head(10)
print(top10)

In [None]:
df['Amount']=df['Quantity']*df['Price']
df.head()

In [None]:
df['year_month']=df['InvoiceDate'].dt.strftime('%Y-%m')

In [None]:
df1=pd.DataFrame(df.groupby('year_month')['Amount'].sum())
df1=df1.reset_index()
plt.figure(figsize=(10,7))
sns.lineplot(data=df1,x='year_month',y='Amount')
plt.xticks(rotation=30);

SO, it can be observed that most number of the transactions were between October, 2011 and December, 2011

In [None]:
df['Country'].value_counts()[:15].sort_values(ascending = False).head(10).plot.bar(figsize=(10, 7),xlabel='Country',ylabel='Number of Transactions');

It can be observed that United Kingdom has the highest number of transactions. So, only UK is kept for analysis.

In [None]:
#Group, sum, unstack and set index of dataframe
basket = df[df['Country'] =="United Kingdom"]\
        .groupby(['Invoice', 'Description'])["Quantity"]\
        .sum().unstack()\
        .reset_index().fillna(0)\
        .set_index("Invoice")

basket.head()

Next task is to hot encode our data. To do this, we can create a simple function to convert any values above zero to a 1, and any zeros to 0. We can then apply the function using the applymap() method.

[ ]


In [None]:
def encode(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

encoded_df = basket.applymap(encode)

encoded_df

We don't need those transactions which has only only one item sold. So, we will remove those transactions. 

In [None]:
filtered_df = encoded_df[(encoded_df > 0).sum(axis=1) >= 2]

filtered_df

Apply Apriori Algorithm

In [None]:
start_time = time.time()
frequent_itemsets = apriori(filtered_df, min_support=0.03, use_colnames=True).sort_values("support",ascending=False)
frequent_itemsets
end_time = time.time()

In [None]:
elapsed_time = end_time - start_time
elapsed_time

In [None]:
frequent_itemsets

After getting the frequent itemsets, we need to find association rules.

In [None]:
assoc_rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.8).sort_values("lift",ascending=False).reset_index(drop=True)
assoc_rules

FP GROWTH - ASSOCIATION RULES

The FP Growth algorithm can be seen as Apriori’s modern version, as it is faster and more efficient while obtaining the same goal.

Compute the frequent itemsets using FP Growth algorithm

In [None]:
from mlxtend.frequent_patterns.fpgrowth import fpgrowth


In [None]:
start_time = time.time()
f_patterns = fpgrowth(filtered_df, min_support=0.03, use_colnames=True)
# print the frequent itemsets

f_patterns
end_time = time.time()

In [None]:
elapsed_time = end_time - start_time
elapsed_time

In [None]:
f_patterns

In the last step, we need to use the association_rules function to convert those frequent itemsets into association rules

In [None]:
# Compute the association rules based on the frequent itemsets
from mlxtend.frequent_patterns import association_rules

# compute and print the association rules
association_rules(f_patterns, metric="lift", min_threshold=0.80)