<a href="https://www.kaggle.com/code/dilekdd/association-rule-based-recommender-script-de?scriptVersionId=197934370" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<div style="text-align: center; font-size: 40px; font-weight: bold; color: hotpink;">
     Association Rule Based Recommender Script Germany
</div>

Below are the basket information of 3 different users. Make the most appropriate product suggestion for this basket information using the association rule. Product suggestions can be 1 or more than 1. Derive the decision rules from 2010-2011 Germany customers.

The ID of the product in User 1's basket: 21987 The ID of the product in User 2's basket: 23235 The ID of the product in User 3's basket: 22747

The dataset named Online Retail II includes online sales transactions of a UK-based retail company between 01/12/2009 - 09/12/2011. The company's product catalogue includes gift items and it is known that most of its customers are wholesalers.


| **Column**     | **Description**                                                                    |
|----------------|------------------------------------------------------------------------------------|
| InvoiceNo      | Invoice number (If the code starts with 'C', it indicates the transaction was canceled). |
| StockCode      | Product code (Unique for each product).                                             |
| Description    | Product name.                                                                      |
| Quantity       | Product quantity (How many of each product were sold in the invoice).               |
| InvoiceDate    | Invoice date.                                                                      |
| UnitPrice      | Invoice price (in British pounds).                                                  |
| CustomerID     | Unique customer number.                                                            |
| Country        | Country name.                                                                      |

In [1]:
!pip install mlxtend
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)
# çıktının tek bir satırda olmasını sağlar.
pd.set_option('display.expand_frame_repr', False)
from mlxtend.frequent_patterns import apriori, association_rules



In [2]:
df_ = pd.read_excel("/kaggle/input/online-retail-ii/online_retail_II.xlsx",
                    sheet_name="Year 2010-2011", engine="openpyxl")

In [3]:
df = df_.copy()
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541910 entries, 0 to 541909
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      541910 non-null  object        
 1   StockCode    541910 non-null  object        
 2   Description  540456 non-null  object        
 3   Quantity     541910 non-null  int64         
 4   InvoiceDate  541910 non-null  datetime64[ns]
 5   Price        541910 non-null  float64       
 6   Customer ID  406830 non-null  float64       
 7   Country      541910 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [5]:
df.isnull().sum()

Invoice             0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
Price               0
Customer ID    135080
Country             0
dtype: int64

In [6]:
df.shape

(541910, 8)

In [7]:
def outlier_thresholds(dataframe, variable):
    quartile1 = dataframe[variable].quantile(0.01)
    quartile3 = dataframe[variable].quantile(0.99)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

def retail_data_prep(dataframe):
    dataframe.dropna(inplace=True)
    dataframe["Invoice"] = dataframe["Invoice"].astype(str)
    dataframe = dataframe[~dataframe["Invoice"].str.contains("C", na=False)]
    dataframe = dataframe[dataframe["Quantity"] > 0]
    dataframe = dataframe[dataframe["Price"] > 0]
    replace_with_thresholds(dataframe, "Quantity")
    replace_with_thresholds(dataframe, "Price")
    return dataframe


def create_invoice_product_df(dataframe, id=False):
    if id:
        return dataframe.groupby(['Invoice', "StockCode"])['Quantity'].sum().unstack().fillna(0). \
            applymap(lambda x: 1 if x > 0 else 0)
    else:
        return dataframe.groupby(['Invoice', 'Description'])['Quantity'].sum().unstack().fillna(0). \
            applymap(lambda x: 1 if x > 0 else 0)


def check_id(dataframe, stock_code):
    product_name = dataframe[dataframe["StockCode"] == stock_code][["Description"]].values[0].tolist()
    print(product_name)


def create_rules(dataframe, id=True, country="Germany"):
    dataframe = dataframe[dataframe['Country'] == country]
    dataframe = create_invoice_product_df(dataframe, id)
    frequent_itemsets = apriori(dataframe, min_support=0.01, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="support", min_threshold=0.01)
    return rules

In [8]:
df = retail_data_prep(df)
rules = create_rules(df)

  dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
  applymap(lambda x: 1 if x > 0 else 0)


In [9]:
rules[(rules["support"]>0.05) & (rules["confidence"]>0.1) & (rules["lift"]>5)]. \
sort_values("lift", ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
6153,"(POST, 20719)",(20724),0.115974,0.070022,0.054705,0.471698,6.736439,0.046584,1.760316,0.963267
6156,(20724),"(POST, 20719)",0.070022,0.115974,0.054705,0.78125,6.736439,0.046584,4.041263,0.915671
264,(20724),(20719),0.070022,0.126915,0.059081,0.84375,6.648168,0.050194,5.587746,0.913551
265,(20719),(20724),0.126915,0.070022,0.059081,0.465517,6.648168,0.050194,1.739959,0.973081
6157,(20719),"(POST, 20724)",0.126915,0.065646,0.054705,0.431034,6.566092,0.046373,1.642199,0.970927
6152,"(POST, 20724)",(20719),0.065646,0.126915,0.054705,0.833333,6.566092,0.046373,5.238512,0.90726


In [10]:
def arl_recommender(rules_df, product_id, rec_count=1):
    sorted_rules = rules_df.sort_values("lift", ascending=False)
    recommendation_list = []
    for i, product in enumerate(sorted_rules["antecedents"]):
        for j in list(product):
            if j == product_id:
                recommendation_list.append(list(sorted_rules.iloc[i]["consequents"])[0])

    return recommendation_list[0:rec_count]

In [11]:
product_ids = [21987, 23235, 22747]
rec_counts = [1, 2, 3]

for p_id, r_count in zip(product_ids, rec_counts):
    recommendations = arl_recommender(rules, p_id, r_count)
    print(f"Recommendations for the product {p_id}: {recommendations}")

Recommendations for the product 21987: [21086]
Recommendations for the product 23235: [23244, 'POST']
Recommendations for the product 22747: [22746, 22746, 22746]
