# Case 3: Recheio Customer Data Enrichment & Recommendation Engine

## Overview
This case study focuses on enriching Recheio Cash & Carry’s customer dataset to deliver more relevant product recommendations across digital channels such as the website, mobile app, POS, and contact center. The project combines customer segmentation and association rule mining to develop systems like **Smart Baskets** and **Did You Forget**, enhancing customer experience and increasing Recheio’s share in total customer purchases.

## Business Problem
- Recheio serves two distinct segments (HoReCa and Traditional Retail), each with unique needs.  
- Customer relationship and loyalty are heavily dependent on personalized and insightful interactions.  
- Current data systems are fragmented, limiting the ability to generate impactful recommendations.  
- There is a need to increase **Recheio’s share of wallet** by becoming more relevant in each customer’s purchasing behavior.

---

**This notebook was developed by:**  
- João Venichand - 20211644  
- Gonçalo Custódio - 20211643  
- Diogo Correia - 20211586  
- Duarte Emanuel - 20240564

# 1. Import Libraries

In [1]:
import calendar
import warnings
import pandas as pd
import random
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate
from matplotlib.colors import Normalize
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import matplotlib.dates as mdates
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import umap
import re
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import pairwise_distances
import matplotlib.patches as patches

# Dataset

In [2]:
data = pd.read_excel('data_with_clusters.xlsx')
data

  now = datetime.datetime.utcnow()
  now = datetime.datetime.utcnow()


Unnamed: 0,Date,Client ID,ID Product,YearMonth,ZIP Code,ID Client Type,Product Description,ID Product Category,Cluster
0,2022-01-02,210100281,224780,2022-01,4400,Café/Pastelaria,FARINHA ESPIGA AMIDO MILHO LUS.2KG,FARINHAS,2
1,2022-01-02,210100281,276806,2022-01,4400,Café/Pastelaria,LARANJA CAL7 (67/76) RCH,FRUTAS FRESCAS,2
2,2022-01-02,210100281,276809,2022-01,4400,Café/Pastelaria,LIMAO CAL 3/4 RCH,FRUTAS FRESCAS,2
3,2022-01-02,210100281,277674,2022-01,4400,Café/Pastelaria,COGUMELO BRANCO MÉDIO RCH,LEGUMES FRESCOS,2
4,2022-01-02,210100281,277917,2022-01,4400,Café/Pastelaria,TOMATE BB 67/82 1CAM RCH,LEGUMES FRESCOS,2
...,...,...,...,...,...,...,...,...,...
884094,2022-12-31,210106386,949447,2022-12,4000,Hotelaria,"BATATA WEDGES C PELE ECOFROST CG 2,5 KG",BATATA,0
884095,2022-12-31,210106386,954062,2022-12,4000,Hotelaria,FIAMBRE FATIADO CASA PORTUGUESA 750G,FIAMBRES,0
884096,2022-12-31,210199916,106702,2022-12,4480,,MOSTARDA DONA SARAH 950GR,MOLHOS,4
884097,2022-12-31,210199916,906800,2022-12,4480,,MOLHO INGLES UNCLE THOMAS 1 LT,MOLHOS,4


In [3]:
data.columns

Index(['Date', 'Client ID', 'ID Product', 'YearMonth', 'ZIP Code',
       'ID Client Type', 'Product Description', 'ID Product Category',
       'Cluster'],
      dtype='object')

In [4]:
horeca = pd.read_excel('horeca_data.xlsx')
horeca

  now = datetime.datetime.utcnow()
  now = datetime.datetime.utcnow()


Unnamed: 0,Date,Client ID,ID Product,YearMonth,ZIP Code,ID Client Type,Product Description,ID Product Category,Recency,Frequency,...,Sunday,YearWeek,Avg_Products_Per_Week,Avg_Products_Per_Purchase,Region,Region_Centro,Region_Lisboa,Region_Norte,Region_Porto,Region_Sul
0,2022-01-02,210105700,941436,2022-01,4400,,* LOMBO DE PORCO C AMEIXAS INOX,PRATO DE CARNE,16,36,...,0.321918,2022-01,7.300000,4.055556,Porto,0,0,0,1,0
1,2022-01-02,210105700,948765,2022-01,4400,,* EMPADÃO DE CARNE INOX,PRATO DE CARNE,16,36,...,0.321918,2022-01,7.300000,4.055556,Porto,0,0,0,1,0
2,2022-01-02,210105700,10001162,2022-01,4400,,JARDINEIRA DE SOJA *,PRATO VEGETARIANO,16,36,...,0.321918,2022-01,7.300000,4.055556,Porto,0,0,0,1,0
3,2022-01-02,210105969,10005577,2022-01,3004,Coletiva/Instituição/Cantina,* CREME DE CENOURA SS 3LT,SOPAS,1,286,...,0.150018,2022-01,105.115385,19.111888,Centro,1,0,0,0,0
4,2022-01-02,210105969,10005957,2022-01,3004,Coletiva/Instituição/Cantina,* CENOURA COZIDA SI,ACOMPANHAMENTO,1,286,...,0.150018,2022-01,105.115385,19.111888,Centro,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10019,2022-12-31,210105969,10006520,2022-12,3004,Coletiva/Instituição/Cantina,* LEG CAMPESTRES SI,ACOMPANHAMENTO,1,286,...,0.150018,2022-52,105.115385,19.111888,Centro,1,0,0,0,0
10020,2022-12-31,210105969,10006532,2022-12,3004,Coletiva/Instituição/Cantina,* MASSA PENNE SI,ACOMPANHAMENTO,1,286,...,0.150018,2022-52,105.115385,19.111888,Centro,1,0,0,0,0
10021,2022-12-31,210105969,10006710,2022-12,3004,Coletiva/Instituição/Cantina,* PERNAS DE FRANGO COZIDAS SI,PRATO DE CARNE,1,286,...,0.150018,2022-52,105.115385,19.111888,Centro,1,0,0,0,0
10022,2022-12-31,210105969,10006758,2022-12,3004,Coletiva/Instituição/Cantina,* COSTELETAS A SALSICHEIRO SI,PRATO DE CARNE,1,286,...,0.150018,2022-52,105.115385,19.111888,Centro,1,0,0,0,0


In [5]:
horeca.columns

Index(['Date', 'Client ID', 'ID Product', 'YearMonth', 'ZIP Code',
       'ID Client Type', 'Product Description', 'ID Product Category',
       'Recency', 'Frequency', 'Monetary', 'Log_Recency', 'Log_Frequency',
       'Log_Monetary', 'Weekday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday',
       'Friday', 'Saturday', 'Sunday', 'YearWeek', 'Avg_Products_Per_Week',
       'Avg_Products_Per_Purchase', 'Region', 'Region_Centro', 'Region_Lisboa',
       'Region_Norte', 'Region_Porto', 'Region_Sul'],
      dtype='object')

# Market Basket Analysis (Association Rules)

In [6]:
def mba(data, cluster_col='Cluster', top_n_products=500, min_support=0.01, max_len=2):
    cluster_rules = {}

    for cluster_id in sorted(data[cluster_col].unique()):
        print(f"Running MBA for Cluster {cluster_id}...")
        
        # Filter data for current cluster
        cluster_data = data[data[cluster_col] == cluster_id]
        
        # Step 1: Basket format
        baskets = cluster_data.groupby(['Client ID', 'Date'])['Product Description'].apply(list).reset_index()

        # Step 2: Encode to binary matrix
        transactions = baskets['Product Description'].tolist()
        te = TransactionEncoder()
        te_matrix = te.fit(transactions).transform(transactions)
        df_basket = pd.DataFrame(te_matrix, columns=te.columns_)

        # Step 3: Top frequent products only
        top_products = df_basket.sum().sort_values(ascending=False).head(top_n_products).index
        df_filtered = df_basket[top_products]

        # Step 4: Apriori + Rules
        try:
            frequent_itemsets = apriori(df_filtered, min_support=min_support, use_colnames=True, max_len=max_len)
            rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
            rules = rules[rules['confidence'] > 0.2].dropna(subset=['antecedents', 'consequents'])
            rules = rules.sort_values(by='lift', ascending=False).reset_index(drop=True)
        except MemoryError:
            print(f"Cluster {cluster_id}: MemoryError — try lowering `top_n_products` or `max_len`.")
            rules = pd.DataFrame()

        cluster_rules[cluster_id] = rules

    return cluster_rules

mba_clusters = mba(data, top_n_products=400, max_len=2)
mba_clusters

Running MBA for Cluster 0...
Running MBA for Cluster 1...
Running MBA for Cluster 2...
Running MBA for Cluster 3...
Running MBA for Cluster 4...
Running MBA for Cluster 5...


{0:                                     antecedents  \
 0            (ICE TEA LIPTON PÊSSEGO LATA 33CL)   
 1              (ICE TEA LIPTON LIMÃO LATA 33CL)   
 2             (DOCE BONNE MAMAN MORANG.15X30GR)   
 3             (DOCE BONNE MAMAN LARANJA15X30GR)   
 4                 (DOCE BONNE MAMAN MEL15X30GR)   
 ..                                          ...   
 156           (AGUA C/GAS PEDRAS SALGADAS 25CL)   
 157           (BOLACHA AMANHECER MARIA 4X200GR)   
 158                (POLPA DE TOMATE GULOSO 1KG)   
 159  (REFRIGERANTE COCA COLA ORIGINAL LATA33CL)   
 160     (REFRIGERANTE COCA COLA ZERO LATA 33CL)   
 
                             consequents  antecedent support  \
 0      (ICE TEA LIPTON LIMÃO LATA 33CL)            0.015675   
 1    (ICE TEA LIPTON PÊSSEGO LATA 33CL)            0.015538   
 2     (DOCE BONNE MAMAN LARANJA15X30GR)            0.015812   
 3     (DOCE BONNE MAMAN MORANG.15X30GR)            0.015538   
 4     (DOCE BONNE MAMAN LARANJA15X30GR)           

# **Cluster-Based Association Rule Highlights**

Below is a summary of the most interesting association rules discovered through Market Basket Analysis (MBA) for each cluster, based on support, confidence, and lift.

---

## Cluster 0 – Convenience & Retail Drink Buyers

**Top Rules:**
- `ICE TEA LIPTON PÊSSEGO → ICE TEA LIPTON LIMÃO`  
  - Confidence: 69.0%, Lift: 44.4
- `DOCE BONNE MAMAN MORANGO → DOCE BONNE MAMAN LARANJA`  
  - Confidence: 67.1%, Lift: 43.2

**Conclusion:**  
Buyers in this cluster frequently co-purchase **complementary flavors** and **branded ready-to-consume items**. Likely small retail stores or vending operators focused on variety drinks and snacks.

---

## Cluster 1 – Institutional Meal Providers

**Top Rules:**
- `PASTEIS BACALHAU → CROQUETES CARNE`  
  - Confidence: 74.1%, Lift: 12.3
- `TOMATE CONCENTRADO → ÓLEO ALTO RENDIMENTO`  
  - Confidence: 66.0%, Lift: 10.5

**Conclusion:**  
This cluster shows high co-occurrence of **bulk frozen traditional foods** and **staple cooking products**, typical of **canteens, caterers, or school kitchens**.

---

## Cluster 2 – Health-Conscious / Snack Cafés

**Top Rules:**
- `IOGURTE COCO → IOGURTE MORANGO`  
  - Confidence: 75.8%, Lift: 43.0
- `POLVO CONGELADO → MAIONESE MCHEF`  
  - Confidence: 78.8%, Lift: 40.0

**Conclusion:**  
Strong focus on **dairy items**, **seafood**, and light meal ingredients. This cluster likely represents **vegetarian cafés, health-food stores, or light meal caterers**.

---

## Cluster 3 – Beverage-Focused Businesses

**Top Rules:**
- `VINHO BEIRA → VINHO DOURO VILA REAL`  
  - Confidence: 96.2%, Lift: 85.1
- `VINHO DOURO QTA ACIPRESTES → VINHO BEIRA`  
  - Confidence: 86.2%, Lift: 82.2

**Conclusion:**  
Almost all rules involve **wines and premium drinks** with exceptionally high lift. This clearly represents **liquor stores, bars, or clubs** with high-end product overlap.

---

## Cluster 4 – Dairy and Sweet Snack Buyers

**Top Rules:**
- `IOGURTE MORANGO → IOGURTE COCO`  
  - Confidence: 64.3%, Lift: 56.5
- `IOGURTE BANANA → IOGURTE MORANGO`  
  - Confidence: 95.6%, Lift: 56.2

**Conclusion:**  
Tight clustering around **flavored yogurts** and **dairy snack combos**. This cluster is likely small **pastry cafés or snack bars** with a fast-moving inventory.

---

## Cluster 5 – Gourmet & Full-Service Buyers

**Top Rules:**
- `BOVINO LOMBO → PETIT GÂTEAU CACAU`  
  - Confidence: 87.5%, Lift: 62.9
- `PETIT GÂTEAU CACAU → PERNA DE PATO CONGELADA`  
  - Confidence: 81.8%, Lift: 60.6

Graph Visualization Maybe?

## **MBA for Horeca**

In [None]:
def mba_horeca(data, top_n_products=500, min_support=0.01, max_len=2):
    # Step 1: Convert data to basket format (group by order = client + date)
    baskets = data.groupby(['Client ID', 'Date'])['Product Description'].apply(list).reset_index()

    # Step 2: Encode transactions to binary matrix
    transactions = baskets['Product Description'].tolist()
    te = TransactionEncoder()
    te_matrix = te.fit(transactions).transform(transactions)
    df_basket = pd.DataFrame(te_matrix, columns=te.columns_)

    # Step 3: Keep only top N most frequent products
    top_products = df_basket.sum().sort_values(ascending=False).head(top_n_products).index
    df_filtered = df_basket[top_products]

    # Step 4: Apply Apriori with error handling
    try:
        frequent_itemsets = apriori(df_filtered, min_support=min_support, use_colnames=True, max_len=max_len)
        rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
        rules = rules[rules['confidence'] > 0.2].dropna(subset=['antecedents', 'consequents'])
        rules = rules.sort_values(by='lift', ascending=False).reset_index(drop=True)
    except MemoryError:
        print("MemoryError: Try reducing `top_n_products` or `max_len`.")
        rules = pd.DataFrame()

    return rules

horeca_rules = mba_horeca(horeca, top_n_products=400, max_len=2)
horeca_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(* BACALHAU C/ BROA SI),(* BIFE DE PERU C COGUMELOS SI),0.020969,0.011184,0.010718,0.511111,45.701852,1.0,0.010483,2.022579,0.999069,0.500000,0.505582,0.734722
1,(* BIFE DE PERU C COGUMELOS SI),(* BACALHAU C/ BROA SI),0.011184,0.020969,0.010718,0.958333,45.701852,1.0,0.010483,23.496738,0.989182,0.500000,0.957441,0.734722
2,(* CARIL DE GRÃO SI),(* BIFE DE PERU PANADO SI),0.017707,0.019571,0.013514,0.763158,38.993734,1.0,0.013167,4.139588,0.991919,0.568627,0.758430,0.726817
3,(* BIFE DE PERU PANADO SI),(* CARIL DE GRÃO SI),0.019571,0.017707,0.013514,0.690476,38.993734,1.0,0.013167,3.173561,0.993805,0.568627,0.684897,0.726817
4,(* PESCADA DOURADA SI),(* LASANHA DE CARNE SI),0.018639,0.015843,0.011184,0.600000,37.870588,1.0,0.010888,2.460391,0.992086,0.480000,0.593561,0.652941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2289,(* CREME DE LEGUMES SS 3LT),(* CREME DE ABOBORA SS 3LT),0.068500,0.067568,0.021435,0.312925,4.631293,1.0,0.016807,1.357105,0.841736,0.186992,0.263137,0.315083
2290,(* CREME DE ABOBORA SS 3LT),(* CREME DE LEGUMES SS 3LT),0.067568,0.068500,0.021435,0.317241,4.631293,1.0,0.016807,1.364319,0.840895,0.186992,0.267034,0.315083
2291,(* FILETES PERCA-DO-NILO NO FORNO SI),(* CREME DE LEGUMES SS 3LT),0.036347,0.068500,0.011184,0.307692,4.491889,1.0,0.008694,1.345501,0.806697,0.119403,0.256782,0.235479
2292,(* CREME DE CENOURA),(* SOPA DE GRÃO C/ESPINAFRES),0.082479,0.085741,0.030289,0.367232,4.283039,1.0,0.023217,1.444856,0.835426,0.219595,0.307889,0.360246


# Cluster-Based Association Rules Analysis – HoReCa Segment

This analysis focuses on the HoReCa dataset, which includes products marked for professional foodservice use. The top rules reveal strong patterns of co-purchasing, which are useful for understanding purchasing behavior in institutional foodservice.

---

## Key Association Rules

### Ready-to-Serve Main Dishes
- **Rule:** *Bacalhau C/ Broa SI → *Bife de Peru C Cogumelos SI*  
  - Confidence: 51.1%, Lift: 45.7  
- **Reverse Rule:** *Bife de Peru C Cogumelos SI → *Bacalhau C/ Broa SI*  
  - Confidence: 95.8%, Lift: 45.7

**Conclusion:** These two dishes are frequently purchased together, indicating that they are likely part of a recurring menu in foodservice environments.

---

### Vegetarian / Healthy Dishes
- **Rule:** *Caril de Grão SI → *Bife de Peru Panado SI*  
  - Confidence: 76.3%, Lift: 39.0  
- **Rule:** *Sopa de Grão C/Espinafres → *Creme de Cenoura*  
  - Confidence: 35.3%, Lift: 4.28

**Conclusion:** Institutions offering vegetarian or healthy options tend to purchase these items together, showing a focus on balanced meal preparation.

---

### Soups & Starters
- **Rule:** *Creme de Legumes SS 3LT ↔ *Creme de Abóbora SS 3LT*  
  - Confidence: 31.2% – 31.7%, Lift: 4.63

**Conclusion:** Soup items are often purchased together, suggesting efficient meal planning in institutional kitchens.

---

## Overall Insights

- High confidence and lift values indicate that these product pairs are commonly co-purchased, making them ideal for promotions or suggestions.
- Most products are ready-made meals or soups, suggesting that HoReCa clients are mostly canteens, institutional kitchens, or meal prep services.

# Similarity Measures

Jaccard, Cosine and Dice Similarity

In [None]:
# Monthly basket construction
baskets = data.groupby(['Client ID', pd.Grouper(key='Date', freq='M')])['Product Description'].apply(list).reset_index()
transactions = baskets['Product Description'].tolist()

# Encode
te = TransactionEncoder()
te_matrix = te.fit(transactions).transform(transactions)
df_basket = pd.DataFrame(te_matrix, columns=te.columns_)

# Filter by frequency
df_basket = df_basket[df_basket.sum().sort_values(ascending=False).head(10).index]

In [None]:
def dice_similarity_matrix(X):
    X = X.astype(bool).astype(int)
    intersection = np.dot(X.T, X)
    row_sums = X.sum(axis=0).values
    dice = 2 * intersection / (row_sums[:, None] + row_sums[None, :])
    np.fill_diagonal(dice, 1.0)
    return pd.DataFrame(dice, index=X.columns, columns=X.columns)

def compute_similarity_matrices(basket_matrix):
    X = (basket_matrix > 0).astype(int)

    # Jaccard
    jaccard_sim = 1 - pairwise_distances(X.T.values, metric='jaccard')
    df_jaccard = pd.DataFrame(jaccard_sim, index=X.columns, columns=X.columns)

    # Cosine
    cosine_sim = cosine_similarity(X.T.values)
    df_cosine = pd.DataFrame(cosine_sim, index=X.columns, columns=X.columns)

    # Dice
    df_dice = dice_similarity_matrix(X)

    return {
        'jaccard': df_jaccard,
        'cosine': df_cosine,
        'dice': df_dice
    }

In [None]:
similarities = compute_similarity_matrices(df_basket)
print(similarities['jaccard'].head())

In [None]:
print(similarities['cosine'].head())

In [None]:
print(similarities['dice'].head())

Visualization

In [None]:
"""plt.figure(figsize=(12, 8))
sns.heatmap(similarities['jaccard'].iloc[:20, :20], cmap="Blues")
plt.title("Jaccard Similarity (Top 20 Products)")
plt.show()"""

# Page Rank

In [None]:
def build_cooccurrence_matrix(df_basket):
    binary = df_basket.astype(int)
    return np.dot(binary.T, binary)

def normalize_transition_matrix(co_matrix):
    trans_matrix = co_matrix.astype(float)
    row_sums = trans_matrix.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1
    return trans_matrix / row_sums

def pagerank_scores(trans_matrix, damping=0.85, max_iter=100, tol=1e-6):
    n = trans_matrix.shape[0]
    rank = np.ones(n) / n
    teleport = np.ones(n) / n
    for _ in range(max_iter):
        prev_rank = rank.copy()
        rank = damping * trans_matrix.dot(rank) + (1 - damping) * teleport
        if np.linalg.norm(rank - prev_rank, ord=1) < tol:
            break
    return rank

def compute_pagerank_recommendations(df_basket, damping=0.85, top_n=None):
    co_matrix = build_cooccurrence_matrix(df_basket)
    np.fill_diagonal(co_matrix, 0)
    trans_matrix = normalize_transition_matrix(co_matrix)
    scores = pagerank_scores(trans_matrix, damping=damping)
    ranked = pd.Series(scores, index=df_basket.columns).sort_values(ascending=False)
    return ranked if top_n is None else ranked.head(top_n)

In [None]:
pagerank_scores = compute_pagerank_recommendations(df_basket)
pagerank_scores

# Integrated Recommendation System

## **SMART BASKET (Check-In)**

In [None]:
def smart_basket_recommendations(df_basket, similarities, pagerank_scores, top_n=10):
    top_pagerank = pagerank_scores.head(top_n).index.tolist()
    recommendations = []

    for product in top_pagerank:
        similar_products = similarities['cosine'][product].drop(index=product).sort_values(ascending=False)
        similar_filtered = [p for p in similar_products.index if p not in top_pagerank][:3]
        recommendations.append({
            'Product': product,
            'Recommended due to similarity': similar_filtered
        })

    return pd.DataFrame(recommendations)

In [None]:
smart_basket_df = smart_basket_recommendations(df_basket, similarities, pagerank_scores, top_n=5)
print("Smart Basket Recommendations (Check-In):")
print(smart_basket_df)

# **"DID YOU FORGET?" (Check-Out)**

In [None]:
def checkout_recommendations(current_basket, rules_df, similarities, top_n=5):
    recommendations = []

    for _, rule in rules_df.iterrows():
        antecedents = set(rule['antecedents'])
        consequents = set(rule['consequents'])

        if antecedents.issubset(current_basket) and not consequents.issubset(current_basket):
            for item in consequents:
                if item not in recommendations:
                    if item in similarities['dice'].columns:
                        similar_items = similarities['dice'][item].drop(index=item).sort_values(ascending=False)
                        similar_filtered = [p for p in similar_items.index if p not in current_basket][:2]
                    else:
                        similar_filtered = []

                    recommendations.append({
                        'Recommended Item': item,
                        'Because you bought': ', '.join(antecedents),
                        'Confidence': rule['confidence'],
                        'Lift': rule['lift'],
                        'Similar popular items': similar_filtered
                    })

                if len(recommendations) >= top_n:
                    break

        if len(recommendations) >= top_n:
            break

    return pd.DataFrame(recommendations)

In [None]:
current_basket = {'LEITE MCHEF UHT M/GORDO LT', 'ACUCAR AMANH BCO PAP KG'}
checkout_df = checkout_recommendations(current_basket, check_out, similarities, top_n=5)

print("\nDid You Forget? Recommendations (Check-Out):")
print(checkout_df)

# **MCCV**

In [None]:
def monte_carlo_validation(data, rules_df, similarities, iterations=5, cutoff='2022-10-01', top_n=5):
    clients = data['Client ID'].unique()
    results = []

    for _ in range(iterations):
        sampled_clients = random.sample(list(clients), k=int(len(clients) * 0.1))

        for client_id in sampled_clients:
            client_data = data[data['Client ID'] == client_id]
            obs_data = client_data[client_data['Date'] < cutoff]
            test_data = client_data[client_data['Date'] >= cutoff]

            if obs_data.empty or test_data.empty:
                continue

            baskets = obs_data.groupby(['Client ID', 'Date'])['Product Description'].apply(list).reset_index()
            transactions = baskets['Product Description'].tolist()
            te = TransactionEncoder()
            te_matrix = te.fit(transactions).transform(transactions)
            df_basket = pd.DataFrame(te_matrix, columns=te.columns_)
            df_basket = df_basket[df_basket.sum().sort_values(ascending=False).head(400).index]

            rules = run_market_basket_analysis(obs_data, top_n_products=400, max_len=2)

            future_basket = test_data.groupby('Date')['Product Description'].apply(list).sample(1).values[0]
            if len(future_basket) < 2:
                continue

            held_out = future_basket.pop()
            current_basket = set(future_basket)

            recs = checkout_recommendations(current_basket, rules, similarities, top_n=top_n)

            if not recs.empty and 'Recommended Item' in recs.columns:
                hit = held_out in recs['Recommended Item'].values
            else:
                hit = False

            results.append(hit)

    accuracy = sum(results) / len(results) if results else 0
    print(f"Monte Carlo Cross-Validation Accuracy: {accuracy:.2%}")

In [None]:
monte_carlo_validation(data, mba_clusters, similarities, iterations=10, cutoff='2022-11-01', top_n=5)