# Case 3: Recheio Customer Data Enrichment & Recommendation Engine

## Overview
This case study focuses on enriching Recheio Cash & Carry’s customer dataset to deliver more relevant product recommendations across digital channels such as the website, mobile app, POS, and contact center. The project combines customer segmentation and association rule mining to develop systems like **Smart Baskets** and **Did You Forget**, enhancing customer experience and increasing Recheio’s share in total customer purchases.

## Business Problem
- Recheio serves two distinct segments (HoReCa and Traditional Retail), each with unique needs.  
- Customer relationship and loyalty are heavily dependent on personalized and insightful interactions.  
- Current data systems are fragmented, limiting the ability to generate impactful recommendations.  
- There is a need to increase **Recheio’s share of wallet** by becoming more relevant in each customer’s purchasing behavior.

---

**This notebook was developed by:**  
- João Venichand - 20211644  
- Gonçalo Custódio - 20211643  
- Diogo Correia - 20211586  
- Duarte Emanuel - 20240564

# 1. Import Libraries

In [None]:
import calendar
import warnings
import pandas as pd
import random
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate
from matplotlib.colors import Normalize
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import matplotlib.dates as mdates
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import umap
import re
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import pairwise_distances
import matplotlib.patches as patches

# Dataset

In [None]:
data = pd.read_excel('data_with_clusters.xlsx')
horeca = pd.read_excel('horeca_data.xlsx')

# Market Basket Analysis (Association Rules)

In [124]:
def run_market_basket_analysis(data, top_n_products=500, min_support=0.01, max_len=2):
    # Step 1: Convert data to basket format (group by order = client + date)
    baskets = data.groupby(['Client ID', 'Date'])['Product Description'].apply(list).reset_index()

    # Step 2: Encode transactions to binary matrix
    transactions = baskets['Product Description'].tolist()
    te = TransactionEncoder()
    te_matrix = te.fit(transactions).transform(transactions)
    df_basket = pd.DataFrame(te_matrix, columns=te.columns_)

    # Step 3: Keep only top N most frequent products
    top_products = df_basket.sum().sort_values(ascending=False).head(top_n_products).index
    df_filtered = df_basket[top_products]

    # Step 4: Apply Apriori with error handling
    try:
        frequent_itemsets = apriori(
            df_filtered,
            min_support=min_support,
            use_colnames=True,
            max_len=max_len
        )
    except MemoryError:
        print("MemoryError: Try reducing `top_n_products` or `max_len`.")
        return pd.DataFrame()

    # Step 5: Extract strong rules with confidence and lift filters
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
    rules = rules[rules['confidence'] > 0.2].dropna(subset=['antecedents', 'consequents'])
    rules = rules.sort_values(by='lift', ascending=False).reset_index(drop=True)

    return rules

check_out = run_market_basket_analysis(data, top_n_products=400, max_len=2)
check_out

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(IOG PEDAÇOS MORANGO 5KG RTM),(IOG PEDAÇOS PÊSS/MARACUJÁ 5KG RTM),0.014498,0.014046,0.012391,0.854671,60.846490,1.0,0.012187,6.784300,0.998035,0.767081,0.852601,0.868407
1,(IOG PEDAÇOS PÊSS/MARACUJÁ 5KG RTM),(IOG PEDAÇOS MORANGO 5KG RTM),0.014046,0.014498,0.012391,0.882143,60.846490,1.0,0.012187,8.361836,0.997578,0.767081,0.880409,0.868407
2,(RISSOIS CAMARAO MINI MCHEF CG 46UN 920G),(PASTEIS BACALHAU MINI MCHEF CG 55UN 825G),0.013896,0.013545,0.011137,0.801444,59.170317,1.0,0.010949,4.968148,0.996953,0.683077,0.798718,0.811833
3,(PASTEIS BACALHAU MINI MCHEF CG 55UN 825G),(RISSOIS CAMARAO MINI MCHEF CG 46UN 920G),0.013545,0.013896,0.011137,0.822222,59.170317,1.0,0.010949,5.546836,0.996598,0.683077,0.819717,0.811833
4,(CROQUETES CARNE MINI MCHEF CG 55UN 825G),(PASTEIS BACALHAU MINI MCHEF CG 55UN 825G),0.015351,0.013545,0.011388,0.741830,54.769039,1.0,0.011180,3.820953,0.997047,0.650430,0.738285,0.791285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6194,(AGUA MINERAL CALDAS DE PENACOVA 5LT),(ACUCAR AMANH BCO PAP KG),0.062005,0.147035,0.012742,0.205502,1.397635,1.0,0.003625,1.073589,0.303313,0.064912,0.068545,0.146081
6195,(COUVE CORACAO DE BOI C/FOLHAS RCH),(ACUCAR AMANH BCO PAP KG),0.054028,0.147035,0.011087,0.205200,1.395582,1.0,0.003143,1.073181,0.299642,0.058358,0.068191,0.140300
6196,(AGUA MINERAL CALDAS DE PENACOVA 5LT),(LEITE MCHEF UHT M/GORDO LT),0.062005,0.172670,0.013695,0.220874,1.279169,1.0,0.002989,1.061869,0.232669,0.061975,0.058264,0.150094
6197,(ARROZ AGULHA MASTERCHEF 5 KG),(LEITE MCHEF UHT M/GORDO LT),0.053125,0.172670,0.011087,0.208687,1.208593,1.0,0.001913,1.045516,0.182275,0.051636,0.043535,0.136447


## Aqui Meter a função de Categorias dos produtos

Graph Visualization Maybe?

# Similarity Measures

Jaccard, Cosine and Dice Similarity

In [125]:
# Monthly basket construction
baskets = data.groupby(['Client ID', pd.Grouper(key='Date', freq='M')])['Product Description'].apply(list).reset_index()
transactions = baskets['Product Description'].tolist()

# Encode
te = TransactionEncoder()
te_matrix = te.fit(transactions).transform(transactions)
df_basket = pd.DataFrame(te_matrix, columns=te.columns_)

# Filter by frequency
df_basket = df_basket[df_basket.sum().sort_values(ascending=False).head(10).index]

  baskets = data.groupby(['Client ID', pd.Grouper(key='Date', freq='M')])['Product Description'].apply(list).reset_index()


In [126]:
def dice_similarity_matrix(X):
    X = X.astype(bool).astype(int)
    intersection = np.dot(X.T, X)
    row_sums = X.sum(axis=0).values
    dice = 2 * intersection / (row_sums[:, None] + row_sums[None, :])
    np.fill_diagonal(dice, 1.0)
    return pd.DataFrame(dice, index=X.columns, columns=X.columns)

def compute_similarity_matrices(basket_matrix):
    X = (basket_matrix > 0).astype(int)

    # Jaccard
    jaccard_sim = 1 - pairwise_distances(X.T.values, metric='jaccard')
    df_jaccard = pd.DataFrame(jaccard_sim, index=X.columns, columns=X.columns)

    # Cosine
    cosine_sim = cosine_similarity(X.T.values)
    df_cosine = pd.DataFrame(cosine_sim, index=X.columns, columns=X.columns)

    # Dice
    df_dice = dice_similarity_matrix(X)

    return {
        'jaccard': df_jaccard,
        'cosine': df_cosine,
        'dice': df_dice
    }

In [127]:
similarities = compute_similarity_matrices(df_basket)
print(similarities['jaccard'].head())

                                         LEITE MCHEF UHT M/GORDO LT  \
LEITE MCHEF UHT M/GORDO LT                                 1.000000   
ACUCAR AMANH BCO PAP KG                                    0.310148   
SAL AMANHECER CRISTAL KG                                   0.229154   
FARINHA AMANH S/FERMENTO 1KG                               0.250161   
VINAGRE AMANHECER DE VINHO BRANCO 1000M                    0.199615   

                                         ACUCAR AMANH BCO PAP KG  \
LEITE MCHEF UHT M/GORDO LT                              0.310148   
ACUCAR AMANH BCO PAP KG                                 1.000000   
SAL AMANHECER CRISTAL KG                                0.361345   
FARINHA AMANH S/FERMENTO 1KG                            0.350214   
VINAGRE AMANHECER DE VINHO BRANCO 1000M                 0.321014   

                                         SAL AMANHECER CRISTAL KG  \
LEITE MCHEF UHT M/GORDO LT                               0.229154   
ACUCAR AMANH BCO PAP KG   



In [128]:
print(similarities['cosine'].head())

                                         LEITE MCHEF UHT M/GORDO LT  \
LEITE MCHEF UHT M/GORDO LT                                 1.000000   
ACUCAR AMANH BCO PAP KG                                    0.473550   
SAL AMANHECER CRISTAL KG                                   0.378908   
FARINHA AMANH S/FERMENTO 1KG                               0.409233   
VINAGRE AMANHECER DE VINHO BRANCO 1000M                    0.344096   

                                         ACUCAR AMANH BCO PAP KG  \
LEITE MCHEF UHT M/GORDO LT                              0.473550   
ACUCAR AMANH BCO PAP KG                                 1.000000   
SAL AMANHECER CRISTAL KG                                0.537655   
FARINHA AMANH S/FERMENTO 1KG                            0.528341   
VINAGRE AMANHECER DE VINHO BRANCO 1000M                 0.500055   

                                         SAL AMANHECER CRISTAL KG  \
LEITE MCHEF UHT M/GORDO LT                               0.378908   
ACUCAR AMANH BCO PAP KG   

In [129]:
print(similarities['dice'].head())

                                         LEITE MCHEF UHT M/GORDO LT  \
LEITE MCHEF UHT M/GORDO LT                                 1.000000   
ACUCAR AMANH BCO PAP KG                                    0.473455   
SAL AMANHECER CRISTAL KG                                   0.372864   
FARINHA AMANH S/FERMENTO 1KG                               0.400206   
VINAGRE AMANHECER DE VINHO BRANCO 1000M                    0.332798   

                                         ACUCAR AMANH BCO PAP KG  \
LEITE MCHEF UHT M/GORDO LT                              0.473455   
ACUCAR AMANH BCO PAP KG                                 1.000000   
SAL AMANHECER CRISTAL KG                                0.530864   
FARINHA AMANH S/FERMENTO 1KG                            0.518753   
VINAGRE AMANHECER DE VINHO BRANCO 1000M                 0.486012   

                                         SAL AMANHECER CRISTAL KG  \
LEITE MCHEF UHT M/GORDO LT                               0.372864   
ACUCAR AMANH BCO PAP KG   

Visualization

In [140]:
"""plt.figure(figsize=(12, 8))
sns.heatmap(similarities['jaccard'].iloc[:20, :20], cmap="Blues")
plt.title("Jaccard Similarity (Top 20 Products)")
plt.show()"""

'plt.figure(figsize=(12, 8))\nsns.heatmap(similarities[\'jaccard\'].iloc[:20, :20], cmap="Blues")\nplt.title("Jaccard Similarity (Top 20 Products)")\nplt.show()'

# Page Rank

In [131]:
def build_cooccurrence_matrix(df_basket):
    binary = df_basket.astype(int)
    return np.dot(binary.T, binary)

def normalize_transition_matrix(co_matrix):
    trans_matrix = co_matrix.astype(float)
    row_sums = trans_matrix.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1
    return trans_matrix / row_sums

def pagerank_scores(trans_matrix, damping=0.85, max_iter=100, tol=1e-6):
    n = trans_matrix.shape[0]
    rank = np.ones(n) / n
    teleport = np.ones(n) / n
    for _ in range(max_iter):
        prev_rank = rank.copy()
        rank = damping * trans_matrix.dot(rank) + (1 - damping) * teleport
        if np.linalg.norm(rank - prev_rank, ord=1) < tol:
            break
    return rank

def compute_pagerank_recommendations(df_basket, damping=0.85, top_n=None):
    co_matrix = build_cooccurrence_matrix(df_basket)
    np.fill_diagonal(co_matrix, 0)
    trans_matrix = normalize_transition_matrix(co_matrix)
    scores = pagerank_scores(trans_matrix, damping=damping)
    ranked = pd.Series(scores, index=df_basket.columns).sort_values(ascending=False)
    return ranked if top_n is None else ranked.head(top_n)

In [132]:
pagerank_scores = compute_pagerank_recommendations(df_basket)
pagerank_scores

SAL AMANHECER CRISTAL KG                   0.1
FARINHA AMANH S/FERMENTO 1KG               0.1
LEITE MCHEF UHT M/GORDO LT                 0.1
ACUCAR AMANH BCO PAP KG                    0.1
VINAGRE AMANHECER DE VINHO BRANCO 1000M    0.1
POLPA TOMATE MCHEF 1LT                     0.1
BOLACHA AMANHECER MARIA 4X200GR            0.1
ESPARGUETE MASTERCHEF 1KG                  0.1
MAIONESE GULOSO 5L                         0.1
ERVILHA MCHEF CONG 2,5 KG                  0.1
dtype: float64

# Integrated Recommendation System

## **SMART BASKET (Check-In)**

In [133]:
def smart_basket_recommendations(df_basket, similarities, pagerank_scores, top_n=10):
    top_pagerank = pagerank_scores.head(top_n).index.tolist()
    recommendations = []

    for product in top_pagerank:
        similar_products = similarities['cosine'][product].drop(index=product).sort_values(ascending=False)
        similar_filtered = [p for p in similar_products.index if p not in top_pagerank][:3]
        recommendations.append({
            'Product': product,
            'Recommended due to similarity': similar_filtered
        })

    return pd.DataFrame(recommendations)

In [134]:
smart_basket_df = smart_basket_recommendations(df_basket, similarities, pagerank_scores, top_n=5)
print("Smart Basket Recommendations (Check-In):")
print(smart_basket_df)

Smart Basket Recommendations (Check-In):
                                   Product  \
0                 SAL AMANHECER CRISTAL KG   
1             FARINHA AMANH S/FERMENTO 1KG   
2               LEITE MCHEF UHT M/GORDO LT   
3                  ACUCAR AMANH BCO PAP KG   
4  VINAGRE AMANHECER DE VINHO BRANCO 1000M   

                       Recommended due to similarity  
0  [ESPARGUETE MASTERCHEF 1KG, BOLACHA AMANHECER ...  
1  [ERVILHA MCHEF CONG 2,5 KG, MAIONESE GULOSO 5L...  
2  [POLPA TOMATE MCHEF 1LT, ERVILHA MCHEF CONG 2,...  
3  [BOLACHA AMANHECER MARIA 4X200GR, ERVILHA MCHE...  
4  [MAIONESE GULOSO 5L, POLPA TOMATE MCHEF 1LT, E...  


# **"DID YOU FORGET?" (Check-Out)**

In [135]:
def checkout_recommendations(current_basket, rules_df, similarities, top_n=5):
    recommendations = []

    for _, rule in rules_df.iterrows():
        antecedents = set(rule['antecedents'])
        consequents = set(rule['consequents'])

        if antecedents.issubset(current_basket) and not consequents.issubset(current_basket):
            for item in consequents:
                if item not in recommendations:
                    if item in similarities['dice'].columns:
                        similar_items = similarities['dice'][item].drop(index=item).sort_values(ascending=False)
                        similar_filtered = [p for p in similar_items.index if p not in current_basket][:2]
                    else:
                        similar_filtered = []

                    recommendations.append({
                        'Recommended Item': item,
                        'Because you bought': ', '.join(antecedents),
                        'Confidence': rule['confidence'],
                        'Lift': rule['lift'],
                        'Similar popular items': similar_filtered
                    })

                if len(recommendations) >= top_n:
                    break

        if len(recommendations) >= top_n:
            break

    return pd.DataFrame(recommendations)

In [136]:
current_basket = {'LEITE MCHEF UHT M/GORDO LT', 'ACUCAR AMANH BCO PAP KG'}
checkout_df = checkout_recommendations(current_basket, check_out, similarities, top_n=5)

print("\nDid You Forget? Recommendations (Check-Out):")
print(checkout_df)


Did You Forget? Recommendations (Check-Out):
                   Recommended Item       Because you bought  Confidence  \
0      FARINHA AMANH S/FERMENTO 1KG  ACUCAR AMANH BCO PAP KG    0.256909   
1                    MORANGO KG RCH  ACUCAR AMANH BCO PAP KG    0.201296   
2                MAIONESE GULOSO 5L  ACUCAR AMANH BCO PAP KG    0.200273   
3  OVO PASTEURIZADO TETRA BRIK 1 Lt  ACUCAR AMANH BCO PAP KG    0.227226   

       Lift                              Similar popular items  
0  3.214829  [SAL AMANHECER CRISTAL KG, VINAGRE AMANHECER D...  
1  2.675096                                                 []  
2  2.487377  [VINAGRE AMANHECER DE VINHO BRANCO 1000M, FARI...  
3  2.262501                                                 []  


# **MCCV**

In [137]:
def monte_carlo_validation(data, rules_df, similarities, iterations=5, cutoff='2022-10-01', top_n=5):
    clients = data['Client ID'].unique()
    results = []

    for _ in range(iterations):
        sampled_clients = random.sample(list(clients), k=int(len(clients) * 0.1))

        for client_id in sampled_clients:
            client_data = data[data['Client ID'] == client_id]
            obs_data = client_data[client_data['Date'] < cutoff]
            test_data = client_data[client_data['Date'] >= cutoff]

            if obs_data.empty or test_data.empty:
                continue

            baskets = obs_data.groupby(['Client ID', 'Date'])['Product Description'].apply(list).reset_index()
            transactions = baskets['Product Description'].tolist()
            te = TransactionEncoder()
            te_matrix = te.fit(transactions).transform(transactions)
            df_basket = pd.DataFrame(te_matrix, columns=te.columns_)
            df_basket = df_basket[df_basket.sum().sort_values(ascending=False).head(400).index]

            rules = run_market_basket_analysis(obs_data, top_n_products=400, max_len=2)

            future_basket = test_data.groupby('Date')['Product Description'].apply(list).sample(1).values[0]
            if len(future_basket) < 2:
                continue

            held_out = future_basket.pop()
            current_basket = set(future_basket)

            recs = checkout_recommendations(current_basket, rules, similarities, top_n=top_n)

            if not recs.empty and 'Recommended Item' in recs.columns:
                hit = held_out in recs['Recommended Item'].values
            else:
                hit = False

            results.append(hit)

    accuracy = sum(results) / len(results) if results else 0
    print(f"Monte Carlo Cross-Validation Accuracy: {accuracy:.2%}")

In [138]:
monte_carlo_validation(data, check_out, similarities, iterations=10, cutoff='2022-11-01', top_n=5)

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)
  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)
  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)
  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)
  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)
  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)
  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)
  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)
  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)
  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)
  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)
  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)
  ce

Monte Carlo Cross-Validation Accuracy: 16.72%


  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)
