In [1]:
# Importações necessárias
import warnings
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
import shap
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, Table, TableStyle
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
import matplotlib.pyplot as plt
import io
import os
from datetime import datetime

warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Função auxiliar para converter figuras do Matplotlib em imagens para o PDF
def fig_to_image(fig):
    """Converte uma figura matplotlib para um objeto Image do ReportLab"""
    buf = io.BytesIO()
    fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
    buf.seek(0)
    return Image(buf, width=6*inch, height=3*inch)

In [3]:
# Função que realiza o pré-processamento da base de dados
def load_and_preprocess_data(file_path):
    """Carrega e pré-processa os dados."""
    df = pd.read_parquet(file_path)
    df = df[['Customer Name', 'Product ID', 'Product Name', 'Sales', 'Category', 'Sub-Category']].copy()

    # Codificação
    customer_encoder = LabelEncoder()
    product_encoder = LabelEncoder()
    category_encoder = LabelEncoder()
    subcategory_encoder = LabelEncoder()

    df['Customer ID Enc'] = customer_encoder.fit_transform(df['Customer Name'])
    df['Product ID Enc'] = product_encoder.fit_transform(df['Product ID'])
    df['Category Enc'] = category_encoder.fit_transform(df['Category'])
    df['Sub-Category Enc'] = subcategory_encoder.fit_transform(df['Sub-Category'])

    # Normalização
    scaler = MinMaxScaler()
    df['Sales Normalized'] = scaler.fit_transform(df[['Sales']])

    return df, customer_encoder, product_encoder, category_encoder, subcategory_encoder

In [4]:
# Função para dividir a base de dados em treino e teste
def split_data(df):
    """Divide os dados em treino e teste."""
    customer_ids = df['Customer ID Enc'].values
    product_ids = df['Product ID Enc'].values
    category_ids = df['Category Enc'].values
    subcategory_ids = df['Sub-Category Enc'].values
    sales = df['Sales Normalized'].values

    return train_test_split(
        customer_ids, product_ids, category_ids, subcategory_ids, sales,
        test_size=0.2, random_state=42
    )

In [5]:
# Função que efetivamente cria o modelo de recomendação de produtos
def create_model(num_customers, num_products, num_categories, num_subcategories, embedding_dim):
    """Cria o modelo de recomendação."""
    customer_input = layers.Input(shape=(1,), name='customer_input')
    product_input = layers.Input(shape=(1,), name='product_input')
    category_input = layers.Input(shape=(1,), name='category_input')
    subcategory_input = layers.Input(shape=(1,), name='subcategory_input')

    customer_embeddings = layers.Embedding(input_dim=num_customers, output_dim=embedding_dim, name='customer_embeddings')(customer_input)
    product_embeddings = layers.Embedding(input_dim=num_products, output_dim=embedding_dim, name='product_embeddings')(product_input)
    category_embeddings = layers.Embedding(input_dim=num_categories, output_dim=embedding_dim, name='category_embeddings')(category_input)
    subcategory_embeddings = layers.Embedding(input_dim=num_subcategories, output_dim=embedding_dim, name='subcategory_embeddings')(subcategory_input)

    customer_vec = layers.Flatten(name='customer_flatten')(customer_embeddings)
    product_vec = layers.Flatten(name='product_flatten')(product_embeddings)
    category_vec = layers.Flatten(name='category_flatten')(category_embeddings)
    subcategory_vec = layers.Flatten(name='subcategory_flatten')(subcategory_embeddings)

    concat_vec = layers.Concatenate(name='concat')([customer_vec, product_vec, category_vec, subcategory_vec])

    dense_1 = layers.Dense(64, activation='relu', name='dense_1')(concat_vec)
    dense_2 = layers.Dense(32, activation='relu', name='dense_2')(dense_1)
    output = layers.Dense(1, activation='linear', name='output')(dense_2)

    model = tf.keras.Model([customer_input, product_input, category_input, subcategory_input], output)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return model

In [6]:
# Função que treina o modelo de recomendação de produtos
def train_model(model, customer_ids_train, product_ids_train, category_ids_train, subcategory_ids_train, sales_train, epochs=20, batch_size=32):
    """Treina o modelo."""
    model.fit(
        [customer_ids_train, product_ids_train, category_ids_train, subcategory_ids_train],
        sales_train,
        epochs=epochs,
        batch_size=batch_size,
        verbose=0
    )
    return model

In [7]:
# Função já treinada que agora faz efetivamente a recomendação dos até 7 melhores produtos para o consumidor
def recomendar_produtos(nome_cliente, df, model, customer_encoder, product_encoder, category_encoder, subcategory_encoder, top_k=7):
    """Retorna os top-k produtos recomendados para um cliente específico."""
    try:
        cliente_id = customer_encoder.transform([nome_cliente])[0]
    except ValueError:
        cliente_id = -1

    if cliente_id == -1:
        sales_by_category = df.groupby(['Category', 'Product ID'])['Sales'].sum().reset_index()
        recommended_products = pd.DataFrame()
        for category in df['Category'].unique():
            top_products = sales_by_category[sales_by_category['Category'] == category].nlargest(top_k, 'Sales')
            recommended_products = pd.concat([recommended_products, top_products])

        recommended_product_ids = recommended_products['Product ID'].unique()
        valid_product_ids = [pid for pid in recommended_product_ids if pid in product_encoder.classes_]
        
        if not valid_product_ids:
            top_products = df.nlargest(top_k, 'Sales')['Product ID'].unique()
            valid_product_ids = [pid for pid in top_products if pid in product_encoder.classes_]
            
        if not valid_product_ids:
            valid_product_ids = product_encoder.classes_[:top_k]
            
        recommended_products_names = product_encoder.inverse_transform(product_encoder.transform(valid_product_ids))
        recommended_df = pd.DataFrame({'Product ID': recommended_products_names[:top_k]})
        recommended_df = recommended_df.merge(df[['Product ID', 'Product Name', 'Category', 'Sub-Category']], on='Product ID', how='left').drop_duplicates()
    else:
        num_products = len(product_encoder.classes_)
        product_ids = np.arange(num_products)
        customer_ids = np.full((num_products,), cliente_id)
        category_ids = np.zeros(num_products)
        subcategory_ids = np.zeros(num_products)

        product_scores = model.predict([customer_ids.reshape(-1, 1), product_ids.reshape(-1, 1), 
                                      category_ids.reshape(-1, 1), subcategory_ids.reshape(-1, 1)], 
                                     verbose=0).flatten()

        top_product_indices = np.argsort(product_scores)[::-1]
        recommended_products = []
        categories_seen = set()
        
        i = 0
        while len(recommended_products) < top_k and i < len(top_product_indices):
            product_idx = top_product_indices[i]
            product_id_encoded = product_encoder.classes_[product_idx]
            product_category = df[df['Product ID Enc'] == product_idx]['Category'].iloc[0]
            
            if product_category not in categories_seen:
                recommended_products.append(product_id_encoded)
                categories_seen.add(product_category)
            i += 1

        if len(recommended_products) < top_k:
            remaining_products = product_encoder.inverse_transform(top_product_indices[:top_k - len(recommended_products)])
            recommended_products.extend(remaining_products)
        recommended_products = recommended_products[:top_k]

        recommended_df = pd.DataFrame({'Product ID': recommended_products})
        recommended_df = recommended_df.merge(df[['Product ID', 'Product Name', 'Category', 'Sub-Category']], 
                                           on='Product ID', how='left').drop_duplicates()

    recommended_df.insert(0, 'Ranking', range(1, len(recommended_df) + 1))
    return recommended_df

In [8]:
# Função que explica as recomendações (método SHAP)
def explain_recommendations_shap(model, background_data, customer_id, product_id, category_id, subcategory_id):
    """Gera explicações SHAP para as recomendações."""
    def model_predict(X):
        return model.predict([
            X[:,0].reshape(-1,1),  # customer
            X[:,1].reshape(-1,1),  # product
            X[:,2].reshape(-1,1),  # category
            X[:,3].reshape(-1,1)   # subcategory
        ])
    
    explainer = shap.KernelExplainer(model_predict, background_data)
    test_data = np.array([[customer_id, product_id, category_id, subcategory_id]])
    shap_values = explainer.shap_values(test_data, l1_reg="aic")
    
    return shap_values[0]  # Retorna apenas os valores para a primeira instância

### RELATÓRIO:
Essa é a função que gera o relatório para a consumidora **Irene Maddox**. O modelo pode gerar relatórios para cada consumidor.

In [10]:
# Função para gerar relatórios SHAP a respeito das escolhas no algoritmo
def gerar_relatorio_irene_maddox(df, model, customer_encoder, product_encoder, category_encoder, subcategory_encoder,
                                customer_ids_train, product_ids_train, category_ids_train, subcategory_ids_train,
                                file_path="relatorio_irene_maddox.pdf"):
    """Gera um relatório em PDF com explicações SHAP para Irene Maddox."""
    
    # Configuração do PDF
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    doc = SimpleDocTemplate(file_path, pagesize=letter)
    styles = getSampleStyleSheet()
    story = []
    
    # Estilos
    h1_style = styles['h1']
    h2_style = styles['h2']
    normal_style = styles['Normal']
    
    # 1. Cabeçalho
    story.append(Paragraph("Relatório de Interpretabilidade - Irene Maddox", h1_style))
    story.append(Paragraph(f"Data: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", normal_style))
    story.append(Spacer(1, 0.5 * inch))
    
    # 2. Recomendações
    story.append(Paragraph("Recomendações para Irene Maddox", h2_style))
    recomendacoes_df = recomendar_produtos("Irene Maddox", df, model, customer_encoder, product_encoder, 
                                          category_encoder, subcategory_encoder)
    
    # Tabela de recomendações
    data = [recomendacoes_df.columns.tolist()] + recomendacoes_df.values.tolist()
    table = Table(data)
    table.setStyle(TableStyle([('GRID', (0, 0), (-1, -1), 1, 'black'),
                              ('FONTSIZE', (0, 0), (-1, -1), 8),
                              ('VALIGN', (0, 0), (-1, -1), 'MIDDLE')]))
    story.append(table)
    story.append(Spacer(1, 0.5 * inch))
    
    # 3. Explicações SHAP
    story.append(Paragraph("Explicação das Recomendações", h2_style))
    story.append(Paragraph("Esta seção explica como o modelo chegou a essas recomendações usando valores SHAP.", normal_style))
    story.append(Spacer(1, 0.2 * inch))
    
    # Preparar dados para SHAP
    cliente_id = customer_encoder.transform(["Irene Maddox"])[0]
    
    # Amostrar dados de background
    background_size = min(100, len(customer_ids_train))
    background_indices = np.random.choice(len(customer_ids_train), size=background_size, replace=False)
    background_data = np.column_stack([
        customer_ids_train[background_indices],
        product_ids_train[background_indices],
        category_ids_train[background_indices],
        subcategory_ids_train[background_indices]
    ])
    
    # Gerar explicações SHAP para os produtos recomendados
    recommended_product_indices = [product_encoder.transform([pid])[0] for pid in recomendacoes_df['Product ID']]
    
    for i, product_idx in enumerate(recommended_product_indices[:5]):  # Analisar apenas os 5 primeiros para o relatório
        product_id = recomendacoes_df.iloc[i]['Product ID']
        product_name = recomendacoes_df.iloc[i]['Product Name']
        
        story.append(Paragraph(f"Produto {i+1}: {product_name} ({product_id})", h2_style))
        
        # Obter valores SHAP para este produto específico
        shap_values = explain_recommendations_shap(
            model, background_data,
            cliente_id, product_idx, 0, 0
        )
        
        # Gráfico de importância das features
        feature_names = ['Customer', 'Product', 'Category', 'Subcategory']
        shap_values = np.array(shap_values).flatten()
        
        fig, ax = plt.subplots(figsize=(10, 6))
        shap.bar_plot(shap_values, feature_names=feature_names, show=False)
        ax.set_title(f'Contribuição das Features', pad=20)
        ax.set_xlabel('Valor SHAP', labelpad=10)
        ax.tick_params(axis='both', which='major', labelsize=10)
        plt.tight_layout()
        
        story.append(fig_to_image(fig))
        story.append(Spacer(1, 0.2 * inch))
        
        # Explicação textual
        explanation = (
            f"Este gráfico mostra como cada característica contribuiu para a recomendação deste produto. "
            f"Valores SHAP positivos indicam que a característica aumentou a probabilidade de recomendação, "
            f"enquanto valores negativos diminuíram."
        )
        story.append(Paragraph(explanation, normal_style))
        story.append(Spacer(1, 0.5 * inch))
        
        plt.close(fig)
    
    doc.build(story)

if __name__ == '__main__':
    # Caminho para os dados
    from src.config import DADOS_TRATADOS
    file_path = DADOS_TRATADOS
    
    # Caminho para o PDF
    pdf_file_path = r"C:\Users\flavi\Documents\GitHub\Projeto_7_Sistema_de_Recomendacao\Notebooks\relatorio_irene_maddox.pdf"
    
    # Carregar e pré-processar os dados
    df, customer_encoder, product_encoder, category_encoder, subcategory_encoder = load_and_preprocess_data(file_path)
    
    # Dividir os dados
    (customer_ids_train, customer_ids_test,
     product_ids_train, product_ids_test,
     category_ids_train, category_ids_test,
     subcategory_ids_train, subcategory_ids_test,
     sales_train, sales_test) = split_data(df)
    
    # Criar e treinar o modelo
    num_customers = len(customer_encoder.classes_)
    num_products = len(product_encoder.classes_)
    num_categories = len(category_encoder.classes_)
    num_subcategories = len(subcategory_encoder.classes_)
    embedding_dim = 50
    
    model = create_model(num_customers, num_products, num_categories, num_subcategories, embedding_dim)
    model = train_model(model, customer_ids_train, product_ids_train, category_ids_train, subcategory_ids_train, sales_train)
    
    # Gerar relatório específico para Irene Maddox
    gerar_relatorio_irene_maddox(df, model, customer_encoder, product_encoder, category_encoder, subcategory_encoder,
                                customer_ids_train, product_ids_train, category_ids_train, subcategory_ids_train,
                                pdf_file_path)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step


  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.22it/s]


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.57it/s]


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step


  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.06it/s]


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step


  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.92it/s]
