In [1]:
# Configuração do ambiente

# Garante que o notebook está na raiz do projeto
%cd .. 

# Verifica o diretório atual (Linux/Mac)
# !pwd  

# Verifica o diretório atual (Windows)
!cd 

C:\Users\flavi\Documents\GitHub\Projeto_7_Sistema_de_Recomendacao
C:\Users\flavi\Documents\GitHub\Projeto_7_Sistema_de_Recomendacao


In [2]:
# Importações Necessárias

import os
import sys
from pathlib import Path
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import TensorBoard
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import defaultdict
from datetime import datetime
from tensorflow.keras.metrics import MeanSquaredError 
from pathlib import Path

from src.config.paths import MODELS_DIR
from src.config.paths import DADOS_TRATADOS 

In [3]:
# --- Carregamento e Pré-processamento dos Dados ---

def load_and_preprocess_data(file_path):
    """Carrega e pré-processa os dados."""

    df = pd.read_parquet(file_path)
    df = df[['Customer Name', 'Product ID', 'Product Name', 'Sales', 'Category', 'Sub-Category']].copy()

    # Codificação
    customer_encoder = LabelEncoder()
    product_encoder = LabelEncoder()
    category_encoder = LabelEncoder()
    subcategory_encoder = LabelEncoder()

    df['Customer ID Enc'] = customer_encoder.fit_transform(df['Customer Name'])
    df['Product ID Enc'] = product_encoder.fit_transform(df['Product ID'])
    df['Category Enc'] = category_encoder.fit_transform(df['Category'])
    df['Sub-Category Enc'] = subcategory_encoder.fit_transform(df['Sub-Category'])

    # Normalização
    scaler = MinMaxScaler()
    df['Sales Normalized'] = scaler.fit_transform(df[['Sales']])

    return df, customer_encoder, product_encoder, category_encoder, subcategory_encoder

In [4]:
# --- Divisão dos Dados ---

def split_data(df):
    """Divide os dados em treino e teste."""

    customer_ids = df['Customer ID Enc'].values
    product_ids = df['Product ID Enc'].values
    category_ids = df['Category Enc'].values
    subcategory_ids = df['Sub-Category Enc'].values
    sales = df['Sales Normalized'].values

    customer_ids_train, customer_ids_test, \
    product_ids_train, product_ids_test, \
    category_ids_train, category_ids_test, \
    subcategory_ids_train, subcategory_ids_test, \
    sales_train, sales_test = train_test_split(
        customer_ids, product_ids, category_ids, subcategory_ids, sales,
        test_size=0.2, random_state=42
    )

    return (customer_ids_train, customer_ids_test,
            product_ids_train, product_ids_test,
            category_ids_train, category_ids_test,
            subcategory_ids_train, subcategory_ids_test,
            sales_train, sales_test)

In [5]:
# --- Modelo ---

def create_model(num_customers, num_products, num_categories, num_subcategories, embedding_dim):
    """Cria o modelo de recomendação."""

    customer_input = layers.Input(shape=(1,), name='customer_input')
    product_input = layers.Input(shape=(1,), name='product_input')
    category_input = layers.Input(shape=(1,), name='category_input')
    subcategory_input = layers.Input(shape=(1,), name='subcategory_input')

    customer_embeddings = layers.Embedding(input_dim=num_customers, output_dim=embedding_dim, name='customer_embeddings')(customer_input)
    product_embeddings = layers.Embedding(input_dim=num_products, output_dim=embedding_dim, name='product_embeddings')(product_input)
    category_embeddings = layers.Embedding(input_dim=num_categories, output_dim=embedding_dim, name='category_embeddings')(category_input)
    subcategory_embeddings = layers.Embedding(input_dim=num_subcategories, output_dim=embedding_dim, name='subcategory_embeddings')(subcategory_input)

    customer_vec = layers.Flatten(name='customer_flatten')(customer_embeddings)
    product_vec = layers.Flatten(name='product_flatten')(product_embeddings)
    category_vec = layers.Flatten(name='category_flatten')(category_embeddings)
    subcategory_vec = layers.Flatten(name='subcategory_flatten')(subcategory_embeddings)

    concat_vec = layers.Concatenate(name='concat')([customer_vec, product_vec, category_vec, subcategory_vec])

    dense_1 = layers.Dense(64, activation='relu', name='dense_1')(concat_vec)
    dense_2 = layers.Dense(32, activation='relu', name='dense_2')(dense_1)
    output = layers.Dense(1, activation='linear', name='output')(dense_2)

    model = tf.keras.Model([customer_input, product_input, category_input, subcategory_input], output)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')  # Otimizador Adam e taxa de aprendizado padrão
    return model

In [6]:
# --- Métricas de Avaliação ---

def evaluate_model(model, customer_ids, product_ids, category_ids, subcategory_ids, sales, top_k=7):
    """Avalia o modelo usando MSE e métricas de recomendação (precisão@k, recall@k, F1@k)."""

    predictions = model.predict([customer_ids, product_ids, category_ids, subcategory_ids], verbose=0).flatten()
    mse_calculator = MeanSquaredError()  
    mse = mse_calculator(sales, predictions).numpy()  

    # Métricas de recomendação
    relevant_items = defaultdict(list)
    recommended_items = defaultdict(list)

    for i in range(len(customer_ids)):
        customer = customer_ids[i]
        true_sale = sales[i]
        prediction = predictions[i]

        if true_sale > 0.5:  # Define o limiar para relevância 
            relevant_items[customer].append((product_ids[i], true_sale))

        recommended_items[customer].append((product_ids[i], prediction))

    precision_at_k_sum = 0
    recall_at_k_sum = 0
    f1_at_k_sum = 0
    num_users_with_relevant_items = 0

    for customer in relevant_items:
        if len(relevant_items[customer]) > 0:
            num_users_with_relevant_items += 1
            # Sort recommended items by prediction score for this customer
            recommended_items[customer].sort(key=lambda x: x[1], reverse=True)
            top_k_recommended = [item[0] for item in recommended_items[customer][:top_k]]
            
            relevant_set = {item[0] for item in relevant_items[customer]}
            
            hits = len(relevant_set.intersection(top_k_recommended))
            precision = hits / top_k if top_k else 0
            recall = hits / len(relevant_set) if len(relevant_set) > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
            
            precision_at_k_sum += precision
            recall_at_k_sum += recall
            f1_at_k_sum += f1

    avg_precision_at_k = precision_at_k_sum / num_users_with_relevant_items if num_users_with_relevant_items > 0 else 0
    avg_recall_at_k = recall_at_k_sum / num_users_with_relevant_items if num_users_with_relevant_items > 0 else 0
    avg_f1_at_k = f1_at_k_sum / num_users_with_relevant_items if num_users_with_relevant_items > 0 else 0

    return mse, avg_precision_at_k, avg_recall_at_k, avg_f1_at_k

In [7]:
# --- Função de Recomendação ---

def recomendar_produtos(nome_cliente, df, model, customer_encoder, product_encoder, category_encoder, subcategory_encoder, top_k=7):
    """
    Retorna os top-k produtos recomendados para um cliente específico,
    lidando com o cold start e considerando diversidade básica.
    """

    try:
        cliente_id = customer_encoder.transform([nome_cliente])[0]
    except ValueError:
        # Cliente não encontrado (Cold Start)
        cliente_id = -1

    if cliente_id == -1:
        # Estratégia de Cold Start: Produtos mais vendidos por categoria
        # Calcula as vendas por categoria
        sales_by_category = df.groupby(['Category', 'Product ID'])['Sales'].sum().reset_index()

        # Para cada categoria, obtém os top-k produtos mais vendidos
        recommended_products = pd.DataFrame()
        for category in df['Category'].unique():
            top_products = sales_by_category[sales_by_category['Category'] == category].nlargest(top_k, 'Sales')
            recommended_products = pd.concat([recommended_products, top_products])

        # Decodifica os IDs dos produtos
        recommended_product_ids = recommended_products['Product ID'].unique()
        recommended_products_names = product_encoder.inverse_transform(recommended_product_ids)

        recommended_df = pd.DataFrame({'Product ID': recommended_products_names})
        recommended_df = recommended_df.merge(df[['Product ID', 'Product Name', 'Category', 'Sub-Category']], on='Product ID', how='left').drop_duplicates()

    else:
        # Cliente conhecido: Prever pontuações
        num_products = len(product_encoder.classes_)
        product_ids = np.arange(num_products)
        customer_ids = np.full((num_products,), cliente_id)
        category_ids = np.zeros(num_products)  # Usando 0 como placeholder
        subcategory_ids = np.zeros(num_products)  # Usando 0 como placeholder


        product_scores = model.predict([customer_ids.reshape(-1, 1), product_ids.reshape(-1, 1), category_ids.reshape(-1, 1), subcategory_ids.reshape(-1, 1)], verbose=0).flatten()

        # Obter os top-k produtos recomendados com diversidade básica
        # (Seleciona um pouco de cada categoria, se possível)
        top_product_indices = np.argsort(product_scores)[::-1]
        recommended_products = []
        categories_seen = set()
        
        i = 0
        while len(recommended_products) < top_k and i < len(top_product_indices):
            product_idx = top_product_indices[i]
            product_id_encoded = product_encoder.classes_[product_idx]
            product_category = df[df['Product ID Enc'] == product_idx]['Category'].iloc[0] #Pegar a categoria do produto original
            
            if product_category not in categories_seen:
                recommended_products.append(product_id_encoded)
                categories_seen.add(product_category)
            i += 1

        # Se não conseguiu diversificar totalmente, preenche com os melhores restantes
        if len(recommended_products) < top_k:
            remaining_products = product_encoder.inverse_transform(top_product_indices[:top_k - len(recommended_products)])
            recommended_products.extend(remaining_products)
            recommended_products = recommended_products[:top_k]  # Garante que não ultrapassa top_k

        recommended_df = pd.DataFrame({'Product ID': recommended_products})
        recommended_df = recommended_df.merge(df[['Product ID', 'Product Name', 'Category', 'Sub-Category']], on='Product ID', how='left').drop_duplicates()

    recommended_df.insert(0, 'Ranking', range(1, len(recommended_df) + 1))
    return recommended_df

In [8]:
# --- Treinamento e Avaliação ---

def train_and_evaluate(df, customer_encoder, product_encoder, category_encoder, subcategory_encoder, epochs=20, batch_size=32):
    """Treina e avalia o modelo."""

    (customer_ids_train, customer_ids_test,
     product_ids_train, product_ids_test,
     category_ids_train, category_ids_test,
     subcategory_ids_train, subcategory_ids_test,
     sales_train, sales_test) = split_data(df)

    num_customers = len(customer_encoder.classes_)
    num_products = len(product_encoder.classes_)
    num_categories = len(category_encoder.classes_)
    num_subcategories = len(subcategory_encoder.classes_)
    embedding_dim = 16  

    model = create_model(num_customers, num_products, num_categories, num_subcategories, embedding_dim)

    # Callbacks (TensorBoard)
    log_dir = "logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

    model.fit(
        [customer_ids_train.reshape(-1, 1), product_ids_train.reshape(-1, 1), category_ids_train.reshape(-1, 1), subcategory_ids_train.reshape(-1, 1)],
        sales_train,
        epochs=epochs,
        batch_size=batch_size,
        verbose=1,
        callbacks=[tensorboard_callback]
    )

    # Avaliação
    mse_train, precision_train, recall_train, f1_train = evaluate_model(
        model, customer_ids_train, product_ids_train, category_ids_train, subcategory_ids_train, sales_train
    )
    mse_test, precision_test, recall_test, f1_test = evaluate_model(
        model, customer_ids_test, product_ids_test, category_ids_test, subcategory_ids_test, sales_test
    )

    print(f"Treino: MSE = {mse_train:.4f}, Precision@{7} = {precision_train:.4f}, Recall@{7} = {recall_train:.4f}, F1@{7} = {f1_train:.4f}")
    print(f"Teste: MSE = {mse_test:.4f}, Precision@{7} = {precision_test:.4f}, Recall@{7} = {recall_test:.4f}, F1@{7} = {f1_test:.4f}")

    return model

In [9]:
# Sobe para a raiz do projeto (assumindo que o notebook está em /notebooks)
project_root = Path.cwd()
sys.path.append(str(project_root))  # Adiciona a raiz ao Python path

# Importação correta
from src.config.paths import DADOS_TRATADOS

# Verificação
print(f"Caminho completo: {DADOS_TRATADOS}")
print(f"Arquivo existe? {DADOS_TRATADOS.exists()}")

# Uso no código
file_path = DADOS_TRATADOS

Caminho completo: C:\Users\flavi\Documents\GitHub\Projeto_7_Sistema_de_Recomendacao\dados\dados_tratados.parquet
Arquivo existe? True


In [10]:
# --- Main ---

if __name__ == '__main__':
    # Caminho para os dados 
    file_path = DADOS_TRATADOS

    # Carregar e pré-processar os dados
    df, customer_encoder, product_encoder, category_encoder, subcategory_encoder = load_and_preprocess_data(file_path)

    # Treinar e avaliar o modelo
    trained_model = train_and_evaluate(df, customer_encoder, product_encoder, category_encoder, subcategory_encoder)

    # Gerar recomendações para um cliente específico
    cliente_para_recomendar = "Irene Maddox"  
    recomendacoes = recomendar_produtos(cliente_para_recomendar, df, trained_model, customer_encoder, product_encoder, category_encoder, subcategory_encoder)

Epoch 1/20
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - loss: 4.5799e-04  
Epoch 2/20
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 3.1083e-04  
Epoch 3/20
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 1.5649e-04    
Epoch 4/20
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 1.0751e-04  
Epoch 5/20
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 1.0802e-04
Epoch 6/20
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 5.2405e-05    
Epoch 7/20
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 5.3471e-05  
Epoch 8/20
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 4.4858e-05
Epoch 9/20
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 2.9822e-05  
Epoch 10/20
[1m245/245[0m [32m━━━━

In [11]:
print(f"\nRecomendações para {cliente_para_recomendar}:\n")
recomendacoes


Recomendações para Irene Maddox:



Unnamed: 0,Ranking,Product ID,Product Name,Category,Sub-Category
0,1,OFF-SU-10002881,Martin Yale Chadless Opener Electric Letter Op...,Office Supplies,Supplies
6,2,FUR-CH-10001215,Global Troy Executive Leather Low-Back Tilter,Furniture,Chairs
15,3,TEC-CO-10004722,Canon imageCLASS 2200 Advanced Copier,Technology,Copiers
26,4,OFF-AR-10002671,Hunt BOSTON Model 1606 High-Volume Electric Pe...,Office Supplies,Art
36,5,OFF-AP-10002945,Honeywell Enviracaire Portable HEPA Air Cleane...,Office Supplies,Appliances


### Salvando o modelo para persistência

In [12]:
def save_model(model, filename: str, overwrite: bool = False) -> Path:
    """
    Salva o modelo na pasta models com tratamento de erros.
    
    Parâmetros:
        model: Modelo treinado (Keras/TensorFlow)
        filename: Nome do arquivo (com extensão .keras ou .h5)
        overwrite: Se True, sobrescreve arquivos existentes
        
    Retorna:
        Caminho completo do modelo salvo
    """
    model_path = MODELS_DIR / filename
    
    if model_path.exists() and not overwrite:
        raise FileExistsError(f"Arquivo {model_path} já existe. Use overwrite=True para substituir.")
    
    # Garante que a extensão é válida
    if not filename.endswith(('.keras', '.h5')):
        raise ValueError("Extensão inválida. Use '.keras' ou '.h5'")
    
    model.save(model_path)
    
    # Verificação pós-salvamento
    if not model_path.exists():
        raise RuntimeError(f"Falha ao salvar o modelo em {model_path}")
    
    return model_path

# Uso:
try:
    saved_path = save_model(trained_model, "best_model_recomendacao.keras")
    print(f"✅ Modelo salvo com sucesso em:\n{saved_path}")
except Exception as e:
    print(f"❌ Erro ao salvar modelo: {str(e)}")

✅ Modelo salvo com sucesso em:
C:\Users\flavi\Documents\GitHub\Projeto_7_Sistema_de_Recomendacao\models\best_model_recomendacao.keras
