
# Trabajo Práctico: Predicción de Gastos de Clientes durante Black Sales
Este notebook contiene la solución a los puntos b), c) y d) del trabajo práctico.
El objetivo es predecir cuánto un cliente está dispuesto a gastar, utilizando deep learning.


In [4]:
# Punto a) - Análisis y preparación del dataset

import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Cargar el dataset
dataset = pd.read_csv('./dataset_black_sales.csv')

# Análisis exploratorio del dataset
print("Información del dataset:")
print(dataset.info())

print("\nDescripción de la columna 'Purchase':")
print(dataset['Purchase'].describe())

# Revisar valores faltantes
print("\nValores faltantes por columna:")
print(dataset.isnull().sum())

# Categorización de la columna Purchase
def categorize_purchase(purchase):
    if purchase < 5000:
        return 0
    elif 5000 <= purchase < 10000:
        return 1
    elif 10000 <= purchase < 15000:
        return 2
    else:
        return 3

# Crear la columna de categorías
dataset['Purchase_Category'] = dataset['Purchase'].apply(categorize_purchase)

# Distribución de las categorías
print("\nDistribución de categorías de 'Purchase':")
print(dataset['Purchase_Category'].value_counts())

# Preprocesar las columnas categóricas
label_encoder_gender = LabelEncoder()
dataset['Gender'] = label_encoder_gender.fit_transform(dataset['Gender'])

label_encoder_age = LabelEncoder()
dataset['Age'] = label_encoder_age.fit_transform(dataset['Age'])

label_encoder_city = LabelEncoder()
dataset['City_Category'] = label_encoder_city.fit_transform(dataset['City_Category'])

# Imputar valores faltantes en Product_Category_2 y Product_Category_3
dataset['Product_Category_2'] = dataset['Product_Category_2'].fillna(0)
dataset['Product_Category_3'] = dataset['Product_Category_3'].fillna(0)

# Dataset preparado para los siguientes puntos
print("\nPrimeras filas del dataset preparado:")
print(dataset.head())

Información del dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     550068 non-null  int64  
 1   Product_ID                  550068 non-null  object 
 2   Gender                      550068 non-null  object 
 3   Age                         550068 non-null  object 
 4   Occupation                  550068 non-null  int64  
 5   City_Category               550068 non-null  object 
 6   Stay_In_Current_City_Years  550068 non-null  object 
 7   Marital_Status              550068 non-null  int64  
 8   Product_Category_1          550068 non-null  int64  
 9   Product_Category_2          376430 non-null  float64
 10  Product_Category_3          166821 non-null  float64
 11  Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage

In [None]:
# Punto b) - Modelo de deep learning sin embeddings
# Preprocesamiento y creación del modelo sin embeddings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt

# Preprocesamiento
label_encoder_gender = LabelEncoder()
dataset['Gender'] = label_encoder_gender.fit_transform(dataset['Gender'])

label_encoder_age = LabelEncoder()
dataset['Age'] = label_encoder_age.fit_transform(dataset['Age'])

label_encoder_city = LabelEncoder()
dataset['City_Category'] = label_encoder_city.fit_transform(dataset['City_Category'])

# Imputar valores faltantes
dataset['Product_Category_2'].fillna(0, inplace=True)
dataset['Product_Category_3'].fillna(0, inplace=True)

# Seleccionar las columnas relevantes
X = dataset[['Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years',
             'Marital_Status', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3']]

# Normalizar características
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Definir variable objetivo
y = dataset['Purchase_Category']

# Dividir en entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Convertir a tensores de PyTorch
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# Definir el modelo
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(9, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 4)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

model = SimpleNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Entrenamiento del modelo
n_epochs = 10
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(n_epochs):
    model.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
    
    train_loss = running_loss / len(train_loader)
    train_accuracy = 100 * correct_train / total_train
    train_losses.append(train_loss)
    train_accuracies.append(train_accuracy)

    # Validación
    model.eval()
    val_loss = 0.0
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()
    
    val_loss /= len(val_loader)
    val_accuracy = 100 * correct_val / total_val
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)

    print(f'Epoch {epoch+1}/{n_epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%')

# Graficar pérdidas y exactitud
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Training Accuracy')
plt.plot(val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Punto c) - Modelo de deep learning con embeddings

class EmbeddingNN(nn.Module):
    def __init__(self, num_users, num_products, embedding_dim):
        super(EmbeddingNN, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.product_embedding = nn.Embedding(num_products, embedding_dim)
        
        self.fc1 = nn.Linear(embedding_dim * 2 + 7, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 4)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, user_id, product_id, other_features):
        user_embedded = self.user_embedding(user_id)
        product_embedded = self.product_embedding(product_id)
        
        x = torch.cat([user_embedded, product_embedded, other_features], dim=1)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# Definir el modelo de embedding, el optimizador y la función de pérdida
num_users = dataset['User_ID'].nunique()
num_products = dataset['Product_ID'].nunique()
embedding_dim = 10

model_embedding = EmbeddingNN(num_users, num_products, embedding_dim)
optimizer = optim.Adam(model_embedding.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# El resto del proceso de entrenamiento es similar al punto b)


In [None]:
# Punto d) - Función de recomendación basada en embeddings

def recommend_similar_users(user_id, n=5):
    user_embedding = model_embedding.user_embedding(torch.tensor([user_id]))
    
    all_user_embeddings = model_embedding.user_embedding.weight.data
    similarities = torch.cosine_similarity(user_embedding, all_user_embeddings)
    
    top_n_similar_users = similarities.argsort(descending=True)[:n].tolist()
    return top_n_similar_users

# Ejemplo de uso
user_id = 12345
similar_users = recommend_similar_users(user_id, n=5)
print(f"Usuarios con comportamientos de compra similares al usuario {user_id}: {similar_users}")
