In [33]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader

In [34]:
# Load the data
events_df = pd.read_csv('./data/events.csv')

In [35]:
# Convert timestamps to datetime for better readability
events_df['timestamp'] = pd.to_datetime(events_df['timestamp'], unit='ms')

In [36]:
# Only sample 10,000 interactions
sampled_events_df = events_df.sample(n=100000, random_state=42)

In [37]:
# Define a split date (example: 80% of the data before the split date for training, 20% after for testing)
split_date = sampled_events_df['timestamp'].quantile(0.8)
print("Split date:", split_date)
train_df = sampled_events_df[sampled_events_df['timestamp'] < split_date]
test_df = sampled_events_df[sampled_events_df['timestamp'] >= split_date]

Split date: 2015-08-18 15:23:50.822800128


In [38]:
print("Number of interactions in the training set:", len(train_df))
print("Number of interactions in the test set:", len(test_df))

Number of interactions in the training set: 80000
Number of interactions in the test set: 20000


In [39]:
# Consider only unique pairs (user, item)
train_df = train_df.drop_duplicates(subset=['visitorid', 'itemid'])
test_df = test_df.drop_duplicates(subset=['visitorid', 'itemid'])

In [40]:
print("Number of unique (user, item) pairs in the training set:", len(train_df))
print("Number of unique (user, item) pairs in the test set:", len(test_df))

Number of unique (user, item) pairs in the training set: 78342
Number of unique (user, item) pairs in the test set: 19655


In [41]:
# Ensure that all users in the test set are also in the training set
test_df = test_df[test_df['visitorid'].isin(train_df['visitorid'])]

In [42]:
# Ensure that all items in the test set are also in the training set
test_df = test_df[test_df['itemid'].isin(train_df['itemid'])]
print("\nConjunto de teste após filtrar usuários e itens não vistos:\n", test_df.head())


Conjunto de teste após filtrar usuários e itens não vistos:
                       timestamp  visitorid      event  itemid  transactionid
1223771 2015-09-04 15:45:22.989     914082       view  105108            NaN
1149787 2015-08-31 20:33:47.398     899857       view   76512            NaN
1380228 2015-09-12 18:06:25.336     530559  addtocart  277490            NaN
1024747 2015-08-24 22:24:25.585     247235       view  395749            NaN
1434974 2015-09-15 16:56:46.717     276250       view  411126            NaN


In [43]:
# Codificar IDs de usuário e item
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

In [44]:
# Create explicit copies of the DataFrame to avoid SettingWithCopyWa
train_df_copy = train_df.copy()
test_df_copy = test_df.copy()

In [45]:
# Adjust and transform training data
train_df_copy['user'] = user_encoder.fit_transform(train_df_copy['visitorid'])
train_df_copy['item'] = item_encoder.fit_transform(train_df_copy['itemid'])

In [46]:
print("\nConjunto de treinamento codificado:\n", train_df_copy.head())

print("\nMapeamento de usuários (visitorid -> user):")
print(dict(zip(train_df_copy['visitorid'].head(), train_df_copy['user'].head())))

print("\nMapeamento de itens (itemid -> item):")
print(dict(zip(train_df_copy['itemid'].head(), train_df_copy['item'].head())))


Conjunto de treinamento codificado:
                       timestamp  visitorid event  itemid  transactionid  \
486798  2015-06-25 00:46:56.976      50734  view    4442            NaN   
1601366 2015-05-10 17:50:37.515    1066758  view  221329            NaN   
843976  2015-08-13 23:17:44.399    1049477  view   23683            NaN   
2524686 2015-07-22 21:39:19.324     143239  view    6552            NaN   
2361757 2015-07-14 21:27:06.237     976898  view   82224            NaN   

          user   item  
486798    2534    386  
1601366  53091  20145  
843976   52184   2123  
2524686   7167    582  
2361757  48592   7530  

Mapeamento de usuários (visitorid -> user):
{50734: 2534, 1066758: 53091, 1049477: 52184, 143239: 7167, 976898: 48592}

Mapeamento de itens (itemid -> item):
{4442: 386, 221329: 20145, 23683: 2123, 6552: 582, 82224: 7530}


In [47]:
# Transform test data using the same encoder
test_df_copy['user'] = user_encoder.transform(test_df_copy['visitorid'])
test_df_copy['item'] = item_encoder.transform(test_df_copy['itemid'])

print("\nConjunto de teste codificado:\n", test_df_copy.head())


Conjunto de teste codificado:
                       timestamp  visitorid      event  itemid  transactionid  \
1223771 2015-09-04 15:45:22.989     914082       view  105108            NaN   
1149787 2015-08-31 20:33:47.398     899857       view   76512            NaN   
1380228 2015-09-12 18:06:25.336     530559  addtocart  277490            NaN   
1024747 2015-08-24 22:24:25.585     247235       view  395749            NaN   
1434974 2015-09-15 16:56:46.717     276250       view  411126            NaN   

          user   item  
1223771  45476   9628  
1149787  44790   6975  
1380228  26548  25408  
1024747  12436  36301  
1434974  13889  37611  


In [48]:
# Custom dataset for PyTorch
class InteractionDataset(Dataset):
    def __init__(self, users, items, labels):
        self.users = users
        self.items = items
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

In [49]:
# Generate positive and negative samples for training
def generate_samples(df, num_negatives=1):
    user_item_set = set(zip(df['user'], df['item']))
    all_items = df['item'].unique()

    positive_samples = df[['user', 'item']].drop_duplicates()
    positive_samples['label'] = 1

    negative_samples = []
    for user in df['user'].unique():
        negative_items = np.random.choice(all_items, size=num_negatives * len(df[df['user'] == user]), replace=True)
        for item in negative_items:
            if (user, item) not in user_item_set:
                negative_samples.append((user, item, 0))

    negative_samples_df = pd.DataFrame(negative_samples, columns=['user', 'item', 'label'])
    samples = pd.concat([positive_samples, negative_samples_df]).sample(frac=1, random_state=42).reset_index(drop=True)

    return samples['user'].values, samples['item'].values, samples['label'].values

In [50]:
# Generate samples for training
train_users, train_items, train_labels = generate_samples(train_df_copy)

print("\nAmostras de treinamento:\n")
print("Usuários:", train_users[:5])
print("Itens:", train_items[:5])
print("Labels:", train_labels[:5])


Amostras de treinamento:

Usuários: [  239 44551 24850 16593 64381]
Itens: [25459 20676  7936 16160 17315]
Labels: [1 1 0 1 0]


In [51]:
# Create DataLoader for training
train_dataset = InteractionDataset(train_users, train_items, train_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)

In [52]:
# Define the neural network model
class SimpleNeuralCollaborativeFiltering(nn.Module):
    def __init__(self, num_users, num_items, embedding_size=5):  # Reduzindo ainda mais o tamanho da embedding
        super(SimpleNeuralCollaborativeFiltering, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
        self.fc1 = nn.Linear(embedding_size * 2, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)

    def forward(self, user, item):
        user_embedded = self.user_embedding(user)
        item_embedded = self.item_embedding(item)
        x = torch.cat([user_embedded, item_embedded], dim=-1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

In [53]:
num_users = len(user_encoder.classes_)
num_items = len(item_encoder.classes_)
model = SimpleNeuralCollaborativeFiltering(num_users, num_items)

In [54]:
# Loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [55]:
# Training settings
device = torch.device('cpu')
model.to(device)

SimpleNeuralCollaborativeFiltering(
  (user_embedding): Embedding(69990, 5)
  (item_embedding): Embedding(42628, 5)
  (fc1): Linear(in_features=10, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=8, bias=True)
  (fc3): Linear(in_features=8, out_features=1, bias=True)
)

In [56]:
# Train the neural network model
num_epochs = 2  # Reduzindo o número de épocas para teste inicial
for epoch in range(num_epochs):
    model.train()  # Define o modelo para modo de treinamento
    for batch_idx, (user_ids, item_ids, labels) in enumerate(train_loader):
        user_ids = user_ids.to(device)  # Move os IDs de usuário para o dispositivo (CPU)
        item_ids = item_ids.to(device)  # Move os IDs de item para o dispositivo (CPU)
        labels = labels.to(device).float()  # Move as labels para o dispositivo (CPU) e as converte para float
        optimizer.zero_grad()  # Zera os gradientes do otimizador
        outputs = model(user_ids, item_ids).squeeze()  # Faz a previsão com o modelo
        loss = criterion(outputs, labels)  # Calcula a perda
        loss.backward()  # Propaga os gradientes
        optimizer.step()  # Atualiza os pesos do modelo
        if batch_idx % 10 == 0:  # Adiciona checkpoints de progresso
            print(f'Batch {batch_idx}/{len(train_loader)}, Loss: {loss.item()}')
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

Batch 0/4897, Loss: 0.6885088682174683
Batch 10/4897, Loss: 0.6953601837158203
Batch 20/4897, Loss: 0.6951708793640137
Batch 30/4897, Loss: 0.6869214177131653
Batch 40/4897, Loss: 0.6833251118659973
Batch 50/4897, Loss: 0.6968539357185364
Batch 60/4897, Loss: 0.6945077180862427
Batch 70/4897, Loss: 0.6935948133468628
Batch 80/4897, Loss: 0.6943491697311401
Batch 90/4897, Loss: 0.6972706317901611
Batch 100/4897, Loss: 0.6997047662734985
Batch 110/4897, Loss: 0.6999765634536743
Batch 120/4897, Loss: 0.6961562037467957
Batch 130/4897, Loss: 0.6889839768409729
Batch 140/4897, Loss: 0.6952756643295288
Batch 150/4897, Loss: 0.6885349750518799
Batch 160/4897, Loss: 0.6911840438842773
Batch 170/4897, Loss: 0.6964337229728699
Batch 180/4897, Loss: 0.6913577318191528
Batch 190/4897, Loss: 0.6850990056991577
Batch 200/4897, Loss: 0.6912617683410645
Batch 210/4897, Loss: 0.6935731172561646
Batch 220/4897, Loss: 0.7000172734260559
Batch 230/4897, Loss: 0.6913942694664001
Batch 240/4897, Loss: 0.696

In [57]:
# Function to get recommendations
def get_recommendations(user_id, num_recommendations):
    user_idx = torch.tensor([user_id]).to(device)
    item_indices = torch.arange(num_items).to(device)
    
    print(f"User ID (encoded): {user_idx}")
    print(f"Item Indices: {item_indices}")
    
    user_idx_repeated = user_idx.repeat(item_indices.shape[0])
    
    with torch.no_grad():
        scores = model(user_idx_repeated, item_indices).squeeze()
    
    print(f"Scores: {scores}")
    
    _, top_item_indices = torch.topk(scores, num_recommendations)
    
    print(f"Top Item Indices: {top_item_indices}")
    
    recommended_item_ids = item_encoder.inverse_transform(top_item_indices.cpu().numpy())
    
    print(f"Recommended Item IDs: {recommended_item_ids}")
    
    return recommended_item_ids

In [58]:
# Function to evaluate the model
def evaluate_model(test_df, num_recommendations):
    hits = 0
    total_relevant = 0
    total_recommended = 0
    
    for user_id in test_df['user'].unique():
        actual_items = set(test_df[test_df['user'] == user_id]['item'].unique())
        recommended_items = set(get_recommendations(user_id, num_recommendations))
        
        hits += len(actual_items.intersection(recommended_items))
        total_relevant += len(actual_items)
        total_recommended += len(recommended_items)
    
    precision = hits / total_recommended if total_recommended > 0 else 0
    recall = hits / total_relevant if total_relevant > 0 else 0
    return precision, recall, hits


In [60]:
# Evaluate the model
precision, recall, hits = evaluate_model(test_df_copy, num_recommendations=5)

print(f'Precision@5: {precision}')
print(f'Recall@5: {recall}')
print(f'Hits@5: {hits}')

User ID (encoded): tensor([45476])
Item Indices: tensor([    0,     1,     2,  ..., 42625, 42626, 42627])
Scores: tensor([0.4951, 0.4762, 0.5277,  ..., 0.5015, 0.4563, 0.5388])
Top Item Indices: tensor([ 6829, 25831, 37279, 35718,  6352])
Recommended Item IDs: [ 75188 281826 407396 389607  70175]
User ID (encoded): tensor([44790])
Item Indices: tensor([    0,     1,     2,  ..., 42625, 42626, 42627])
Scores: tensor([0.4990, 0.4874, 0.5351,  ..., 0.5038, 0.4674, 0.5281])
Top Item Indices: tensor([37279,  3187, 24763, 26272, 41868])
Recommended Item IDs: [407396  35202 270291 286647 458265]
User ID (encoded): tensor([26548])
Item Indices: tensor([    0,     1,     2,  ..., 42625, 42626, 42627])
Scores: tensor([0.4843, 0.4823, 0.5252,  ..., 0.4950, 0.4706, 0.5244])
Top Item Indices: tensor([37279, 33559,  3187, 25831, 35718])
Recommended Item IDs: [407396 366879  35202 281826 389607]
User ID (encoded): tensor([12436])
Item Indices: tensor([    0,     1,     2,  ..., 42625, 42626, 42627])
