In this notebook, I am aiming to improve the efficiency and performance of my VAE model in VAE2.ipynb

In [2]:
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from sklearn.preprocessing import LabelEncoder

In [3]:
user_data = pd.read_csv('../unpushed_work/last_fm_data/user_artists.dat', sep='\t')

user_interaction_counts = user_data.groupby('userID').size()
users_with_50_interactions = user_interaction_counts[user_interaction_counts >= 50].index
user_data_filtered = user_data[user_data['userID'].isin(users_with_50_interactions)]

unique_users = user_data_filtered['userID'].unique()

# Ensure that test users have at least 50 interactions in 'test_data'
# We need to carefully select 'test_users' to satisfy this condition

np.random.seed(42)

shuffled_users = np.random.permutation(unique_users)

train_users = []
test_users = []

# We'll collect test users until we have enough that have at least 50 interactions
for user in shuffled_users:
    user_data_temp = user_data_filtered[user_data_filtered['userID'] == user]
    if len(test_users) < int(0.2 * len(unique_users)):
        # Tentatively add to test_users
        test_users.append(user)
    else:
        train_users.append(user)

# Recreate test_data and train_data
train_data = user_data_filtered[user_data_filtered['userID'].isin(train_users)]
test_data = user_data_filtered[user_data_filtered['userID'].isin(test_users)]

# Now check that each user in test_data has 50 interactions
# Remove any users from test_users who don't meet this criterion
valid_test_users = []
for user in test_users:
    user_data_temp = test_data[test_data['userID'] == user]
    if len(user_data_temp) == 50:
        valid_test_users.append(user)

# Update test_users and test_data
test_users = valid_test_users
test_data = test_data[test_data['userID'].isin(test_users)]

# Update train_data to include any users removed from test_users
removed_test_users = set(shuffled_users) - set(train_users) - set(test_users)
if removed_test_users:
    train_users.extend(list(removed_test_users))
    train_data = user_data_filtered[user_data_filtered['userID'].isin(train_users)]

# Now proceed to split test_data into test_x and test_y
test_x = pd.DataFrame(columns=test_data.columns)
test_y = pd.DataFrame(columns=test_data.columns)

for user in test_users:
    user_data_temp = test_data[test_data['userID'] == user]
    user_data_shuffled = user_data_temp.sample(frac=1, random_state=42)
    user_test_x = user_data_shuffled.iloc[:25]
    user_test_y = user_data_shuffled.iloc[25:50]
    test_x = pd.concat([test_x, user_test_x], ignore_index=True)
    test_y = pd.concat([test_y, user_test_y], ignore_index=True)

In [4]:
# Encode artistIDs
# Essential since ML models require numerical input (also efficient)
artist_encoder = LabelEncoder()
all_artistIDs = user_data_filtered['artistID'].unique()
artist_encoder.fit(all_artistIDs)
num_artists = len(artist_encoder.classes_)

# Encode train users
train_user_encoder = LabelEncoder()
train_user_encoder.fit(train_users)
num_train_users = len(train_user_encoder.classes_)

# Encode test users
test_user_encoder = LabelEncoder()
test_user_encoder.fit(test_users)
num_test_users = len(test_user_encoder.classes_)

In [5]:
# Function to create user-item interaction matrix
def create_user_item_matrix(data, user_encoder, num_users):
    user_item_matrix = np.zeros((num_users, num_artists))
    for idx, row in data.iterrows():
        user_idx = user_encoder.transform([row['userID']])[0]
        artist_idx = artist_encoder.transform([row['artistID']])[0]
        weight = row['weight']
        user_item_matrix[user_idx, artist_idx] = weight
    return user_item_matrix

train_user_item_matrix = create_user_item_matrix(train_data, train_user_encoder, num_train_users)
test_x_user_item_matrix = create_user_item_matrix(test_x, test_user_encoder, num_test_users)
test_y_user_item_matrix = create_user_item_matrix(test_y, test_user_encoder, num_test_users)

# Scale data before inputting it into autoencoder
# Find the maximum and minimum values across all matrices for consistent scaling
max_value = max(train_user_item_matrix.max(), test_x_user_item_matrix.max(), test_y_user_item_matrix.max())
min_value = min(train_user_item_matrix.min(), test_x_user_item_matrix.min(), test_y_user_item_matrix.min())

# Define a function to normalize a matrix
def normalise(matrix, min_value, max_value):
    return (matrix - min_value) / (max_value - min_value)

# Normalize each matrix
train_user_item_matrix = normalise(train_user_item_matrix, min_value, max_value)
test_x_user_item_matrix = normalise(test_x_user_item_matrix, min_value, max_value)
test_y_user_item_matrix = normalise(test_y_user_item_matrix, min_value, max_value)

In [6]:
class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(VAE, self).__init__()
        # Encoder
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc_logvar = nn.Linear(hidden_dim, latent_dim)
        # Decoder
        self.fc_decode = nn.Linear(latent_dim, hidden_dim)
        self.fc_out = nn.Linear(hidden_dim, input_dim)
        
    def encode(self, x):
        h = torch.relu(self.fc1(x))
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        logvar = torch.clamp(logvar, min=-10, max=10)
        return mu, logvar
    
    def reparameterise(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std) # Adds noise by sampling from standard normal dist.
        return mu + eps * std
    
    def decode(self, z):
        h = torch.relu(self.fc_decode(z))
        return torch.sigmoid(self.fc_out(h))
    
    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterise(mu, logvar)
        return self.decode(z), mu, logvar

In [7]:
# Convert train data to tensor
train_tensor = torch.FloatTensor(train_user_item_matrix)

# Create DataLoader
batch_size = 64
train_dataset = torch.utils.data.TensorDataset(train_tensor)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize the model, optimizer, and loss function
input_dim = num_artists
hidden_dim = 256
latent_dim = 50

model = VAE(input_dim, hidden_dim, latent_dim)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

def loss_function(recon_x, x, mu, logvar):
    MSE = nn.functional.mse_loss(recon_x, x, reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return MSE + KLD

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for data_batch in train_loader:
        data = data_batch[0]
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_function(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {train_loss / len(train_loader.dataset):.4f}")

Epoch 1, Loss: 2228.4047
Epoch 2, Loss: 226.2412
Epoch 3, Loss: 21.8895
Epoch 4, Loss: 11.3289
Epoch 5, Loss: 10.5715
Epoch 6, Loss: 9.8042
Epoch 7, Loss: 9.2067
Epoch 8, Loss: 8.5201
Epoch 9, Loss: 8.2623
Epoch 10, Loss: 7.4779


In [8]:
# Convert test data to tensor
test_x_tensor = torch.FloatTensor(test_x_user_item_matrix)
test_y_tensor = torch.FloatTensor(test_y_user_item_matrix)

# Get the model's predictions on test_x
model.eval()
with torch.no_grad():
    predictions, _, _ = model(test_x_tensor) # Reconstructed predictions

mse_loss = nn.functional.mse_loss(predictions, test_y_tensor)
print(f"Test MSE: {mse_loss.item():.4f}")

Test MSE: 0.0002


In [14]:
# Instead of using MSE, we need a better way to evaluate our model. Since the entries of our matrices are normalised (i.e. between 0 and 1), the MSE will
# always be small, potentially leading to false confidence in our model.
# First we will unnormalise the matrices and then find the MSE

def unnormalise(matrix, min_value, max_value):
    return matrix * (max_value - min_value) + min_value

predictions_not_normal = unnormalise(predictions, min_value, max_value)
test_y_tensor_not_normal = unnormalise(test_y_tensor, min_value, max_value)

mse_loss = nn.functional.mse_loss(predictions_not_normal, test_y_tensor_not_normal)
print(f"Unnormalised test MSE: {mse_loss.item():.4f}")

Unnormalised test MSE: 19942642.0000


In [45]:
# As we can see, this MSE is extremely high, however it is not necessarily reflective of the predictive power of our model. 
# We will now try the Hit Rate@k (HR@k) as a metric, which measures the fraction of users for which the recommender system successfully recomends at least one
# relevant item within the top-k recommendations

def hit_rate_at_k(true_matrix, predicted_matrix, k):
    hits = 0
    n_users = true_matrix.shape[0]

    for user_idx in range(n_users):

        _, top_k_indices = torch.topk(predicted_matrix[user_idx], k=k, largest=True, sorted=True)

        if torch.any(true_matrix[user_idx][top_k_indices] > 0):
            hits += 1

    return hits / n_users

In [56]:
print(f'Hit Rate for k=5 : {hit_rate_at_k(test_y_tensor, predictions, k=5)}')
print(f'Hit Rate for k=7 : {hit_rate_at_k(test_y_tensor, predictions, k=7)}')
print(f'Hit Rate for k=10 : {hit_rate_at_k(test_y_tensor, predictions, k=10)}')
print(f'Hit Rate for k=20 : {hit_rate_at_k(test_y_tensor, predictions, k=20)}')
print(f'Hit Rate for k=50 : {hit_rate_at_k(test_y_tensor, predictions, k=50)}')
print(f'Hit Rate for k=100 : {hit_rate_at_k(test_y_tensor, predictions, k=100)}')
print(f'Hit Rate for k=300 : {hit_rate_at_k(test_y_tensor, predictions, k=300)}')
print(f'Hit Rate for k=500 : {hit_rate_at_k(test_y_tensor, predictions, k=500)}')

Hit Rate for k=5 : 0.005479452054794521
Hit Rate for k=7 : 0.00821917808219178
Hit Rate for k=10 : 0.0136986301369863
Hit Rate for k=20 : 0.03561643835616438
Hit Rate for k=50 : 0.057534246575342465
Hit Rate for k=100 : 0.12054794520547946
Hit Rate for k=300 : 0.3835616438356164
Hit Rate for k=500 : 0.5178082191780822
