This notebook is exactly the same as VAE1.ipynb, except that we have scaled the matrices before inputting them into the VAE. Also we have printed the decoded matrix in order to explicitly compare artist preferences with predicted preferences.

In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from sklearn.preprocessing import LabelEncoder

In [2]:
user_data = pd.read_csv('../unpushed_work/last_fm_data/user_artists.dat', sep='\t')

user_interaction_counts = user_data.groupby('userID').size()
users_with_50_interactions = user_interaction_counts[user_interaction_counts >= 50].index
user_data_filtered = user_data[user_data['userID'].isin(users_with_50_interactions)]

unique_users = user_data_filtered['userID'].unique()

# Ensure that test users have at least 50 interactions in 'test_data'
# We need to carefully select 'test_users' to satisfy this condition

np.random.seed(42)

shuffled_users = np.random.permutation(unique_users)

train_users = []
test_users = []

# We'll collect test users until we have enough that have at least 50 interactions
for user in shuffled_users:
    user_data_temp = user_data_filtered[user_data_filtered['userID'] == user]
    if len(test_users) < int(0.2 * len(unique_users)):
        # Tentatively add to test_users
        test_users.append(user)
    else:
        train_users.append(user)

# Recreate test_data and train_data
train_data = user_data_filtered[user_data_filtered['userID'].isin(train_users)]
test_data = user_data_filtered[user_data_filtered['userID'].isin(test_users)]

# Now check that each user in test_data has 50 interactions
# Remove any users from test_users who don't meet this criterion
valid_test_users = []
for user in test_users:
    user_data_temp = test_data[test_data['userID'] == user]
    if len(user_data_temp) == 50:
        valid_test_users.append(user)

# Update test_users and test_data
test_users = valid_test_users
test_data = test_data[test_data['userID'].isin(test_users)]

# Update train_data to include any users removed from test_users
removed_test_users = set(shuffled_users) - set(train_users) - set(test_users)
if removed_test_users:
    train_users.extend(list(removed_test_users))
    train_data = user_data_filtered[user_data_filtered['userID'].isin(train_users)]

# Now proceed to split test_data into test_x and test_y
test_x = pd.DataFrame(columns=test_data.columns)
test_y = pd.DataFrame(columns=test_data.columns)

for user in test_users:
    user_data_temp = test_data[test_data['userID'] == user]
    user_data_shuffled = user_data_temp.sample(frac=1, random_state=42)
    user_test_x = user_data_shuffled.iloc[:25]
    user_test_y = user_data_shuffled.iloc[25:50]
    test_x = pd.concat([test_x, user_test_x], ignore_index=True)
    test_y = pd.concat([test_y, user_test_y], ignore_index=True)

In [3]:
# Encode artistIDs
# Essential since ML models require numerical input (also efficient)
artist_encoder = LabelEncoder()
all_artistIDs = user_data_filtered['artistID'].unique()
artist_encoder.fit(all_artistIDs)
num_artists = len(artist_encoder.classes_)

# Encode train users
train_user_encoder = LabelEncoder()
train_user_encoder.fit(train_users)
num_train_users = len(train_user_encoder.classes_)

# Encode test users
test_user_encoder = LabelEncoder()
test_user_encoder.fit(test_users)
num_test_users = len(test_user_encoder.classes_)

In [4]:
# Function to create user-item interaction matrix
def create_user_item_matrix(data, user_encoder, num_users):
    user_item_matrix = np.zeros((num_users, num_artists))
    for idx, row in data.iterrows():
        user_idx = user_encoder.transform([row['userID']])[0]
        artist_idx = artist_encoder.transform([row['artistID']])[0]
        weight = row['weight']
        user_item_matrix[user_idx, artist_idx] = weight
    return user_item_matrix

train_user_item_matrix = create_user_item_matrix(train_data, train_user_encoder, num_train_users)
test_x_user_item_matrix = create_user_item_matrix(test_x, test_user_encoder, num_test_users)
test_y_user_item_matrix = create_user_item_matrix(test_y, test_user_encoder, num_test_users)

# Scale data before inputting it into autoencoder
# Find the maximum and minimum values across all matrices for consistent scaling
max_value = max(train_user_item_matrix.max(), test_x_user_item_matrix.max(), test_y_user_item_matrix.max())
min_value = min(train_user_item_matrix.min(), test_x_user_item_matrix.min(), test_y_user_item_matrix.min())

# Define a function to normalize a matrix
def normalise(matrix, min_value, max_value):
    return (matrix - min_value) / (max_value - min_value)

# Normalize each matrix
train_user_item_matrix = normalise(train_user_item_matrix, min_value, max_value)
test_x_user_item_matrix = normalise(test_x_user_item_matrix, min_value, max_value)
test_y_user_item_matrix = normalise(test_y_user_item_matrix, min_value, max_value)


In [5]:
class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(VAE, self).__init__()
        # Encoder
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc_logvar = nn.Linear(hidden_dim, latent_dim)
        # Decoder
        self.fc_decode = nn.Linear(latent_dim, hidden_dim)
        self.fc_out = nn.Linear(hidden_dim, input_dim)
        
    def encode(self, x):
        h = torch.relu(self.fc1(x))
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        logvar = torch.clamp(logvar, min=-10, max=10)
        return mu, logvar
    
    def reparameterise(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std) # Adds noise by sampling from standard normal dist.
        return mu + eps * std
    
    def decode(self, z):
        h = torch.relu(self.fc_decode(z))
        return torch.sigmoid(self.fc_out(h))
    
    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterise(mu, logvar)
        return self.decode(z), mu, logvar

In [6]:
# Convert train data to tensor
train_tensor = torch.FloatTensor(train_user_item_matrix)

# Create DataLoader
batch_size = 64
train_dataset = torch.utils.data.TensorDataset(train_tensor)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize the model, optimizer, and loss function
input_dim = num_artists
hidden_dim = 256
latent_dim = 50

model = VAE(input_dim, hidden_dim, latent_dim)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

def loss_function(recon_x, x, mu, logvar):
    MSE = nn.functional.mse_loss(recon_x, x, reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return MSE + KLD

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for data_batch in train_loader:
        data = data_batch[0]
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_function(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {train_loss / len(train_loader.dataset):.4f}")

Epoch 1, Loss: 2214.3641
Epoch 2, Loss: 221.0252
Epoch 3, Loss: 19.4951
Epoch 4, Loss: 10.9065
Epoch 5, Loss: 10.1564
Epoch 6, Loss: 9.6393
Epoch 7, Loss: 8.9696
Epoch 8, Loss: 8.2749
Epoch 9, Loss: 8.1322
Epoch 10, Loss: 7.6201


In [8]:
# Convert test data to tensor
test_x_tensor = torch.FloatTensor(test_x_user_item_matrix)
test_y_tensor = torch.FloatTensor(test_y_user_item_matrix)

# Get the model's predictions on test_x
model.eval()
with torch.no_grad():
    recon_test_x, _, _ = model(test_x_tensor) # Reconstructed predictions

# Compute MSE only on positions where test_y has interactions
test_y_mask = test_y_tensor > 0
predicted_ratings = recon_test_x[test_y_mask]
true_ratings = test_y_tensor[test_y_mask]

mse_loss = nn.functional.mse_loss(predicted_ratings, true_ratings)
print(f"Test MSE: {mse_loss.item():.4f}")

Test MSE: 0.0002


In [9]:
def get_top_10_artist_ids(userID, recon_test_x, test_y_tensor, test_user_encoder, artist_encoder):
    # Get the index of the user in the tensor
    if userID not in test_user_encoder.classes_:
        print("User ID not found in the test set.")
        return
    
    # Converts userID to its index
    user_idx = test_user_encoder.transform([userID])[0]
    
    # Get true ratings and predicted ratings for this user
    true_ratings = test_y_tensor[user_idx]
    predicted_ratings = recon_test_x[user_idx]
    
    # Get the indices of the top 10 true artists (by highest ratings)
    top_true_artist_indices = torch.topk(true_ratings, k=10).indices
    # Convert the indices back to artistIDs
    top_true_artists = artist_encoder.inverse_transform(top_true_artist_indices.cpu().numpy())
    
    # Get the indices of the top 10 predicted artists (by highest predicted ratings)
    top_predicted_artist_indices = torch.topk(predicted_ratings, k=10).indices
    top_predicted_artists = artist_encoder.inverse_transform(top_predicted_artist_indices.cpu().numpy())
    
    # Output the top 10 true and predicted artists
    print("Top 10 True Artists for User", userID, ":", top_true_artists)
    print("Top 10 Predicted Artists for User", userID, ":", top_predicted_artists)

# Example usage:
get_top_10_artist_ids(userID=123, recon_test_x=recon_test_x, test_y_tensor=test_y_tensor, 
                   test_user_encoder=test_user_encoder, artist_encoder=artist_encoder)

Top 10 True Artists for User 123 : [  88 1390 2610  768 2608   56 2824 3028 1546  366]
Top 10 Predicted Artists for User 123 : [ 3695 15400  9928  2782  2182  5286  4923 11906  8404 17039]


In [10]:
artist_ids = pd.read_csv('../unpushed_work/last_fm_data/artists.dat', sep='\t')

artist_id_to_name = pd.Series(artist_ids.name.values, index=artist_ids.id).to_dict()
name_to_artist_id = {v: k for k, v in artist_id_to_name.items()}

def get_top_10_artists(userID, recon_test_x, test_y_tensor, test_user_encoder, artist_encoder, artist_id_to_name):
    # Get the index of the user in the tensor
    if userID not in test_user_encoder.classes_:
        print("User ID not found in the test set.")
        return
    
    user_idx = test_user_encoder.transform([userID])[0]
    
    # Get true ratings and predicted ratings for this user
    true_ratings = test_y_tensor[user_idx]
    predicted_ratings = recon_test_x[user_idx]
    
    # Get the indices of the top 10 true artists (by highest ratings)
    top_true_artist_indices = torch.topk(true_ratings, k=10).indices
    # Convert the indices back to artistIDs
    top_true_artist_ids = artist_encoder.inverse_transform(top_true_artist_indices.cpu().numpy())
    top_true_artist_names = [artist_id_to_name.get(artist_id, "Unknown Artist") for artist_id in top_true_artist_ids]
    
    # Get the indices of the top 10 predicted artists (by highest predicted ratings)
    top_predicted_artist_indices = torch.topk(predicted_ratings, k=10).indices
    top_predicted_artist_ids = artist_encoder.inverse_transform(top_predicted_artist_indices.cpu().numpy())
    top_predicted_artist_names = [artist_id_to_name.get(artist_id, "Unknown Artist") for artist_id in top_predicted_artist_ids]
    
    # Output the top 10 true and predicted artist names
    print(f"Top 10 True Artists for User {userID}:")
    for idx, artist_name in enumerate(top_true_artist_names, start=1):
        print(f"{idx}. {artist_name}")
    
    print(f"\nTop 10 Recommended Artists for User {userID}:")
    for idx, artist_name in enumerate(top_predicted_artist_names, start=1):
        print(f"{idx}. {artist_name}")

get_top_10_artists(userID=1104, recon_test_x=recon_test_x, test_y_tensor=test_y_tensor, 
                   test_user_encoder=test_user_encoder, artist_encoder=artist_encoder, artist_id_to_name=artist_id_to_name)

Top 10 True Artists for User 1104:
1. Westlife
2. Björk
3. Prince
4. Madonna
5. Mariah Carey
6. Michael Jackson
7. Donna Summer
8. Air Supply
9. Meat Loaf
10. Kylie Minogue

Top 10 Recommended Artists for User 1104:
1. Anthony Callea
2. Ada Milea (cu Alexander Balanescu)
3. Edda
4. Burnt Fur
5. Aubrey Ashburn
6. NOMAK
7. Otis Rush
8. Placebo (Feat. David Bowie)
9. Eberhard Weber
10. WC


In [11]:
def recommendation_generator(artist_preferences):

    columns = ['userID', 'artistID', 'weight']
    df = pd.DataFrame(columns=columns)
    rows = []
    weight = 10
    for i in artist_preferences:
        artistID = name_to_artist_id.get(i, None)
        if artistID is None:
            print('The artist ', i, ' unfortunately does not appear in our records, please pick a different artist')
            return
        if artistID is not None:
            new_row = {'userID' : 2, 'artistID': artistID, 'weight' : weight}
            weight -= 1
            rows.append(new_row)
    df = pd.concat([df, pd.DataFrame(rows)], ignore_index=True)
    
    # Encode users
    artist_ids = pd.read_csv('../unpushed_work/last_fm_data/artists.dat', sep='\t')
    all_artist_ids = artist_ids.id.tolist()
    df_user_encoder = LabelEncoder()
    df_user_encoder.fit(all_artist_ids)
    num_df_users = len(df_user_encoder.classes_)
    df_matrix = create_user_item_matrix(df, df_user_encoder, num_df_users)

    # Scale data before inputting it into autoencoder
    max_value = df_matrix.max()
    min_value = df_matrix.min()
    # Define a function to normalize a matrix
    def normalise(matrix, min_value, max_value):
        return (matrix - min_value) / (max_value - min_value)

    # Normalize each matrix
    df_matrix = normalise(df_matrix, min_value, max_value)

    # Convert test data to tensor
    df_tensor = torch.FloatTensor(df_matrix)

    model.eval()
    with torch.no_grad():
        prediction, _, _ = model(df_tensor) # Reconstructed predictions
    
    prediction = prediction[2]
    
    # Get the indices of the top 10 predicted artists (by highest predicted ratings)
    top_predicted_artist_indices = torch.topk(prediction, k=10).indices
    top_predicted_artist_ids = artist_encoder.inverse_transform(top_predicted_artist_indices.cpu().numpy())
    top_predicted_artist_names = [artist_id_to_name.get(artist_id, "Unknown Artist") for artist_id in top_predicted_artist_ids]

    print(top_predicted_artist_names)

In [12]:
lucy_preferences = ['Amy Winehouse', 'The Strokes', 'Radiohead', 'Pink Floyd', 'Red Hot Chili Peppers', 'Bob Marley & The Wailers', 'The Beatles', 'Jimi Hendrix', 'Muse', 'Tame Impala'] # We assume this is list of artist names of length 10 ordered from highest to lowest preference

recommendation_generator(lucy_preferences)

['Burning the Masses', 'Barrelhouse', 'Bora Uzer', 'Tyler Hilton', 'Dancing Ghosts', 'Noon', 'Tunng', 'Brandy', 'Kathryn Williams', 'Freak Kitchen']
