In [36]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models

user_artists = pd.read_csv('../unpushed_work/last_fm_data/user_artists.dat', sep='\t')

# Since there are 2100 users, we will use 20% of the users for testing and 80% for training (here we assume the users are randomly ordered)

user_artists_train = user_artists[:74437]
user_artists_test = user_artists[74437:92834]

users = user_artists_train['userID'].unique()
artists = user_artists_train['artistID'].unique()

In [58]:
user_artist_matrix = user_artists_train.pivot(index='userID', columns='artistID', values=['weight']).fillna(0)

input_dim = user_artist_matrix.shape[1]

autoencoder = models.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(input_dim, activation='linear')
])

autoencoder.compile(optimizer='adam', loss='mse')

autoencoder.fit(user_artist_matrix, user_artist_matrix, epochs=50, batch_size=256, validation_split=0.1)

Epoch 1/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 377ms/step - loss: 38973.8359 - val_loss: 88166.2734
Epoch 2/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 128ms/step - loss: 40581.1602 - val_loss: 87520.9844
Epoch 3/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 122ms/step - loss: 36479.5859 - val_loss: 86253.8516
Epoch 4/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 128ms/step - loss: 42807.6602 - val_loss: 84821.1406
Epoch 5/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 123ms/step - loss: 40228.9297 - val_loss: 83760.8594
Epoch 6/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 121ms/step - loss: 40362.7930 - val_loss: 83806.7734
Epoch 7/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 130ms/step - loss: 41824.8672 - val_loss: 83719.3906
Epoch 8/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 178ms/step - loss: 35107.1602 - val_loss: 831

<keras.src.callbacks.history.History at 0x7f8e708d6900>

In [51]:
predicted_preferences = autoencoder.predict(user_artist_matrix)
rounded_preferences = np.round(predicted_preferences)

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step


In [52]:
print('highest predicted artist for user 2 is in column ', np.argmax(rounded_preferences[0]), ' in the matrix')

highest predicted artist for user 2 is in column  45  in the matrix


In [67]:
print('highest predicted artist for user 2 has ID ', user_artist_matrix.columns[45][1]) # This is promising as 51 is the artists Duran Duran and this was indeed this user's most listened artist

highest predicted artist for user 2 has ID  51
