In [14]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import keras
from keras import backend as K
from keras import initializers
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Multiply, Dense
from keras.optimizers import Adam
from keras.regularizers import l2
from time import time









In [15]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

# Load the dataset
df_songs = pd.read_csv('processed_songs.csv')

# Filter users with more than 16 listened songs
song_user = df_songs.groupby('user_id')['song_id'].count()
song_ten_id = song_user[song_user > 16].index.to_list()
df_song_id_more_ten = df_songs[df_songs['user_id'].isin(song_ten_id)].reset_index(drop=True)

# Create user and song mappings
df_song_id_more_ten['user_index'] = df_song_id_more_ten['user_id'].astype('category').cat.codes
df_song_id_more_ten['song_index'] = df_song_id_more_ten['song_id'].astype('category').cat.codes

# Create training data
train_data = df_song_id_more_ten[['user_index', 'song_index', 'listen_count']]

# Get the number of unique users and songs
num_users = train_data['user_index'].nunique()
num_songs = train_data['song_index'].nunique()

# Create a sparse matrix
mat = sp.dok_matrix((num_users, num_songs), dtype=np.float32)
for _, row in train_data.iterrows():
    mat[row['user_index'], row['song_index']] = 1.0

In [27]:

def init_normal(shape, dtype=None, name=None):
    return initializers.RandomNormal(mean=0.0, stddev=0.01)(shape)

def get_model(num_playlists, num_items, latent_dim, regs=[0,0]):
    # Input variables
    playlist_input = Input(shape=(1,), dtype='int32', name = 'playlist_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')

    MF_Embedding_playlist = Embedding(input_dim = num_playlists, output_dim = latent_dim, name = 'playlist_embedding',
                                  embeddings_initializer = init_normal, embeddings_regularizer = l2(regs[0]))
    MF_Embedding_Item = Embedding(input_dim = num_items, output_dim = latent_dim, name = 'item_embedding',
                                  embeddings_initializer = init_normal, embeddings_regularizer = l2(regs[1]))   
    
    # Crucial to flatten an embedding vector!
    playlist_latent = Flatten()(MF_Embedding_playlist(playlist_input))
    item_latent = Flatten()(MF_Embedding_Item(item_input))
    
    # Element-wise product of playlist and item embeddings 
    predict_vector = Multiply()([playlist_latent, item_latent])
    
    # Final prediction layer
    #prediction = Lambda(lambda x: K.sigmoid(K.sum(x)), output_shape=(1,))(predict_vector)
    prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name = 'prediction')(predict_vector)
    
    model = Model(inputs=[playlist_input, item_input], 
                outputs=prediction)
    return model

def get_train_instances(train, num_negatives):
    playlist_input, item_input, labels = [],[],[]
    num_playlists = train.shape[0]
    for (u, i) in train.keys():
        # positive instance
        playlist_input.append(u)
        item_input.append(i)
        labels.append(1)
        # negative instances
        for t in range(num_negatives):
            j = np.random.randint(num_items)
            while (u,j) in train.keys():
                j = np.random.randint(num_items)
            playlist_input.append(u)
            item_input.append(j)
            labels.append(0)
    return playlist_input, item_input, labels

# Specify hyperparameters
num_factors = 8
regs = [0,0]
num_negatives = 4
learner = 'adam'
lr = 0.001
epochs = 15
batch_size = 200
verbose = 1

# Save model
# model_out_file = './GMF_%d_%d.h5' %(num_factors, time())
    
# Loading data
train = mat
num_playlists, num_items = train.shape
print("Load data done")

# Build model
model = get_model(num_playlists, num_items, num_factors, regs)
model.compile(optimizer=Adam(learning_rate=lr), loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())
    
# Train model
for epoch in range(epochs):
    # Generate training instances
    playlist_input, item_input, labels = get_train_instances(train, num_negatives)

    # Training
    hist = model.fit([np.array(playlist_input), np.array(item_input)], #input
                     np.array(labels), # labels 
                     validation_split=0.20, batch_size=batch_size, epochs=1, verbose=0, shuffle=True)
    print(hist.history)

Load data done


None
{'accuracy': [0.7996967434883118], 'loss': [0.5965136885643005], 'val_accuracy': [0.8000128865242004], 'val_loss': [0.5362681746482849]}
{'accuracy': [0.8075785636901855], 'loss': [0.4912120997905731], 'val_accuracy': [0.8002486228942871], 'val_loss': [0.5080084204673767]}
{'accuracy': [0.8175232410430908], 'loss': [0.4402247667312622], 'val_accuracy': [0.800141453742981], 'val_loss': [0.5079367160797119]}
{'accuracy': [0.8240923285484314], 'loss': [0.4005967974662781], 'val_accuracy': [0.8002057671546936], 'val_loss': [0.5088663697242737]}
{'accuracy': [0.8362445831298828], 'loss': [0.3570761978626251], 'val_accuracy': [0.8000128865242004], 'val_loss': [0.5076622366905212]}
{'accuracy': [0.8573877811431885], 'loss': [0.31315556168556213], 'val_accuracy': [0.7999699711799622], 'val_loss': [0.5051125288009644]}
{'accuracy': [0.8789811134338379], 'loss': [0.27643343806266785], 'val_accuracy': [0.8003129363059998], 'val_loss': [0.5033220648765564]}
{'accuracy': [0.8963629007339478], 