In [1]:
import numpy as np
import pandas as pd
import json
from numba import njit

import sqlalchemy as db

from time import time
from datetime import timedelta

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
# Load functions
%run Functions.ipynb

# Load database

In [3]:
db_path = "sqlite:///../MPD_sql.db"
engine = db.create_engine(db_path)
conn = engine.connect()

In [4]:
time_0 = time()
# R_list = make_R_list_sql(conn, pid_limit=10000, progress=10)
R_list = make_R_list_sql(conn, pid_limit=None, progress=10)
time_end = time()

print(str( timedelta(seconds=time_end-time_0) )[:-4])
print()

num_playlists_0 = len(np.unique(R_list[:,0]))
print(f'We have {num_playlists_0} playlists')

1.51%: 3.10 sec
3.01%: 1.75 sec
4.52%: 1.53 sec
6.03%: 1.59 sec
7.54%: 1.46 sec
9.04%: 1.46 sec
10.55%: 1.50 sec
12.06%: 1.71 sec
13.57%: 1.47 sec
15.07%: 1.53 sec
16.58%: 1.51 sec
18.09%: 1.21 sec
19.59%: 1.64 sec
21.10%: 1.39 sec
22.61%: 1.58 sec
24.12%: 1.42 sec
25.62%: 1.58 sec
27.13%: 1.39 sec
28.64%: 1.66 sec
30.14%: 1.44 sec
31.65%: 1.46 sec
33.16%: 1.47 sec
34.67%: 1.21 sec
36.17%: 1.58 sec
37.68%: 1.40 sec
39.19%: 1.60 sec
40.70%: 1.45 sec
42.20%: 1.68 sec
43.71%: 1.41 sec
45.22%: 1.44 sec
46.72%: 1.39 sec
48.23%: 1.60 sec
49.74%: 1.46 sec
51.25%: 1.46 sec
52.75%: 1.60 sec
54.26%: 1.41 sec
55.77%: 1.39 sec
57.28%: 1.37 sec
58.78%: 1.59 sec
60.29%: 1.42 sec
61.80%: 1.41 sec
63.30%: 1.40 sec
64.81%: 1.41 sec
66.32%: 1.35 sec
67.83%: 1.56 sec
69.33%: 1.38 sec
70.84%: 1.59 sec
72.35%: 1.42 sec
73.85%: 1.41 sec
75.36%: 1.38 sec
76.87%: 1.37 sec
78.38%: 1.61 sec
79.88%: 1.49 sec
81.39%: 1.42 sec
82.90%: 1.18 sec
84.41%: 1.53 sec
85.91%: 1.60 sec
87.42%: 1.22 sec
88.93%: 1.53 sec
90.

Make train-test split

In [5]:
# Percentage of the total database to reserve for validation and testing
val_size_abs = 0.15
test_size    = 0.15
shuffle = True

# Note: the first pid_train contains (1-test_size) percent of the data.
# We need to use val_size so that val_size*(1-test_size) = val_size_abs.
val_size = val_size_abs/(1-test_size)
pid_train_full, pid_test_0 = train_test_split(np.arange(num_playlists_0), test_size=test_size,
                                       shuffle=shuffle, random_state=11)
pid_train,      pid_val_0  = train_test_split(pid_train_full, test_size=val_size,
                                       shuffle=shuffle, random_state=11)

# Show the first few entries of pid_train
print('Verify the following two lists are equal')
print('to ensure we\'re always using the same train-val-test split')
print(pid_train[:7])
print(np.array([597571, 679779, 758502, 28851, 480077, 430444, 982068]))

Verify the following two lists are equal
to ensure we're always using the same train-val-test split
[597571 679779 758502  28851 480077 430444 982068]
[597571 679779 758502  28851 480077 430444 982068]


Get the tracks present in the training set

In [6]:
R_list_train = R_list[ np.isin(R_list[:,0], pid_train), :]
tid_train = np.unique( R_list_train[:,1] )
tid_to_idx, _ = list_to_dict(tid_train)

num_playlists = len(pid_train)
num_songs = len(tid_train)

print('Number of training playlists:', num_playlists)
print('Number of training songs:', num_songs)

# Delete the whole list (we don't use it again)
# del R_list

Number of training playlists: 700000
Number of training songs: 1903758


Process the validation/test sets

In [7]:
# Reduce the size of the val/test sets to make testing times more feasible
pid_val = pid_val_0[:5000]
R_list_val   = R_list[ np.isin(R_list[:,0], pid_val),   :]
tid_val   = np.unique( R_list_val[:,1]   )

pid_test = pid_test_0[:10000]
R_list_test  = R_list[ np.isin(R_list[:,0], pid_test),  :]
tid_test  = np.unique( R_list_test[:,1]  )

# Remove tracks we don't know from the validation and test sets
# Create new arrays with consecutive tids
# And store the dicts to go from the original pid-tid labeling to the new consecutive indexing
# _,           R_idx_train, _,   _  = format_new_R_list(R_list_train, tid_to_idx)
R_list_val,  R_idx_val,   _,   _  = format_new_R_list(R_list_val,  tid_to_idx)
R_list_test, R_idx_test,  pid_to_idx_test,  idx_to_pid_test  = format_new_R_list(R_list_test,  tid_to_idx)

# Evaluate the performance of a trained model

Choose trained model

In [8]:
file_name = 'models/Q_trained_full_20_feats.npy'
Q_trained = np.load(file_name)

f = Q_trained.shape[1]

Generate a matrix $P_{\text{test}}$ so that $R_{\text{test}} = P_{\text{test}} \cdot Q_{\text{trained}}$, and compute $\text{MSE}_{\text{test}}$.

In [9]:
# Initialize random data
P_initial = np.random.normal(0, 0.1, (num_playlists, f))
Q_initial = np.random.normal(0, 0.1, (num_songs, f))

# Construct a P matrix for the test playlists and compute MSE
start_time = time()
P_test = make_Pval(R_idx_test, Q_trained, 10**-3)
mse_test = MSE(R_idx_test, P_test, Q_trained)
end_time = time()
print('Done: {:.3f} sec'.format(end_time-start_time))

print('MSE on test set: {:.5f}'.format(mse_test))

Done: 16.676 sec
MSE on test set: 0.03545


For reference, we also compute $\text{MSE}_{\text{val}}$.

In [10]:
# Initialize random data
P_initial = np.random.normal(0, 0.1, (num_playlists, f))
Q_initial = np.random.normal(0, 0.1, (num_songs, f))

# Construct a P matrix for the test playlists and compute MSE
start_time = time()
P_val = make_Pval(R_idx_val, Q_trained, 10**-3)
mse_val = MSE(R_idx_val, P_val, Q_trained)
end_time = time()
print('Done: {:.3f} sec'.format(end_time-start_time))

print('MSE on val set: {:.5f}'.format(mse_val))

Done: 3.806 sec
MSE on val set: 0.03642
