## Create user-song matrix

In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
import pickle

In [5]:
# Give a unique index for each user
users = pd.read_csv('./data/train_triplets.txt',sep='\t', header=None, usecols=[0], names=['user']).user.unique()
users = {u:i for (i,u) in enumerate(users)}
with open('./data/users.pkl', mode='wb') as f:
    pickle.dump(users, f)

In [18]:
# Give a unique index for each song
songs = pd.read_csv('./data/train_triplets.txt',sep='\t', header=None, usecols=[1], names=['song']).song.unique()
songs = {u:i for (i,u) in enumerate(songs)}
with open('./data/songs.pkl', mode='wb') as f:
    pickle.dump(songs, f)

In [7]:
# Load the users and song dictionaries
with open('./data/users.pkl', mode='rb') as f:
    users = pickle.load(f)
with open('./data/songs.pkl', mode='rb') as f:
    songs = pickle.load(f)

In [12]:
# Iterate over the triplets file, and save the listening data in row,col,data lists
row = []
col = []
data = []
with open('./data/train_triplets.txt') as f:
    for line in f:
        s = line.strip().split('\t')
        row.append(users[s[0]])
        col.append(songs[s[1]])
        data.append(int(s[2]))

In [18]:
# Create the sparse matrix
echonest_data = csr_matrix((data, (row, col)), shape=(len(users), len(songs)))
with open('./data/echonest_data.pkl', mode='wb') as f:
    pickle.dump(echonest_data, f)

## Calculate the latent factors from the user-song matrix

In [1]:
import pickle
import wmf

In [3]:
# Load the data
with open('./data/echonest_data.pkl', mode='rb') as f:
    data = pickle.load(f)
# data = pd.read_pickle("data/test_matrix.pkl")

In [4]:
# Calcualte latent factors
S = wmf.log_surplus_confidence_matrix(data, alpha=2.0, epsilon=1e-6)
U, V = wmf.factorize(S, num_factors=40, lambda_reg=1e-5, num_iterations=10, init_std=0.01, verbose=True, dtype='float32', recompute_factors=wmf.recompute_factors_bias)

precompute transpose
  took 3.356 seconds
run ALS algorithm
  iteration 0
    recompute user factors U
    time since start: 134.064 seconds
    recompute item factors V
    time since start: 222.585 seconds
  iteration 1
    recompute user factors U
    time since start: 369.029 seconds
    recompute item factors V
    time since start: 466.506 seconds
  iteration 2
    recompute user factors U
    time since start: 601.748 seconds
    recompute item factors V
    time since start: 668.367 seconds
  iteration 3
    recompute user factors U
    time since start: 783.951 seconds
    recompute item factors V
    time since start: 854.010 seconds
  iteration 4
    recompute user factors U
    time since start: 974.096 seconds
    recompute item factors V
    time since start: 1051.623 seconds
  iteration 5
    recompute user factors U
    time since start: 1179.918 seconds
    recompute item factors V
    time since start: 1262.831 seconds
  iteration 6
    recompute user factors U
    ti

In [4]:
V.shape

(384546, 40)

In [None]:
with open('./data/users.pkl', mode='rb') as f:
    users = pickle.load(f)
with open('./data/songs.pkl', mode='rb') as f:
    songs = pickle.load(f)

In [None]:
U = {i:v for (i,v) in enumerate(U)}
V = {i:v for (i,v) in enumerate(V)}

In [None]:
users = {u:U[i] for (u,i) in users.items()}
songs = {s:V[i] for (s,i) in songs.items()}

In [None]:
# Save the song latent factors
with open('./data/user_latent_factors.pkl', mode='wb') as f:
    pickle.dump(users, f)
with open('./data/song_latent_factors.pkl', mode='wb') as f:
    pickle.dump(songs, f)