# Matrix Factorization

In [335]:
import pandas as pd
from collections import defaultdict
import numpy as np
from sortedcontainers import SortedList, SortedDict, SortedSet

## Read in Data and Create Relevant Dictionaries

In [336]:
data = pd.read_csv('kaggle_visible_evaluation_triplets.txt', header=None, sep='\t')

In [337]:
kaggle = pd.read_csv('kaggle_songs.txt', header=None, sep=' ')
song_to_kaggle = dict()
for idx, row in kaggle.iterrows():
    song_to_kaggle[row[0]] = row[1]

In [338]:
user_rating_list = dict()
user_rating_list = defaultdict(lambda:list(), user_rating_list)

song_rating_list = dict()
song_rating_list = defaultdict(lambda:list(), song_rating_list)

id_to_user = dict()
user_to_id = dict()
id_to_song = dict()
song_to_id = dict()
user_idx = -1
song_idx = -1
last_user = 0

total = 0
ratings = 0

for i in range(len(data.iloc[:,0])):
    
    if data.iloc[i,0] != last_user:
        user_idx += 1
        id_to_user[user_idx] = data.iloc[i,0]
        user_to_id[data.iloc[i,0]] = user_idx
        last_user = data.iloc[i,0]

    user_rating_list[user_idx].append((song_idx, data.iloc[i,2]))
        
    if data.iloc[i,1] in song_to_id:
        song_rating_list[song_to_id[data.iloc[i,1]]].append((user_idx, data.iloc[i,2]))
    else:
        song_idx += 1
        id_to_song[song_idx] = data.iloc[i,1]
        song_to_id[data.iloc[i,1]] = song_idx
        song_rating_list[song_idx].append((user_idx, data.iloc[i,2]))
    
    total += data.iloc[i,2]
    ratings += 1

## Parameters

In [359]:
T = 90 # epochs
user_ct = len(user_rating_list)
song_ct = len(song_rating_list)
K = 30 # latent factors (cross validate)
mu = total / ratings # average of all known ratings
reg = 9 # regularization (cross validate)

In [354]:
np.random.seed(42)

U = np.random.randn(user_ct, K) / K
S = np.random.randn(K, song_ct) / K
U_bias = np.zeros(user_ct)
S_bias = np.zeros(song_ct)

## Matrix Factorization Functions

In [355]:
def recommend(user_ct, song_ct, user_rating_list, song_rating_list, K, U_bias, S_bias, mu, S, U, T):
    timect = 0
    for t in range(T):
        update_u_bias(user_ct, user_rating_list, U, S, S_bias, mu, U_bias)
        update_u(user_ct, user_rating_list, K, U_bias, S_bias, mu, S, U)
        update_s_bias(song_ct, song_rating_list, U, S, U_bias, mu, S_bias)
        update_s(song_ct, song_rating_list, K, U, U_bias, S_bias, mu, S)
        
        timect += 1
        print('|{0:50s}| {1:3f}%\r'.format('='*round((timect/T)*50), (timect/T) * 100), end='')

### Update Equations

In [356]:
def update_u(user_ct, user_rating_list, K, U_bias, S_bias, mu, S, U):
    for i in range(user_ct):
        if i in user_rating_list:
            matrix = np.zeros((K, K)) + reg*np.eye(K)
            vector = np.zeros(K)
            for j, r in user_rating_list[i]:
                matrix += np.outer(S[:,j], S[:,j])
                vector += (r - U_bias[i] - S_bias[j] - mu)*S[:,j]
            U[i,:] = np.linalg.solve(matrix, vector)
    
def update_s(song_ct, song_rating_list, K, U, U_bias, S_bias, mu, S):
    for j in range(song_ct):
        if j in song_rating_list:
            matrix = np.zeros((K, K)) + reg*np.eye(K)
            vector = np.zeros(K)
            for i, r in song_rating_list[j]:
                matrix += np.outer(U[i,:], U[i,:])
                vector += (r - U_bias[i] - S_bias[j] - mu)*U[i,:]
            S[:,j] = np.linalg.solve(matrix, vector)
    
def update_u_bias(user_ct, user_rating_list, U, S, S_bias, mu, U_bias):
    for i in range(user_ct):
        if i in user_rating_list:
            accum = 0
            for j, r in user_rating_list[i]:
                accum += (r - U[i,:].dot(S[:,j]) - S_bias[j] - mu)
            U_bias[i] = accum / (len(user_rating_list[i]) + reg)
    
def update_s_bias(song_ct, song_rating_list, U, S, U_bias, mu, S_bias):
    for j in range(song_ct):
        if j in song_rating_list:
            accum = 0
            for i, r in song_rating_list[j]:
                accum += (r - U[i,:].dot(S[:,j]) - U_bias[i] - mu)
            S_bias[j] = accum / (len(song_rating_list[j]) + reg)

In [360]:
import pickle

try:
    recommend(user_ct, song_ct, user_rating_list, song_rating_list, K, U_bias, S_bias, mu, S, U, T)
except KeyboardInterrupt:
    file_Name1 = "SaveU"
    file_Name2 = "SaveS"
    file_Name3 = "SaveU_bias"
    file_Name4 = "SaveS_bias"
    
    fileObject1 = open(file_Name1,'wb')
    fileObject2 = open(file_Name2,'wb')
    fileObject3 = open(file_Name3,'wb')
    fileObject4 = open(file_Name4,'wb')

    pickle.dump(U,fileObject1)
    pickle.dump(S,fileObject2)
    pickle.dump(U_bias,fileObject3)
    pickle.dump(S_bias,fileObject4)

    fileObject1.close()
    fileObject2.close()
    fileObject3.close()
    fileObject4.close()



## Export/Load Saved Models

In [264]:
# file_Name1 = "SaveU"
# file_Name2 = "SaveS"
# file_Name3 = "SaveU_bias"
# file_Name4 = "SaveS_bias"

# fileObject1 = open(file_Name1,'wb')
# fileObject2 = open(file_Name2,'wb')
# fileObject3 = open(file_Name3,'wb')
# fileObject4 = open(file_Name4,'wb')

# pickle.dump(U,fileObject1)
# pickle.dump(S,fileObject2)
# pickle.dump(U_bias,fileObject3)
# pickle.dump(S_bias,fileObject4)

# fileObject1.close()
# fileObject2.close()
# fileObject3.close()
# fileObject4.close()

In [322]:
# fileObject1 = open("SaveU",'rb')
# fileObject2 = open("SaveS",'rb')
# fileObject3 = open("SaveU_bias",'rb')
# fileObject4 = open("SaveS_bias",'rb')

# U = pickle.load(fileObject1)
# S = pickle.load(fileObject2)
# U_bias = pickle.load(fileObject3)
# S_bias = pickle.load(fileObject4)

# fileObject1.close()
# fileObject2.close()
# fileObject3.close()
# fileObject4.close()

## Make Recommendations

In [345]:
canonical_users_temp = np.asarray(pd.read_csv('year1_valid_triplets_hidden.txt', header=None, sep='\t')).T[0]
canonical_users = []
last_user = 0

for user in canonical_users_temp:
    if user != last_user:
        canonical_users.append(user)
        last_user = user

In [361]:
import operator

f = open('submission.txt', 'w')
timect = 0

for user in canonical_users:
    songs_to_recommend = []
    song_ranking = dict()
    user_id = user_to_id[user]
    
    for song_id in song_rating_list:
        song_ranking[song_id] = np.matmul(U[user_id],S[:,song_id])
    
    sorted_ranking = sorted(song_ranking.items(), key=operator.itemgetter(1), reverse=True)
        
    top500 = sorted_ranking[:500]
    dt=np.dtype('int,float')
    top500 = np.array(top500,dtype=dt)['f0']
    top = []
    for song_id in top500:
        song = id_to_song[song_id]
        track = song_to_kaggle[song]
        top.append(str(track))
    
    # Write line for that user
    f.write(' '.join(top) + '\n')
    
    timect += 1
    if timect % 10 == 0:
        print('|{0:50s}| {1:3f}%\r'.format('='*round((timect/len(canonical_users))*50), (timect/len(canonical_users)) * 100), end='')

f.close()

