In [117]:
import math
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
import torch
import torch.nn as nn
from torch.nn import init
from torch.autograd import Variable
import sys,utils,models,time
import random

In [5]:
train_dir = "../train/"
test_dir = "../test/"
train_triplets_file=train_dir+"kaggle_visible_evaluation_triplets.txt"
test_triplets_file=train_dir+"year1_valid_triplets_hidden.txt"
user_file = train_dir+'kaggle_users.txt'
song_file = train_dir+'kaggle_songs.txt'

print "loading user-song matrix."    
user2id = {v:int(k) for k,v in enumerate(utils.load_users(user_file))}
song2id = {k:int(v) for k,v in utils.song_to_idx(song_file).items()}

s2u=utils.song_to_users(train_triplets_file,user2id,song2id)
print "Song:",len(song2id),"Listened Song:",len(s2u)
u2s=utils.user_to_songs(train_triplets_file,user2id,song2id)
print "User:",len(user2id),"Active User:",len(u2s)


loading user-song matrix.
Song: 386213 Listened Song: 163206
User: 110000 Active User: 110000


In [6]:
embeddings_file='../train/song_embedding_128.deepwalk'
model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False)

In [35]:
embeds = []
print u2s[0]
for s in u2s[0]:
     embeds.append(model.get_vector(str(s)))
user_vector = np.mean(embeds, axis=0)
model.similar_by_vector(user_vector, topn=10)

set([58821, 87433, 123630, 351764, 68212, 25150])


[(u'123630', 0.8817476630210876),
 (u'12985', 0.8367016911506653),
 (u'25150', 0.8291943073272705),
 (u'351764', 0.8251831531524658),
 (u'68212', 0.8052948713302612),
 (u'307202', 0.7937827706336975),
 (u'288653', 0.7857335805892944),
 (u'311604', 0.7789132595062256),
 (u'87433', 0.7662076354026794),
 (u'92547', 0.7626552581787109)]

In [67]:

def negative_sample(user, cnt):
    candidates = [s for s in s2u.keys() if s not in u2s[user]]
    prob = np.array([len(s2u[x]) for x in candidates])
    return np.random.choice(candidates,cnt,p=prob*1.0/sum(prob))

def get_embeddings(users):
    user_embeddings = []  # list
    user_action_embs = [] # list
    neg_embs = [] # list of list
    neg_cnt = 5
    for u in users:
        s_embs = {}
        for s in u2s[u]:
            if str(s) in model.vocab:
                s_embs[s] = model.get_vector(str(s))
        for s in s_embs:
            leave_one_out = [s_embs[si] for si in s_embs if si != s]
            user_embeddings.append(np.mean(leave_one_out, axis=0))
            user_action_embs.append(s_embs[s])
            user_neg_samples = []
            for neg in negative_sample(u, neg_cnt):
                if str(neg) in model.vocab:
                    user_neg_samples.append(model.get_vector(str(neg)))
            neg_embs.append(user_neg_samples)
    return user_embeddings, user_action_embs, neg_embs

In [73]:
user_embeddings, user_action_embs, negative_samples = get_embeddings([0,1,2,3,4,5])

In [122]:

class LTR(torch.nn.Module):
    def __init__(self, D_in, H=32):
        super(LTR, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, 1)

    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return torch.sigmoid(y_pred)

    def neg_loss(self, user_embedding, negative_embeddings):
        scores = []
        for ne in negative_embeddings:
            score = self.forward(torch.tensor(np.multiply(user_embedding , ne)))
            scores.append(score)
        return torch.mean(torch.tensor(scores))

    # users = list, user_actions = list, negatives = map
    def loss(self, user_embeddings, user_action_embs, negatives):
        loss = 0
        for i, user_embedding in enumerate(user_embeddings):
            pos_score = self.forward(torch.tensor(np.multiply(user_embedding,user_action_embs[i])))
            neg_score = self.neg_loss(user_embedding, negatives[i])
            loss = loss + neg_score-pos_score
        return loss

embedding_dim = 128
LTRmodel = LTR(embedding_dim)

In [126]:
num_users = len(u2s)
rand_indices = np.random.permutation(u2s.keys())
split_pos = int(num_users * 0.7)

train_users = rand_indices[:split_pos]
valid_users = rand_indices[split_pos:]

# model training

learning_rate = 1e-6
optimizer = torch.optim.SGD(LTRmodel.parameters(), lr=learning_rate)
times = []

In [127]:
num_epochs = 100
for epoch in range(num_epochs):
    start_time = time.time()
    batch_users = train_users[:16]
    user_embeddings,user_action_embs,negatives = get_embeddings(batch_users) # TODO
    loss = LTRmodel.loss(user_embeddings, user_action_embs, negatives)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    end_time = time.time()
    times.append(end_time-start_time)
    print epoch, times[-1] ,loss.item()
    random.shuffle(train_users)

0 39.7040579319 0.19899481535
1 34.4842088223 0.241581439972
2 37.550951004 0.167332708836
3 35.9972789288 0.415518283844
4 32.2171998024 0.173104763031
5 30.5230119228 -0.0439894795418
6 34.064964056 0.236192405224
7 33.5145881176 0.357805430889
8 31.7265470028 0.229601562023
9 30.2553269863 0.182775199413
10 23.354706049 0.165178060532
11 33.4721720219 0.447008311749
12 28.3315420151 0.284649252892
13 29.8890559673 0.221496403217
14 21.7300209999 0.377642571926
15 28.1625730991 0.246895551682
16 28.7426891327 0.135021924973
17 29.238106966 0.13764411211
18 26.2274849415 0.321985602379
19 29.1849589348 0.105818152428
20 36.2828021049 0.50313013792
21 23.138117075 0.0908598899841
22 26.2779741287 0.176581203938
23 22.0683288574 0.188812673092
24 28.0011918545 0.196877062321
25 36.2568790913 0.307262003422
26 36.4432480335 0.138478457928
27 29.0282199383 0.197724103928
28 22.2350361347 0.101239562035
29 32.3237440586 0.26782220602
30 26.3298380375 0.223353266716
31 31.4455490112 0.11789

-0.0006198287010192871