In [10]:
import numpy as np
from scipy.sparse.linalg import svds
#from alternating import optimize
from itertools import count
from scipy import sparse
from sklearn import linear_model
import time

In [15]:
datasetPath = '../../train_triplets.txt'
n =300000
b = 10
ignoreLessThan = 5
num_test = 200

# k is the num of freedom of Q's cols and P's rows
k = 30

# N is the num of iterations for the alternative optimization
N = 5

alpha = 0.5

In [2]:
# read first n triplets from the dataset
def readTriplets(datasetPath, n):
    row = []
    col = []
    data = []

    user_to_row = {}
    song_to_col = {}
    user_count = count()
    song_count = count()
    limitingCounter = 0
    with open(datasetPath) as f:
        for triplet in f:
            if limitingCounter ==n:
                break
            limitingCounter+=1

            uid, sid, play_count =triplet.split('\t')
            #print (uid, sid, play_count)
            if not uid in user_to_row:
                user_to_row[uid] = next(user_count)
            row.append(user_to_row[uid])
            if not sid in song_to_col:
                song_to_col[sid] = next(song_count)
            col.append(song_to_col[sid])        
            data.append(int(play_count))
    return row, col, data

$Preprocess 1:$

assign compress all entries in M into [i,i+1]

In [3]:
def resetData(data):
    data = np.floor(np.log(data)/np.log(2)+1)
    data[data>b]=b
    return data

$Preprocess 2:$

Remove invalid songs and users

In [4]:
# construct the M
# input: raw row,c col, data, and ignoreLessThan
# return csr sparse pruned M
def constructM(row,col,data,ignoreLessThan):
    M = sparse.csr_matrix((data, (row, col)))
    size = M.shape
    count = 0
    while(True):
        count += 1
        mask = M!=0
        dr = mask.sum(axis=1)
        M = M[np.where(dr>ignoreLessThan)[0],:]
        mask = M!=0
        dc = mask.sum(axis=0)
        M = M[:,np.where(dc>ignoreLessThan)[1]]
        if size == M.shape:
            break
        size = M.shape
    print (M.shape, count)
    return M

$Preprocess3 :$

Draw test data

In [5]:
#divide M into test and train datasets
# return test: first col of rows second col of cols and third col of data
#        train: same, train_M: csr sparse of trian M 
def divideM(M,num_test):
    # find out nonzeros of pruned M, and store the rows, cols, data in non0M
    row,col = sparse.csr_matrix.nonzero(M)
    data = M[row,col]
    data = np.asarray(data.tolist()[0])
    non0M = np.array([row,col,data]).T

    # randomly draw num_test nonzeros from M as test data
    draw = np.random.permutation(len(row))[:num_test]
    test = non0M[draw]

    # set train_M and assign all test data to 0s
    train = np.delete(non0M,draw,0)
    train_M = sparse.csr_matrix((train[:,2], (train[:,0], train[:,1])))
    print  (non0M.shape, test.shape, train.shape, train_M.shape, train_M[test[0,0],test[0,1]])
    return test,train,train_M

$Initialization :$

In [6]:
def initQP(train_M,k):
    U,s,V = sparse.linalg.svds(train_M,k=k)
    Q = U.dot(np.diag(s))
    P = np.dot(np.diag(s),V)
    print (Q.shape, P.shape)
    return Q,P

$AlternativeOptimization :$

In [7]:
def AlternativeOptimization(train_M,train,P,Q,N,alpha):
    COL = train_M.shape[1]
    ROW = train_M.shape[0]
    row = train[:,0].astype(int)
    col = train[:,1].astype(int)
    data = train[:,2]
    reg = linear_model.Ridge (alpha = alpha, fit_intercept=False)
    for n in range(N):
        for c in range(COL):
            temp = np.where(col==c)
            if len(temp[0])>0:
                X = Q[row[temp],:]
                y = data[temp]
                reg.fit(X,y)
                P[:,c] = reg.coef_
        QT = Q.T
        PT = P.T
        for r in range(ROW):
            temp = np.where(row==r)
            if len(temp[0])>0:
                X = PT[row[temp],:]
                y = data[temp]
                reg.fit(X,y)
                QT[:,r] = reg.coef_      
        P = PT.T
        Q = QT.T
    return Q,P

$PredictError :$

In [8]:
def predError(test,Q,P):
    row = test[:,0].astype(int)
    col = test[:,1].astype(int)
    data = test[:,2]
    error = 0
    for i in range(len(row)):
        error += (data[i]-np.dot(Q[row[i],:],P[:,col[i]]))**2
    return np.sqrt(error/len(row))

$RunTogether :$

In [16]:
row,col,data = readTriplets(datasetPath, n)
start_time = time.time()
data = resetData(data)
M = constructM(row,col,data,ignoreLessThan)
test,train,train_M = divideM(M,num_test)
Q,P = initQP(train_M,k)
Q,P = AlternativeOptimization(train_M,train,P,Q,N,alpha)
print (predError(test,Q,P))
print (time.time()-start_time)

((5693, 10966), 5)
((171225, 3), (200, 3), (171025, 3), (5693, 10966), 0.0)
((5693, 30), (30, 10966))
1.19976159561
39.5577721596
