In [1]:
import re
import numpy as np
from numpy.linalg import norm
import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(0)

## PLSA model

In [2]:
class PlsaRecc(object):
    '''
    PLSA with EM algorithm
    '''
    def __init__(self, n_topic, rate_mat):
        '''
        Constructor
        z represent latent variable
        p_u_i = P(i|u)
        p_u_z = p(z|u)
        p_z_i = p(i|z)
        p_ui_z = p(z|u,i)
        '''
        self.usr_itm_mat = rate_mat
        self.n_usr = self.usr_itm_mat.shape[0]
        self.n_itm = self.usr_itm_mat.shape[1]
        self.n_topic = n_topic

        self.prb_u_z = np.zeros([self.n_usr, self.n_topic], dtype = np.float)
        self.prb_z_i = np.zeros([self.n_topic, self.n_itm], dtype = np.float)
        self.prb_ui_z = np.zeros([self.n_usr, self.n_itm, self.n_topic], dtype = np.float)
        self.prb_u_i = np.zeros([self.n_usr, self.n_itm], dtype = np.float)
        # randomly initialize parameters
        self.init_random_para()
        
    def init_random_para(self):
        # randomly initialize parameters
        self.prb_u_z = np.random.random(size=(self.n_usr, self.n_topic))
        self.prb_z_i = np.random.random(size=(self.n_topic, self.n_itm))
        
    def run_em(self, max_iter=200, norm_check=False):
        print 'Begin EM Algorithm'
        for stp in range(max_iter):
            print 'Iteration ' + str(stp+1)
            print 'Begin E step'
            # calculate p(z|u,i), normalized
            norm_arr = np.zeros([self.n_usr, self.n_itm], dtype = np.float)
            for k in range(self.n_topic):
                self.prb_ui_z[:,:,k]=self.prb_u_z[:,k][:,None]*self.prb_z_i[k,:]
                norm_arr += self.prb_ui_z[:,:,k]
            # normalize
            if norm_check:
                print 'Check normalize constants are non-zero'
                norm_arr[norm_arr==0.0] = 1.0
            for k in range(self.n_topic):
                self.prb_ui_z[:,:,k] = np.divide(self.prb_ui_z[:,:,k], norm_arr)
            print 'E step done'
            print 'Begin M step'
            # calculate p(i|z), normalized
            for i in range(self.n_usr):
                self.prb_z_i += self.prb_ui_z[i,:,:].transpose()*self.usr_itm_mat[i,:]
            norm_arr = np.zeros(self.n_topic)
            for j in range(self.n_itm):
                norm_arr += self.prb_z_i[:,j]
            for j in range(self.n_itm):
                self.prb_z_i[:,j] = np.divide(self.prb_z_i[:,j], norm_arr)
            #print np.sum(self.prb_z_i)
            # calculate p(z|u), normalized
            for j in range(self.n_itm):
                self.prb_u_z += self.usr_itm_mat[:,j][:,None]*self.prb_ui_z[:,j,:]
            norm_arr = np.zeros(self.n_usr)
            for k in range(self.n_topic):
                norm_arr += self.prb_u_z[:,k]
            for k in range(self.n_topic):
                self.prb_u_z[:,k] = np.divide(self.prb_u_z[:,k], norm_arr)
            #print np.sum(self.prb_u_z)
            print np.sum(self.prb_u_z), np.sum(self.prb_z_i), np.sum(self.prb_ui_z)
            print 'M step done'
            
    def cal_recc_score(self):
        print 'Calculate recommendation scores'
        '''
        print self.prb_z_i.shape
        print self.prb_u_z.shape
        calculate p(i|u)
        '''
        self.prb_u_i = np.dot(self.prb_u_z, self.prb_z_i)
        return self.prb_u_i
    
    def save_format(self):
        np.save('mat_u_z', self.prb_u_z)
        np.save('mat_z_i', self.prb_z_i)

## Read Data

In [3]:
rate = []
usr = {}
itm = {}
fp = open('../data/ratings.csv', 'r')
for line in fp:
    print line
    break
for line in fp:
    arr = re.split(',', line[:-1])
    usr[arr[0]] = 0
    itm[arr[1]] = 0
    rate.append([int(arr[0]), int(arr[1])])
fp.close()
n_usr = len(usr)
n_itm = len(itm)
print len(usr), len(itm), len(rate)

uid,lid,_id

5797 5157 63481


## Training

In [None]:
rate_mat = np.zeros([n_usr, n_itm])
for k in rate:
    rate_mat[k[0]-1, k[1]-1] = 1.0
model = PlsaRecc(10, rate_mat)
model.run_em(20, True)

Begin EM Algorithm
Iteration 1
Begin E step


In [None]:
model.save_format()