In [243]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform # distance.euclidean(a,b)
import tensorflow as tf

In [244]:
df = pd.read_csv('data/대전시.csv', sep='\t', index_col=False)
df.columns

Index(['Comment', 'Member ID', 'Member Nickname', 'Rating',
       'Restaurant Address', 'Restaurant ID', 'Restaurant Latitude',
       'Restaurant Longitude', 'Restaurant Name', 'Time'],
      dtype='object')

In [245]:
# N = number of users
# I = number of locations
# Z = number of topic, we choose.
N = len(df['Member ID'].unique())
I = len(df['Restaurant ID'].unique())
Z = 5
print(N, I, Z)

1153 852 5


In [246]:
def initialize(N, Z, I):
    theta = np.random.rand(N, Z)
    col_sum = theta.sum(axis=0)
    theta = theta / col_sum
    
    phi = np.random.rand(Z, I)
    return [theta, phi]

In [247]:
psi = initialize(N, Z, I)
theta = psi[0]; phi = psi[1]
print(theta.shape)
print(phi.shape)

(1153, 5)
(5, 852)


In [248]:
def getDist(beta, location):
    I = len(location)
    L = np.array([[x[1], x[2]] for x in location])
    dist = squareform(pdist(np.exp(-0.5*beta*L)))
    loc_id = np.array([x[0] for x in location])
    res = pd.DataFrame(dist, columns=loc_id, index=loc_id)
    return res

In [249]:
# location distance calculation
# using getDist function
# the index and column of the getDist output: Restaurant ID
beta = 0.5
df_loc = df[['Restaurant ID', 'Restaurant Latitude', 'Restaurant Longitude']]
location = sorted(list(set([tuple(x) for x in df_loc.to_records(index=False)])))
df_dist = getDist(beta, location)
print(df_dist.shape)

(852, 852)


In [250]:
# make x_um. x_um: dictionary. 
# key: user ID, value: list containing the restaurant ID that user visited
df2 = df.sort_values('Member ID')
df_user = df2[['Member ID', 'Restaurant ID']]
# 만약에 한 유저가 한 가게에 두번 방문해서 두번 리뷰를 남긴 걸 무시하고 싶으면 list 전에 set 씌울 것
x_um = {}
for index, row in df_user.iterrows():
    if row['Member ID'] not in x_um:
        x_um[row['Member ID']] = [row['Restaurant ID']]
    else:
        x_um[row['Member ID']].append(row['Restaurant ID'])

In [251]:
# Use equation (2) P(i|z, R_u, phi)
# the i is x_um here.
# So, in M.py, this part is the test of p_i function
test = x_um[359]
print(test)
userdist = df_dist.loc[test, test]
userdistSum = userdist.sum(axis=1)
print(userdist)
print(userdistSum)

loc_id = np.array([x[0] for x in location])
phi = np.exp(phi)
phi = pd.DataFrame(phi, columns=loc_id)
print(phi)
user_phi = phi.loc[:, test]
print(user_phi)
Px_um = user_phi * userdistSum
print("RESULT")
print(Px_um)

# calculate P(x_um|z, R_u, phi)

[16922, 16912]
              16922         16912
16922  0.000000e+00  2.242773e-09
16912  2.242773e-09  0.000000e+00
16922    2.242773e-09
16912    2.242773e-09
dtype: float64
     16835     16836     16837     16838     16841     16843     16845   \
0  1.273405  2.304468  1.307277  1.796477  1.390040  1.452742  1.620620   
1  1.443135  1.062552  1.424344  1.657316  1.086140  1.654399  1.069540   
2  1.526686  1.069106  1.159535  1.974262  1.435548  2.507787  2.141534   
3  1.857135  2.206482  1.409781  1.553539  1.031233  1.829043  1.268479   
4  1.598145  1.590848  1.696139  1.521148  2.175894  1.426466  2.073625   

     16847     16848     16852     ...       325525    325566    327787  \
0  1.177861  2.006295  1.823291    ...     2.011964  2.176126  1.371892   
1  1.889244  2.645781  1.513704    ...     1.717100  1.226050  1.109749   
2  1.976692  1.356787  1.146819    ...     1.007207  1.198435  1.409810   
3  1.513087  2.661030  2.517812    ...     1.562501  1.433402  2.493647  

In [252]:
# E-step, calculate P(z|u, m; \hat{psi}) (posterior)
mem_id = df_user['Member ID'].unique()
theta = pd.DataFrame(theta, index=mem_id)

# theta_uz => theta_(x_um)z
# Select theta for user u
theta_user = theta.loc[359].as_matrix()
Px_um = Px_um.as_matrix()
theta_user = theta_user.reshape(1, -1)
res = theta_user.T * Px_um

In [256]:
print(res)
sumRes = res.sum(axis=0)
print(sumRes)
posterior = res / sumRes
print(posterior)

[[  2.65409231e-12   2.46208886e-12]
 [  2.72606160e-12   3.05113320e-12]
 [  8.06853240e-13   7.38642431e-13]
 [  1.28748518e-13   1.71678293e-13]
 [  7.99455225e-14   8.78031505e-14]]
[  6.39570119e-12   6.51134593e-12]
[[ 0.41498066  0.37812288]
 [ 0.42623342  0.46858718]
 [ 0.12615556  0.11343929]
 [ 0.02013048  0.02636602]
 [ 0.01249988  0.01348464]]


In [259]:
# optimize theta_uz 
# This test is for user 359. Not whole user

theta_hat_numer = posterior.sum(axis=1)
print(theta_hat_numer)
theta_hat_denom = theta_hat_numer.sum()
print(theta_hat_denom)
theta_hat = theta_hat_numer / theta_hat_denom
print(theta_hat)

[ 0.79310354  0.8948206   0.23959484  0.0464965   0.02598452]
2.0
[ 0.39655177  0.4474103   0.11979742  0.02324825  0.01299226]


In [269]:
#df_user
#location = sorted(list(set([tuple(x) for x in df_loc.to_records(index=False)])))
indices = [list(x) for x in df_user.to_records(index=False)]
Indices = tf.SparseTensor(indices=indices, values=tf.ones(len(indices), dtype=tf.float64), dense_shape=[N, I])
P_hat = tf.placeholder(tf.float64, shape=[Z, N, I])
Theta = tf.placeholder(tf.float64, shape=[N, Z])
Phi = tf.placeholder(tf.float64, shape=[Z, I])
dist = tf.placeholder(tf.float64, shape=[I, I])

front = tf.exp(Phi)
back = tf.sparse_tensor_dense_matmul(Indices, dist)
P_numer = tf.expand_dims(front, axis=1) * back
P_denom = tf.expand_dims(tf.reduce_sum(P_numer, axis=2), axis=2)
P = P_numer / P_denom
log_Theta = tf.expand_dims(tf.transpose(tf.log(Theta)), axis=2)
loglike = P_hat * log_Theta * P
Q = tf.reduce_sum(tf.sparse_tensor_dense_matmul(Indices, tf.transpose(tf.reshape(loglike, [-1, I]))))
Phi_grad = tf.gradients(Q, Phi)