In [1]:
import pandas as pd
import numpy as np
import scipy
# numpy version should be 1.13, because of the heaviside function
from scipy.stats import pearsonr, logistic
from scipy.special import expit
from scipy.spatial.distance import squareform
from scipy.optimize import minimize
import time
import pdb

In [2]:
print(np.__version__)

1.13.1


In [3]:
def load_data():
    df = pd.read_csv('data/Daejeon_dataset.csv', delimiter='\t', index_col=False)
    return df

In [4]:
df = load_data()
df.head()

Unnamed: 0,Comment,Member ID,Member Nickname,Rating,Restaurant Address,Restaurant ID,Restaurant Latitude,Restaurant Longitude,Restaurant Name,Restaurant code,Restaurant subcode,Time
0,대전 둔산동에 위치한 퓨전 양식집! 입구부터 내부까지 인테리어도 넘 예쁘고 좋았담 ...,361632,써머칭구,3,대전시 서구 둔산동 1081,188235,36.35188,127.37573,모모가든,4,16,2017-07-07 06:44:19
1,"음식도 너무 맛있고 분위기도 짱 좋아요!!\n세명이서 자몽리코타샐러드, 백합마스카포...",567494,쌤J,3,대전시 서구 둔산동 1081,188235,36.35188,127.37573,모모가든,4,16,2017-01-27 16:23:00
2,오랜만에 또 방문한 모모가든이에요. 개인적으로 대전 탑3안에 꼽을 것 같아요!\n저...,47875,준영,3,대전시 서구 둔산동 1081,188235,36.35188,127.37573,모모가든,4,16,2016-06-17 10:32:42
3,모모가든 정말 대전에서 가본 곳 중 최고였어요!!!! 일단 자몽 리코타 치즈 샐러드...,47875,준영,3,대전시 서구 둔산동 1081,188235,36.35188,127.37573,모모가든,4,16,2015-10-10 16:04:11
4,타임월드 근처에 있는 집이었다. 가게가 의외로 찾기 쉽지 않았는데 가보니 여기에 이...,533317,푱,3,대전시 서구 둔산동 1081,188235,36.35188,127.37573,모모가든,4,16,2017-07-11 15:56:40


In [5]:
def get_pref_mats(df):
    '''
    This function generates the check-in matrix and sentiment-preference matrix
    
    Input:
    df, dataFrame. Return from load_data function

    Output:
    1. pref_checkin, check-in preference matrix
    2. pref_sentiment, sentiment preference matrix
    '''
    mem_id = sorted(df['Member ID'].unique()); loc_id = sorted(df['Restaurant ID'].unique())
    pref_checkin = pd.DataFrame(0, index=mem_id, columns=loc_id)
    pref_sentiment = pd.DataFrame(0, index=mem_id, columns=loc_id)
    
    for index, row in df.iterrows():
        # make sentiment preference matrix
        member = row['Member ID']; restaurant = row['Restaurant ID']; rating = row['Rating']
        pref_sentiment.loc[member, restaurant] = rating
        checkin = pref_checkin.loc[member, restaurant]
        if checkin == 0:
            checkin = 1
        elif checkin > 0:
            checkin += 1

        if checkin >= 3:
            checkin = 3

        pref_checkin.loc[member, restaurant] = checkin
    
    pref_checkin = np.array(pref_checkin); pref_sentiment = np.array(pref_sentiment)
    return pref_checkin, pref_sentiment

In [6]:
pref_checkin, pref_sentiment = get_pref_mats(df)

In [7]:
def compute_pref_final(pref_checkin, pref_sentiment):
    '''
    This function calculates the final preference matrix. 
    Equation (1) from the paper
    
    Input:
    1. pref_checkin, check-in preference matrix
    2. pref_sentiment, sentiment preference matrix

    Output:
    pref_final, final preference matrix
    '''
    pref_final = pref_checkin - np.sign(pref_checkin - pref_sentiment) * np.heaviside(np.abs(pref_checkin - pref_sentiment)-2, 0.5)
    return pref_final

In [8]:
Z = 5
N = len(df['Member ID'].unique()); I = len(df['Restaurant ID'].unique())
pref_final = compute_pref_final(pref_checkin, pref_sentiment)
print(pref_final.shape)

(1171, 854)


In [9]:
def get_UV(N, I, Z):
    U = np.random.rand(N, Z)
    V = np.random.rand(Z, I)
    
    return U, V

In [10]:
def get_sim_u(pref_final):
    '''
    This function returns similarity of users

    Input: pref_final, which is the return matrix of the compute_pref_final function
    Output: similarity of users, N * N matrix
    '''
    N, _ = pref_final.shape
    sim_u = []
    for n in range(N):
        temp = []
        for i in range(N):
            temp.append(pearsonr(pref_final[n], pref_final[i])[0])
        sim_u.append(temp)

    sim_u = np.array(sim_u)
    return sim_u


In [11]:
def get_sim_v(df):
    '''
    For two venues, the similarity score is set to 1 if both venues have the same sub-category in Foursquare
    and set 0 if there is no overlapping sub-category

    If two restaurants have the same cuisine code, similarity score is set to 1, else 0.
    '''
    location = []
    for index, row in df.iterrows():
        tmp = [row['Restaurant ID'], row['Restaurant code']]
        if tmp not in location:
            location.append(tmp)

    sim_v = []
    I = len(location)
    
    for i in range(I):
        temp = []
        current_code = location[i][1]
        for j in range(I):
            if location[j][1] == current_code:
                temp.append(1)
            else:
                temp.append(0)
        sim_v.append(temp)
        
    sim_v = np.array(sim_v)
    return sim_v

In [12]:

def get_coefficient(pref_final, sim_u, sim_v, U, V):
    '''
    lambda_u = sigma^2_R / sigma^2_U
    lambda_v = sigma^2_R / sigma^2_V
    alpha = sigma^2_R / sigma^2_simU
    beta = sigma^2_R / sigma^2_simV
    '''
    var_R = np.var(pref_final)
    lambda_u = var_R / np.var(U)
    lambda_v = var_R / np.var(V)
    alpha = var_R / np.var(sim_u)
    beta = var_R / np.var(sim_v)
    
    return lambda_u, lambda_v, alpha, beta

In [28]:
def get_log_posterior(U, V, pref_final, sim_u, sim_v, lambda_u, lambda_v, alpha, beta, N, I, Z):
    U = scipy.resize(U, (N, Z))
    V = scipy.resize(V, (Z, I))
    first_term = np.sum(pref_final - expit(U @ V))
    second_term = lambda_u * np.sum(U @ U.T) + lambda_v * np.sum(V @ V.T)
    third_term = alpha * np.sum((U - (sim_u @ U)) @ (U - (sim_u @ U)).T)
    fourth_term = beta * np.sum((V.T - (sim_v @ V.T)) @ (V.T - (sim_v @ V.T)).T)
    log_posterior = 0.5 * (first_term + second_term + third_term + fourth_term)
#     print("log_posterior:", log_posterior)
    return log_posterior

In [15]:
start = time.time()
U, V = get_UV(N, I, Z)
sim_u = get_sim_u(pref_final)
sim_v = get_sim_v(df)
lambda_u, lambda_v, alpha, beta = get_coefficient(pref_final, sim_u, sim_v, U, V)
end = time.time() - start
print(end)
print(sim_u)

74.52107787132263
[[ 1.         -0.00117233 -0.0016589  ..., -0.00117233 -0.00117233
  -0.00117233]
 [-0.00117233  1.          0.70669218 ..., -0.00117233 -0.00117233
  -0.00117233]
 [-0.0016589   0.70669218  1.         ..., -0.0016589  -0.0016589
  -0.0016589 ]
 ..., 
 [-0.00117233 -0.00117233 -0.0016589  ...,  1.         -0.00117233
  -0.00117233]
 [-0.00117233 -0.00117233 -0.0016589  ..., -0.00117233  1.         -0.00117233]
 [-0.00117233 -0.00117233 -0.0016589  ..., -0.00117233 -0.00117233  1.        ]]


In [16]:
log_posterior = get_log_posterior(U, V, pref_final, sim_u, sim_v, lambda_u, lambda_v, alpha, beta)
print(log_posterior)

log posterior
948263720.437
948263720.437


In [29]:
def get_grad_u(U, V, pref_final, sim_u, sim_v, lambda_u, lambda_v, alpha, beta, N, I, Z):
    '''
    A: U, B: V 
    '''
    U = scipy.resize(U, (N, Z))
    V = scipy.resize(V, (Z, I))
#     U = U.reshape(1171, 5)

#     pdb.set_trace()
    grad_u_first = (logistic.pdf(U @ V) * (expit(U @ V) - pref_final)) @ V.T
    grad_u_second = lambda_u * U + alpha * (U - sim_u @ U)
    grad_u_third = -alpha * (sim_u @ (U - sim_u @ U))
    grad_u = grad_u_first + grad_u_second + grad_u_third
    grad_u = np.ndarray.flatten(grad_u)
#     print("gradient_u:", grad_u)
    return grad_u

def get_grad_v(U, V, pref_final, sim_u, sim_v, lambda_u, lambda_v, alpha, beta, N, I, Z):
#     U = U.reshape(1171, 5)

    U = scipy.resize(U, (N, Z))
    V = scipy.resize(V, (Z, I))
    grad_v_first = (logistic.pdf(U @ V) * (expit(U @ V)-pref_final)).T @ U
    grad_v_second = (lambda_v * V).T + beta * (V.T - sim_v @ V.T)
    grad_v_third = -beta * (sim_v @ (V.T - sim_v @ V.T))
    grad_v = grad_v_first + grad_v_second + grad_v_third
    grad_v = np.ndarray.flatten(grad_v)
#     print("gradient_v:", grad_v)
    return grad_v

In [34]:
cnt = 0
while True:
    cnt += 1
    print("---------------------------------------------------------------")
    print("%d th iteration" % cnt)
    print("---------------------------------------------------------------")
    u_res = minimize(get_log_posterior,
                     x0 = U, args = (V, pref_final, sim_u, sim_v, lambda_u, lambda_v, alpha, beta, N, I, Z),
                     jac = get_grad_u)

    v_res = minimize(get_log_posterior,
                     x0 = V, args = (U, pref_final, sim_u, sim_v, lambda_u, lambda_v, alpha, beta, N, I, Z),
                     jac = get_grad_v)

    estimated_U = u_res.x.reshape(N, Z)
    estimated_V = v_res.x.reshape(Z, I)
    
    cond = np.sqrt(np.sum(np.square(U - estimated_U)) + np.sum(np.square(V - estimated_V)))
    print("condition value:", cond)
    print("U:", U)
    print("V:", V)
    print("estimated_U:", estimated_U)
    print("estimated_V:", estimated_V)
    condition = cond < 1e-01
    print("condition:", condition)

    if condition:
        break

    U, V = estimated_U, estimated_V

---------------------------------------------------------------
1 th iteration
---------------------------------------------------------------
condition value: 0.0
U: [[-0.75306978 -0.83532236 -0.92459367 -0.48295363 -0.78143274]
 [ 0.32773668  0.70252663  0.09724874  0.13565821  0.09000367]
 [-0.01041188  0.33742857 -0.18191843 -0.36660375 -0.13581435]
 ..., 
 [-0.6630746  -0.27726652 -0.80753745 -0.06012641 -0.40103471]
 [-0.99736775 -0.74537994 -1.01984131 -0.16698571 -0.56029732]
 [-0.0926893  -0.34548037 -0.07123231 -0.95459782 -0.28310296]]
V: [[ 0.16973245  0.12354289  0.01700917 ...,  0.32587061  0.80424156
   0.38910637]
 [ 0.41593245  0.20445953  0.80131759 ...,  0.57135617  0.0134101
   0.18418824]
 [ 0.16151429  0.64633566  0.37032339 ...,  0.3325219   0.61526965
   0.9062234 ]
 [ 0.62476066  0.26310118  0.07445813 ...,  0.47399694  0.31408562
   0.3650576 ]
 [ 0.75237087  0.92714424  0.78305655 ...,  0.6987696   0.80300382
   0.30874887]]
estimated_U: [[-0.75306978 -0.8353

In [24]:
# print(estimated_u.x.shape)
# print(estimated_v.x.shape)

# estimated_u = (estimated_u.x).reshape(1171, 5)
# estimated_v = (estimated_v.x).reshape(5, 854)
# print(estimated_u.shape)
# print(estimated_v.shape)
# print(pref_final)
# print(estimated_u @ estimated_v)
# print(estimated_)
print(U)
print(V)

[[-0.75306978 -0.83532236 -0.92459367 -0.48295363 -0.78143274]
 [ 0.32773668  0.70252663  0.09724874  0.13565821  0.09000367]
 [-0.01041188  0.33742857 -0.18191843 -0.36660375 -0.13581435]
 ..., 
 [-0.6630746  -0.27726652 -0.80753745 -0.06012641 -0.40103471]
 [-0.99736775 -0.74537994 -1.01984131 -0.16698571 -0.56029732]
 [-0.0926893  -0.34548037 -0.07123231 -0.95459782 -0.28310296]]
[[ 0.16973245  0.12354289  0.01700917 ...,  0.32587061  0.80424156
   0.38910637]
 [ 0.41593245  0.20445953  0.80131759 ...,  0.57135617  0.0134101
   0.18418824]
 [ 0.16151429  0.64633566  0.37032339 ...,  0.3325219   0.61526965
   0.9062234 ]
 [ 0.62476066  0.26310118  0.07445813 ...,  0.47399694  0.31408562
   0.3650576 ]
 [ 0.75237087  0.92714424  0.78305655 ...,  0.6987696   0.80300382
   0.30874887]]
