In [40]:
import pandas as pd
import numpy as np
import scipy
# numpy version should be 1.13, because of the heaviside function
from scipy.stats import pearsonr, logistic
from scipy.special import expit
from scipy.spatial.distance import squareform
from scipy.optimize import minimize
import time
import pdb

In [2]:
print(np.__version__)

1.13.1


In [3]:
def load_data():
    df = pd.read_csv('data/Daejeon_dataset.csv', delimiter='\t', index_col=False)
    return df

In [4]:
df = load_data()
df.head()

Unnamed: 0,Comment,Member ID,Member Nickname,Rating,Restaurant Address,Restaurant ID,Restaurant Latitude,Restaurant Longitude,Restaurant Name,Restaurant code,Restaurant subcode,Time
0,대전 둔산동에 위치한 퓨전 양식집! 입구부터 내부까지 인테리어도 넘 예쁘고 좋았담 ...,361632,써머칭구,3,대전시 서구 둔산동 1081,188235,36.35188,127.37573,모모가든,4,16,2017-07-07 06:44:19
1,"음식도 너무 맛있고 분위기도 짱 좋아요!!\n세명이서 자몽리코타샐러드, 백합마스카포...",567494,쌤J,3,대전시 서구 둔산동 1081,188235,36.35188,127.37573,모모가든,4,16,2017-01-27 16:23:00
2,오랜만에 또 방문한 모모가든이에요. 개인적으로 대전 탑3안에 꼽을 것 같아요!\n저...,47875,준영,3,대전시 서구 둔산동 1081,188235,36.35188,127.37573,모모가든,4,16,2016-06-17 10:32:42
3,모모가든 정말 대전에서 가본 곳 중 최고였어요!!!! 일단 자몽 리코타 치즈 샐러드...,47875,준영,3,대전시 서구 둔산동 1081,188235,36.35188,127.37573,모모가든,4,16,2015-10-10 16:04:11
4,타임월드 근처에 있는 집이었다. 가게가 의외로 찾기 쉽지 않았는데 가보니 여기에 이...,533317,푱,3,대전시 서구 둔산동 1081,188235,36.35188,127.37573,모모가든,4,16,2017-07-11 15:56:40


In [5]:
def get_pref_mats(df):
    '''
    This function generates the check-in matrix and sentiment-preference matrix
    
    Input:
    df, dataFrame. Return from load_data function

    Output:
    1. pref_checkin, check-in preference matrix
    2. pref_sentiment, sentiment preference matrix
    '''
    mem_id = sorted(df['Member ID'].unique()); loc_id = sorted(df['Restaurant ID'].unique())
    pref_checkin = pd.DataFrame(0, index=mem_id, columns=loc_id)
    pref_sentiment = pd.DataFrame(0, index=mem_id, columns=loc_id)
    
    for index, row in df.iterrows():
        # make sentiment preference matrix
        member = row['Member ID']; restaurant = row['Restaurant ID']; rating = row['Rating']
        pref_sentiment.loc[member, restaurant] = rating
        checkin = pref_checkin.loc[member, restaurant]
        if checkin == 0:
            checkin = 1
        elif checkin > 0:
            checkin += 1

        if checkin >= 3:
            checkin = 3

        pref_checkin.loc[member, restaurant] = checkin
    
    pref_checkin = np.array(pref_checkin); pref_sentiment = np.array(pref_sentiment)
    return pref_checkin, pref_sentiment

In [6]:
pref_checkin, pref_sentiment = get_pref_mats(df)

In [7]:
def compute_pref_final(pref_checkin, pref_sentiment):
    '''
    This function calculates the final preference matrix. 
    Equation (1) from the paper
    
    Input:
    1. pref_checkin, check-in preference matrix
    2. pref_sentiment, sentiment preference matrix

    Output:
    pref_final, final preference matrix
    '''
    pref_final = pref_checkin - np.sign(pref_checkin - pref_sentiment) * np.heaviside(np.abs(pref_checkin - pref_sentiment)-2, 0.5)
    return pref_final

In [8]:
Z = 5
N = len(df['Member ID'].unique()); I = len(df['Restaurant ID'].unique())
pref_final = compute_pref_final(pref_checkin, pref_sentiment)
print(pref_final.shape)

(1171, 854)


In [9]:
def get_UV(N, I, Z):
    U = np.random.rand(N, Z)
    V = np.random.rand(Z, I)
    
    return U, V

In [10]:
def get_sim_u(pref_final):
    '''
    This function returns similarity of users

    Input: pref_final, which is the return matrix of the compute_pref_final function
    Output: similarity of users, N * N matrix
    '''
    N, _ = pref_final.shape
    sim_u = []
    for n in range(N):
        temp = []
        for i in range(N):
            temp.append(pearsonr(pref_final[n], pref_final[i])[0])
        sim_u.append(temp)

    sim_u = np.array(sim_u)
    return sim_u


In [11]:
def get_sim_v(df):
    '''
    For two venues, the similarity score is set to 1 if both venues have the same sub-category in Foursquare
    and set 0 if there is no overlapping sub-category

    If two restaurants have the same cuisine code, similarity score is set to 1, else 0.
    '''
    location = []
    for index, row in df.iterrows():
        tmp = [row['Restaurant ID'], row['Restaurant code']]
        if tmp not in location:
            location.append(tmp)

    sim_v = []
    I = len(location)
    
    for i in range(I):
        temp = []
        current_code = location[i][1]
        for j in range(I):
            if location[j][1] == current_code:
                temp.append(1)
            else:
                temp.append(0)
        sim_v.append(temp)
        
    sim_v = np.array(sim_v)
    return sim_v

In [12]:

def get_coefficient(pref_final, sim_u, sim_v, U, V):
    '''
    lambda_u = sigma^2_R / sigma^2_U
    lambda_v = sigma^2_R / sigma^2_V
    alpha = sigma^2_R / sigma^2_simU
    beta = sigma^2_R / sigma^2_simV
    '''
    var_R = np.var(pref_final)
    lambda_u = var_R / np.var(U)
    lambda_v = var_R / np.var(V)
    alpha = var_R / np.var(sim_u)
    beta = var_R / np.var(sim_v)
    
    return lambda_u, lambda_v, alpha, beta

In [63]:
def get_log_posterior(U, V, pref_final, sim_u, sim_v, lambda_u, lambda_v, alpha, beta):
    U = scipy.resize(U, (1171, 5))
    V = scipy.resize(V, (5, 854))
    print("log posterior")
    first_term = np.sum(pref_final - expit(U @ V))
    second_term = lambda_u * np.sum(U @ U.T) + lambda_v * np.sum(V @ V.T)
    third_term = alpha * np.sum((U - (sim_u @ U)) @ (U - (sim_u @ U)).T)
    fourth_term = beta * np.sum((V.T - (sim_v @ V.T)) @ (V.T - (sim_v @ V.T)).T)
    log_posterior = 0.5 * (first_term + second_term + third_term + fourth_term)
    print(log_posterior)
    return log_posterior

In [64]:
start = time.time()
U, V = get_UV(N, I, Z)
sim_u = get_sim_u(pref_final)
sim_v = get_sim_v(df)
lambda_u, lambda_v, alpha, beta = get_coefficient(pref_final, sim_u, sim_v, U, V)
end = time.time() - start
print(end)

73.90386700630188


In [65]:
print(lambda_u, lambda_v, alpha, beta)
print(sim_u.shape)
print(sim_v.shape)
print(U.shape)
print(V.shape)

0.0844959762728 0.0816335894613 0.409473051625 0.0374964183682
(1171, 1171)
(854, 854)
(1171, 5)
(5, 854)


In [66]:
log_posterior = get_log_posterior(U, V, pref_final, sim_u, sim_v, lambda_u, lambda_v, alpha, beta)
print(log_posterior)

log posterior
964810831.687
964810831.687


In [75]:
def get_grad_u(U, V, pref_final, sim_u, sim_v, lambda_u, lambda_v, alpha, beta):
    '''
    A: U, B: V 
    '''
    U = scipy.resize(U, (1171, 5))
    V = scipy.resize(V, (5, 854))
#     U = U.reshape(1171, 5)
    print("gradient")
#     pdb.set_trace()
    grad_u_first = (logistic.pdf(U @ V) * (expit(U @ V) - pref_final)) @ V.T
    grad_u_second = lambda_u * U + alpha * (U - sim_u @ U)
    grad_u_third = -alpha * (sim_u @ (U - sim_u @ U))
    grad_u = grad_u_first + grad_u_second + grad_u_third
    grad_u = np.ndarray.flatten(grad_u)
    print(grad_u)
    return grad_u

def get_grad_v(U, V, pref_final, sim_u, sim_v, lambda_u, lambda_v, alpha, beta):
#     U = U.reshape(1171, 5)
    print("gradient")
    U = scipy.resize(U, (1171, 5))
    V = scipy.resize(V, (5, 854))
    grad_v_first = (logistic.pdf(U @ V) * (expit(U @ V)-pref_final)).T @ U
    grad_v_second = (lambda_v * V).T + beta * (V.T - sim_v @ V.T)
    grad_v_third = -beta * (sim_v @ (V.T - sim_v @ V.T))
    grad_v = grad_v_first + grad_v_second + grad_v_third
    grad_v = np.ndarray.flatten(grad_v)
    print(grad_v)
    return grad_v

In [76]:
estimated_u = minimize(get_log_posterior, 
                  x0 = U,
                  args = (V, pref_final, sim_u, sim_v, lambda_u, lambda_v, alpha, beta), jac=get_grad_u)
print("===================================================")
print("U estimation ended")
print("===================================================")
estimated_v = minimize(get_log_posterior, 
                  x0 = V,
                  args = (U, pref_final, sim_u, sim_v, lambda_u, lambda_v, alpha, beta), jac=get_grad_v)

gradient
[  97.57498662  112.17261354  106.85804245 ...,   41.27812162   41.11174915
   43.74704483]
log posterior
964810831.687
log posterior
943017866.704
gradient
[  96.46299789  110.91514677  105.72479099 ...,   41.76824502   41.62763777
   44.2684875 ]
log posterior
868237887.164
gradient
[  92.01460052  105.8849537   101.19130215 ...,   43.72867256   43.69110289
   46.35415362]
log posterior
671265934435.0
gradient
[-7447.25987641 -8359.4899833  -8018.48828249 ...,   596.24458778
   520.66404491   603.19409498]
log posterior
868070413.083
gradient


  return -x - 2. * sc.log1p(np.exp(-x))


[  91.99656944  105.85933527  101.16778789 ...,   43.76586323   43.72573254
   46.39053336]
log posterior
868070348.474
gradient
[  91.99656248  105.85932537  101.16777881 ...,   43.76587758   43.7257459
   46.39054739]
log posterior
164207794883.0
gradient
[-3702.84678244 -4151.10731103 -3983.91998433 ...,   294.96553811
   257.12024863   298.24711297]
log posterior
867898859.701
gradient
[  91.97804922  105.83304527  101.14364319 ...,   43.80398051   43.76122089
   46.42781364]
log posterior
867898793.578
gradient
[  91.97804207  105.83303513  101.14363387 ...,   43.80399521   43.76123457
   46.42782801]
log posterior
39574901790.5
gradient
[-1830.67797987 -2046.95839622 -1966.67650448 ...,   144.32905022
   125.35100706   145.77669589]
log posterior
867718254.273
gradient
[  91.95849644  105.80531539  101.11816066 ...,   43.84413169   43.79859835
   46.46707671]
log posterior
867718184.701
gradient
[  91.9584889   105.8053047   101.11815083 ...,   43.84414716   43.79861275
   46.467

In [88]:
# print(estimated_u.x.shape)
# print(estimated_v.x.shape)

# estimated_u = (estimated_u.x).reshape(1171, 5)
# estimated_v = (estimated_v.x).reshape(5, 854)
print(estimated_u.shape)
print(estimated_v.shape)
print(pref_final)
print(estimated_u @ estimated_v)
# print(estimated_)

(1171, 5)
(5, 854)
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
[[-1.90499125 -2.84163267 -2.17588484 ..., -1.33052188 -1.81154074
  -1.94077684]
 [ 1.56721535  2.16282554  1.67944642 ...,  1.04182514  1.17657953
   1.69225483]
 [ 0.52383018  1.02247596  0.48019874 ...,  0.56645277  0.92167598
   0.87087862]
 ..., 
 [-1.40833926 -1.89294611 -2.05066137 ..., -1.18159775 -1.17674293
  -1.34235526]
 [-1.24068427 -2.17670561 -1.32764476 ..., -0.99258282 -1.78451997
  -1.26993669]
 [-0.85971912 -1.2782225  -1.09717559 ..., -0.83877977 -0.91385161
  -1.12628605]]
