# Fit Exposure MF with exposure covariantes (Location ExpoMF) to the Gowalla dataset

In [1]:
import glob
import os
# if you are using OPENBLAS, you might want to turn this option on. Otherwise, joblib might get stuck
# os.environ['OPENBLAS_NUM_THREADS'] = '1'

import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.sparse
import pandas as pd

In [2]:
import expomf_cov
import rec_eval

Change this to wherever you saved the processed data from [processGowalla.ipynb](processGowalla.ipynb)

In [3]:
DATA_ROOT = '/home/waldorf/dawen.liang/gowalla_pro/'

In [4]:
unique_uid = list()
with open(os.path.join(DATA_ROOT, 'unique_uid.txt'), 'r') as f:
    for line in f:
        unique_uid.append(line.strip())
    
unique_sid = list()
with open(os.path.join(DATA_ROOT, 'unique_sid.txt'), 'r') as f:
    for line in f:
        unique_sid.append(line.strip())

In [5]:
n_songs = len(unique_sid)
n_users = len(unique_uid)

### Load the data and train the model

In [6]:
def load_data(csv_file, shape=(n_users, n_songs)):
    tp = pd.read_csv(csv_file)
    rows, cols = np.array(tp['uid'], dtype=np.int32), np.array(tp['sid'], dtype=np.int32)
    count = tp['rating']
    return scipy.sparse.csr_matrix((count,(rows, cols)), dtype=np.int16, shape=shape), rows, cols

In [7]:
train_data, rows, cols = load_data(os.path.join(DATA_ROOT, 'train.num.csv'))
# binarize the data
train_data.data = np.ones_like(train_data.data)

In [8]:
print train_data.shape
print train_data.data.shape

(57629, 47198)
(804262,)


In [9]:
vad_data, rows_vad, cols_vad = load_data(os.path.join(DATA_ROOT, 'vad.num.csv'))
# binarize the data
vad_data.data = np.ones_like(vad_data.data)

In [10]:
test_data, rows_test, cols_test = load_data(os.path.join(DATA_ROOT, 'test.num.csv'))
# binarize the data
test_data.data = np.ones_like(test_data.data)

`feat_venue_locs.tsv` contains the location features (part of the [pre-processed data](http://dawenl.github.io/data/gowalla_pro.zip)), which are generated in the following way: 
- Run GMM (from [scikit.learn](http://scikit-learn.org/)) on all the venue locations.
- For each venue, take the expected cluster assignment as location features `pi`.

In [11]:
pi = np.loadtxt(os.path.join(DATA_ROOT, 'feat_venue_locs.tsv'), dtype='float32')

In [12]:
# sanity check to make sure all the venues has its corresponding feature    
for i, s in enumerate(unique_sid):
    assert s == "%d" % pi[i, 0]

In [13]:
# the first column is ID, don't need them
pi = pi[:, 1:]

In [15]:
n_components = 100
max_iter = 20
n_jobs = 20
lam = 1e-5
# here we use the best performing init_mu from per-item \mu_i experiment
init_mu = 0.01
max_epoch = 10

save_dir="Gowalla_Location_ExpoMF_params_K%d_lam%1.0E_initmu%1.0E_maxepoch%d" % (n_components, lam, init_mu, max_epoch)

coder = expomf_cov.ExpoMF(n_components=n_components, max_iter=max_iter, batch_size=1000, 
                          batch_sgd=10, max_epoch=max_epoch, init_std=0.01,
                          n_jobs=n_jobs, random_state=98765, save_params=True, save_dir=save_dir, 
                          early_stopping=True, verbose=True, 
                          lam_y=1., lam_theta=lam, lam_beta=lam, lam_nu=lam, init_mu=init_mu, learning_rate=.5)

In [16]:
coder.fit(train_data, pi, vad_data=vad_data, batch_users=5000, k=100)

ITERATION #0
	Updating user factors: time=178.12
	Updating item factors: time=190.42
	Updating user consideration factors...
		Epoch #0: initial validation loss = 3154.786
		Epoch #0: validation loss = 3133.727
		Epoch #1: initial validation loss = 3129.680
		Epoch #1: validation loss = 3129.385
		Epoch #2: initial validation loss = 3172.628
		Epoch #2: validation loss = 3173.474
		Epoch #3: initial validation loss = 3143.275
		Epoch #3: validation loss = 3143.484
		Epoch #4: initial validation loss = 3158.697
		Epoch #4: validation loss = 3159.602
		Epoch #5: initial validation loss = 3130.088
		Epoch #5: validation loss = 3130.258
		Epoch #6: initial validation loss = 3163.065
		Epoch #6: validation loss = 3163.948
		Epoch #7: initial validation loss = 3221.945
		Epoch #7: validation loss = 3222.674
		Epoch #8: initial validation loss = 3119.196
		Epoch #8: validation loss = 3119.460
		Epoch #9: initial validation loss = 3138.604
		Epoch #9: validation loss = 3138.755
	Updating user 

ExpoMF(batch_sgd=10, batch_size=1000, early_stopping=True, init_std=0.01,
    max_epoch=10, max_iter=20, n_components=100, n_jobs=20,
    random_state=98765,
    save_dir='Gowalla_Location_ExpoMF_params_K100_lam1E-05_initmu1E-02_maxepoch10',
    save_params=True, verbose=True)

It seems that after a few epochs the validation loss will not decrease. However, we empirically found that it is still better to train for more epochs, instead of stop the SGD

## Evaluate the performance on heldout testset

In [17]:
n_params = len(glob.glob(os.path.join(save_dir, '*.npz')))

params = np.load(os.path.join(save_dir, 'ExpoMF_cov_K%d_mu%.1e_iter%d.npz' % (n_components, init_mu, n_params - 1)))
U, V, nu, alpha = params['U'], params['V'], params['nu'], params['alpha']

### Rank by $\mathbb{E}[y_{ui}] = \mu_{ui}\theta_u^\top\beta_i$

In [18]:
mu = {'params': [nu, pi, alpha], 'func': expomf_cov.get_mu}

print 'Test Recall@20: %.4f' % rec_eval.recall_at_k(train_data, test_data, U, V, k=20, mu=mu, vad_data=vad_data)
print 'Test Recall@50: %.4f' % rec_eval.recall_at_k(train_data, test_data, U, V, k=50, mu=mu, vad_data=vad_data)
print 'Test NDCG@100: %.4f' % rec_eval.normalized_dcg_at_k(train_data, test_data, U, V, k=100, mu=mu, vad_data=vad_data)
print 'Test MAP@100: %.4f' % rec_eval.map_at_k(train_data, test_data, U, V, k=100, mu=mu, vad_data=vad_data)

Test Recall@20: 0.1292
Test Recall@50: 0.1992
Test NDCG@100: 0.1252
Test MAP@100: 0.0478
