In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
from irt.evaluation import eval_utils
from irt.data import data_loader
import time
from irt.algorithms.spectral_estimator import spectral_estimate
from irt.algorithms import conditional_mle
from irt.algorithms import rasch_mml
from irt.algorithms import joint_mle
from irt.evaluation.eval_utils import log_likelihood_heldout, bayesian_auc, top_k_accuracy
import warnings
warnings.filterwarnings("ignore")
import torch as th
import os
from scipy.stats import norm

In [8]:
datasets = ["hetrec_2k", "ml_100k", "ml_1m", "each_movie"]

cmle_links = [
    "../experiment_results/may/may5/hetrec_2k_m=6829_119_0.8_CMLE.th",
    "../experiment_results/may/may5/ml_100k_m=2269_119_0.8_CMLE.th",
    "../experiment_results/may/may5/ml_1m_m=3260_119_0.8_CMLE.th",
    "../experiment_results/may/may5/each_movie_m=1327_119_0.8_CMLE.th"
]

prior_dist = [(0, 0.5), (0, 1.), (0, 1.5), (-1, 0.5), 
              (-1, 1.), (-1, 1.5), (1, 0.5), (1, 1.), 
              (1, 1.5), (-2, 0.5), (-2, 1), (-2, 1.5), 
              (2, 0.5), (2, 1.), (2, 1.5)]


In [9]:
seed = 119

test_loglik_arr = []

for i, dataset in enumerate(datasets):

    # Load data
    A, ratings =  getattr(data_loader, dataset)() # Use the default cutoff for each dataset
    
    cmle_link = cmle_links[i]
    cmle_res = th.load(cmle_link)
    
    est_cmle = cmle_res["est_cmle"]
    log_lik_wrong = cmle_res["loglik_cmle"]
    # Partition the data into train and test set using the global seed
    
    A_train_all, test_data = eval_utils.partition_data(A, p_train=0.8, p_test=0.2, seed=seed)
    binary_responses = eval_utils.extract_binary_responses(A_train_all)
    p_estimates_cmle = []
    loglik_cmle = []
    
    for mu, sigma in prior_dist:
        p_estimate_cmle = eval_utils.quadrature_p_response(est_cmle, sigma, mu)
        p_estimates_cmle.append(p_estimate_cmle)
        loglik_cmle.append(log_likelihood_heldout(p_estimate_cmle, binary_responses))

    mu_best_cmle, sigma_best_cmle = prior_dist[np.argmax(loglik_cmle)]
    p_estimate_cmle = p_estimates_cmle[np.argmax(loglik_cmle)]
    test_loglik_cmle = log_likelihood_heldout(p_estimate_cmle, test_data)
    test_loglik_arr.append(test_loglik_cmle)
    print(f"Dataset:{dataset}, w.length = {len(w)}, loglik (before) = {log_lik_wrong}, loglik correct = {test_loglik_cmle}")



Dataset:hetrec_2k, w.length = 1327, loglik (before) = -0.5622169237770317, loglik correct = -1.1193450374199119
Dataset:ml_100k, w.length = 1327, loglik (before) = -0.6251552834769624, loglik correct = -1.159089117252271
Dataset:ml_1m, w.length = 1327, loglik (before) = -0.6798305041936825, loglik correct = -1.1660113372541032
Dataset:each_movie, w.length = 1327, loglik (before) = -0.30508930284920566, loglik correct = -0.9460256371232257


In [10]:
test_loglik_arr

[-1.1193450374199119,
 -1.159089117252271,
 -1.1660113372541032,
 -0.9460256371232257]

In [11]:


for i, dataset in enumerate(datasets):    
    cmle_link = cmle_links[i]
    cmle_res = th.load(cmle_link)
    
    est_cmle = cmle_res["est_cmle"]
    cmle_res["loglik_cmle"] = test_loglik_arr[i]
    th.save(cmle_res, cmle_link)

