In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from irt.evaluation import eval_utils
from irt.data import data_loader
import matplotlib.pyplot as plt
import time
import girth
# from girth import rasch_conditional
from irt.data.rasch import generate_data
from irt.algorithms.spectral_estimator import spectral_estimate, construct_markov_chain, construct_markov_chain_accelerated
from irt.algorithms import conditional_mle
from irt.algorithms import rasch_mml
from irt.evaluation.eval_utils import log_likelihood_heldout, bayesian_auc, pairwise_disagreement_error, top_k_accuracy
import warnings
warnings.filterwarnings("ignore")

In [78]:
# A = data_loader.lsat()
cutoff = 25
# A_lsat = data_loader.lsat()
# A_ml_100k, ml_100k_ratings = data_loader.ml_100k(cutoff=cutoff)
# A_hetrec_2k, hetrec_2k_ratings  = data_loader.hetrec_2k(cutoff=50)
A_ml_1m, ml_1m_ratings = data_loader.ml_1m(cutoff=cutoff, top_k_cutoff=100)
# A_bx, bx_ratings = data_loader.bx_book(cutoff=200)
# A_genome, genome_ratings = data_loader.book_genome(cutoff=200)
# A_ml_20m, ml_20m_ratings = data_loader.ml_20m(cutoff=100)

In [79]:
print(A_ml_1m.shape)
print(len(ml_1m_ratings))

(2934, 6040)
2006


In [80]:

A = A_ml_1m
ratings = ml_1m_ratings
sorted_ratings = sorted(ratings, key=lambda x: x[1], reverse=True)
true_rank = [item for (item, _, _) in sorted_ratings] # Sort from most popular items

# Ignore items that have 


In [81]:
K = 200
print(f"Data has shape {A.shape}")
all_num_ratings = [num_ratings for (_, _, num_ratings) in sorted_ratings][:K]
print(f"mean num ratings in top {K}:", np.mean(all_num_ratings))
print(f"max num ratings in top {K}:", np.max(all_num_ratings))
print(f"median num ratings in top {K}:", np.median(all_num_ratings))
print(f"min num ratings in top {K}:", np.min(all_num_ratings))


Data has shape (2934, 6040)
mean num ratings in top 200: 864.76
max num ratings in top 200: 3428
median num ratings in top 200: 622.0
min num ratings in top 200: 104


In [82]:
auc_ase = 0.
auc_cmle = 0.
auc_mmle = 0.
loglik_ase = 0.
loglik_cmle = 0.
loglik_mmle = 0.
pd_ase = 0.
pd_cmle = 0.
pd_mmle = 0.

time_ase = 0.
time_cmle = 0.
time_mmle = 0.

n_trials = 1
seed = 119
p_train = 0.8

np.random.seed(seed)
trial_seeds = np.random.randint(0, 9999, size=(n_trials,))
sigma = 1
K_array = [10, 25, 50, 200, 500]
p_array = [0.5, 0.75]
lambd_arr = [0.5, 1.]

for j, p_sub in enumerate(p_array):
    auc_ase = 0.
    auc_cmle = 0.
    auc_mmle = 0.
    loglik_ase = 0.
    loglik_cmle = 0.
    loglik_mmle = 0.
    pd_ase = 0.
    pd_cmle = 0.
    pd_mmle = 0.
    top_K_ase = []
    top_K_mmle = []

    time_ase = 0.
    time_cmle = 0.
    time_mmle = 0.

    for i in range(n_trials):
        # Partition data
        all_train_data, test_data = eval_utils.partition_data(A, p_train=p_train, seed=trial_seeds[i])

        # Extract a subset of the columns
        train_data = all_train_data[:, :int(p_sub * all_train_data.shape[1])]

        # Conditional MLE
        # start = time.time()
        # est_cmle = conditional_mle.rasch_conditional(data, return_beta=True)
        # time_cmle += 1./n_trials * (time.time() - start)
        # loglik_cmle += 1./n_trials * log_likelihood_heldout(est_cmle, test_data)
        
        # Marginal MLE
        start = time.time()
        est_mmle = rasch_mml.rasch_mml(train_data, return_beta=True) 
        time_mmle += 1./n_trials * (time.time() - start)
        start = time.time()
        loglik_mmle += 1./n_trials * log_likelihood_heldout(est_mmle, test_data)
        auc_mmle += 1./n_trials * bayesian_auc(est_mmle, test_data, sigma)
        est_rank_mmle = np.argsort(est_mmle)[::-1]
        top_K_mmle += [[top_k_accuracy(true_rank, est_rank_mmle, k) for k in K_array]]


        # Accelerated spectral method
        start = time.time()
        lambd = 1.
        est_ase = spectral_estimate(train_data, lambd=lambd, regularization="uniform") # Note regularization
        time_ase += 1./n_trials * (time.time() - start)
        loglik_ase += 1./n_trials * log_likelihood_heldout(est_ase, test_data, 2)
        auc_ase += 1./n_trials * bayesian_auc(est_ase, test_data, sigma)
        est_rank_ase = np.argsort(est_ase)[::-1]
        top_K_ase += [[top_k_accuracy(true_rank, est_rank_ase, k) for k in K_array]]
        
    top_K_ase = np.array(top_K_ase)
    top_K_mmle = np.array(top_K_mmle)

    print(
        f"p_sub: {p_sub}\n" +
        f"Loglik: ASE={loglik_ase}, CMLE={loglik_cmle} MMLE={loglik_mmle}, \n" +
        f"AUC: ASE={auc_ase}, CMLE={auc_cmle}, MMLE={auc_mmle}, \n" +
        f"Rank: ASE={pd_ase}, CMLE={pd_cmle}, MMLE={pd_mmle}, \n" +
        f"Top-K: ASE={np.mean(top_K_ase, 0)}, MMLE={np.mean(top_K_mmle, 0)}\n" +
        f"Time: ASE={time_ase}, CMLE={time_cmle}, MMLE={time_mmle}"
    )

p_sub: 0.5
Loglik: ASE=-0.6307220307281772, CMLE=0.0 MMLE=-0.6330921665207141, 
AUC: ASE=0.699238860310662, CMLE=0.0, MMLE=0.7013380704243893, 
Rank: ASE=0.0, CMLE=0.0, MMLE=0.0, 
Top-K: ASE=[0.7   0.72  0.68  0.8   0.802], MMLE=[0.5   0.52  0.52  0.715 0.768]
Time: ASE=6.726557970046997, CMLE=0.0, MMLE=2.3011624813079834
p_sub: 0.75
Loglik: ASE=-0.6286397746971168, CMLE=0.0 MMLE=-0.6299505596007663, 
AUC: ASE=0.7006752322239423, CMLE=0.0, MMLE=0.7028008066056033, 
Rank: ASE=0.0, CMLE=0.0, MMLE=0.0, 
Top-K: ASE=[0.8   0.76  0.74  0.85  0.834], MMLE=[0.7   0.64  0.7   0.785 0.786]
Time: ASE=8.024992942810059, CMLE=0.0, MMLE=2.7260897159576416
