In [15]:
import time
import numpy as np
import matplotlib.pyplot as plt
from src import generator as gen
from src.estimators import SNNEstimator, RidgeEstimator, GapEstimator
from src.general_snn import general_snn
from src import anchor_matrix as am

# Tests

Evaluate the effects of different anchor matrix finding methods:
- baseline: finding the _best_ anchor matrix
- using multiple good anchor matrices together
- using a non-complete matrix and imputing the missing values with averages

Test on the following datasets:
- Recommendation system, limited MNAR (80x80)
- Recommendation system, limited MNAR (160x160)
- Recommendation system, general MNAR (80x80)
- Recommendation system, general MNAR (160x160)

Additional tests:
- when using multiple anchor matrices, how many estimates do we need?
- non-complete matrix, using whole matrix vs submatrix averages

In [17]:
def run_limited_MNAR(rating_matrix, P, biclique_search, estimator, num_estimates=1, num_runs=1):
    RMSEs = []
    MAEs = []
    Times = []
    for _ in range(num_runs):
        D = np.random.binomial(1, P)
        Y = rating_matrix.copy()
        Y[D == 0] = np.nan
        rtime = time.time()
        estimator.prepare(Y, D)
        Y_restored = general_snn(
          D, Y,
          estimator=estimator,
          biclique_search=biclique_search,
          num_estimates=num_estimates,
          min_val=1, max_val=5,
          print_progress=True
        )
        Times.append(time.time() - rtime)
        Error = (rating_matrix - Y_restored).flatten()
        RMSEs.append(np.sqrt(np.mean(Error ** 2)))
        MAEs.append(np.mean(np.abs(Error)))
    return {
        "RMSE": {'mean': np.mean(RMSEs), 'std': np.std(RMSEs)},
        "MAE": {'mean': np.mean(MAEs), 'std': np.std(MAEs)},
        "time": {'mean': np.mean(Times), 'std': np.std(Times)}
    }

def run_general_MNAR(latent_movie_matrix, inv_scale, biclique_search, estimator, num_estimates=1, num_runs=1):
    RMSEs = []
    MAEs = []
    Times = []
    for _ in range(num_runs):
        rating_matrix, P, latent_movie_matrix = gen.getRatingAndPropensityMatrix_general(latent_movie_matrix, inv_scale)
        D = np.random.binomial(1, P) # not really needed as P[i,j] ∈ {0, 1}
        Y = rating_matrix.copy()
        Y[D == 0] = np.nan
        rtime = time.time()
        estimator.prepare(Y, D)
        Y_restored = general_snn(
          D, Y,
          estimator=estimator,
          biclique_search=biclique_search,
          num_estimates=num_estimates,
          min_val=1, max_val=5,
          print_progress=True
        )
        Times.append(time.time() - rtime)
        Error = (rating_matrix - Y_restored).flatten()
        RMSEs.append(np.sqrt(np.mean(Error ** 2)))
        MAEs.append(np.mean(np.abs(Error)))
    return {
        "RMSE": {'mean': np.mean(RMSEs), 'std': np.std(RMSEs)},
        "MAE": {'mean': np.mean(MAEs), 'std': np.std(MAEs)},
        "time": {'mean': np.mean(Times), 'std': np.std(Times)}
    }


### Datasets:

In [3]:
rating_matrix_80,  P_80  = gen.getRatingAndPropensityMatrix(inv_scale=1)
rating_matrix_160, P_160 = gen.getRatingAndPropensityMatrix(inv_scale=0.5)
_, _, latent_movie_matrix_80  = gen.getRatingAndPropensityMatrix_general(inv_scale=1, seed=0)
_, _, latent_movie_matrix_160 = gen.getRatingAndPropensityMatrix_general(inv_scale=0.5, seed=0)

In [21]:
estimators = [
  RidgeEstimator(reg_alpha=lambda sz, ratio: 0.001),
  SNNEstimator(spectral_rank_fun=lambda s, m, n: np.sum(s>=0.001)),
]
biclique_methods = [
  #am.biclique_find,
  am.biclique_random,
  am.whole_matrix,
]
num_runs = 1
for biclique_method in biclique_methods:
    num_estimates = 5 if biclique_method == am.biclique_random else 1
    for estimator in estimators:
        est = GapEstimator(estimator, avg_base="submatrix") if biclique_method == am.whole_matrix else estimator
        print("\n", biclique_method.__name__, estimator.__class__.__name__)
        print(run_limited_MNAR(rating_matrix_80, P_80, biclique_method, est, num_estimates, num_runs))
        print(run_limited_MNAR(rating_matrix_160, P_160, biclique_method, est, num_estimates, num_runs))
        print(run_general_MNAR(latent_movie_matrix_80, 1, biclique_method, est, num_estimates, num_runs))
        print(run_general_MNAR(latent_movie_matrix_160, 0.5, biclique_method, est, num_estimates, num_runs))


 biclique_random RidgeEstimator
 80/80
{'RMSE': {'mean': 0.10361809910651641, 'std': 0.0}, 'MAE': {'mean': 0.04523245912113323, 'std': 0.0}, 'time': {'mean': 99.7135682106018, 'std': 0.0}}
 4/160

In [4]:
run_general_MNAR(
  latent_movie_matrix_80,
  inv_scale=1,
  biclique_search=am.biclique_find,
  estimator=SNNEstimator(spectral_rank_fun=lambda s, m, n: np.sum(s>=0.001)),
  num_estimates=1,
  num_runs=1,
)

 80/80


{'RMSE': {'mean': 0.032498122274896575, 'std': 0.0},
 'MAE': {'mean': 0.00573244788127733, 'std': 0.0}}

In [5]:
run_general_MNAR(
  latent_movie_matrix_80,
  inv_scale=1,
  biclique_search=am.biclique_find,
  estimator=RidgeEstimator(reg_alpha=lambda sz, ratio: 0.001),
  num_estimates=1,
  num_runs=1,
)

 80/80


{'RMSE': {'mean': 0.02238232337683985, 'std': 0.0},
 'MAE': {'mean': 0.005750886642592301, 'std': 0.0}}

In [13]:
run_general_MNAR(
  latent_movie_matrix_80,
  inv_scale=1,
  biclique_search=am.whole_matrix,
  estimator=GapEstimator(
    estimator=SNNEstimator(spectral_rank_fun=lambda s, m, n: np.sum(s>=0.001)),
    #estimator=RidgeEstimator(reg_alpha=lambda sz, ratio: 0.001),
    #avg_base="submatrix",
    avg_base="complete",
  ),
  num_estimates=1,
  num_runs=1,
)

 80/80


{'RMSE': {'mean': 0.20820581852734021, 'std': 0.0},
 'MAE': {'mean': 0.12328157387088565, 'std': 0.0}}