In [1]:
import time, json
import numpy as np
import matplotlib.pyplot as plt
from src import generator as gen
from src.estimators import SNNEstimator, RidgeEstimator, GapEstimator
from src.general_snn import general_snn
from src import anchor_matrix as am

# Tests

Evaluate the effects of different anchor matrix finding methods:
- baseline: finding the _best_ anchor matrix
- using multiple good anchor matrices together
- using a non-complete matrix and imputing the missing values with averages

Test on the following datasets:
- Recommendation system, limited MNAR (80x80)
- Recommendation system, limited MNAR (160x160)
- Recommendation system, general MNAR (80x80)
- Recommendation system, general MNAR (160x160)

Additional tests:
- when using multiple anchor matrices, how many estimates do we need?
- non-complete matrix, using whole matrix vs submatrix averages

In [2]:
def run_limited_MNAR(rating_matrix, P, biclique_search, estimator, num_estimates=1, num_runs=1, seed=None):
    rng = np.random.default_rng(seed)
    RMSEs = []
    MAEs = []
    Times = []
    for _ in range(num_runs):
        D = rng.binomial(1, P)
        Y = rating_matrix.copy()
        Y[D == 0] = np.nan
        rtime = time.time()
        estimator.prepare(Y, D)
        Y_restored = general_snn(
          D, Y,
          estimator=estimator,
          biclique_search=biclique_search,
          num_estimates=num_estimates,
          min_val=1, max_val=5,
          print_progress=True
        )
        Times.append(time.time() - rtime)
        Error = (rating_matrix - Y_restored).flatten()
        RMSEs.append(np.sqrt(np.mean(Error ** 2)))
        MAEs.append(np.mean(np.abs(Error)))
    return {
        "RMSE": {'mean': np.mean(RMSEs), 'std': np.std(RMSEs)},
        "MAE": {'mean': np.mean(MAEs), 'std': np.std(MAEs)},
        "time": {'mean': np.mean(Times), 'std': np.std(Times)}
    }

def run_general_MNAR(latent_movie_matrix, inv_scale, biclique_search, estimator, num_estimates=1, num_runs=1, seed=None):
    rng = np.random.default_rng(seed)
    RMSEs = []
    MAEs = []
    Times = []
    for _ in range(num_runs):
        rating_matrix, P, latent_movie_matrix = gen.getRatingAndPropensityMatrix_general(latent_movie_matrix, inv_scale, seed=rng)
        D = np.random.binomial(1, P) # not really needed as P[i,j] ∈ {0, 1}
        Y = rating_matrix.copy()
        Y[D == 0] = np.nan
        rtime = time.time()
        estimator.prepare(Y, D)
        Y_restored = general_snn(
          D, Y,
          estimator=estimator,
          biclique_search=biclique_search,
          num_estimates=num_estimates,
          min_val=1, max_val=5,
          print_progress=True
        )
        Times.append(time.time() - rtime)
        Error = (rating_matrix - Y_restored).flatten()
        RMSEs.append(np.sqrt(np.mean(Error ** 2)))
        MAEs.append(np.mean(np.abs(Error)))
    return {
        "RMSE": {'mean': np.mean(RMSEs), 'std': np.std(RMSEs)},
        "MAE": {'mean': np.mean(MAEs), 'std': np.std(MAEs)},
        "time": {'mean': np.mean(Times), 'std': np.std(Times)}
    }


### Datasets:

In [3]:
rating_matrix_80,  P_80  = gen.getRatingAndPropensityMatrix(inv_scale=1)
rating_matrix_100, P_100 = gen.getRatingAndPropensityMatrix(inv_scale=0.8)
rating_matrix_160, P_160 = gen.getRatingAndPropensityMatrix(inv_scale=0.5)
_, _, latent_movie_matrix_80  = gen.getRatingAndPropensityMatrix_general(inv_scale=1, seed=0)
_, _, latent_movie_matrix_100  = gen.getRatingAndPropensityMatrix_general(inv_scale=0.8, seed=0)
_, _, latent_movie_matrix_160 = gen.getRatingAndPropensityMatrix_general(inv_scale=0.5, seed=0)

In [4]:
estimators = [
  RidgeEstimator(reg_alpha=lambda sz, ratio: 0.001),
  SNNEstimator(spectral_rank_fun=lambda s, m, n: np.sum(s>=0.001)),
]
biclique_methods = [
  am.biclique_find,
  am.biclique_random,
  am.whole_matrix,
]
num_runs = 1
# res1: method -> estimator -> dataset -> res
res1 = {method.__name__: {est.__class__.__name__:{} for est in estimators} for method in biclique_methods}
for biclique_method in biclique_methods:
    num_estimates = 5 if biclique_method == am.biclique_random else 1
    for estimator in estimators:
        est = GapEstimator(estimator, avg_base="submatrix") if biclique_method == am.whole_matrix else estimator
        m_name = biclique_method.__name__
        est_name = estimator.__class__.__name__
        print("\n", m_name, est_name)
        res1[m_name][est_name]["l080"] = run_limited_MNAR(rating_matrix_80, P_80, biclique_method, est, num_estimates, num_runs, 0)
        res1[m_name][est_name]["l100"] = run_limited_MNAR(rating_matrix_100, P_100, biclique_method, est, num_estimates, num_runs, 0)
        if biclique_method != am.biclique_find:
            res1[m_name][est_name]["l160"] = run_limited_MNAR(
                rating_matrix_160, P_160, biclique_method, est, num_estimates, num_runs, 0)
        res1[m_name][est_name]["g080"] = run_general_MNAR(latent_movie_matrix_80, 1, biclique_method, est, num_estimates, num_runs, 0)
        res1[m_name][est_name]["g100"] = run_general_MNAR(latent_movie_matrix_100, 0.8, biclique_method, est, num_estimates, num_runs, 0)
        if biclique_method != am.biclique_find:
            res1[m_name][est_name]["g160"] = run_general_MNAR(
                latent_movie_matrix_160, 0.5, biclique_method, est, num_estimates, num_runs, 0)

with open('data/res1.json', 'w') as outfile:
    json.dump(res1, outfile, indent=2)


 biclique_find RidgeEstimator
 80/80
 100/100
 80/80
 100/100

 biclique_find SNNEstimator
 80/80
 100/100
 80/80
 100/100

 biclique_random RidgeEstimator
 80/80
 100/100
 160/160
 80/80
 100/100
 160/160

 biclique_random SNNEstimator
 80/80
 100/100
 160/160
 80/80
 100/100
 160/160

 whole_matrix RidgeEstimator
 80/80
 100/100
 160/160
 80/80
 100/100
 160/160

 whole_matrix SNNEstimator
 80/80
 100/100
 160/160
 80/80
 100/100
 160/160


## 2. Additional Tests

### 2.1 How many estimates do we need?

In [5]:
estimator = RidgeEstimator(reg_alpha=lambda sz, ratio: 0.001)
biclique_method = am.biclique_random

num_runs = 5
num_estimates_vals = [1, 3, 5, 10, 20]
# res2: num_est -> dataset -> res
res2 = {}
for num_estimates in num_estimates_vals:
    res2[num_estimates] = {}
    print(f"Num estimates: {num_estimates}")
    res2[num_estimates]["l80"] = run_limited_MNAR(rating_matrix_80, P_80, biclique_method, estimator, num_estimates, num_runs)
    res2[num_estimates]["g80"] = run_general_MNAR(latent_movie_matrix_80, 1, biclique_method, estimator, num_estimates, num_runs)

with open('data/res2.json', 'w') as outfile:
    json.dump(res2, outfile, indent=2)

Num estimates: 1
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
Num estimates: 3
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
Num estimates: 5
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
Num estimates: 10
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
Num estimates: 20
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80
 80/80


### 2.2 Whole matrix, use total or submatrix averages

In [9]:
base_estimators = [
  RidgeEstimator(reg_alpha=lambda sz, ratio: 0.001),
  SNNEstimator(spectral_rank_fun=lambda s, m, n: np.sum(s>=0.001)),
]

biclique_method = am.whole_matrix

num_runs = 3
avg_basis = ["submatrix", "complete"]
# res3: estimator -> avg_base -> dataset -> res
res3 = {est.__class__.__name__:{avg_base: {} for avg_base in avg_basis} for est in estimators}
for base_estimator in base_estimators:
  for avg_base in avg_basis:
      est = GapEstimator(base_estimator, avg_base=avg_base) 
      ename = base_estimator.__class__.__name__
      print("\n", ename, avg_base)
      res3[ename][avg_base]["l80"] = run_limited_MNAR(rating_matrix_80, P_80, biclique_method, est, 1, num_runs)
      res3[ename][avg_base]["l100"] = run_limited_MNAR(rating_matrix_100, P_100, biclique_method, est, 1, num_runs)
      res3[ename][avg_base]["l160"] = run_limited_MNAR(rating_matrix_160, P_160, biclique_method, est, 1, num_runs)
      res3[ename][avg_base]["g80"] = run_general_MNAR(latent_movie_matrix_80, 1, biclique_method, est, 1, num_runs)
      res3[ename][avg_base]["g100"] = run_general_MNAR(latent_movie_matrix_100, 0.8, biclique_method, est, 1, num_runs)
      res3[ename][avg_base]["g160"] = run_general_MNAR(latent_movie_matrix_160, 0.5, biclique_method, est, 1, num_runs)

with open('data/res3.json', 'w') as outfile:
    json.dump(res3, outfile, indent=2)


 RidgeEstimator submatrix
 80/80
 80/80
 80/80
 100/100
 100/100
 100/100
 160/160
 160/160
 160/160
 80/80
 80/80
 80/80
 100/100
 100/100
 100/100
 160/160
 160/160
 160/160

 RidgeEstimator complete
 80/80
 80/80
 80/80
 100/100
 100/100
 100/100
 160/160
 160/160
 160/160
 80/80
 80/80
 80/80
 100/100
 100/100
 100/100
 160/160
 160/160
 160/160

 SNNEstimator submatrix
 80/80
 80/80
 80/80
 100/100
 100/100
 100/100
 160/160
 160/160
 160/160
 80/80
 80/80
 80/80
 100/100
 100/100
 100/100
 160/160
 160/160
 160/160

 SNNEstimator complete
 80/80
 80/80
 80/80
 100/100
 100/100
 100/100
 160/160
 160/160
 160/160
 80/80
 80/80
 80/80
 100/100
 100/100
 100/100
 160/160
 160/160
 160/160


## Tests

In [4]:
run_general_MNAR(
  latent_movie_matrix_80,
  inv_scale=1,
  biclique_search=am.biclique_find,
  estimator=SNNEstimator(spectral_rank_fun=lambda s, m, n: np.sum(s>=0.001)),
  num_estimates=1,
  num_runs=1,
)

 80/80


{'RMSE': {'mean': 0.032498122274896575, 'std': 0.0},
 'MAE': {'mean': 0.00573244788127733, 'std': 0.0}}

In [5]:
run_general_MNAR(
  latent_movie_matrix_80,
  inv_scale=1,
  biclique_search=am.biclique_find,
  estimator=RidgeEstimator(reg_alpha=lambda sz, ratio: 0.001),
  num_estimates=1,
  num_runs=1,
)

 80/80


{'RMSE': {'mean': 0.02238232337683985, 'std': 0.0},
 'MAE': {'mean': 0.005750886642592301, 'std': 0.0}}

In [13]:
run_general_MNAR(
  latent_movie_matrix_80,
  inv_scale=1,
  biclique_search=am.whole_matrix,
  estimator=GapEstimator(
    estimator=SNNEstimator(spectral_rank_fun=lambda s, m, n: np.sum(s>=0.001)),
    #estimator=RidgeEstimator(reg_alpha=lambda sz, ratio: 0.001),
    #avg_base="submatrix",
    avg_base="complete",
  ),
  num_estimates=1,
  num_runs=1,
)

 80/80


{'RMSE': {'mean': 0.20820581852734021, 'std': 0.0},
 'MAE': {'mean': 0.12328157387088565, 'std': 0.0}}