# Experiment: Reconstruction Quality of Sampled Signal

## Setup

In [None]:
import sys
PROJECT_PATH = "/home/christopher_orlowicz1_vodafone_c/gershgorin"
sys.path.append(PROJECT_PATH)
%cd $PROJECT_PATH

%load_ext autoreload
%autoreload 2

In [None]:
#%pip install -q -r requirements.txt
#%pip install graphilp

In [None]:
import time
from tqdm import tqdm

from google.cloud import bigquery
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"
import networkx as nx
import numpy as np
import pandas as pd
import pandas_gbq
import scipy
from sklearn.neighbors import kneighbors_graph

import src.db.big_query as bq
from src.db.preprocessing import Preprocessor
from src.gershgorin.bs_gda import bs_gda
from src.graph.graph import Graph
from src.graph import metrics
import src.utils.plotting as util_plt

## Load customer data (CAR)

In [None]:
# specify a timeframe to query
from_date = "2022-08-01"
to_date = "2022-08-30"

In [None]:
# read the CAR data (where all customers gave feedback)
car_df = bq.join_car_nps(from_date, to_date)
# read the recommendation values (NPS data) that we will use as signal
answers_df = bq.nps_query_timeframe(from_date, to_date)
# remove answers that cannot be assigned to a customer in CAR
not_in_car_ids = answers_df.client_id[~answers_df.client_id.isin(car_df.client_id)]
answers_df = answers_df[~answers_df.client_id.isin(not_in_car_ids)]

In [None]:
prep = Preprocessor(from_date, to_date, data=car_df, verbose=False)
car_df, client_ids = prep.car_df, prep.client_ids
# inspect nan columns
#nan_cols = car_df.columns[car_df.isna().sum() > 0]
# remove these columns for now
#car_df.drop(nan_cols, axis=1, inplace=True)

## Construct graph

In [None]:
n_nodes = car_df.shape[0]
d = 0.1  # upper (directed) / lower (undirected) bound on graph density
k = np.ceil(d*n_nodes).astype(int)  # compute number of neighbors based on desired density
t = time.perf_counter()
A = kneighbors_graph(car_df, k, mode='connectivity', include_self=False, n_jobs=-1)
#A = A.maximum(A.T)  # symmetrize the matrix (but density can be significantly higher)
print(f"This took {time.perf_counter()-t:.3f} s")

In [None]:
metrics.density(n_nodes, A.getnnz()/2, mode="undirected")

In [None]:
# wrap graph for faster neighborhood queries
graph = Graph(A)

## Signal smoothness

### Mask signal

In [None]:
def mask_signal(s, num_masked=None, index=None, val=0):
    """
    Masks a signal vector either by setting random indices to a specified value (default: 0)
    or by setting a given list of indices to that value.
    :param s: signal vector
    :param num_masked: number of entries to mask
    :param index: (optional) indices of entries that shall be masked
    :param val: masking value
    :return: masked signal
    """
    masked_s = s.copy()
    if index is None:
        if num_masked is None or num_masked <= 0:
            raise ValueError("num_masked needs to be greater than zero")
        rng = np.random.default_rng()
        rand_idx = rng.choice(len(s), num_masked, replace=False)
        masked_s[rand_idx] = val
    else:
        masked_s[index] = val
    return masked_s

In [None]:
s = answers_df.answer_value.values.astype(int)
prct_masked = 0.1
num_masked = int(prct_masked*n_nodes)
s_masked = mask_signal(s, num_masked, val=-1)

### Compute sampling sets and reconstruct signal

In [None]:
from scipy import sparse

def reconstruct(graph, sampling_set, s, mu=0.01):
    L = sparse.diags(graph.deg, 0) - A
    a = np.zeros(n_nodes, dtype=bool)
    a[list(sampling_set)] = 1
    B = np.diag(a) + mu * L
    B = sparse.csc_matrix(B)
    # compute the reconstruction matrix
    I = np.eye(L.shape[0])
    reconstr_mat = I - mu * sparse.linalg.inv(B) @ L
    noise = np.random.normal(size=n_nodes)*0.01
    s_reconst = reconstr_mat @ (s.reshape(-1, 1) + noise)
    return s_reconst

In [None]:
def mse(a, b):
    return np.mean(np.square(a-b))

In [None]:
sampling_budgets = [20, 50, 100, 200]
sets = []

for budget in tqdm(sampling_budgets):
    sampling_set, _ = bs_gda(graph, budget, parallel=True)
    sets.append(sampling_set)

In [None]:
reconstr_errors = []

for sampling_set in sets:
    s_reconst = reconstruct(graph, sampling_set, s)
    reconstr_errors.append(mse(s, s_reconst))

In [None]:
def exact():
    B = sparse.csr_matrix(B)
    se, _ = sparse.linalg.eigsh(B, k=1, which='SM')
    print(f"smallest eigenvalue via eigen-decomposition: {se[0]}\n")

In [None]:
plt.title("Reconstruction errors for growing sampling budget")
plt.xlabel("Sampling budget")
plt.ylabel("Reconstruction MSE")
plt.plot(sampling_budgets, reconstr_errors);