# Experiment: Reconstruction Quality of Sampled Signal

## Setup

In [None]:
import sys
PROJECT_PATH = "/home/christopher_orlowicz1_vodafone_c/gershgorin"
sys.path.append(PROJECT_PATH)
%cd $PROJECT_PATH

%load_ext autoreload
%autoreload 2

In [None]:
#!pip install -q -r requirements.txt
#!pip install mpire
#!pip install faiss-cpu==1.7.1 faiss-gpu==1.7.1

In [None]:
import time
from tqdm import tqdm

from google.cloud import bigquery
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"
import networkx as nx
import numpy as np
import pandas as pd
import pandas_gbq
import scipy
import torch

import src.db.big_query as bq
from src.db.preprocessing import Preprocessor
from src.db.zip_code_mapper import ZipCodeMapper
from src.gershgorin.bs_gda import bs_gda
from src.graph.graph import Graph
from src.graph import metrics
from src.graph import sample_evaluation
from src.graph.nearest_neighbors import NearestNeighbors
from src.gsp.reconstruction import reconstruct_signal, reconstruct_signal_direct, mse
import src.utils.plotting as util_plt
from src.utils import layout

## Build customer graph

### Load customer data and recommendation scores

In [None]:
# specify a timeframe to query
from_date = "2023-01-01"
to_date = "2023-01-30"

In [None]:
# read the CAR data (feedback score is known for all customers)
car_df = bq.join_car_nps(from_date, to_date)

### Preprocess the customer data

#### Remove unused features and impute missing values

In [None]:
prep = Preprocessor(from_date, to_date, data=car_df, verbose=False)
car_df, client_ids, adr_zips = prep.car_df, prep.client_ids, prep.adr_zips

In [None]:
# take small sample
sample = np.random.choice(car_df.shape[0], 1000, replace=False)
car_df = car_df.loc[sample].reset_index(drop=True)
client_ids = client_ids[sample].reset_index(drop=True)
adr_zips = adr_zips[sample].reset_index(drop=True)

In [None]:
car_df.shape

#### Remove unknown zip codes

In [None]:
# load mapper for zip_code -> (longitude, latitude)
zip_mapper = ZipCodeMapper()
# load zip codes of customers
adr_zip_df = pd.DataFrame(adr_zips, dtype=int)
# remove unknown (unmappable) zip codes
known_zips = adr_zip_df.adr_zip.isin(zip_mapper.zip_code_map.index)
# apply mask to all three Dataframes
adr_zips = adr_zip_df[known_zips].reset_index(drop=True)
car_df = car_df[known_zips].reset_index(drop=True)
client_ids = client_ids[known_zips].reset_index(drop=True)
car_df.shape

#### Mapping zip codes to (long, lat) pairs

In [None]:
# map zip code to coords
coords = zip_mapper.map_zip_codes_to_coords(adr_zips)
# remove zip codes, keep lat and long
coords.drop(columns="adr_zip", inplace=True)

### Construct graph

In [None]:
# store data as tensor on GPU
X = torch.tensor(np.ascontiguousarray(car_df.to_numpy()), device=torch.device('cuda', 0), dtype=torch.float32)
# compute k-nearest neighbor graph
knn = NearestNeighbors(device="gpu")
t = time.perf_counter()
_, k_neighbors = knn.knn(X, k=50)
print(f"This took {time.perf_counter()-t:.3f} s")

In [None]:
# build adjacency matrix from neighborhood index
A = knn.to_adj_matrix(k_neighbors)
n_nodes = A.shape[0]

In [None]:
# directed graph: count_nonzero(A) >= n_edges >= count_nonzero(A)/2
# undirected graph: count_nonzero(A)/2 = n_edges
n_edges = A.getnnz()
metrics.density(n_nodes, n_edges, mode="directed")

In [None]:
# wrap graph for faster neighborhood queries
graph = Graph(A)

## Signal reconstruction error

### Load recommendation scores (signal)

In [None]:
# read the recommendation values (NPS data) that we will use as signal
answers_df = bq.nps_query_timeframe(from_date, to_date)
# remove answers that cannot be assigned to a customer in CAR
answers_df = answers_df[answers_df.client_id.isin(client_ids)].reset_index(drop=True)
s = answers_df.answer_value.values.astype(int)

In [None]:
# plot the signal
plt.figure(figsize=(50, 20))
plt.plot(s);

### Test error of random sample

In [None]:
rand_sample = np.random.choice(n_nodes, 100, replace=False)
s_rec = reconstruct_signal(graph.laplacian(), rand_sample, s[rand_sample])
print("MSE of random sample:", mse(s, s_rec))

### Compute sampling sets and reconstruct signal

In [None]:
from bin.reconstruction_analysis import run_sampling_budget_experiment

#sampling_budget = np.linspace(100, 200, 4)
sampling_budget = np.array([500, 600])
t = time.perf_counter()
run_sampling_budget_experiment(graph, graph.laplacian(), s, sampling_budget, "Customer", p_hops=6, parallel=True)
time.perf_counter()-t

In [None]:
from bin.reconstruction_analysis import run_p_hops_experiment

p_hops = np.arange(1, 13)
t = time.perf_counter()
run_p_hops_experiment(graph, graph.laplacian(), s, p_hops, "Customer", sampling_budget=100, parallel=True)
time.perf_counter()-t