# Experiment: Sampling Set Extension

## Setup

In [None]:
import sys
PROJECT_PATH = "/home/christopher_orlowicz1_vodafone_c/gershgorin"
sys.path.append(PROJECT_PATH)
%cd $PROJECT_PATH

%load_ext autoreload
%autoreload 2

In [None]:
#!pip install -r requirements.txt
#!pip install --force-reinstall faiss-cpu==1.7.2
#!pip install --force-reinstall faiss-gpu==1.7.2

In [None]:
import time

import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"
import networkx as nx
import numpy as np
import pandas as pd
import scipy
from sklearn.neighbors import kneighbors_graph
import torch

import src.db.big_query as bq
from src.db.preprocessing import Preprocessor
from src.db.zip_code_mapper import ZipCodeMapper
from src.gershgorin.bs_gda import bs_gda, bs_gda_extension
from src.graph.graph import Graph
from src.graph import metrics
from src.graph import sample_evaluation
from src.graph.nearest_neighbors import NearestNeighbors
from src.gsp.reconstruction import reconstruct_signal, mse
import src.utils.plotting as util_plt
from src.utils import layout

## Load customer data (CAR & NPS)

In [None]:
# specify a timeframe to query
from_date = "2023-01-01"
to_date = "2023-01-30"

In [None]:
# load the ground truth
car_df = bq.join_car_nps(from_date, to_date)

#### Removing unused columns

In [None]:
prep = Preprocessor(from_date, to_date, data=car_df, verbose=False)
car_df, client_ids, adr_zips = prep.car_df, prep.client_ids, prep.adr_zips

#### Take a small sample

In [None]:
sample = np.random.choice(car_df.shape[0], 2000, replace=False)
car_df = car_df.loc[sample].reset_index(drop=True)
client_ids = client_ids[sample].reset_index(drop=True)
adr_zips = adr_zips[sample].reset_index(drop=True)

#### Removing customers with unknown zip codes

In [None]:
# load mapper for zip_code -> (longitude, latitude)
zip_mapper = ZipCodeMapper()

In [None]:
# load zip codes of customers
adr_zip_df = pd.DataFrame(adr_zips, dtype=str)
# remove unknown (unmappable) zip codes
known_zips = adr_zip_df.adr_zip.isin(zip_mapper.zip_code_map.index)
# apply mask to all three Dataframes
adr_zips = adr_zip_df.loc[known_zips].reset_index(drop=True)
car_df = car_df[known_zips].reset_index(drop=True)
client_ids = client_ids[known_zips].reset_index(drop=True)

In [None]:
# map zip code to coords
coords = zip_mapper.map_zip_codes_to_coords(adr_zips)
# remove zip codes, keep lat and long
coords.drop(columns="adr_zip", inplace=True)

In [None]:
# read the recommendation values (NPS data) that we will use as signal
answers_df = bq.nps_query_timeframe(from_date, to_date)
# filter for postpaid customers in survey answers (market = 'MMC')
answers_df = answers_df[answers_df.client_id.isin(client_ids)]

## Construct graph

In [None]:
# store data as tensor on GPU
X = torch.tensor(np.ascontiguousarray(car_df.to_numpy()), device=torch.device('cuda', 0), dtype=torch.float32)
# compute k-nearest neighbor graph
knn = NearestNeighbors(device="gpu")
t = time.perf_counter()
_, k_neighbors = knn.knn(X, k=50)
print(f"This took {time.perf_counter()-t:.3f} s")

In [None]:
n_nodes = neighbors_idx.shape[0]
A = sparse.lil_matrix((n_nodes, n_nodes), dtype=np.int8)
A[range(n_nodes), neighbors_idx[range(n_nodes)]] = 1

In [None]:
# build adjacency matrix from neighborhood index
A = knn.to_adj_matrix(k_neighbors)
n_nodes = A.shape[0]

In [None]:
# directed graph: count_nonzero(A) >= n_edges >= count_nonzero(A)/2
# undirected graph: count_nonzero(A)/2 = n_edges
n_edges = A.getnnz()
# compute graph density
metrics.density(n_nodes, n_edges, mode="directed")

## Sampling set extension

### Pre-select sampling set

In [None]:
#preselection = [36, 34, 54, 56]
# pick a random set
preselection = np.random.choice(range(n_nodes), size=20, replace=False).tolist()

### Extend sampling set

In [None]:
# specify max. number of nodes to sample
sampling_budget = 200
assert sampling_budget > len(preselection)
p_hops = 12

In [None]:
# wrap graph with adjacency list for faster neighborhood queries
graph = Graph(A)

In [None]:
start = time.perf_counter()
sampling_set_extended, thres_extended = bs_gda_extension(graph, preselection, sampling_budget, p_hops=p_hops, parallel=True)
print(f"This took {time.perf_counter()-start:.3f} s")
print("Budget:", sampling_budget)
print("Sampled nodes:", len(sampling_set_extended))

In [None]:
#sampling_set_extended = list(np.random.choice(range(n_nodes), 100, replace=False))

In [None]:
# map node_ids to client_ids
pred_sampling_set = client_ids[sampling_set_extended]
# which customers should be sampled in addition to the preselection?
client_ids_to_sample = client_ids[set(sampling_set_extended) - set(preselection)]

In [None]:
#print("The following clients should be sampled in addition:\n", client_ids_to_sample.to_numpy())

### Evaluate results

#### Draw solution

In [None]:
# compute graph layout
# get dict of node positions
fixed_zip_pos = layout.pos_from_coords(coords)
# scatter customers in a circle around their zip code
pos = layout.circular_layout_around_zip_codes(fixed_zip_pos, radius=0.1)

In [None]:
# plot preselected nodes and those that should be sampled in addition
color_extended = util_plt.sample_to_node_color_vec(graph.num_nodes, sampling_set_extended)
color_preselected = util_plt.sample_to_node_color_vec(graph.num_nodes, preselection)
# color scheme: c[not_sampled] = 0, c[sampled] = 1, c[preselected] = 2
c = color_extended + color_preselected
util_plt.draw_graph(nx.from_scipy_sparse_matrix(A), pos, node_size=20, node_color=c, hide_edges=True, cmap=plt.cm.copper)

#### Analyse reconstruction error

In [None]:
# use recommendation values as signal
s = answers_df.answer_value.to_numpy().astype(int)

In [None]:
# create a smooth signal
# compute eigenvalues
lamda, V = la.eigh(np.array(graph.laplacian().todense().astype('float')))

# construct a smooth signal
k = 5
gft_coeffs = np.random.normal(0, 10, k)
s = eigvecs @ coeffs[:, np.newaxis]

In [None]:
# reconstruct the original signal from the preselected signal
L = graph.laplacian()
t = time.perf_counter()
s_recon = reconstruct_signal(L, preselection, s[preselection])
print(f"This took {time.perf_counter()-start:.3f} s")
print("MSE (preselection only)", mse(s, s_recon))

In [None]:
# reconstruct the original signal from the extended sampled signal
t = time.perf_counter()
s_recon = reconstruct_signal(L, sampling_set_extended, s[sampling_set_extended])
print(f"This took {time.perf_counter()-start:.3f} s")
print("MSE (extended):", mse(s, s_recon))

#### Compare to sampling set without preselection

In [None]:
start = time.perf_counter()
sampling_set, thres = bs_gda(graph, sampling_budget, p_hops=p_hops, parallel=True)
print(f"This took {time.perf_counter()-start:.3f} s")
print("Budget:", sampling_budget)
print("Sampled nodes:", len(sampling_set))

In [None]:
sampling_set = np.random.choice(range(n_nodes), 100, replace=False).tolist()
s_recon = reconstruct_signal(L, sampling_set, s[sampling_set])
mse_ = mse(s, s_recon)
print("MSE without preselection:", mse_)