# Demo of the Graph Sampling Algorithm

## Setup

In [None]:
import sys
PROJECT_PATH = "/home/christopher_orlowicz1_vodafone_c/gershgorin"
sys.path.append(PROJECT_PATH)
%cd $PROJECT_PATH

%load_ext autoreload
%autoreload 2

In [None]:
from tqdm import tqdm
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"
import numpy as np
import pandas as pd
import scipy

import src.db.big_query as bq  # connection to the database
from src.db.preprocessing import Preprocessor  # data cleaning
from src.db import nps  # loading of NPS recommendation scores
from src.graph.graph import Graph  # wrapper class for the graph (stores an adjacency list for fast neighborhood queries)
from src.graph.nearest_neighbors import NearestNeighborGraph  # class for computing the adjacency matrix of a nearest neighbor graph
from src.gershgorin.bs_gda import bs_gda  # graph sampling method
from src.gsp import laplace_utils  # utility functions for working with the Laplacian
import src.utils.plotting as plt_util  # utility module for plotting

## Load the Data

In [None]:
# specify a timeframe to query
from_date = "2023-03-01"
to_date = "2023-03-30"

# read the joint data from CAR and NPS (feedback score is known for all customers)
car_df = bq.join_car_nps(from_date, to_date)

# preprocess the data (remove irrelevant features, impute missing values, normalize, etc.)
prep = Preprocessor(from_date, to_date, data=car_df, verbose=False)
car_df, client_ids, adr_zips = prep.car_df, prep.client_ids, prep.adr_zips
car_df.shape

- `car_df` contains the feature matrix. It will be used for all following computations.
- `client_ids` contains the customer ids of the graph nodes. This is used to map between node ids and client ids.
- `adr_zips`contains the zip codes of all customers in the graph.

In [None]:
# generate a signal from NPS data
s = nps.nps_signal(from_date, to_date, client_ids)

## Construct the Graph

We construct a similarity graph from the feature vectors. The best choice in experiments was the k-nearest neighbor (k-nn) graph with weights computed from a Gaussian kernel.

Steps in graph construction:

1. Compute the k-nn graph based on the feature matrix.
2. Compute Gaussian weights by passing the non-zero distances from step 1 through the Gaussian kernel
$$
    \exp\left(\frac{-dist^2}{\sigma^2}\right),
$$
    where $\sigma$ is chosen as the 25th percentile of the non-zero distances.
3. Since every node is the most similar to itself, we remove self-loops from the weight matrix.
4. One requirement for our graph sampling method is that the graph is undirected, i.e. the weight matrix is symmetric. Thus, we need to symmetrize the matrix.

Be **careful** with the choice of the number of neighbors. A too large neighborhood leads to a dense graph on which calculations are not feasible anymore. Also signal variation increases with more edges.
A good value for $k$ lies in the range [5, 50].

In [None]:
# construct a similarity graph
A = NearestNeighborGraph(n_neighbors=30).build(car_df)
# wrap graph for faster neighborhood queries
graph = Graph(A)

## Apply Graph Sampling

We select a subset of the nodes which should be representative for nodes in its p-hop neighborhood.

In [None]:
# select a sampling set; BSGDA returns a sample and a validity flag (a warning is also returned if the sample is invalid)
# parameters
k = 500  # sample_size (maximum number of nodes to sample)
mu = 0.01  # regularization strength of the smoothness prior
eps = 1e-5  # precision (controls number of iterations of the binary search)
p_hops = 6  # size of the p-hop neighborhood to consider per node
parallel = True  # whether to use parallelization for execution speedup

sample, vf = bs_gda(graph, k, mu, eps, p_hops, parallel=parallel)

## Reconstruct the Original Signal

We reconstruct the signal $\hat x$ on the entire graph by solving a linear system that minimizes a regularized reconstruction error:
$$
    \hat x = \min_x ||Hx - y||^2 + \mu x^T L x,
$$
where $H$ is the sampling matrix (can be computed from the sampling set), $y$ is our observed signal, $\mu$ is the regularization strength and $L$ is the graph Laplacian.

In [None]:
s_rec = reconstruction.reconstruct_signal(graph.laplacian(), sample, s[sample])

## Evaluate the Reconstruction Quality

### Reconstruction Error

In [None]:
reconstruction.mse(s, s_rec), reconstruction.rmse(s, s_rec), reconstruction.norm_error_norm(s, s_rec)

### Smoothness

In [None]:
laplace_utils.norm_lap_quad_form(L, s_rec), laplace_utils.norm_lap_quad_form(L, s)

### Visualise the Signals

In [None]:
plt.plot(s, alpha=.8, label="Original")
plt.plot(s_rec, alpha=.8, label="Reconstruction")
plt.legend();