# Comparison of One Signal on Different Graphs

## Setup

In [None]:
import sys
PROJECT_PATH = "/home/christopher_orlowicz1_vodafone_c/gershgorin"
sys.path.append(PROJECT_PATH)
%load_ext autoreload
%autoreload 2

In [None]:
%cd $PROJECT_PATH

In [None]:
#!pip install -q -r requirements.txt
#!pip install --force-reinstall faiss-gpu

In [None]:
import math
import time

from google.cloud import bigquery
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"
import networkx as nx
import numpy as np
import pandas as pd
import pandas_gbq
import scipy
import sklearn
from scipy import sparse
from sklearn.neighbors import kneighbors_graph

import src.db.big_query as bq
from src.db.preprocessing import Preprocessor
from src.gershgorin.bs_gda import bs_gda
from src.graph.graph import Graph
from src.graph import gauss_similarity_graph as gauss, graph_builder, graph_tools, metrics
from src.gsp import fourier, laplace_utils, signal, reconstruction
import src.utils.plotting as plt_util
from src.utils import data_handler, layout
from src.eval.eval_reconstruction import run_sampling_budget_experiment, run_reconstruction_analysis_budget_avg

## k-nn

In [None]:
IN_DIR = "out/customer_analytics/2023-01"
size = "1000"
adj_matrix = sparse.load_npz(f"{IN_DIR}/graph/knn/{size}/adj_matrix.npz")
graph = Graph(adj_matrix)

In [None]:
#s = np.load(f"{IN_DIR}/signal/knn/{size}/gs1.npy")
s = np.load(f"{IN_DIR}/signal/{size}/nps.npy")
#s = signal.gs1(L, size=1) + signal.gauss_noise(size=1)
L = graph.laplacian()
lamda, V = np.linalg.eigh(np.array(L.todense().astype('float')))
s_hat = fourier.gft(V, s)

In [None]:
plt_util.plot_cdf_gft_energy(lamda, abs(s_hat))
plt_util.plot_spectral_domain(lamda, abs(s_hat))
#plt.savefig("out/spectral_properties_knn_1000_nps.png", dpi=400)

In [None]:
# smoothness
laplace_utils.norm_lap_quad_form(L, s)

In [None]:
G = nx.Graph(adj_matrix)
adr_zips = data_handler.load_csv_to_list(f"{IN_DIR}/data/{size}/zip_codes.csv", dtype=str)
pos = layout.geo_layout(adr_zips)
plt.figure(figsize=(4,4))
nx.draw(G, pos, node_size=30, node_color=s.squeeze(), edgelist=[], cmap="coolwarm")

In [None]:

sampling_budgets = [100, 200, 300, 400]
errors, _, smoothness, _, s_rec = run_sampling_budget_experiment(graph, s, sampling_budgets, 0.01, 1e-5, 6, True)
plt_util.plot_reconstruction_error(np.array(sampling_budgets), errors,
                                   "sampling budget", "Sampling budget", 
                                   graph.num_nodes, filepath=None)

In [None]:
y_true = s <= 2
y_pred = s_rec[-1] < 3
y_score = 1/s_rec[-1]

In [None]:
plt.figure(figsize=(8,4))
plt.plot(y_true, label="Original")
plt.plot(y_pred, label="Reconstruction");
plt.legend()
#plt.savefig("out/reconstructed_bin_signal.pdf")

In [None]:
print(sklearn.metrics.precision_score(y_true, y_pred))
print(sklearn.metrics.recall_score(y_true, y_pred))

In [None]:
prec, recall, thresholds = sklearn.metrics.precision_recall_curve(y_true, y_score)
plt_util.plot_precision_recall_curve(prec, recall)
#plt.savefig("out/precision_recall_curve_dd_1000.pdf")

## Gauss

In [None]:
size = 1000
adj_matrix = sparse.load_npz(f"{IN_DIR}/graph/gauss/{size}/adj_matrix.npz")
graph = Graph(adj_matrix)

In [None]:
#s = np.load(f"{IN_DIR}/signal/gauss/{size}/nps.npy")
s = np.load(f"{IN_DIR}/signal/{size}/nps.npy")
L = graph.laplacian()
lamda, V = np.linalg.eigh(np.array(L.todense().astype('float')))
s_hat = fourier.gft(V, s)

In [None]:
plt_util.plot_cdf_gft_energy(lamda, abs(s_hat), True)
plt_util.plot_spectral_domain(lamda, abs(s_hat))

In [None]:
G = nx.Graph(adj_matrix)
adr_zips = data_handler.load_csv_to_list(f"{IN_DIR}/data/{size}/zip_codes.csv", dtype=str)
pos = layout.geo_layout(adr_zips)
plt.figure(figsize=(4,4))
nx.draw(G, pos, node_size=30, node_color=s.squeeze(), edgelist=[], cmap="coolwarm")

In [None]:
# smoothness
laplace_utils.norm_lap_quad_form(L, s)

In [None]:
sampling_budgets = [100, 200, 300, 400]
errors, _, smoothness, _, s_rec = run_sampling_budget_experiment(graph, s, sampling_budgets, 0.01, 1e-5, 12, False)
plt_util.plot_reconstruction_error(np.array(sampling_budgets), errors,
                                   "sampling budget", "Sampling budget", 
                                   graph.num_nodes, filepath=None)

In [None]:
y_true = s < 3
y_pred = s_rec[-1] < 3
np.mean(y_true == y_pred)

In [None]:
G = nx.Graph(adj_matrix)
adr_zips = data_handler.load_csv_to_list(f"{IN_DIR}/data/{size}/zip_codes.csv", dtype=str)
pos = layout.geo_layout(adr_zips)
plt.figure(figsize=(4,4))
nx.draw(G, pos, node_size=30, node_color=s_rec[-1].squeeze(), edgelist=[], cmap="coolwarm")

In [None]:
print(sklearn.metrics.precision_score(y_true, y_pred))
print(sklearn.metrics.recall_score(y_true, y_pred))

In [None]:
prec, recall, _ = sklearn.metrics.precision_recall_curve(y_true, s_rec[-1])
plt.plot(recall, prec)
plt.title("2-class precision-recall curve")
plt.xlabel("Recall")
plt.ylabel("Precision");

In [None]:
plt.figure(figsize=(6,4))
plt.plot(s)
plt.plot(s_rec[-1]);
#plt.scatter(range(len(s)), s<3, c='r', marker="x")

## k-NN with Gauss kernel

In [None]:
from src.graph.nearest_neighbors import NearestNeighborGraph

size = "500"
data = pd.read_parquet(f"{IN_DIR}/data/{size}/car.parquet")
adj_matrix = NearestNeighborGraph(n_neighbors=50).build(data)
graph = Graph(adj_matrix)

In [None]:
adj_matrix[adj_matrix>0].A.size

In [None]:
#s = np.load(f"{IN_DIR}/signal/{size}/nps.npy")
L = graph.laplacian()
s = np.mean(signal.gs1(L, 50) + signal.gauss_noise(size=50), axis=1)
#s = signal.gs1(L, 1) + signal.gauss_noise(size=1)
lamda, V = np.linalg.eigh(np.array(L.todense().astype('float')))

In [None]:
s_hat = fourier.gft(V, s)

In [None]:
plt_util.plot_spectral_domain(lamda, s_hat)

In [None]:
plt.plot(abs(s_hat))
plt.xlabel(r"Eigenvalue index")
plt.ylabel("Frequency");

In [None]:
# smoothness
laplace_utils.norm_lap_quad_form(L, s)

In [None]:
sampling_budgets = np.linspace(50, 300, 3).astype(int)
errors, _, smoothness, _, s_rec = run_sampling_budget_experiment(graph, s, sampling_budgets, 0.01, 1e-5, 6, False)
plt_util.plot_reconstruction_error(np.array(sampling_budgets), errors,
                                   "sampling budget", "Sampling budget", 
                                   graph.num_nodes, filepath=None)

In [None]:
y_true = s < 3
y_pred = s_rec[-1] < 3
print("Precision:", np.mean(y_true == y_pred))

In [None]:
plt.plot(s)
plt.plot(s_rec[-1]);

## Remainder

In [None]:
return
adj_matrix = sparse.load_npz(f"{IN_DIR}/graph/knn/{size}/adj_matrix.npz")
adj_matrix = gauss.min_max_scale(adj_matrix)
dists = adj_matrix[adj_matrix.nonzero()].A1
sigma = np.percentile(dists, 25)
adj_matrix = gauss.gauss_kernel(adj_matrix.todense(), sigma) * (adj_matrix.todense() > 0)
adj_matrix = sparse.csr_matrix(adj_matrix)
adj_matrix = symmetrize_adj(adj_matrix)
graph = Graph(adj_matrix)

## Try out low-pass filtering

In [None]:
from src.gsp import filter_functions

cut_off = lamda[199]
s_filtered = filter_functions.apply_filter(L.todense(), s, lambda eigs: filter_functions.low_pass_filter(eigs, cut_off))
print("MSE between original and filtered:", reconstruction.mse(s, s_filtered.squeeze()))

In [None]:
s_hat = fourier.gft(V, s_filtered)
plt.plot(abs(s_hat));

In [None]:
plt_util.plot_cdf_gft_energy(lamda, abs(s_hat), False)
plt_util.plot_spectral_domain(lamda, abs(s_hat))

In [None]:
# smoothness
laplace_utils.norm_lap_quad_form(L, s_filtered)

In [None]:
G = nx.Graph(adj_matrix)
adr_zips = data_handler.load_csv_to_list(f"{IN_DIR}/data/{size}/zip_codes.csv", dtype=str)
pos = layout.geo_layout(adr_zips)
plt.figure(figsize=(4,4))
nx.draw(G, pos, node_size=30, node_color=s_filtered.squeeze(), edgelist=[], cmap="coolwarm")

In [None]:
sampling_budgets = [100, 200, 300, 400]
errors, _, smoothness, _, s_rec = run_sampling_budget_experiment(graph, s_filtered, sampling_budgets, 0.01, 1e-5, 6, False)
plt_util.plot_reconstruction_error(np.array(sampling_budgets), errors,
                                   "sampling budget", "Sampling budget", 
                                   graph.num_nodes, filepath=None)

In [None]:
y_true = s < 3
y_pred = s_rec[-1] < 3
print("Precision:", np.mean(y_true == y_pred))

In [None]:
def run_reconstruction_analysis_budget(graph, signal_func, sampling_budgets: list, n_signals: int = 50, mu: float = 0.01,
                                eps: float = 1e-5, p_hops: int = 12, parallel: bool = False, runs: int = 5, seed: int = 0):
    n_nodes = graph.num_nodes
    errors = np.zeros((runs, len(sampling_budgets)))
    thresholds = np.zeros_like(errors)
    print("Starting reconstruction analysis.")
    for run in range(runs):
        print("Run", run + 1)
        L = graph.laplacian()
        # sample random signals
        rand_signals = signal_func(L, size=n_signals)
        # generate random Gaussian noise
        gauss_noises = signal.gauss_noise(size=n_signals)
        for j, k in enumerate(sampling_budgets):
            # if budget is a percentage, multiply with number of nodes
            sampling_budget = math.floor(n_nodes * k) if k < 1 else k
            print(f"|\tK: {sampling_budget}")
            # compute the sampling set
            sampling_set, thres = bs_gda(graph, sampling_budget, mu, eps, p_hops, parallel)
            print(f"|\tsample: {len(sampling_set)} nodes, threshold: {thres}\n|")
            mses = []
            for s in rand_signals.T:
                for noise in gauss_noises:
                    # add Gaussian noise to the signal
                    s_noisy = s + noise
                    # reconstruct the original signal from the sampled signal
                    s_recon = reconstruction.reconstruct_signal(L, sampling_set, s_noisy[sampling_set], mu)
                    mses.append(reconstruction.mse(s_noisy, s_recon))
            # take the average MSE over the signals
            errors[run, j] = np.mean(mses)
            thresholds[run, j] = thres
    # take the average MSE over the runs
    return errors.mean(axis=0), thresholds.mean(axis=0)

In [None]:
#sampling_budgets = [50, 100, 200, 300, 400, 450]
sampling_budgets = [50, 100, 150]
errors, thres = run_reconstruction_analysis_budget(graph, signal.gs1, sampling_budgets, n_signals=20, mu=0.01, 
                                                   eps=1e-5, p_hops=6, parallel=False, runs=1, seed=1234)
plt_util.plot_reconstruction_error(np.array(sampling_budgets), errors, 
                                   "sampling budget", "Sampling budget", 
                                   graph.num_nodes, out=None)