# Application of Graph Sampling Set Selection to the CAR dataset

## Setup

In [None]:
import sys
PROJECT_PATH = "/home/christopher_orlowicz1_vodafone_c/gershgorin"
sys.path.append(PROJECT_PATH)
%load_ext autoreload
%autoreload 2

In [None]:
%cd $PROJECT_PATH

In [None]:
%pip install -q -r requirements.txt

In [None]:
import time

import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"
import networkx as nx
import numpy as np
import pandas as pd
import pandas_gbq
import scipy
from sklearn.neighbors import kneighbors_graph

import src.db.big_query as bq
from src.db.preprocessing import Preprocessor
from src.gershgorin.bs_gda import bs_gda
from src.graph.graph import Graph
from src.graph import metrics
import src.utils.plotting as util_plt

## Build customer graph

### Load the data

Earliest possible date is **2021-12-09** (occurence of first `answer_value`s in NPS table).

In [None]:
from_date = "2022-06-01"
to_date = "2022-06-30"
n_nodes = 10000
prep = Preprocessor(from_date, to_date, limit=n_nodes, verbose=False)
car_df, client_ids = prep.car_df, prep.client_ids
# inspect nan columns
nan_cols = car_df.columns[car_df.isna().sum() > 0]
# remove these columns for now
car_df.drop(nan_cols, axis=1, inplace=True)

### Construct graph

In [None]:
d = 0.2  # upper (directed) / lower (undirected) bound on graph density
k = np.ceil(d*n_nodes).astype(int)  # compute number of neighbors based on desired density
t = time.perf_counter()
A = kneighbors_graph(car_df, k, mode='connectivity', include_self=False, n_jobs=-1)
#A = A.maximum(A.T)  # symmetrize the matrix (but density can be significantly higher)
print(f"This took {time.perf_counter()-t:.3f} s")

In [None]:
# directed graph: count_nonzero(A) >= n_edges >= count_nonzero(A)/2
# undirected graph: count_nonzero(A)/2 = n_edges
n_edges = A.getnnz()
metrics.density(n_nodes, n_edges, mode="directed")

## Compute sampling set

In [None]:
sampling_budget = int(0.1*n)  # sample at most 10 % of all nodes
graph = Graph(A)
start = time.perf_counter()
sampling_set, _ = bs_gda(graph, sampling_budget, parallel=True)
print(f"This took {time.perf_counter()-start:.3f} s")
print("Budget:", sampling_budget)
print("Sampled nodes:", len(sampling_set))

In [None]:
# map node_id to client_id
pred_sampling_set = client_ids[sampling_set]