# Demo of Graph Sampling Set Selection on a Customer Graph

## Setup

In [None]:
import sys
sys.path.append("/home/christopher_orlowicz1_vodafone_c/gershgorin")
%load_ext autoreload
%autoreload 2

In [None]:
PROJECT_PATH = "/home/christopher_orlowicz1_vodafone_c/gershgorin"
%cd $PROJECT_PATH

In [None]:
%pip install -q -r requirements.txt

In [None]:
import time

import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"
import networkx as nx
import numpy as np
import pandas as pd
import pandas_gbq
import scipy

import src.db.big_query as bq
from src.db.preprocessing import Preprocessor
from src.gershgorin.bs_gda import bs_gda
from src.graph.graph import Graph
from src.graph.similarity_graph import SimilarityGraph
import src.utils.plotting as util_plt

In [None]:
project_id = "vf-de-ca-lab"

## Build customer graph

In [None]:
from_date = "2022-06-01"
to_date = "2022-06-30"
n = 1000
car_df = Preprocessor(from_date, to_date, limit=n, verbose=False).car_df
# inspect nan columns
nan_cols = car_df.columns[car_df.isna().sum()>0]
car_df[nan_cols].isna().sum()
# remove these columns for now
car_df.drop(nan_cols, axis=1, inplace=True)

In [None]:
# build graph
density = 0.05  # desired density
k = np.ceil(density*n).astype(int)
A, G = SimilarityGraph(car_df).k_neighbors(k)
if n < 2000:
    util_plt.plot_adj_matrix(A, markersize=1, figsize=(5,5))
    pos = nx.spring_layout(G)
    util_plt.draw_graph(G, pos, title="Customer graph")

## Apply GSSS

In [None]:
sampling_budget = int(0.1*n)
graph = Graph(nx.adjacency_matrix(G))
start = time.perf_counter()
sampling_set, _ = bs_gda(graph, sampling_budget, parallel=True)
end = time.perf_counter()
print(f"This took {end-start:.3f} s")
print("Budget:", sampling_budget)
print("Sampled nodes:", len(sampling_set))

In [None]:
if n < 10000:
    c = np.ones(n)
    c[sampling_set] = 70
    draw_graph(G, pos, node_size=c, node_color=c)

### Apply sampling centrality

In [None]:
from src.gsp.metrics import sampling_centrality, z_score

sc = sampling_centrality(G, nx.adjacency_matrix(G), 100, 0.2)
z = z_score(sc)

In [None]:
plt.scatter(range(n), sc);

In [None]:
plt.scatter(range(n), z);