# Visualization of surveyed customers in a customer graph

## Setup

In [None]:
import sys
PROJECT_PATH = "/home/christopher_orlowicz1_vodafone_c/gershgorin"
sys.path.append(PROJECT_PATH)
%load_ext autoreload
%autoreload 2

In [None]:
%cd $PROJECT_PATH

In [None]:
#!pip install -q -r requirements.txt
#!pip install faiss-cpu==1.7.1
#!pip install faiss-gpu==1.7.1
#!pip install mpire==2.6.0

In [None]:
import time

import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"
import networkx as nx
import numpy as np
import pandas as pd
import scipy
from sklearn.neighbors import kneighbors_graph
import torch

import src.db.big_query as bq
from src.db.preprocessing import Preprocessor
from src.db.zip_code_mapper import ZipCodeMapper
from src.gershgorin.bs_gda import bs_gda
from src.graph.graph import Graph
from src.graph import metrics
from src.graph import sample_evaluation
from src.graph.nearest_neighbors import NearestNeighbors
import src.utils.plotting as util_plt
from src.utils import layout

## Build customer graph

### Load the data

Earliest possible date is **2021-12-09** (occurence of first `answer_value`s in NPS table).

In [None]:
from_date = "2023-01-01"
to_date = "2023-01-30"

In [None]:
# load a random sample of the CAR data
car_extra_df = bq.car_query_timeframe_sample(from_date, to_date, limit=1000)
# load all available feedback scores
car_gt_df = bq.join_car_nps(from_date, to_date)
# concatenate both DataFrames
union = pd.concat([car_extra_df, car_gt_df]).reset_index(drop=True)

In [None]:
car_extra_df.shape, car_gt_df.shape, union.shape

### Preprocessing

#### Removing unused features

In [None]:
prep = Preprocessor(from_date, to_date, data=None, verbose=False)
car_df, client_ids, adr_zips = prep.car_df, prep.client_ids, prep.adr_zips

#### Removing unknown zip codes

In [None]:
# load mapper for zip_code -> (longitude, latitude)
zip_mapper = ZipCodeMapper()

In [None]:
# load zip codes of customers
adr_zip_df = pd.DataFrame(adr_zips, dtype=int)
# remove unknown (unmappable) zip codes
known_zips = adr_zip_df.adr_zip.isin(zip_mapper.zip_code_map.index)
#print(adr_zips.loc[~known_zips])
# apply mask to all three Dataframes
adr_zips = adr_zip_df.loc[known_zips].reset_index(drop=True)
car_df = car_df[known_zips].reset_index(drop=True)
client_ids = client_ids[known_zips].reset_index(drop=True)
car_df.shape

#### Mapping zip codes to (long, lat) pairs

In [None]:
# map zip code to coords
coords = zip_mapper.map_zip_codes_to_coords(adr_zips)
# remove zip codes, keep lat and long
coords.drop(columns="adr_zip", inplace=True)

### Graph construction

In [None]:
import faiss

def knn_multi_gpu(X, k):
    X = X.cpu()
    # create a flat CPU index
    cpu_index = faiss.IndexFlatL2(X.shape[1])
    # map the index to all available GPUs
    gpu_index = faiss.index_cpu_to_all_gpus(cpu_index)
    # add the data to the index
    gpu_index.add(X)
    # compute the k-nearest neighbor indices
    distances, neighbor_indices = gpu_index.search(X, k)
    return distances, neighbor_indices

In [None]:
# store data as tensor on GPU
X = torch.tensor(np.ascontiguousarray(car_df.to_numpy()), device=torch.device('cuda', 0), dtype=torch.float32)
# compute k-nearest neighbor graph
knn = NearestNeighbors(device="gpu")
t = time.perf_counter()
#_, k_neighbors = knn.knn(X, k=100)
_, k_neighbors = knn_multi_gpu(X, k=100)
print(f"This took {time.perf_counter()-t:.3f} s")

In [None]:
# build adjacency matrix from neighborhood index
A = knn.to_adj_matrix(k_neighbors)
n_nodes = A.shape[0]

In [None]:
# directed graph: count_nonzero(A) >= n_edges >= count_nonzero(A)/2
# undirected graph: count_nonzero(A)/2 = n_edges
n_edges = A.getnnz()
metrics.density(n_nodes, n_edges, mode="directed")

## Draw graph and highlight surveyed customers

In [None]:
surveyed_customers = car_gt_df.client_id
# map all client ids to node ids
actual_sample = np.flatnonzero(client_ids.isin(surveyed_customers))

In [None]:
# use geographical coordinates of customers for graph layout
fixed_zip_pos = coords.to_dict("index")
# map dict of dicts to dict of tuples
fixed_zip_pos = {key: (values["long"], values["lat"]) for key, values in fixed_zip_pos.items()}
# scatter customers in a circle around their address
pos = layout.circular_layout_around_zip_codes(fixed_zip_pos, radius=0.1)

In [None]:
def scatter_graph(pos, node_size=10, node_color=None, figsize=(6, 6), title=None, cmap=plt.cm.viridis, filepath=None):
    x = np.array([x for x, y in pos.values()])
    y = np.array([y for x, y in pos.values()])
    plt.figure(figsize=figsize)
    sampled = node_color > 0
    # scatter not sampled nodes first
    plt.scatter(x[~sampled], y[~sampled], s=node_size, c='dimgrey', cmap=cmap)
    # then scatter sampled nodes
    plt.scatter(x[sampled], y[sampled], s=node_size, c='#E60000', cmap=cmap)
    plt.axis("off")
    if title is not None:
        plt.title(title)
    if filepath is not None:
        plt.savefig(filepath, dpi=300)

In [None]:
# plot customers
scatter_graph(pos, node_size=5, node_color=util_plt.sample_to_node_color_vec(n_nodes, actual_sample), 
              filepath="graph_true_samples_8000000.png")