# Experiment: How do the sampled customers compare to the computed sampling set?

## Setup

In [None]:
import sys
PROJECT_PATH = "/home/christopher_orlowicz1_vodafone_c/gershgorin"
sys.path.append(PROJECT_PATH)
%load_ext autoreload
%autoreload 2

In [None]:
%cd $PROJECT_PATH

In [None]:
%pip install -q -r requirements.txt
%pip install graphilp

In [None]:
import time
from tqdm import tqdm

from google.cloud import bigquery
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"
import networkx as nx
import numpy as np
import pandas as pd
import pandas_gbq
import scipy
from sklearn.neighbors import kneighbors_graph

import src.db.big_query as bq
from src.db.preprocessing import Preprocessor
from src.gershgorin.bs_gda import bs_gda
from src.graph.graph import Graph
from src.graph import metrics
import src.utils.plotting as util_plt

## Build customer graph

### Load and preprocess the data

Earliest possible date is **2021-12-09** (occurence of first `answer_value`s in NPS table).

In [None]:
from_date = "2022-08-01"
to_date = "2022-08-30"

In [None]:
car_df = bq.join_car_nps(from_date, to_date)
answers_df = bq.nps_query_timeframe(from_date, to_date)

In [None]:
def postpaid(from_date, to_date, month: int):
    client = bigquery.Client()
    sql_query = """
        SELECT DISTINCT nps.client_id, seg.market
        FROM 
            vf-de-datahub.vfde_dh_lake_dsl_customer_rawprepared_s.nps_cs_base AS nps,
            vf-de-datahub.vfde_dh_lake_mob_customer_rawprepared_s.seg_tn_all AS seg
        WHERE
            nps.client_id = seg.client_id
            AND nps.touchpoint_new = "Customer Base (All)"
            AND nps.question_name = "NPS"
            AND seg.market = "MMC"
            AND nps.client_id IS NOT NULL
            AND nps.answer_value IS NOT NULL
            AND nps.contactdate BETWEEN @from_date AND @to_date
            AND seg.monat = @month
        """
    query_config = bigquery.QueryJobConfig(
        query_parameters=[
            bigquery.ScalarQueryParameter("from_date", "DATE", from_date),
            bigquery.ScalarQueryParameter("to_date", "DATE", to_date),
            bigquery.ScalarQueryParameter("month", "INTEGER", month)
        ]
    )
    return client.query(sql_query, job_config=query_config).to_dataframe()

In [None]:
# filter for postpaid customers in answers (market = 'MMC')
month = int("".join(from_date.split("-")[:2]))
postpaid_df = postpaid(from_date, to_date, month)
answers_df = answers_df[answers_df.client_id.isin(postpaid_df.client_id)]

In [None]:
# build ground truth signal by removing answers that cannot be assigned to a customer in CAR
s = answers_df.copy()
not_in_car_ids = postpaid_df[~postpaid_df.client_id.isin(car_df.client_id)].client_id
s = s[~s.client_id.isin(not_in_car_ids)]

In [None]:
from_date = "2022-08-01"
to_date = "2022-08-30"
prep = Preprocessor(from_date, to_date, data=car_df, verbose=False)
car_df, client_ids = prep.car_df, prep.client_ids
# inspect nan columns
nan_cols = car_df.columns[car_df.isna().sum() > 0]
# remove these columns for now
car_df.drop(nan_cols, axis=1, inplace=True)

### Construct graph

In [None]:
n = car_df.shape[0]
d = 0.4  # bound on graph density
k = np.ceil(d*n).astype(int)
t = time.perf_counter()
A = kneighbors_graph(car_df, k, mode='connectivity', include_self=False, n_jobs=-1)
#A = A.maximum(A.T)  # symmetrize the matrix (but lose density guarantee)
print(f"This took {time.perf_counter()-t} s")

In [None]:
metrics.density(n, A.getnnz(), mode="directed")

## Compute sampling set

In [None]:
# wrap graph for faster neighborhood queries
graph = Graph(A)

In [None]:
sampling_budget = int(0.1*n)
start = time.perf_counter()
sampling_set, _ = bs_gda(graph, sampling_budget, parallel=True)
print(f"This took {time.perf_counter()-start:.3f} s")
print("Budget:", sampling_budget)
print("Sampled nodes:", len(sampling_set))

In [None]:
# map node_id to client_id
pred_sampling_set = client_ids[sampling_set]

## Sampling set size for growing sampling budget

In [None]:
sampling_budgets = [30, 40, 50, 60, 70, 80, 90, 100, 150, 200]
set_sizes = []

for k in tqdm(sampling_budgets):
    sampling_set, _ = bs_gda(graph, k, parallel=True)
    set_sizes.append(len(sampling_set))

In [None]:
plt.plot(sampling_budgets, set_sizes)
plt.title("Sampling set size for growing budget")
plt.xlabel("Sampling budget")
plt.ylabel("Num. sampled nodes");

The set size for budgets 10 and 20 is 0 because no valid sampling set could be computed with < 30 nodes.