# Experiment: Sampling Set Extension

## Setup

In [None]:
import sys
PROJECT_PATH = "/home/christopher_orlowicz1_vodafone_c/gershgorin"
sys.path.append(PROJECT_PATH)
%cd $PROJECT_PATH

%load_ext autoreload
%autoreload 2

In [None]:
#%pip install -q -r requirements.txt
#%pip install graphilp

In [None]:
import time

from google.cloud import bigquery
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"
import networkx as nx
import numpy as np
import pandas as pd
import pandas_gbq
import scipy
from sklearn.neighbors import kneighbors_graph

import src.db.big_query as bq
from src.db.preprocessing import Preprocessor
from src.gershgorin.bs_gda import bs_gda
from src.graph.graph import Graph
from src.graph import metrics
import src.utils.plotting as util_plt

## Load customer data (CAR)

In [None]:
# specify a timeframe to query
from_date = "2022-08-01"
to_date = "2022-08-30"

In [None]:
# read the CAR data
car_df = bq.join_car_nps(from_date, to_date)
# read the recommendation values (NPS data) that we will use as signal
answers_df = bq.nps_query_timeframe(from_date, to_date)

In [None]:
# query postpaid customers only
def postpaid(from_date, to_date, month: int):
    client = bigquery.Client()
    sql_query = """
        SELECT DISTINCT nps.client_id, seg.market
        FROM 
            vf-de-datahub.vfde_dh_lake_dsl_customer_rawprepared_s.nps_cs_base AS nps,
            vf-de-datahub.vfde_dh_lake_mob_customer_rawprepared_s.seg_tn_all AS seg
        WHERE
            nps.client_id = seg.client_id
            AND nps.touchpoint_new = "Customer Base (All)"
            AND nps.question_name = "NPS"
            AND seg.market = "MMC"
            AND nps.client_id IS NOT NULL
            AND nps.answer_value IS NOT NULL
            AND nps.contactdate BETWEEN @from_date AND @to_date
            AND seg.monat = @month
        """
    query_config = bigquery.QueryJobConfig(
        query_parameters=[
            bigquery.ScalarQueryParameter("from_date", "DATE", from_date),
            bigquery.ScalarQueryParameter("to_date", "DATE", to_date),
            bigquery.ScalarQueryParameter("month", "INTEGER", month)
        ]
    )
    return client.query(sql_query, job_config=query_config).to_dataframe()

In [None]:
# filter for postpaid customers in answers (market = 'MMC')
month = int("".join(from_date.split("-")[:2]))  # month = yyyymm
postpaid_df = postpaid(from_date, to_date, month)
answers_df = answers_df[answers_df.client_id.isin(postpaid_df.client_id)]

In [None]:
# build ground truth signal by removing answers that cannot be assigned to a customer in CAR
not_in_car_ids = postpaid_df[~postpaid_df.client_id.isin(car_df.client_id)].client_id
answers_df = answers_df[~answers_df.client_id.isin(not_in_car_ids)]

In [None]:
prep = Preprocessor(from_date, to_date, data=car_df, verbose=False)
car_df, client_ids = prep.car_df, prep.client_ids
# inspect nan columns
nan_cols = car_df.columns[car_df.isna().sum() > 0]
# remove these columns for now
car_df.drop(nan_cols, axis=1, inplace=True)

## Construct graph

In [None]:
n_nodes = car_df.shape[0]
d = 0.2  # upper (directed) / lower (undirected) bound on graph density
k = np.ceil(d*n_nodes).astype(int)  # compute number of neighbors based on desired density
t = time.perf_counter()
A = kneighbors_graph(car_df, k, mode='connectivity', include_self=False, n_jobs=-1)
A = A.maximum(A.T)  # symmetrize the matrix (but density can be significantly higher)
print(f"This took {time.perf_counter()-t:.3f} s")

In [None]:
metrics.density(n_nodes, n_edges=A.getnnz()/2, mode="undirected")

In [None]:
# wrapper graph class for faster neighborhood queries
graph = Graph(A)

## Sample extension

### Pre-select sampling set

In [None]:
s = answers_df.answer_value.values.astype(int)

### Extend sampling set

In [None]:
sampling_budget = int(0.1*n_nodes)
start = time.perf_counter()
sampling_set, _ = bs_gda(graph, sampling_budget, parallel=True)
print(f"This took {time.perf_counter()-start:.3f} s")
print("Budget:", sampling_budget)
print("Sampled nodes:", len(sampling_set))

In [None]:
# map node_id to client_id
pred_sampling_set = client_ids[sampling_set]