# Comparison of signal smoothness of customer features

We analyse the smoothness of single features of the mobile postpaid data.

## Setup

In [None]:
import sys
PROJECT_PATH = "/home/christopher_orlowicz1_vodafone_c/gershgorin"
sys.path.append(PROJECT_PATH)
%cd $PROJECT_PATH

%load_ext autoreload
%autoreload 2

In [None]:
#!pip install -q -r requirements.txt

In [None]:
import time

from tqdm import tqdm
from google.cloud import bigquery
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = "retina"
import numpy as np
import pandas as pd
import scipy

import src.db.big_query as bq
from src.db.preprocessing import Preprocessor
from src.gershgorin.bs_gda import bs_gda
from src.graph.graph import Graph
from src.graph.nearest_neighbors import NearestNeighborGraph
from src.gsp import fourier, laplace_utils, reconstruction, signal
import src.utils.plotting as plt_util
from src.utils import layout

## Read Data

In [None]:
# specify a timeframe to query
from_date = "2023-03-01"
to_date = "2023-03-30"

# read the CAR data (feedback score is known for all customers)
car_df = bq.join_car_nps(from_date, to_date)

prep = Preprocessor(from_date, to_date, data=car_df, verbose=False)
car_df, client_ids, adr_zips = prep.car_df, prep.client_ids, prep.adr_zips
car_df.shape

In [None]:
A = NearestNeighborGraph(n_neighbors=20).build(car_df)

In [None]:
# wrap graph for faster neighborhood queries
graph = Graph(A)
L = graph.laplacian()

## NPS Signal

In [None]:
# read the recommendation values (NPS data) that we will use as signal
answers_df = bq.nps_query_timeframe(from_date, to_date)
# remove answers that cannot be assigned to a customer in CAR
answers_df = answers_df[answers_df.client_id.isin(client_ids)].reset_index(drop=True)
s = answers_df.answer_value.values.astype(int)

### Smoothness

In [None]:
laplace_utils.lap_quad_form(L, s), laplace_utils.norm_lap_quad_form(L, s)

## Spectral properties (optional)

In [None]:
# takes a while since eigendecomposition is costly
lamda, V = np.linalg.eigh(np.array(L.toarray().astype('float')))
s_hat = fourier.gft(V, s)

In [None]:
plt_util.plot_cdf_gft_energy(lamda, abs(s_hat))
plt_util.plot_spectral_domain(lamda, abs(s_hat))

## Features

Compare sort the features according to their (normalised) smoothness.

In [None]:
top_k = 50
signals = [car_df[feature].to_numpy() for feature in car_df.columns]
smoothness = np.array([laplace_utils.norm_lap_quad_form(L, s) for s in signals])
sorted_idx = np.argsort(smoothness)
top_smoothness = smoothness[sorted_idx][:top_k]
top_features = car_df.columns[sorted_idx][:top_k]

plt.figure(figsize=(20, 8))
plt.plot(top_features, top_smoothness, marker='o')
plt.title(f"Top {top_k} customer features sorted by smoothness")
plt.ylabel("Smoothness")
plt.xticks(rotation=30, ha='right')
plt.savefig("out/top_50_features_smoothness.pdf", bbox_inches='tight')

## Analyse smoothness of most important factors for recognizing deep detractors

In another team they found the following features to be important (not complete):
- YEAR_OF_BIRTH
- NBR_CANC_REQ
- TOT_IVR_DURATION
- REMAINING_DAYS
- MONTHS_SINCE_ACT
- TIME_FIRST_QUEUE_WAIT
- DATA_VOL_TOT_AVG
- NBR_DISTINCT_TEAMS

Let's see whether they are easy or difficult to reconstruct.

In [None]:
important_features = np.array(["YEAR_OF_BIRTH", "NBR_CANC_REQ", "TOT_IVR_DURATION", 
                      "MONTHS_SINCE_ACT", "TIME_FIRST_QUEUE_WAIT", "DATA_VOL_TOTAL_AVG", "NBR_DISTINCT_TEAMS"])

In [None]:
signals = [car_df[feature.lower()].to_numpy() for feature in important_features]
smoothness = np.array([laplace_utils.norm_lap_quad_form(L, s) for s in signals])
sorted_idx = np.argsort(smoothness)
sorted_smoothness = smoothness[sorted_idx]
sorted_features = important_features[sorted_idx]

plt.figure(figsize=(6, 4))
plt.plot(sorted_features, sorted_smoothness, marker='o')
plt.xticks(rotation=30, ha='right')
plt.title("Important features for DD detection sorted by smoothness")
plt.ylabel("Smoothness")
plt.savefig("out/most_important_features_smoothness.pdf", bbox_inches='tight')

## Can we reconstruct one of them?

In [None]:
feature = "tot_ivr_duration"
k = 500
s = car_df[feature].to_numpy()
sample, _ = bs_gda(graph, k, p_hops=4, parallel=True)

In [None]:
s_rec = reconstruction.reconstruct_signal(graph.laplacian(), sample, s[sample])
mse = reconstruction.mse(s, s_rec)
print("MSE:", mse, "RMSE": np.sqrt(mse))

## Compare to really smooth signal (bandlimited + noise)

In [None]:
s_gs1 = signal.gs1(L, 1)
s_gs1

In [None]:
laplace_utils.lap_quad_form(L, s_gs1), laplace_utils.norm_lap_quad_form(L, s_gs1)