# Global Settings and Imports

In [None]:
%reload_ext autoreload
%autoreload 2

import os
import sys

sys.path.insert(0, '..')

import seq_rec.utils.custom_logging

In [None]:
from typing import Dict, Text

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs

%load_ext tensorboard
from tensorboard.plugins import projector

In [None]:
import seq_rec.io as io
import seq_rec.utils as utils
from seq_rec.models.deeprec.ann_helper import ANNHelper

# Parameters

In [None]:
RANDOM_SEED = 13

HYDRA_CONFIG_PATH = '../seq_rec/conf/'

# Config object

In [None]:
cfg = utils.load_cfg(HYDRA_CONFIG_PATH)

# Download data

In [None]:
test_ds = io.load_training_data(cfg.env.io.training, part='test', random_seed=RANDOM_SEED, download=True)

In [None]:
test_prep_ds = io.prep_training_data(test_ds, cfg.input_prep)

## Load model

### Download model from GCS

In [None]:
SAVED_MODEL_PATH = 'models/seq-rec-model-v020'
os.environ['SAVED_MODEL_PATH'] = SAVED_MODEL_PATH

In [None]:
loaded = tf.saved_model.load(SAVED_MODEL_PATH)

# Make predictions

In [None]:
input_serving = ANNHelper.prepare_batch_input(test_prep_ds)
serving_fn = loaded.signatures['k_10']

In [None]:
recommendations = serving_fn(**input_serving)

In [None]:
labels = list(test_prep_ds.map(lambda x: x['target_merchant_id']).as_numpy_iterator())

## Collect predictions into a dataframe

In [None]:
eval_df = pd.DataFrame(
    data=dict(
        target=labels,
        predictions=recommendations['merchant_id'].numpy().tolist(),
        score=recommendations['scores'].numpy().tolist(),
        input_eval_context_merchants=list(test_prep_ds.map(lambda x: x['context_merchants']).as_numpy_iterator()),
        input_eval_context_search_terms=list(test_prep_ds.map(lambda x: x['context_search_terms']).as_numpy_iterator()),
        input_eval_recent_txn_merchants=list(test_prep_ds.map(lambda x: x['recent_txn_merchants']).as_numpy_iterator()),
        input_eval_context_merchants_time_recency=list(test_prep_ds.map(lambda x: x['context_merchants_time_recency']).as_numpy_iterator()),
        input_eval_context_search_terms_time_recency=list(test_prep_ds.map(lambda x: x['context_search_terms_time_recency']).as_numpy_iterator()),
        input_eval_recent_txn_time_recency=list(test_prep_ds.map(lambda x: x['recent_txn_time_recency']).as_numpy_iterator()),
        input_eval_cnt_not_null_context_search_term=list(test_prep_ds.map(lambda x: x['cnt_not_null_context_search_term']).as_numpy_iterator()),
        input_eval_cnt_not_null_context_merchant=list(test_prep_ds.map(lambda x: x['cnt_not_null_context_merchant']).as_numpy_iterator()),
        input_eval_cnt_not_null_recent_txn=list(test_prep_ds.map(lambda x: x['cnt_not_null_recent_txn']).as_numpy_iterator()),
    )
)
eval_df

In [None]:
_not_a_list_cols = ['input_eval_cnt_not_null_context_search_term', 'input_eval_cnt_not_null_context_merchant', 'input_eval_cnt_not_null_recent_txn']
cols_to_explode = list(set(eval_df.columns) - set(['target']) - set(_not_a_list_cols))
eval_df_explode = eval_df.explode(cols_to_explode).reset_index()
eval_df_explode['match'] = eval_df_explode['target'] == eval_df_explode['predictions']
eval_df_explode['hit'] = eval_df_explode.groupby('index')['match'].transform('sum')

In [None]:
eval_df_explode

In [None]:
hit_instances = eval_df_explode.groupby('index')['match'].sum().loc[lambda s: s.eq(1)].index
nonhit_instances = eval_df_explode.groupby('index')['match'].sum().loc[lambda s: s.eq(0)].index

#### Sample nonhit to inspect

In [None]:
samples = np.random.choice(nonhit_instances, size=5)

eval_df_explode.loc[lambda df: df['index'].isin(samples)]

# Analyze

## Agg by instance

In [None]:
eval_df_analyze = eval_df_explode.assign(
    input_eval_context_merchants=lambda df: df['input_eval_context_merchants'].apply(lambda s: s.decode('utf-8')).replace('NULL', np.nan),
    input_eval_context_search_terms=lambda df: df['input_eval_context_search_terms'].apply(lambda s: s.decode('utf-8')).replace('NULL', np.nan),
    input_eval_recent_txn_merchants=lambda df: df['input_eval_recent_txn_merchants'].apply(lambda s: s.decode('utf-8')).replace('NULL', np.nan),
    input_eval_context_merchants_time_recency=lambda df: df['input_eval_context_merchants_time_recency'].apply(lambda s: s.decode('utf-8')).astype(float).replace(0, np.nan),
    input_eval_context_search_terms_time_recency=lambda df: df['input_eval_context_search_terms_time_recency'].apply(lambda s: s.decode('utf-8')).astype(float).replace(0, np.nan),
    input_eval_recent_txn_time_recency=lambda df: df['input_eval_recent_txn_time_recency'].apply(lambda s: s.decode('utf-8')).astype(float).replace(0, np.nan),
)
eval_df_analyze = eval_df_analyze.assign(
    cnt_context_search_term_cross_cnt_merchant_cross_cnt_txn=lambda df: df['input_eval_cnt_not_null_context_search_term'].astype(str) + "search" + "_" + df['input_eval_cnt_not_null_context_merchant'].astype(str) + "merchant" + "_" + df['input_eval_cnt_not_null_recent_txn'].astype(str) + "txn"
)
eval_df_analyze

In [None]:
eval_df_analyze_agg_by_prediction = eval_df_analyze.groupby('index').agg({
    'target': ['first'],
    'score': ['mean', 'max', 'min'],
    'input_eval_context_merchants': ['nunique'],
    'input_eval_context_search_terms': ['nunique'],
    'input_eval_recent_txn_merchants': ['nunique'],
    'input_eval_context_merchants_time_recency': ['mean', 'max', 'min'],
    'input_eval_context_search_terms_time_recency': ['mean', 'max', 'min'],
    'input_eval_recent_txn_time_recency': ['mean', 'max', 'min'],
    'input_eval_cnt_not_null_context_search_term': ['first'],
    'input_eval_cnt_not_null_context_merchant': ['first'],
    'input_eval_cnt_not_null_recent_txn': ['first'],
    'cnt_context_search_term_cross_cnt_merchant_cross_cnt_txn': ['first'],
    'hit': ['max']
})
eval_df_analyze_agg_by_prediction.columns = ['__'.join(x) for x in eval_df_analyze_agg_by_prediction.columns]
eval_df_analyze_agg_by_prediction = eval_df_analyze_agg_by_prediction.assign(
    no_context=lambda df: (df['input_eval_context_merchants__nunique'].eq(0) & df['input_eval_context_search_terms__nunique'].eq(0)).astype(int),
    no_recent_txn=lambda df: (df['input_eval_recent_txn_merchants__nunique'].eq(0)).astype(int),
    only_txn=lambda df: (df['no_context'].eq(1) & df['no_recent_txn'].eq(0)).astype(int),
    only_context=lambda df: (df['no_context'].eq(0) & df['no_recent_txn'].eq(1)).astype(int),
)

## Compare the data

In [None]:
!pip install sweetviz

In [None]:
import sweetviz as sv

In [None]:
hit_df = eval_df_analyze_agg_by_prediction.loc[lambda df: df['hit__max'].eq(1)]
nonhit_df = eval_df_analyze_agg_by_prediction.loc[lambda df: df['hit__max'].eq(0)]

In [None]:
hit_compare_report = sv.compare([hit_df, 'Hit Data'], [nonhit_df, 'Non-Hit Data'])

In [None]:
hit_compare_report.show_notebook()

#### Target merchant

In [None]:
target_compare_df = eval_df_analyze_agg_by_prediction.groupby('target__first').agg({
    'hit__max': ['sum', 'count'],
    'input_eval_cnt_not_null_context_search_term__first': ['mean'],
    'input_eval_cnt_not_null_context_merchant__first': ['mean'],
    'input_eval_cnt_not_null_recent_txn__first': ['mean']
})
target_compare_df.columns = ['__'.join(x) for x in target_compare_df.columns]
target_compare_df = target_compare_df.assign(
    hit_rate=lambda df: df['hit__max__sum'] / df['hit__max__count']
)

In [None]:
print('Lowest hit-rate target merchants')
target_compare_df.loc[lambda df: df['hit__max__count'].ge(10)].sort_values(['hit_rate', 'hit__max__count'], ascending=[True, False]).head(10)

In [None]:
print('Highest hit-rate target merchants')
target_compare_df.loc[lambda df: df['hit__max__count'].ge(10)].sort_values(['hit_rate', 'hit__max__count'], ascending=[False, False]).head(10)

#### W.r.t. number of context search terms

In [None]:
print("Percentage of hit by number of context search terms")
eval_df_analyze_agg_by_prediction.groupby('input_eval_cnt_not_null_context_search_term__first')['hit__max'].agg(['mean', 'count'])

In [None]:
print("Percentage of hit by number of context merchants")
eval_df_analyze_agg_by_prediction.groupby('input_eval_cnt_not_null_context_merchant__first')['hit__max'].agg(['mean', 'count'])

In [None]:
print("Percentage of hit by number of recent txn")
eval_df_analyze_agg_by_prediction.groupby('input_eval_cnt_not_null_recent_txn__first')['hit__max'].agg(['mean', 'count'])

In [None]:
print("Percentage of hit by number of context search terms cross number of context merchants cross number of recent txn order by hit rate (mean)")
(
    eval_df_analyze_agg_by_prediction
    .groupby('cnt_context_search_term_cross_cnt_merchant_cross_cnt_txn__first')
    ['hit__max'].agg(['mean', 'count'])
    .loc[lambda df: df['count'].gt(30)]
    .sort_values(['mean'], ascending=[False])
)

In [None]:
print("Percentage of hit by number of context search terms cross number of context merchants cross number of recent txn order by frequency (count)")
(
    eval_df_analyze_agg_by_prediction
    .groupby('cnt_context_search_term_cross_cnt_merchant_cross_cnt_txn__first')
    ['hit__max'].agg(['mean', 'count'])
    .loc[lambda df: df['count'].gt(30)]
    .sort_values(['count'], ascending=[False])
)

In [None]:
print("Percentage of hit by whether containing only recent txn")
(
    eval_df_analyze_agg_by_prediction
    .groupby('only_txn')
    ['hit__max'].agg(['mean', 'count'])
)

In [None]:
print("Percentage of hit by whether containing any context")
(
    eval_df_analyze_agg_by_prediction
    .groupby('no_context')
    ['hit__max'].agg(['mean', 'count'])
)

In [None]:
print("Percentage of hit by whether containing only context")
(
    eval_df_analyze_agg_by_prediction
    .groupby('only_context')
    ['hit__max'].agg(['mean', 'count'])
)

In [None]:
print("Percentage of hit by number of context search terms cross number of context merchants cross merchant click recency order by hit rate (mean)")
(
    eval_df_analyze_agg_by_prediction
    .groupby(['cnt_context_search_term_cross_cnt_merchant__first', 'input_eval_context_merchants_time_recency__min'])
    ['hit__max'].agg(['mean', 'count'])
    .loc[lambda df: df['count'].gt(30)]
    .sort_values(['mean'], ascending=[False])
)

# Embeddings analysis

In [None]:
log_dir = 'logs/embeddings/merchants/'

In [None]:
# Set up config.
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)

In [None]:
%tensorboard --logdir /home/jupyter/seq-rec/notebooks/logs/embeddings/merchants/

# Archive 

In [None]:
target_compare_df = pd.concat([
    hit_df['target__first'].value_counts(normalize=True).to_frame('perc_hit'),
    nonhit_df['target__first'].value_counts(normalize=True).to_frame('perc_nonhit')
], axis=1)
target_compare_df = target_compare_df.assign(
    diff_hit_minus_nonhit=lambda df: df['perc_hit'] - df['perc_nonhit']
).dropna().sort_values(['diff_hit_minus_nonhit'])

In [None]:
hit_analysis_report.show_notebook()

In [None]:
target_compare_df.head(10)

In [None]:
target_compare_df.tail(10)