In [30]:
import sys
# the mock-0.3.1 dir contains testcase.py, testutils.py & mock.py
sys.path.append('../')

import numpy as np
from sklearn.metrics import confusion_matrix
import functools
import json
import os
from itertools import starmap
from multiprocessing.pool import Pool
from typing import List, Tuple

import luigi
import numpy as np
import pandas as pd
from tqdm import tqdm

from recommendation.fairness_metrics import calculate_fairness_metrics
from recommendation.files import get_test_set_predictions_path
from recommendation.offpolicy_metrics import eval_IPS, eval_CIPS, eval_SNIPS, eval_doubly_robust
from recommendation.rank_metrics import average_precision, precision_at_k, ndcg_at_k, prediction_coverage_at_k, \
    personalization_at_k
from recommendation.task.model.base import BaseEvaluationTask
from recommendation.utils import parallel_literal_eval

In [2]:
def _create_relevance_list(sorted_actions: List[int], expected_action: int, reward: int) -> List[int]:
    if reward == 1:
        return [1 if action == expected_action else 0 for action in sorted_actions]
    else:
        return [0 for _ in sorted_actions]


def _ps_policy_eval(relevance_list: List[int], prob_actions: List[float]) -> List[float]:
    return np.sum(np.array(relevance_list) * np.array(prob_actions[:len(relevance_list)])).tolist()


def _get_rhat_scores(relevance_list: List[int], action_scores: List[float]) -> List[float]:
    return np.sum(np.array(relevance_list) * np.array(action_scores[:len(relevance_list)])).tolist()


def _get_rhat_rewards(relevance_list: List[int]) -> float:
    return relevance_list[0]


In [25]:
df: pd.DataFrame = pd.read_csv(get_test_set_predictions_path("/media/workspace/DeepFood/deep-reco-gym/output/interaction/TrivagoLogisticModelInteraction/results/TrivagoLogisticModelInteraction_selu____epsilon_greedy_58274b531d"))
df = df[df.clicked == 1]    
df.head()

Unnamed: 0,timestamp,timestamp_diff,step,user_idx,session_idx,sum_action_item_before,action_type_item_idx,action_type_idx,list_action_type_idx,list_reference_search_for_poi,...,user_view,hist_views,ps,n_users,n_items,vocab_size,window_hist_size,sorted_actions,prob_actions,action_scores
1,1541461727,1541461708,2.0,3674,3219,0.0,809,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 2]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",...,51.0,2.0,0.039216,4429,2192,241,10,"[1048, 1113, 809, 235, 109, 1088, 125, 97, 32,...","[0.004, 0.904, 0.004, 0.004, 0.004, 0.004, 0.0...","[0.03921429067850113, 0.03646798059344292, 0.0..."
31,1541461746,3082923435,3.0,3674,3219,1.0,798,2,"[0, 0, 0, 0, 0, 0, 0, 0, 2, 2]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",...,76.0,4.0,0.052632,4429,2192,241,10,"[809, 625, 798, 1113, 97, 1048, 238, 1145, 32,...","[0.904, 0.004, 0.004, 0.004, 0.004, 0.004, 0.0...","[0.9070658683776855, 0.27450406551361084, 0.16..."
51,1541461753,21580456031,15.0,2474,472,0.0,1735,2,"[2, 2, 3, 3, 3, 4, 2, 2, 2, 2]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",...,176.0,2.0,0.011364,4429,2192,241,10,"[733, 497, 750, 964, 209, 408, 1800, 284, 1465...","[0.904, 0.004, 0.004, 0.004, 0.004, 0.004, 0.0...","[0.004293902311474085, 0.002859285566955805, 0..."
81,1541461759,29287773125,20.0,1373,1572,19.0,221,2,"[5, 5, 5, 5, 5, 5, 5, 5, 5, 5]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",...,26.0,2.0,0.076923,4429,2192,241,10,"[221, 1048, 809, 565, 104, 128, 1088, 30, 1034...","[0.904, 0.004, 0.004, 0.004, 0.004, 0.004, 0.0...","[0.993881344795227, 0.5346676707267761, 0.1745..."
97,1541461770,163394825917,107.0,1914,3635,16.0,616,2,"[5, 5, 5, 5, 5, 10, 5, 5, 5, 5]","[0, 0, 0, 0, 0, 2, 0, 0, 0, 0]",...,176.0,1.0,0.005682,4429,2192,241,10,"[616, 565, 124, 6, 374, 597, 221, 37, 367, 352...","[0.904, 0.004, 0.004, 0.004, 0.004, 0.004, 0.0...","[0.7017820477485657, 0.6362565755844116, 0.135..."


In [26]:
df["sorted_actions"] = parallel_literal_eval(df["sorted_actions"])
df["action_scores"]  = parallel_literal_eval(df["action_scores"])
df["action"]         = df["sorted_actions"].apply(lambda sorted_actions: sorted_actions[0])


100%|██████████| 1826/1826 [00:00<00:00, 2922090.46it/s]
100%|██████████| 1826/1826 [00:00<00:00, 2526822.54it/s]


In [27]:
df.columns

Index(['timestamp', 'timestamp_diff', 'step', 'user_idx', 'session_idx',
       'sum_action_item_before', 'action_type_item_idx', 'action_type_idx',
       'list_action_type_idx', 'list_reference_search_for_poi',
       'list_reference_change_of_sort_order',
       'list_reference_search_for_destination',
       'list_reference_filter_selection',
       'list_reference_interaction_item_image_idx',
       'list_reference_interaction_item_rating_idx',
       'list_reference_clickout_item_idx',
       'list_reference_interaction_item_deals_idx',
       'list_reference_search_for_item_idx',
       'list_reference_interaction_item_info_idx', 'list_current_filters',
       'platform_idx', 'device_idx', 'current_filters', 'impressions',
       'prices', 'clicked', 'list_mean_price', 'impressions_popularity',
       'pos_item_idx', 'item_idx', 'price', 'view', 'is_first_in_impression',
       'first_item_idx', 'popularity_item_idx', 'diff_price', 'user_view',
       'hist_views', 'ps', 'n_user

In [28]:
df_env = df[['is_first_in_impression', 'item_idx', 'action']]
df_env

Unnamed: 0,is_first_in_impression,item_idx,action
1,0.0,809,1048
31,1.0,798,809
51,1.0,1735,733
81,1.0,221,221
97,0.0,616,616
...,...,...,...
44540,0.0,43,971
44566,0.0,603,49
44595,0.0,1265,167
44603,0.0,19,19


In [29]:
df_env.groupby(['is_first_in_impression']).count()

Unnamed: 0_level_0,item_idx,action
is_first_in_impression,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1282,1282
1.0,544,544


In [36]:
df_env = df_env[df_env.is_first_in_impression == 1.0]


y_true, y_prediction = df_env.item_idx, df_env.action 
cnf_matrix = confusion_matrix(y_true, y_prediction)
print(cnf_matrix)

[[12  0  0 ...  0  0  0]
 [ 0  3  0 ...  0  0  0]
 [ 0  0  1 ...  0  0  0]
 ...
 [ 0  0  0 ...  1  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  1]]


In [53]:
num_positives = np.sum(np.diag(cnf_matrix))
num_negatives = np.sum(cnf_matrix) - num_positives

fp = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
fn = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
tp = np.diag(cnf_matrix)
tn = cnf_matrix.sum() - (fp + fn + tp)

fp = fp.astype(float)
fn = fn.astype(float)
tp = tp.astype(float)
tn = tn.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
tpr = tp/(tp+fn)
# Specificity or true negative rate
tnr = tn/(tn+fp) 
# Precision or positive predictive value
ppv = tp/(tp+fp)
# Negative predictive value
npv = tn/(tn+fn)
# Fall out or false positive rate
fpr = fp/(fp+tn)
# False negative rate
fnr = fn/(tp+fn)
# False discovery rate
fdr = fp/(tp+fp)
# positive rate
pr  = (tp+fp)/(tp+fp+fn+tn)
# positive rate
nr  = (tn+fn)/(tp+fp+fn+tn)

# Overall accuracy
acc = (tp+tn)/(tp+fp+fn+tn)

# Balanced Accuracy (BA)
bacc =  (tpr+tnr)/2

  from ipykernel import kernelapp as app


In [54]:
bacc

array([0.82192649, 0.8       , 0.66666667, 0.75      , 0.74907749,
       0.625     , 0.49907919, 0.5       , 0.49723757,        nan,
       1.        , 0.5       , 0.9787037 , 0.75      , 0.6875    ,
       0.49907749, 0.4907919 , 0.5       , 0.5       , 0.61017653,
       0.75      , 0.75      , 0.5       , 0.49907919, 0.74815498,
       0.99723247, 0.5       , 1.        , 0.49539595, 0.5       ,
       0.58333333, 0.66666667, 0.5       , 0.49907919, 1.        ,
       0.49907749, 0.83333333, 0.49815838, 1.        , 0.74815498,
       1.        , 1.        , 0.61111111, 0.49907919, 0.63542555,
       0.75      , 0.49815838, 0.5       , 0.5       , 0.58054523,
       0.85341846, 1.        , 1.        , 0.87313433, 0.49907919,
       0.75      , 0.5       , 0.49907919, 0.89907236,        nan,
       0.75      , 0.875     , 1.        , 0.99815838,        nan,
       0.49631676, 0.75      ,        nan, 0.5       , 0.5       ,
              nan, 1.        , 0.5       , 0.74907407, 0.5    

In [55]:
{
    "class": len(acc),
    "false_positive_rate": np.nanmean(fpr),
    "false_negative_rate": np.nanmean(fnr),
    "true_positive_rate": np.nanmean(tpr),
    "true_negative_rate": np.nanmean(tnr),
    "positive_rate": np.nanmean(pr),
    "negative_rate": np.nanmean(nr),
    "num_positives": num_positives,
    "num_negatives": num_negatives,
    "num_total": num_positives+ num_negatives,
}

{'class': 235,
 'false_positive_rate': 0.002414991161385277,
 'false_negative_rate': 0.6286218962384765,
 'true_positive_rate': 0.37137810376152347,
 'true_negative_rate': 0.9975850088386147,
 'positive_rate': 0.004255319148936169,
 'negative_rate': 0.9957446808510638,
 'num_positives': 237,
 'num_negatives': 307,
 'num_total': 544}

In [44]:
np.mean(ACC)

0.9951971214017522