In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval

## Check model performance across months

In [None]:
results_dir_fp = '/home/cdsw/project_mck_dmp/data/reseller/07_model_output/single/evaluation'
results_jan = pd.read_csv(f'{results_dir_fp}/ra_mck_int_single_model_evaluation_metrics_jan2020.csv')
results_feb = pd.read_csv(f'{results_dir_fp}/ra_mck_int_single_model_evaluation_metrics_feb2020.csv')
results_mar = pd.read_csv(f'{results_dir_fp}/ra_mck_int_single_model_evaluation_metrics_mar2020.csv')

In [None]:
results_jan

In [None]:
results_feb

In [None]:
results_mar

## Check performance percentiles of outlets and cashflows across months

In [None]:
percentile_dir_fp = '/home/cdsw/project_mck_dmp/data/reseller/07_model_output/single/performance'
percentiles_jan = pd.read_csv(f'{percentile_dir_fp}/ra_mck_int_single_model_performance_percentiles_jan2020_7consecutivedayszero.csv')
percentiles_feb = pd.read_csv(f'{percentile_dir_fp}/ra_mck_int_single_model_performance_percentiles_feb2020_7consecutivedayszero.csv')
percentiles_mar = pd.read_csv(f'{percentile_dir_fp}/ra_mck_int_single_model_performance_percentiles_mar2020_7consecutivedayszero.csv')

In [None]:
percentiles_jan.head()

In [None]:
percentiles_feb.head()

In [None]:
percentiles_mar.head()

In [None]:
# inner join on outlet_id
cols_to_rename = ['ground_truth', 'performance_percentile', 'training_ground_truths_list_length']
prepared_jan = percentiles_jan.set_index('outlet_id')[cols_to_rename].rename(lambda name: name + '_1', axis=1)
prepared_feb = percentiles_feb.set_index('outlet_id')[cols_to_rename].rename(lambda name: name + '_2', axis=1)
prepared_mar = percentiles_mar.set_index('outlet_id')[cols_to_rename].rename(lambda name: name + '_3', axis=1)
prepared_joined = pd.concat([prepared_jan, prepared_feb, prepared_mar], join='inner', axis=1).sort_index(axis=1)
print(prepared_joined.shape)
prepared_joined.head()

In [None]:
# calculate mean percentile
prepared_joined['mean_percentile'] = (
    (prepared_joined['performance_percentile_1'] + prepared_joined['performance_percentile_2'] + prepared_joined['performance_percentile_3']) / 3
)

In [None]:
# number of outlets within % difference with mean across months
pct = 5
prepared_joined['within_pct'] = (
    (np.abs(prepared_joined['performance_percentile_1'] - prepared_joined['mean_percentile']) <= pct) & 
    (np.abs(prepared_joined['performance_percentile_2'] - prepared_joined['mean_percentile']) <= pct) & 
    (np.abs(prepared_joined['performance_percentile_3'] - prepared_joined['mean_percentile']) <= pct)
)

prepared_joined.within_pct.value_counts(normalize=True)

In [None]:
# number of outlets within % difference with mean across months
pct = 10
prepared_joined['within_pct'] = (
    (np.abs(prepared_joined['performance_percentile_1'] - prepared_joined['mean_percentile']) <= pct) & 
    (np.abs(prepared_joined['performance_percentile_2'] - prepared_joined['mean_percentile']) <= pct) & 
    (np.abs(prepared_joined['performance_percentile_3'] - prepared_joined['mean_percentile']) <= pct)
)

prepared_joined.within_pct.value_counts(normalize=True)

In [None]:
# number of outlets within % difference with mean across months
pct = 20
prepared_joined['within_pct'] = (
    (np.abs(prepared_joined['performance_percentile_1'] - prepared_joined['mean_percentile']) <= pct) & 
    (np.abs(prepared_joined['performance_percentile_2'] - prepared_joined['mean_percentile']) <= pct) & 
    (np.abs(prepared_joined['performance_percentile_3'] - prepared_joined['mean_percentile']) <= pct)
)

prepared_joined.within_pct.value_counts(normalize=True)

## Check performance percentiles of outlets and potential gap across months

In [None]:
percentile_dir_fp = '/home/cdsw/project_mck_dmp/data/reseller/07_model_output/kfold/performance'
percentiles_jan = pd.read_csv(f'{percentile_dir_fp}/ra_mck_int_kfold_performance_percentiles_jan2020.csv')
percentiles_feb = pd.read_csv(f'{percentile_dir_fp}/ra_mck_int_kfold_performance_percentiles_feb2020.csv')

In [None]:
percentiles_jan.head()

In [None]:
percentiles_feb.head()

In [None]:
# inner join on outlet_id
cols_to_rename = ['ground_truth', 'performance_percentile', 'training_ground_truths_list_length', 'training_ground_truths_list']
prepared_jan = percentiles_jan.set_index('outlet_id')[cols_to_rename].rename(lambda name: name + '_1', axis=1)
prepared_feb = percentiles_feb.set_index('outlet_id')[cols_to_rename].rename(lambda name: name + '_2', axis=1)
prepared_joined = pd.concat([prepared_jan, prepared_feb], join='inner', axis=1).sort_index(axis=1)
print(prepared_joined.shape)
prepared_joined.head()

In [None]:
# calculate mean percentile
prepared_joined['mean_percentile'] = (
    (prepared_joined['performance_percentile_1'] + prepared_joined['performance_percentile_2']) / 2
)

In [None]:
# number of outlets within % difference with mean across months
pct = 5
prepared_joined['within_pct'] = (
    (np.abs(prepared_joined['performance_percentile_1'] - prepared_joined['mean_percentile']) <= pct) & 
    (np.abs(prepared_joined['performance_percentile_2'] - prepared_joined['mean_percentile']) <= pct)
)

prepared_joined.within_pct.value_counts(normalize=True)

In [None]:
# number of outlets within % difference with mean across months
pct = 10
prepared_joined['within_pct'] = (
    (np.abs(prepared_joined['performance_percentile_1'] - prepared_joined['mean_percentile']) <= pct) & 
    (np.abs(prepared_joined['performance_percentile_2'] - prepared_joined['mean_percentile']) <= pct)
)
prepared_joined.within_pct.value_counts(normalize=True)

In [None]:
# number of outlets within % difference with mean across months
pct = 15
prepared_joined['within_pct'] = (
    (np.abs(prepared_joined['performance_percentile_1'] - prepared_joined['mean_percentile']) <= pct) & 
    (np.abs(prepared_joined['performance_percentile_2'] - prepared_joined['mean_percentile']) <= pct)
)
prepared_joined.within_pct.value_counts(normalize=True)

In [None]:
# number of outlets within % difference with mean across months
pct = 20
prepared_joined['within_pct'] = (
    (np.abs(prepared_joined['performance_percentile_1'] - prepared_joined['mean_percentile']) <= pct) & 
    (np.abs(prepared_joined['performance_percentile_2'] - prepared_joined['mean_percentile']) <= pct)
)
prepared_joined.within_pct.value_counts(normalize=True)

In [None]:
prepared_joined.head()

In [None]:
percentiles = [90, 95, 100]
print(prepared_joined.shape)
sampled = prepared_joined.sample(frac=0.05, random_state=15)
print(sampled.shape)

# compute 90th percentile, 95 percentile and 100th percentile
sampled['training_ground_truths_list_1'] = sampled['training_ground_truths_list_1'].map(literal_eval)
sampled['training_ground_truths_list_2'] = sampled['training_ground_truths_list_2'].map(literal_eval)

# calculate percentiles cashflows
for percentile in percentiles:
    sampled[f'{percentile}th_percentile_cashflow_1'] = sampled['training_ground_truths_list_1'].map(lambda lst: np.quantile(lst, percentile/100))
    sampled[f'{percentile}th_percentile_cashflow_2'] = sampled['training_ground_truths_list_2'].map(lambda lst: np.quantile(lst, percentile/100))
    sampled[f'{percentile}th_percentile_cashflow_diff'] = sampled[f'{percentile}th_percentile_cashflow_2'] - sampled[f'{percentile}th_percentile_cashflow_1']
    sampled[f'{percentile}th_percentile_cashflow_diff_pct'] = (100 * sampled[f'{percentile}th_percentile_cashflow_2'] - sampled[f'{percentile}th_percentile_cashflow_1']) / sampled[f'{percentile}th_percentile_cashflow_1']

sampled.head()

In [None]:
sampled.head()

In [None]:
sampled['100th_percentile_cashflow_diff'].describe()

In [None]:
for i in range(5, 100, 5):
    print(f"{i} percentile: {sampled['100th_percentile_cashflow_diff'].quantile(i/100)}")

In [None]:
# distribution of potential gap of same outlet month to month, should be a normal-like distribution centered around 0
sampled['100th_percentile_cashflow_diff'].hist()

In [None]:
sampled['100th_percentile_cashflow_diff_pct'].describe()

In [None]:
for i in range(5, 100, 5):
    print(f"{i} percentile: {sampled['100th_percentile_cashflow_diff_pct'].quantile(i/100)}")