Formal analysis of statistical significance.

In [5]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.lines import Line2D
%matplotlib inline
from IPython import display

import random
from sklearn.metrics import accuracy_score, log_loss, mean_absolute_error, mean_squared_error, brier_score_loss
from sklearn.metrics import precision_score, recall_score, roc_auc_score, balanced_accuracy_score

from scipy.stats import probplot, t, wilcoxon, 

from algorithms import *
from utils import *
from KMPE import *
from NN_functions import *

from tqdm import tqdm_notebook as tqdm

import warnings
warnings.filterwarnings('ignore')

In [6]:
def normalize_by_max(x, y):
    return (x - y) / np.max([x, y], axis=0)

def wilcoxon_n(x, y, **kwargs):
    return wilcoxon(normalize_by_max(x, y), **kwargs)

In [8]:
# Hypotheses:
# - DEDPUL (proposed) outperforms EN and KM (state-of-the-art) on Mixture Proportions Estimation
# - DEDPUL (proposed) outperforms EN and nnRE (state-of-the-art) on Positive-Unlabeled Classification
# - Proposed procedure of hyperparamer tuning improves DEDPUL performance
# - DEDPUL is robust to choice of hyperparameters
# - Proposed modification of nnRE improves its performance on Positive-Unlabeled Classification
# - Proposed modifications of EN improve its performance on Mixture Proportions Estimation

### alphastar data

In [23]:
res_pivot_alpha = pd.read_csv('raw_data//alpha_synth_raw.csv', decimal=',', sep=';')
res_pivot_alpha.set_index(['dataset', 'ds', 'dmu', 'alpha', 'cons_alpha', 'random_state'], inplace=True)

## zero data

In [25]:
res_zero = pd.read_csv('raw_data//alpha_synth_zero_raw.csv', decimal=',', sep=';')
res_zero.set_index(['dataset', 'ds', 'dmu', 'alpha', 'cons_alpha', 'random_state'], inplace=True)

### alpha data

In [28]:
res_pivot_alpha_uci = pd.read_csv('raw_data//alpha_real_raw.csv', decimal=',', sep=';')
res_pivot_alpha_uci.set_index(['dataset', 'alpha', 'random_state'], inplace=True)

### mae data

In [31]:
res_pivot_mae = pd.read_csv('raw_data//mae_synth_raw.csv', decimal=',', sep=';')
res_pivot_mae.set_index(['dataset', 'ds', 'dmu', 'alpha', 'cons_alpha', 'random_state'], inplace=True)

### accuracy data

In [32]:
res_pivot_acc = pd.read_csv('raw_data//acc_real_raw.csv', decimal=',', sep=';')
res_pivot_acc.set_index(['dataset', 'alpha', 'random_state'], inplace=True)

### DEDPUL (proposed) outperforms KM (state-of-the-art) and EN on Mixture Proportions Estimation

In [35]:
print('alpha star')
wilcoxon_n(res_pivot_alpha['dedpul'].values, res_pivot_alpha['KM_2'].values), \
wilcoxon_n(res_pivot_alpha['dedpul'].values, res_pivot_alpha['e1_en'].values), \
wilcoxon_n(res_pivot_alpha['dedpul'].values, res_pivot_alpha['e3_en'].values), \
wilcoxon_n(res_pivot_alpha['dedpul'].values, res_pivot_alpha['em_en'].values), \
wilcoxon_n(res_pivot_alpha['dedpul'].values, res_pivot_alpha['baseline_dedpul'].values), \
wilcoxon_n(res_pivot_alpha['dedpul'].values, res_pivot_alpha['random_dedpul'].values)

alpha star


(WilcoxonResult(statistic=57384.0, pvalue=0.14001351392237046),
 WilcoxonResult(statistic=757.0, pvalue=1.7104825588751473e-81),
 WilcoxonResult(statistic=35273.0, pvalue=6.404626360432766e-17),
 WilcoxonResult(statistic=40923.0, pvalue=1.892967819776349e-11),
 WilcoxonResult(statistic=27603.0, pvalue=2.726192579653819e-26),
 WilcoxonResult(statistic=53986.0, pvalue=0.04166447958885208))

In [36]:
wilcoxon_n(res_pivot_alpha.reset_index()[((res_pivot_alpha.reset_index()['dataset'] != 'normal') | 
                                         ((res_pivot_alpha.reset_index()['dmu'] != 1) &
                                          (res_pivot_alpha.reset_index()['dmu'] != 2)))]['dedpul'].values, 
          res_pivot_alpha.reset_index()[((res_pivot_alpha.reset_index()['dataset'] != 'normal') | 
                                         ((res_pivot_alpha.reset_index()['dmu'] != 1) & 
                                          (res_pivot_alpha.reset_index()['dmu'] != 2)))]['KM_2'].values)

WilcoxonResult(statistic=30133.0, pvalue=3.0962921098951316e-05)

dedpul outperforms all EN and baseline_dedpul; inconclusive for KM, but KM performs better on unreliable N(1, 1) and N(2, 1), and if those are excluded, DEDPUL outperforms KM

In [37]:
print('alpha')
wilcoxon_n(res_pivot_alpha_uci['dedpul'].values, res_pivot_alpha_uci['KM_2'].values), \
wilcoxon_n(res_pivot_alpha_uci['dedpul'].values, res_pivot_alpha_uci['e1_en'].values), \
wilcoxon_n(res_pivot_alpha_uci['dedpul'].values, res_pivot_alpha_uci['e3_en'].values), \
wilcoxon_n(res_pivot_alpha_uci['dedpul'].values, res_pivot_alpha_uci['em_en'].values), \
wilcoxon_n(res_pivot_alpha_uci['dedpul'].values, res_pivot_alpha_uci['baseline_dedpul'].values), \
wilcoxon_n(res_pivot_alpha_uci['dedpul'].values, res_pivot_alpha_uci['random_dedpul'].values)

alpha


(WilcoxonResult(statistic=21579.5, pvalue=1.291862011533854e-58),
 WilcoxonResult(statistic=3829.0, pvalue=2.8539393823792606e-91),
 WilcoxonResult(statistic=10525.0, pvalue=2.1523111321731132e-78),
 WilcoxonResult(statistic=25888.5, pvalue=1.753769388178255e-51),
 WilcoxonResult(statistic=27338.0, pvalue=2.957653786474387e-49),
 WilcoxonResult(statistic=79611.5, pvalue=0.0679519827947525))

dedpul outperforms KM, all EN, and baseline_dedpul

Conclusion: dedpul outperforms everything in MPE, except random_dedpul, which performs indistinguishably

### Proposed modification of EN improves its performance on Mixture Proportions Estimation

In [38]:
print('alpha star')
wilcoxon_n(res_pivot_alpha['e3_en'].values, res_pivot_alpha['e1_en'].values), \
wilcoxon_n(res_pivot_alpha['em_en'].values, res_pivot_alpha['e1_en'].values), \
wilcoxon_n(res_pivot_alpha['em_en'].values, res_pivot_alpha['e3_en'].values)

alpha star


(WilcoxonResult(statistic=16695.0, pvalue=1.3141443263091014e-45),
 WilcoxonResult(statistic=8159.0, pvalue=1.0420785034169878e-63),
 WilcoxonResult(statistic=41792.0, pvalue=1.6918201009192741e-10))

In [39]:
print('alpha')
wilcoxon_n(res_pivot_alpha_uci['e3_en'].values, res_pivot_alpha_uci['e1_en'].values), \
wilcoxon_n(res_pivot_alpha_uci['em_en'].values, res_pivot_alpha_uci['e1_en'].values), \
wilcoxon_n(res_pivot_alpha_uci['em_en'].values, res_pivot_alpha_uci['e3_en'].values)

alpha


(WilcoxonResult(statistic=76163.0, pvalue=0.0009924722708245708),
 WilcoxonResult(statistic=36016.0, pvalue=5.56981417231352e-37),
 WilcoxonResult(statistic=54855.5, pvalue=9.683064760928584e-17))

Conclusion: em outperforms (?) e3 outperforms e1

### DEDPUL (proposed) outperforms EN and nnRE (state-of-the-art) on Positive-Unlabeled Classification

In [40]:
print('mae')
wilcoxon_n(res_pivot_mae['dedpul'].values, res_pivot_mae['sigmoid_nnre'].values), \
wilcoxon_n(res_pivot_mae['dedpul'].values, res_pivot_mae['brier_nnre'].values), \
wilcoxon_n(res_pivot_mae['dedpul'].values, res_pivot_mae['en'].values), \
wilcoxon_n(res_pivot_mae['dedpul'].values, res_pivot_mae['random_dedpul'].values)

mae


(WilcoxonResult(statistic=5101.0, pvalue=6.476274886799269e-70),
 WilcoxonResult(statistic=49267.0, pvalue=0.00014219403356468044),
 WilcoxonResult(statistic=48490.5, pvalue=2.198300433686926e-05),
 WilcoxonResult(statistic=21268.0, pvalue=9.796868023663647e-36))

In [41]:
wilcoxon_n(res_pivot_mae.reset_index()[((res_pivot_mae.reset_index()['dataset'] != 'normal') | 
                                         ((res_pivot_mae.reset_index()['dmu'] != 1) &
                                          (res_pivot_mae.reset_index()['dmu'] != 2)))]['dedpul'].values, 
          res_pivot_mae.reset_index()[((res_pivot_mae.reset_index()['dataset'] != 'normal') | 
                                         ((res_pivot_mae.reset_index()['dmu'] != 1) & 
                                          (res_pivot_mae.reset_index()['dmu'] != 2)))]['en'].values), \
wilcoxon_n(res_pivot_mae.reset_index()[((res_pivot_mae.reset_index()['dataset'] != 'normal') | 
                                         ((res_pivot_mae.reset_index()['dmu'] != 1) &
                                          (res_pivot_mae.reset_index()['dmu'] != 2)))]['dedpul'].values, 
          res_pivot_mae.reset_index()[((res_pivot_mae.reset_index()['dataset'] != 'normal') | 
                                         ((res_pivot_mae.reset_index()['dmu'] != 1) & 
                                          (res_pivot_mae.reset_index()['dmu'] != 2)))]['brier_nnre'].values)

(WilcoxonResult(statistic=20964.0, pvalue=3.3780882164118944e-16),
 WilcoxonResult(statistic=20377.0, pvalue=1.0070106196053973e-16))

dedpul outperforms sigmoid_nnre; inconclusive for EN and brier_nnre, but they outperform DEDPUL on unreliable N(0, 1), N(0, 2), and if those are excluded, DEDPUL outperforms EN and brier_nnre

In [42]:
print('accuracy')
wilcoxon_n(res_pivot_acc['dedpul'].values, res_pivot_acc['sigmoid_nnre'].values), \
wilcoxon_n(res_pivot_acc['brier_nnre'].values, res_pivot_acc['dedpul'].values), \
wilcoxon_n(res_pivot_acc['dedpul'].values, res_pivot_acc['en'].values), \
wilcoxon_n(res_pivot_acc['dedpul'].values, res_pivot_acc['random_dedpul'].values)

accuracy


(WilcoxonResult(statistic=53883.5, pvalue=1.7638644851509278e-10),
 WilcoxonResult(statistic=70956.0, pvalue=0.02448820050726236),
 WilcoxonResult(statistic=20388.0, pvalue=1.059047312889711e-55),
 WilcoxonResult(statistic=37662.5, pvalue=1.0464645164573003e-07))

(8) dedpul outperforms en and sigmoid_nnre; inconclusive for brier_nnre

Conclusion: dedpul outperforms brier nnre and sigmoid nnre and performs indistinguishably from brier nnre

### Proposed modification of nnRE improves its performance on Positive-Unlabeled Classification

In [43]:
print('mae')
wilcoxon_n(res_pivot_mae['brier_nnre'].values, res_pivot_mae['sigmoid_nnre'].values), \
wilcoxon_n(res_pivot_mae['en'].values, res_pivot_mae['sigmoid_nnre'].values)

mae


(WilcoxonResult(statistic=16636.0, pvalue=1.0123054923175336e-45),
 WilcoxonResult(statistic=19697.0, pvalue=2.9904187092357425e-40))

In [44]:
print('accuracy')
wilcoxon_n(res_pivot_acc['brier_nnre'].values, res_pivot_acc['sigmoid_nnre'].values)

accuracy


WilcoxonResult(statistic=31435.0, pvalue=1.367973471914338e-29)

conclusion: brier_nnre outperforms sigmoid_nnre

### Proposed procedure of hyperparamer tuning improves DEDPUL performance
### DEDPUL is robust to the choice of hyperparameters

In [45]:
wilcoxon_n(res_pivot_alpha['dedpul'].values, res_pivot_alpha['random_dedpul'].values), \
wilcoxon_n(res_pivot_alpha_uci['dedpul'].values, res_pivot_alpha_uci['random_dedpul'].values), \
wilcoxon_n(res_pivot_mae['dedpul'].values, res_pivot_mae['random_dedpul'].values), \
wilcoxon_n(res_pivot_acc['dedpul'].values, res_pivot_acc['random_dedpul'].values)

(WilcoxonResult(statistic=53986.0, pvalue=0.04166447958885208),
 WilcoxonResult(statistic=79611.5, pvalue=0.0679519827947525),
 WilcoxonResult(statistic=21268.0, pvalue=9.796868023663647e-36),
 WilcoxonResult(statistic=37662.5, pvalue=1.0464645164573003e-07))

In [46]:
wilcoxon_n(res_pivot_alpha['random_dedpul'].values, res_pivot_alpha['KM_2'].values), \
wilcoxon_n(res_pivot_alpha_uci['random_dedpul'].values, res_pivot_alpha_uci['KM_2'].values), \
wilcoxon_n(res_pivot_mae['random_dedpul'].values, res_pivot_mae['brier_nnre'].values), \
wilcoxon_n(res_pivot_acc['random_dedpul'].values, res_pivot_acc['brier_nnre'].values), \
'', \
wilcoxon_n(res_pivot_alpha['random_dedpul'].values, res_pivot_alpha['em_en'].values), \
wilcoxon_n(res_pivot_alpha_uci['random_dedpul'].values, res_pivot_alpha_uci['em_en'].values), \
wilcoxon_n(res_pivot_mae['random_dedpul'].values, res_pivot_mae['en'].values), \
wilcoxon_n(res_pivot_acc['random_dedpul'].values, res_pivot_acc['en'].values)

(WilcoxonResult(statistic=62162.0, pvalue=0.8860999645853749),
 WilcoxonResult(statistic=24513.0, pvalue=7.37324991318434e-54),
 WilcoxonResult(statistic=54363.0, pvalue=0.010586689779794529),
 WilcoxonResult(statistic=64747.0, pvalue=1.814738205562641e-05),
 '',
 WilcoxonResult(statistic=43814.0, pvalue=5.897211615589959e-09),
 WilcoxonResult(statistic=28006.0, pvalue=1.829195161478817e-48),
 WilcoxonResult(statistic=58953.0, pvalue=0.2882988598123275),
 WilcoxonResult(statistic=30029.0, pvalue=9.172837930755226e-42))

In [47]:
wilcoxon_n(res_pivot_alpha.reset_index()[((res_pivot_alpha.reset_index()['dataset'] != 'normal') | 
                                         ((res_pivot_alpha.reset_index()['dmu'] != 1) &
                                          (res_pivot_alpha.reset_index()['dmu'] != 2)))]['random_dedpul'].values, 
          res_pivot_alpha.reset_index()[((res_pivot_alpha.reset_index()['dataset'] != 'normal') | 
                                         ((res_pivot_alpha.reset_index()['dmu'] != 1) & 
                                          (res_pivot_alpha.reset_index()['dmu'] != 2)))]['KM_2'].values), \

wilcoxon_n(res_pivot_mae.reset_index()[((res_pivot_mae.reset_index()['dataset'] != 'normal') | 
                                         ((res_pivot_mae.reset_index()['dmu'] != 1) &
                                          (res_pivot_mae.reset_index()['dmu'] != 2)))]['random_dedpul'].values, 
          res_pivot_mae.reset_index()[((res_pivot_mae.reset_index()['dataset'] != 'normal') | 
                                         ((res_pivot_mae.reset_index()['dmu'] != 1) & 
                                          (res_pivot_mae.reset_index()['dmu'] != 2)))]['brier_nnre'].values), \
wilcoxon_n(res_pivot_mae.reset_index()[((res_pivot_mae.reset_index()['dataset'] != 'normal') | 
                                         ((res_pivot_mae.reset_index()['dmu'] != 1) &
                                          (res_pivot_mae.reset_index()['dmu'] != 2)))]['random_dedpul'].values, 
          res_pivot_mae.reset_index()[((res_pivot_mae.reset_index()['dataset'] != 'normal') | 
                                         ((res_pivot_mae.reset_index()['dmu'] != 1) & 
                                          (res_pivot_mae.reset_index()['dmu'] != 2)))]['en'].values)

(WilcoxonResult(statistic=32344.0, pvalue=0.0008018300983706339),
 WilcoxonResult(statistic=35314.0, pvalue=0.03859108446287726),
 WilcoxonResult(statistic=31250.0, pvalue=0.0001750122716522826))

Tuning works: tuned DEDPUL outperforms random on mae (even with excluded N(1, 1), N(2, 1)) and acc

DEDPUL is robast: tuned vs random does not influence alpha significantly; for random choice of parameters all previous findings and comparisons hold, except on mae DEDPUL still indistinguishable from EN and brier even when N(1, 1) and N(2, 1) are dropped, and on acc random_dedpul is outperformed by brier