In [None]:
import itertools as it
import numpy as np
import pandas as pd
from scipy.stats import kruskal, friedmanchisquare, mannwhitneyu, rankdata, wilcoxon
from statsmodels.sandbox.stats.multicomp import multipletests


def scores_table(datasets, estimators, scores, stds=None,
                 greater_is_better=True, method='average'):
    """ Scores table.

        Prints a table where each row represents a dataset and each column
        represents an estimator.

        Parameters
        ----------
        datasets: array-like
                  List of dataset names.
        estimators: array-like
                    List of estimator names.
        scores: array-like
                Matrix of scores where each column represents a model.
        stds: array_like, default=None
              Matrix of standard deviations where each column represents a
              model.
        greater_is_better: boolean, default=True
                           Whether a greater score is better (score) or worse
                           (loss).
        method: {'average', 'min', 'max', 'dense', 'ordinal'}, default='average'
                Method used to solve ties.

        Returns
        -------
        table: array-like
               Table of mean and standard deviation of each estimator-dataset
               pair. A ranking of estimators is also generated.
    """
    ranks = np.asarray([rankdata(-m, method=method) if greater_is_better else rankdata(m, method=method) for m in scores])
    table = pd.DataFrame(data=scores, index=datasets, columns=estimators)
    for i, d in enumerate(datasets):
        for j, e in enumerate(estimators):
            table.loc[d, e] = '{0:.2f}'.format(scores[i, j])
            if stds is not None:
                table.loc[d, e] += ' ±{0:.2f}'.format(stds[i, j])
            table.loc[d, e] += ' ({0:.1f})'.format(ranks[i, j])
    table.loc['rank mean'] = np.around(np.mean(ranks, axis=0), decimals=4)
    return table


def hypotheses_table(samples, models, alpha=0.05, multitest=None,
                     test='wilcoxon', correction=None, multitest_args=dict(),
                     test_args=dict()):
    """ Hypotheses table.

        Prints a hypothesis table with a selected test and correction.

        Parameters
        ----------
        samples: array-like
                 Matrix of samples where each column represent a model.
        models: array-like
                Model names.
        alpha: float in [0, 1], default=0.05
               Significance level.
        multitest: {'kruskal', 'friedmanchisquare'}
                   default=None
                   Ranking multitest used.
        test: {'mannwhitneyu', 'wilcoxon'},
              default='wilcoxon'
              Ranking test used.
        correction: {'bonferroni', 'sidak', 'holm-sidak', 'holm',
                     'simes-hochberg', 'hommel', 'fdr_bh', 'fdr_by', 'fdr_tsbh',
                     'fdr_tsbky'},
              default=None
              Method used to adjust the p-values.
        multitest_args: dict
                        Optional ranking test arguments.
        test_args: dict
                   Optional ranking test arguments.

        Returns
        -------
        multitest_table: array-like
                         Table of p-value and rejection/non-rejection for the
                         multitest hypothesis.
        test_table: array-like
               Table of p-values and rejection/non-rejection for each test
               hypothesis.
    """
    versus = list(it.combinations(range(len(models)), 2))
    comparisons = [models[vs[0]] + " vs " + models[vs[1]] for vs in versus]
    multitests = {'kruskal': kruskal, 'friedmanchisquare': friedmanchisquare}
    tests = {'mannwhitneyu': mannwhitneyu, 'wilcoxon': wilcoxon}
    multitest_table = None
    if multitest is not None:
        multitest_table = pd.DataFrame(index=[multitest], columns=['p-value',
                                                                   'Hypothesis'])
        statistic, pvalue = multitests[multitest](*samples, **multitest_args)
        reject = 'Rejected' if pvalue <= alpha else 'Not rejected'
        multitest_table.loc[multitest] = ['{0:.2f}'.format(pvalue), reject]
        if pvalue > alpha:
            return multitest_table, None
    pvalues = [tests[test](samples[:, vs[0]], samples[:, vs[1]], **test_args)[1] for vs in versus]
    if correction is not None:
        reject, pvalues, alphac_sidak, alphac_bonf = multipletests(pvalues,
                                                                   alpha,
                                                                   method=correction)
    else:
        reject = ['Rejected' if pvalue <= alpha else 'Not rejected' for pvalue in pvalues]
    test_table = pd.DataFrame(index=comparisons, columns=['p-value',
                                                          'Hypothesis'])
    for i, d in enumerate(comparisons):
        test_table.loc[d] = ['{0:.2f}'.format(pvalues[i]), reject[i]]
    return multitest_table, test_table


In [None]:
import numpy as np

from skdatasets.utils._scores import scores_table, hypotheses_table


datasets = ['a4a', 'a8a', 'combined', 'dna', 'ijcnn1', 'letter', 'pendigits',
            'satimage', 'shuttle', 'usps', 'w7a', 'w8a']
estimators = ['LogisticRegression', 'MLPClassifier0', 'MLPClassifier1',
              'MLPClassifier2', 'MLPClassifier3', 'MLPClassifier4',
              'MLPClassifier5']
scores = np.asarray(((89.79, 89.78, 89.76, 89.88, 89.85, 89.91, 89.93),
                     (90.73, 90.73, 90.73, 90.85, 90.83, 90.81, 90.80),
                     (92.36, 92.31, 94.58, 94.82, 94.84, 94.92, 94.89),
                     (99.28, 99.27, 99.28, 99.26, 99.27, 99.25, 99.25),
                     (91.34, 91.34, 99.29, 99.33, 99.34, 99.53, 99.54),
                     (98.07, 98.04, 99.94, 99.95, 99.96, 99.96, 99.95),
                     (99.17, 99.08, 99.87, 99.87, 99.88, 99.90, 99.89),
                     (96.67, 96.28, 98.84, 98.87, 98.90, 98.87, 98.92),
                     (95.85, 92.83, 99.88, 99.93, 99.96, 99.98, 99.99),
                     (99.12, 99.11, 99.65, 99.58, 99.58, 99.65, 99.60),
                     (95.93, 95.40, 94.58, 96.31, 96.34, 96.58, 96.50),
                     (95.80, 95.99, 95.35, 96.20, 96.22, 96.36, 96.71)))


def test_scores_table():
    """Tests scores table."""
    scores_table(datasets, estimators, scores)
    scores_table(datasets, estimators, scores, stds=scores/10.0)


def test_hypotheses_table():
    """Tests hypotheses table."""
    for multitest in ('kruskal', 'friedmanchisquare', None):
        for test in ('mannwhitneyu', 'wilcoxon'):
            hypotheses_table(scores, estimators, multitest=multitest, test=test)
            for correction in ('bonferroni', 'sidak', 'holm-sidak', 'holm',
                               'simes-hochberg', 'hommel', 'fdr_bh', 'fdr_by',
                               'fdr_tsbh', 'fdr_tsbky'):
                hypotheses_table(scores, estimators, multitest=multitest,
                                 test=test, correction=correction)
