In [1]:
def data_file(algorithm, extraction, similarity, components):
    return 'data/classification/class-' + algorithm + '-' + \
                extraction + '-' + similarity + '-' + components + '.json'

In [2]:
import json

def get_sample(file_name):
    sample = []
    with open(file_name, 'r') as f:  
        for line in f:
            line_clean = line.rstrip()
            if line_clean:
                sample.append(json.loads(line_clean)['test_median'])
    if len(sample) < 10:
        print(len(sample))
    return sample

In [3]:
algorithm = 'LDA'
extraction = 'pca'
similarity = 'jaccard'
components = '3'

get_sample(data_file(algorithm, extraction, similarity, components))

[0.8200692041522492,
 0.8166089965397924,
 0.8200692041522492,
 0.8131487889273357,
 0.8200692041522492,
 0.8373702422145328,
 0.8304498269896193,
 0.8131487889273357,
 0.8131487889273357,
 0.8166089965397924]

In [4]:
import skgof
from statsmodels.distributions.empirical_distribution import ECDF
from scipy.stats import rv_continuous

class EmpiricalDistribution(rv_continuous):
    def __init__(self, data):
        self.ecdf = ECDF(data)
        
    def cdf(self, x):
        return self.ecdf(x)

  from .cvmdist import cvm_unif


In [5]:
def von_mises_test(sample_1, sample_2):
    return skgof.cvm_test(sample_1, EmpiricalDistribution(sample_2)).pvalue

check that group 1 algorithms (PCA + Jaccard + all components + tutti algorithms except DT) are indistinguishable

In [6]:
components = ['2', '3', '5', '10', '30']

In [7]:
import itertools as it

[(c1, c2, von_mises_test(get_sample(data_file('LDA', 'pca', 'jaccard', c1)),
                get_sample(data_file('LDA', 'pca', 'jaccard', c2))))
 for c1, c2 in it.combinations(components, 2)]

[('2', '3', 0.10797455696350311),
 ('2', '5', 0.01659957900426101),
 ('2', '10', 0.039283571682110385),
 ('2', '30', 0.3293211390888068),
 ('3', '5', 0.00013900471814665138),
 ('3', '10', 0.007011937396518175),
 ('3', '30', 0.1405866088767651),
 ('5', '10', 0.32932113908880656),
 ('5', '30', 0.0025735918355593856),
 ('10', '30', 0.005470878899666465)]

In [8]:
files_g1 = [data_file(a, 'pca', 'jaccard', c) for a in ['LDA', 'Random_forest', 'Naive_Bayes', 'MLP']
                                           for c in ['2', '3', '5', '10', '30']]

In [9]:
import pandas as pd

pvals = pd.DataFrame([(f1[26:-5], f2[26:-5], von_mises_test(get_sample(f1), get_sample(f2)))
                      for f1, f2 in it.combinations(files_g1, 2)],
                    columns=['sample_1', 'sample_2', 'p_value'])

In [10]:
len(pvals[pvals.p_value<0.01]) / len(pvals)

0.19473684210526315

In [11]:
(len(pvals[pvals.p_value<0.01]), len(pvals))

(37, 190)

In [12]:
pvals[pvals.p_value<0.01]

Unnamed: 0,sample_1,sample_2,p_value
9,LDA-pca-jaccard-2,Naive_Bayes-pca-jaccard-2,0.007935
19,LDA-pca-jaccard-3,LDA-pca-jaccard-5,0.000139
20,LDA-pca-jaccard-3,LDA-pca-jaccard-10,0.007012
27,LDA-pca-jaccard-3,Naive_Bayes-pca-jaccard-2,0.000139
38,LDA-pca-jaccard-5,LDA-pca-jaccard-30,0.002574
41,LDA-pca-jaccard-5,Random_forest-pca-jaccard-5,0.00277
43,LDA-pca-jaccard-5,Random_forest-pca-jaccard-30,0.001935
46,LDA-pca-jaccard-5,Naive_Bayes-pca-jaccard-5,-6e-06
49,LDA-pca-jaccard-5,MLP-pca-jaccard-2,0.002265
51,LDA-pca-jaccard-5,MLP-pca-jaccard-5,0.000398


check that group 1 and group 3 are radically different

In [13]:
files_g3 = [data_file('LDA', 'pca', 'hamming', c) for c in ['2', '3', '5', '10', '30']]

In [14]:
pvals = pd.DataFrame([(f1[26:-5], f2[26:-5], von_mises_test(get_sample(f1), get_sample(f2)))
                      for f1 in files_g1 for f2 in files_g3],
                    columns=['sample_1', 'sample_2', 'p_value'])
len(pvals[pvals.p_value < 0.05]) / len(pvals)

1.0

check that group 2 and group 3 are radically different

In [15]:
files_g2 = [data_file('Decision_tree', 'pca', 'jaccard', c) for c in ['2', '3', '5', '10', '30']] + \
           [data_file(a, 'pca', 'levenshtein', c)
            for a in ['LDA', 'Random_forest', 'MLP', 'Naive_Bayes', 'Decision_tree']
            for c in ['2', '3', '5', '10', '30']] + \
           [data_file(a, 'pca', 'hamming', c)
            for a in ['Random_forest', 'MLP', 'Naive_Bayes', 'Decision_tree']
            for c in ['2', '3', '5', '10', '30']]

In [16]:
pvals = pd.DataFrame([(f1[26:-5], f2[26:-5], von_mises_test(get_sample(f1), get_sample(f2)))
                      for f1 in files_g2 for f2 in files_g3],
                    columns=['sample_1', 'sample_2', 'p_value'])
len(pvals[pvals.p_value < 0.05]) / len(pvals)

1.0

check that group 1 and group 2 are radically different

In [17]:
pvals = pd.DataFrame([(f1[26:-5], f2[26:-5], von_mises_test(get_sample(f1), get_sample(f2)))
                      for f1 in files_g1 for f2 in files_g2],
                    columns=['sample_1', 'sample_2', 'p_value'])
len(pvals[pvals.p_value < 0.05]) / len(pvals)

1.0

check that group 1 and group 4 are radically different

In [18]:
files_g4 = [data_file(a, 'tsne', m, c)
            for a in ['LDA', 'Random_forest', 'MLP', 'Naive_Bayes', 'Decision_tree']
            for m in ['jaccard', 'levenshtein', 'hamming']
            for c in ['2', '3', '5', '10', '30']]

In [19]:
pvals = pd.DataFrame([(f1[26:-5], f2[26:-5], von_mises_test(get_sample(f1), get_sample(f2)))
                      for f1 in files_g1 for f2 in files_g4],
                    columns=['sample_1', 'sample_2', 'p_value'])
len(pvals[pvals.p_value < 0.05]) / len(pvals)

1.0