In [15]:
import mmh3
import re
from collections import defaultdict

import numpy as np
from scipy.stats import chisquare

In [16]:
def chisquare_test(sample, power=1.16, alpha=1, skip=50):
    unique, counts = np.unique(sample, return_counts=True)
    counts[::-1].sort()

    frequencies = np.array([alpha / np.power(i, power) for i in range(1, len(counts) + 1)])
    zipf_counts = np.ceil(frequencies)

    chi = chisquare(counts[skip:], zipf_counts[skip:])
    return chi[1]

In [17]:
def hash_distributed_test(test, sample, paral_size=10):
    distributed_sample = defaultdict(list)
    for el in sample:
        distributed_sample[mmh3.hash(str('qwerty' + el)) % paral_size].append(el)

    print('############################')
    print('PARTITIONING ', paral_size)
    for ds in distributed_sample:
        print('PARTITION ', ds)
        data = distributed_sample[ds]

        power, alpha = parameter_estimation(data)
        print(power, alpha)
        p_value = test(data, power, alpha)

        print('P VALUE', p_value)
        print('#######')

In [18]:
def parameter_estimation(sample):
    unique, counts = np.unique(sample, return_counts=True)
    counts = -np.sort(-counts)
    start = 10
    counts = counts[np.where(counts >= 5)]
    counts = counts[start:]
    m = len(counts)
    logf = np.log(counts)
    logn = np.log(np.arange(start=1, stop=m + 1, step=1) + start)
    s = (m * np.dot(logf, logn) / np.sum(logn) - np.sum(logf)) / (np.sum(logn) - m * np.dot(logn, logn) / np.sum(logn))
    alpha = np.exp((s * np.dot(logn, logn) + np.dot(logn, logf)) / np.sum(logn))
    return (s, alpha)

## Test war and peace

In [19]:
file = open('war_and_peace.txt', 'r')
text = file.read().lower()
words = re.sub('\W', ' ', text).split()[:500000]

tests = {'chi': chisquare_test}
paral_sizes = range(1, 5)
for name in tests:
    for ps in paral_sizes:
        hash_distributed_test(tests[name], words, paral_size=ps)

############################
PARTITIONING  1
PARTITION  0
1.2227028082024947 222358.59781881908
P VALUE 1.0
#######
############################
PARTITIONING  2
PARTITION  0
1.2077008964973277 83434.67620023729
P VALUE 1.0
#######
PARTITION  1
1.2505044772585165 119453.87371271935
P VALUE 1.0
#######
############################
PARTITIONING  3
PARTITION  1
1.2354748609715989 67851.730673141
P VALUE 1.0
#######
PARTITION  0
1.197603246287541 46202.83444819046
P VALUE 1.0
#######
PARTITION  2
1.2701013591275672 79214.29765518052
P VALUE 1.0
#######
############################
PARTITIONING  4
PARTITION  2
1.2067635690946057 36416.36998633147
P VALUE 1.0
#######
PARTITION  1
1.2428042282523715 48128.99709212303
P VALUE 1.0
#######
PARTITION  0
1.2259311576547987 40191.783390759614
P VALUE 1.0
#######
PARTITION  3
1.2774181699879095 59525.340219050195
P VALUE 1.0
#######


## Test lognormal

In [20]:
sample = np.random.lognormal(mean=0.3, sigma=2, size=100000)
sample = np.array(sample, dtype=np.int)
s, alpha = parameter_estimation(sample)
print(s, alpha)
print(chisquare_test(sample, s, alpha))

1.9278371106435572 142204.5470359915
1.0


## Test exponential

In [24]:
sample = np.random.exponential(scale=1.0 / 0.125, size=100000)
sample = np.array(sample, dtype=np.int)
s, alpha = parameter_estimation(sample)
print(s, alpha)
print(chisquare_test(sample, s, alpha))

3.895775352728193 110649152.47327146
6.057928228323907e-09
