## Import estimators

In [13]:
from pydistinct.stats_estimators import *
from pydistinct.ensemble_estimators import median_estimator

# Demonstration of technique

### Sample 500 values from a population of 1000 integers. 
### Can the estimators correctly compute the population size of 1000 integers?

In [14]:
from pydistinct.sampling import sample_uniform, sample_gaussian, sample_zipf
uniform = sample_uniform(n_distinct_integers=1000, sample_size=500, seed=1337) 
print("Ground truth : {}".format(uniform["ground_truth"]))
print("sampled_values : ", uniform["sample"])

Ground truth : 1000
sampled_values :  [152 190 861 233 168 474 680 347 883  83  85  73 266 391 439 347  24 665
 130 796  94 775  23 371 963 618 873 204 834  61 728 660 751 338 880 604
 436 826 196 405 521 585 574 584 635 860 650 709 736 994 333 903 197 371
 502 943 760 789 531 795 708 618 675 427 709 764 521 335 999 132 857 606
 649 304 840 901 740 787 190 522 498  93 334 819  77 945 302 948 441 499
 674 430 769  50  74  85 512 302 739 433 924 509 200  62 277 631 470 551
 109 692  65 529 501 211 646 746 986 945 115  94 627 728 340 647 651 238
 395 635 436 878 379 401 348 412 772 618 269 494 656   1 461   8 436 716
 322 295 674 809 244 253 702 477 394 965   8 290 247 392 608  11 699 776
 983 196 444 396 902 747 336 159 511 170 661 530 449 728 472  67 773 133
 362 583 178 231  32 222 563 525 591 763 792 906 246 123 477 769 294 338
 979 702  92   6 291 658 635 948 364 338 191 578 852 520 517 376   8 444
 621 151 557 263 379 234 237 607 101 741 131 688 243 711 258 270 223 209
 571 857 401 

In [15]:
# Using the estimators
print(median_estimator(uniform["sample"])) # generally the best estimator
print(bootstrap_estimator(uniform["sample"]))
print(horvitz_thompson_estimator(uniform["sample"]))
print(smoothed_jackknife_estimator(uniform["sample"]))

1013.1954292072004
518.4906064599444
585.7879884021419
1027.0415022416053


### You can also use a dictionary of attribute counts to do computation
#### This is a key value pair representation of a sample with the key as the element and the value as the count of the element

In [17]:
from pydistinct.utils import _get_attribute_counts
attrs = _get_attribute_counts(uniform["sample"]) # we convert our pervious sample into this representation
print(attrs)

{1: 1, 2: 1, 5: 1, 6: 2, 8: 3, 9: 1, 11: 2, 15: 1, 20: 1, 23: 1, 24: 3, 28: 1, 32: 1, 38: 1, 44: 1, 49: 1, 50: 1, 59: 1, 61: 2, 62: 1, 65: 1, 67: 1, 69: 2, 73: 1, 74: 1, 75: 1, 77: 2, 80: 1, 83: 2, 85: 2, 86: 1, 89: 1, 92: 1, 93: 2, 94: 2, 98: 1, 99: 1, 101: 1, 102: 1, 109: 1, 110: 1, 114: 1, 115: 1, 118: 1, 119: 1, 123: 1, 126: 1, 130: 1, 131: 1, 132: 2, 133: 1, 136: 1, 147: 1, 151: 1, 152: 1, 155: 1, 158: 1, 159: 1, 162: 1, 164: 1, 166: 2, 168: 3, 170: 1, 178: 1, 186: 1, 190: 2, 191: 1, 195: 1, 196: 2, 197: 1, 200: 1, 202: 1, 204: 2, 206: 1, 207: 1, 209: 1, 211: 1, 215: 1, 217: 1, 221: 1, 222: 1, 223: 1, 228: 1, 230: 1, 231: 2, 233: 1, 234: 1, 236: 1, 237: 1, 238: 1, 241: 1, 243: 1, 244: 1, 246: 1, 247: 1, 252: 1, 253: 2, 254: 1, 255: 1, 258: 2, 263: 1, 266: 2, 269: 1, 270: 1, 272: 1, 273: 1, 275: 1, 277: 1, 279: 1, 280: 1, 290: 2, 291: 1, 294: 1, 295: 1, 297: 1, 298: 1, 299: 1, 300: 1, 302: 2, 304: 1, 305: 1, 306: 1, 309: 2, 314: 1, 320: 1, 322: 3, 323: 1, 328: 1, 329: 1, 330: 1, 33

In [19]:
print(median_estimator(attributes=attrs))

1013.1954292072004


### Using this representation, the keys can also be strings (or any hashable type in fact!)

In [21]:
ecological_sample = {"Carica papaya":1, "Quercus robur":2, "Mangifera indica":1}
print(median_estimator(attributes=ecological_sample)) 

4.99810792149732


# A bootstrap module is also included to generate CIs

In [22]:
from pydistinct.bootstrap import bootstrap
print(bootstrap(sequence=uniform["sample"],num_iterations=1000,iteration_batch_size=10,stat_func=median_estimator,alpha=0.05,is_pivotal=False))
print(bootstrap(attributes=attrs,num_iterations=1000,iteration_batch_size=10,stat_func=smoothed_jackknife_estimator,alpha=0.05,is_pivotal=False))


1013.1954292072005    (934.8627672053022, 1104.5740587473167)
1027.0415022416055    (932.6184802582554, 1126.6746717399876)


# bootstrap is less precise with skewed distributions

In [26]:
zipf = sample_zipf(alpha=1.3, population_size=1000, sample_size=500,seed=42) # sample from a zipf function (power law distribution)
print("ground truth : {}".format(zipf["ground_truth"]))
print(bootstrap(sequence=zipf["sample"],num_iterations=1000,stat_func=median_estimator,alpha=0.01))
print(bootstrap(sequence=zipf["sample"],num_iterations=1000,stat_func=smoothed_jackknife_estimator,alpha=0.01))


ground truth : 271
214.46501159599572    (187.71267008800743, 242.3747134876277)
205.7244030361701    (173.80846047257614, 234.03305325955245)


# including estimate of population size helps some estimators

In [27]:
gaussian = sample_gaussian(population_size=1000,sample_size=500,seed=42) # gaussian distribution centered at 0
print(gaussian["ground_truth"])
smoothed_jackknife_estimator(gaussian["sample"])
smoothed_jackknife_estimator(uniform["sample"],pop_estimator = lambda x : x * 2) # provide ratio of sample size to population

555


587.1545210657686

### You will need to use a label encoder to convert strings to integers, or use the value counter method

In [44]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
list_of_ips = """179.19.165.217,15.141.192.215,65.156.245.224,171.73.0.169,95.183.21.88,84.119.201.97,232.52.161.226,118.99.4.106,0.99.41.93,173.45.123.174,69.220.150.128,235.156.19.169,79.227.254.214,93.178.54.167,226.87.216.61,74.43.145.145,156.57.241.163,12.44.146.41,238.105.101.194,203.193.162.41,223.25.188.118,215.111.199.42,132.248.243.114,149.101.122.98,51.239.27.250,35.173.103.49,58.116.247.108,21.116.7.150,153.100.51.4,243.137.224.170,126.184.75.197,29.245.116.141,237.131.63.224,146.36.175.68,99.50.10.31,109.132.57.250,62.146.84.20,185.78.102.242,6.105.8.196,111.211.181.59,207.136.25.59,20.116.67.5,166.8.62.156,113.216.130.70,30.73.41.49,142.228.167.130,0.151.229.196,16.200.30.176,184.145.250.129,126.217.154.100,111.174.3.27,65.71.251.18,85.147.90.152,130.199.145.224,92.115.252.102,8.148.186.216,185.19.212.230,156.33.194.112,84.102.210.151,178.237.250.83,9.77.121.141,223.5.19.148,163.223.153.88,87.54.252.73,228.104.137.210,100.141.179.225,224.131.156.179,161.104.31.123,81.34.254.250,26.108.24.224,67.221.12.230,39.113.138.146,180.195.127.46,126.50.81.204,190.72.235.169,237.62.4.70,53.117.207.31,120.147.83.139,137.32.206.228,224.166.97.9,110.56.231.156,255.193.121.255,160.27.240.209,226.57.97.150,235.225.141.163,200.190.233.232,215.54.119.237,13.70.135.183,80.168.51.63,52.171.6.158,2.204.160.199,37.129.36.208,215.237.58.79,96.162.168.223,8.226.217.124,188.90.218.223,144.129.245.195,110.247.229.69,253.29.160.67,63.148.35.47"""
list_of_ips = list_of_ips.split(",")
print(list_of_ips[:5])
sequence = le.fit_transform(list_of_ips)
print(sequence[:5])
smoothed_jackknife_estimator(sequence)

['179.19.165.217', '15.141.192.215', '65.156.245.224', '171.73.0.169', '95.183.21.88']
[35 23 80 32 97]


2184

### Alternatively, you can also use a value counter to use strings as input :

In [46]:
from collections import Counter
z = ['blue', 'red', 'blue', 'yellow', 'blue', 'red']
attrs = dict(Counter(z))
median_estimator(attributes=attrs)

3.4691572359396434