In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats

In [None]:
df = pd.read_csv("heaps.dat", sep=',', header=None)
df.head()

In [None]:
size = np.array(df.loc[:,0],dtype=float)
print(len(size))

In [None]:
size = df.loc[:,0].values
diffwords = df.loc[:,1].values

In [None]:
fig = plt.figure()
plt.hist(size, density=False, color='blue', bins = 15, label='files')
plt.title("realization size distribution", fontsize = 18)
plt.xlabel("total counts per file", fontsize=16)
plt.ylabel("#", fontsize=16)
plt.legend(fontsize=18)
plt.show()

In [None]:
fig.savefig("sizeDistr.pdf")

In [None]:
fig = plt.figure()
plt.hist(diffwords, density=False, color='orange', bins = 15, label = 'files')
plt.title("vocabulary size distribution", fontsize=18)
plt.xlabel("#different words per file", fontsize=16)
plt.ylabel("#", fontsize=16)
plt.legend(fontsize=18)
plt.show()

In [None]:
fig.savefig("diffwordsDistr.pdf")

In [None]:
fig = plt.figure(figsize=(10,6))
plt.scatter(size, diffwords, label='samples')
plt.xlabel("Realization size", fontsize=16)
plt.ylabel("#different words", fontsize=16)
bin_means, bin_edges, binnumber = stats.binned_statistic(size, diffwords,statistic='mean', bins=np.linspace(0,1.5e8))
plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='r', lw=5, label='binned average')
#plt.xscale('log')
#plt.yscale('log')
plt.xlim(0,2e8)
plt.legend(fontsize=18)
plt.show()

In [None]:
fig.savefig("heaps.pdf")

# null model

In [None]:
df_null = pd.read_csv("heaps_null.dat", sep=',', header=None)
df_null.head()

In [None]:
size_null = df_null.loc[:,0].values
diffwords_null = df_null.loc[:,1].values

In [None]:
len(df_null)

In [None]:
fig = plt.figure()
plt.hist(size, density=False, color='blue', bins = 15, label='files')
plt.hist(size_null, density=False, color='red', lw=2, histtype='step', bins = 15, label='sampling')
plt.title("realization size distribution", fontsize = 18)
plt.xlabel("total counts per file", fontsize=16)
plt.ylabel("#", fontsize=16)
plt.legend(fontsize=18)
plt.show()

In [None]:
fig.savefig("sizeDistr_null.pdf")

In [None]:
fig = plt.figure()
plt.hist(diffwords, density=False, color='orange', bins = 25, label = 'files')
plt.hist(diffwords_null, density=False, histtype='step', color='red', lw=2, bins = 25, label='sampling')
plt.title("vocabulary size distribution", fontsize=18)
plt.xlabel("#different words per file", fontsize=16)
plt.ylabel("#", fontsize=16)
plt.legend(fontsize=18, loc='upper left')
plt.show()

In [None]:
fig.savefig("diffwordsDistr_null.pdf")

In [None]:
fig = plt.figure(figsize=(10,6))
plt.scatter(size, diffwords, label='samples')
plt.scatter(size_null, diffwords_null, label='sampling')
plt.xlabel("Realization size", fontsize=16)
plt.ylabel("#different words", fontsize=16)
bin_means, bin_edges, binnumber = stats.binned_statistic(size, diffwords,statistic='mean', bins=np.linspace(0,1.5e8))
plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='r', lw=5, label='binned average')
#bin_means_null, bin_edges_null, binnumber_null = stats.binned_statistic(size_null, diffwords_null, statistic='mean', bins=20)
#plt.hlines(bin_means_null, bin_edges_null[:-1], bin_edges_null[1:], colors='r', lw=5, label='binned average')
#plt.xscale('log')
#plt.yscale('log')
plt.xlim(0,2e8)
plt.legend(fontsize=18)
plt.show()

In [None]:
fig.savefig("heaps_null.pdf")

## Taylor

In [None]:
bin_vars, _, _ = stats.binned_statistic(size, diffwords,statistic=np.var, bins=np.linspace(0,1.5e8))

In [None]:
fig=plt.figure(figsize=(15,5))
plt.scatter(bin_means, bin_vars)
x = np.arange(np.nanmin(bin_means),np.nanmax(bin_means))
#plt.plot(x,x*x)
#plt.yscale('log')
#plt.xscale('log')
plt.ylim((np.nanmin(bin_vars)-1e4,np.nanmax(bin_vars)+1e4))
plt.xlim((x[x.nonzero()].min()-5e2,x.max()+5e2))
#plt.xticks(np.arange(1.5e4,1.9e4, step=1e3))
plt.xlabel("$<h>$",fontsize=16)
plt.ylabel("$\sigma_h^2$",fontsize=16)
plt.show()
fig.savefig("heapsTaylor.pdf")