In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats

In [None]:
df = pd.read_csv("heaps.dat", sep=',', header=None)
df.head()

In [None]:
size = np.array(df.loc[:,0],dtype=float)
print(len(size))

In [None]:
size = np.array([x for x in df.loc[1:,0].values])
diffwords = np.array([x for i,x in enumerate(df.loc[1:,1].values)])

In [None]:
fig = plt.figure()
plt.hist(size, density=False, color='blue', bins = 15, label='data')
plt.title("realization size distribution", fontsize = 18)
plt.xlabel("total counts per file", fontsize=16)
plt.ylabel("#", fontsize=16)
plt.legend(fontsize=18)
plt.show()

In [None]:
fig.savefig("sizeDistr.pdf")

In [None]:
fig = plt.figure()
plt.hist(diffwords, density=False, color='orange', bins = 15, label = 'data')
plt.title("vocabulary size distribution", fontsize=18)
plt.xlabel("#different words per file", fontsize=16)
plt.ylabel("#", fontsize=16)
plt.legend(fontsize=18)
plt.show()

In [None]:
fig.savefig("diffwordsDistr.pdf")

In [None]:
fig = plt.figure(figsize=(10,6))
plt.scatter(size, diffwords, label='data')
plt.xlabel("Realization size", fontsize=16)
plt.ylabel("#different words", fontsize=16)
bin_means, bin_edges, binnumber = stats.binned_statistic(size, diffwords,statistic='mean', bins=20)
plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='r', lw=5, label='binned average')
#plt.xscale('log')
#plt.yscale('log')
plt.legend(fontsize=18)
plt.show()

In [None]:
fig.savefig("heaps.pdf")

# null model

In [None]:
df_null = pd.read_csv("heaps_null.dat", sep=',', header=None)
df_null.head()

In [None]:
size_null = np.array([x for x in df_null.loc[1:,0].values])
diffwords_null = np.array([x for i,x in enumerate(df_null.loc[1:,1].values)])

In [None]:
fig = plt.figure()
plt.hist(size, density=False, color='blue', range=(0,600000), bins = 15, label='data')
plt.hist(size_null, density=False, color='red', range=(0,600000), lw=2, histtype='step', bins = 15, label='null')
plt.title("realization size distribution", fontsize = 18)
plt.xlabel("total counts per file", fontsize=16)
plt.ylabel("#", fontsize=16)
plt.legend(fontsize=18)
plt.show()

In [None]:
fig.savefig("sizeDistr_null.pdf")

In [None]:
fig = plt.figure()
plt.hist(diffwords, density=False, color='orange', bins = 25, label = 'data')
plt.hist(diffwords_null, density=False, histtype='step', color='red', lw=2, bins = 25, label='null')
plt.title("vocabulary size distribution", fontsize=18)
plt.xlabel("#different words per file", fontsize=16)
plt.ylabel("#", fontsize=16)
plt.legend(fontsize=18)
plt.show()

In [None]:
fig.savefig("diffwordsDistr_null.pdf")

In [None]:
fig = plt.figure(figsize=(10,6))
plt.scatter(size, diffwords, label='data')
plt.scatter(size_null, diffwords_null, label='null')
plt.xlabel("Realization size", fontsize=16)
plt.ylabel("#different words", fontsize=16)
bin_means, bin_edges, binnumber = stats.binned_statistic(size, diffwords,statistic='mean', bins=20)
plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='r', lw=5, label='binned average')
#bin_means_null, bin_edges_null, binnumber_null = stats.binned_statistic(size_null, diffwords_null, statistic='mean', bins=20)
#plt.hlines(bin_means_null, bin_edges_null[:-1], bin_edges_null[1:], colors='r', lw=5, label='binned average')
#plt.xscale('log')
#plt.yscale('log')
plt.legend(fontsize=18)
plt.show()

In [None]:
fig.savefig("heaps_null.pdf")

### Note
vocabulary_size_null.dat **Must** be sorted 

## null and fulldata

In [None]:
df_lin = pd.read_csv("heaps_lin.dat", sep=',', header=None)
df_lin.head()

In [None]:
size_lin = np.array([x for x in df_lin.loc[:,0].values[1:] if x >= 0])
diffwords_lin = np.array([x for i,x in enumerate(df_lin.loc[:,1].values[0:-1]) if x >= 0])

In [None]:
fig = plt.figure()
bins = 15
rang = (0-0.5/600000,600000+0.5/600000)
plt.hist(size_lin, density=False, color='green', range=rang, lw=2, histtype='step', bins = bins, label='lin')
plt.title("realization size distribution", fontsize = 18)
plt.xlabel("total counts per file", fontsize=16)
plt.ylabel("#", fontsize=16)
plt.legend(fontsize=18)
plt.show()

In [None]:
fig.savefig("sizeDistr_lin.pdf")

In [None]:
fig = plt.figure()
plt.hist(diffwords_lin, density=False, color='green', bins = 15, label = 'lin')
plt.title("vocabulary size distribution", fontsize=18)
plt.xlabel("#different words per file", fontsize=16)
plt.ylabel("#", fontsize=16)
plt.legend(fontsize=18)
plt.show()

In [None]:
fig.savefig("diffwordsDistr_lin.pdf")

In [None]:
fig = plt.figure(figsize=(10,6))
plt.scatter(size, diffwords, label='data')
plt.scatter(size_null, diffwords_null, label='null')
plt.scatter(size_lin, diffwords_lin, label='lin')
plt.xlabel("Realization size", fontsize=16)
plt.ylabel("#different words", fontsize=16)
bin_means, bin_edges, binnumber = stats.binned_statistic(size, diffwords,statistic='mean', bins=20)
#plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='r', lw=5, label='binned average')
#bin_means_null, bin_edges_null, binnumber_null = stats.binned_statistic(size_null, diffwords_null, statistic='mean', bins=20)
#plt.hlines(bin_means_null, bin_edges_null[:-1], bin_edges_null[1:], colors='r', lw=5, label='binned average')
#plt.xscale('log')
#plt.yscale('log')
plt.legend(fontsize=18)
plt.show()

In [None]:
fig.savefig("heaps_all.pdf")

In [None]:
lin = np.linspace(0, 900000, num = 50)

In [None]:
voclin = pd.DataFrame(data = lin)
voclin.head()

In [None]:
voclin.to_csv("vocabulary_size_lin.dat", header=None, index=None)