Bootstraps samples from the large samples of *-heit*, *-nis*, and *-schaft*.

(This code is clunky, the functions are pretty unnecessary.
I rewrote this script for bootstrapping the later samples, and that sleeker code can be found at `../4_applicability/gen_bootstrap_samples.ipynb`.)

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import entropy
import math

In [2]:
SFXS = ['heit', 'nis', 'schaft']
heit = pd.read_csv('../1_data/large_samples/heit.csv')
nis = pd.read_csv('../1_data/large_samples/nis.csv')
schaft = pd.read_csv('../1_data/large_samples/schaft.csv')
all_sfxs_df = pd.concat([heit, nis, schaft], keys=SFXS).reset_index().rename(columns={'level_0':'sfx', 'level_1':'orig_idx'})

# Set this flag for whether to sample with or without replacement.
W_REPL = True

In [4]:
print('Number of -heit tokens:\t\t', len(heit))
print('Number of -nis tokens:\t\t', len(nis))
print('Number of -schaft tokens:\t', len(schaft))

Number of -heit tokens:		 1706488
Number of -nis tokens:		 589279
Number of -schaft tokens:	 795870


In [3]:
def draw_random_sample(df, size):
    """
    Creates a random subset from the given df of the specified size
    
    Args:
        df: pandas df containing corpus query results
        size: integer, size of desired sample
    Returns:
        pandas df of size 'size'
    """
    rd_idcs = np.random.choice(df.index, size = size, replace = W_REPL)
    return df.iloc[rd_idcs].reset_index().rename(columns = {'index':'orig_idx'})


def get_sample_freqdist(df):
    """
    Counts the number of types each type occurs in the given sample.
    
    Args:
        df: pandas df containing corpus query results, with types
            in column 'lemma'
    Returns:
        pandas df with columns type, n_tokens, rank
    """
    
    # Count the occurrences of values in 'lemma' and add a rank column
    df = pd.DataFrame(df.lemma.value_counts()).reset_index().rename(columns={'index':'type', 'lemma':'n_tokens'})
    df['rank'] = list(range(1, len(df)+1))
    return df


def get_sample_entropies(freqdist, base=2):
    """
    Calculates the entropy and scaled entropy of a distribution from a sequence of labels.
    
    Arg:
        freqdist: pandas df, output of get_sample_freqdist()
        base: base of the logarithm to use in the computation, default 2 (for Shannon entropy)
    Returns:
        A tuple of floats: raw entropy in bits, and the entropy scaled to [0,1]
    """
    
    # Compute entropy, and scale it to [0,1] by dividing by log2 of the number of categories
    # (this is the max possible entropy for that number of categories)
    ent = entropy(freqdist['n_tokens'], base = base)
    scaled_ent = ent/math.log(len(freqdist), 2)
    return ent, scaled_ent


def get_sample_hapaxes(freqdist):
    """
    Calculates the entropy and scaled entropy of a distribution from a sequence of labels.
    
    Arg:
        freqdist: pandas df, output of get_sample_freqdist()
    Returns:
        A tuple of numbers: proportion hapaxes/types (float), number of hapaxes (int)
    """
    num_hapaxes = sum(freqdist['n_tokens'] == 1)
    propn_hapaxes = num_hapaxes/len(freqdist)
    return num_hapaxes, propn_hapaxes


# samp = draw_random_sample(nis, 100)
# samp_fd = get_sample_freqdist(samp)
# get_sample_entropies(samp_fd)
# get_sample_hapaxes(samp_fd)

Now we set a range of sample sizes we want to generate and the number of iterations we want (i.e., number of times we want to generate a sample of each size).

In [4]:
NUM_ITER = 500
SIZES = [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000]

FREQDIST_LIST = []

In [None]:
for curr_sfx in SFXS:
    
    print('\nSUFFIX:', curr_sfx)

    # Subset full df to get only the data for the current suffix.
    curr_sfx_df = all_sfxs_df[all_sfxs_df.sfx == curr_sfx].reset_index(drop=True)

    for iter_idx in range(1, NUM_ITER+1):
        
        for size in SIZES:
            
            # Draw a random sample from curr_sfx_df of size 'size'.
            curr_rd_samp = draw_random_sample(curr_sfx_df, size)
            
            # Compute its freqdist and use this to compute its entropy and hapax info.
            curr_freqdist = get_sample_freqdist(curr_rd_samp)
            
            # Add some more information to freqdist and then append to FREQDIST_LIST.
            curr_freqdist['iter'] = iter_idx
            curr_freqdist['suffix'] = curr_sfx
            curr_freqdist['sample_size'] = size
            curr_freqdist = curr_freqdist[['suffix', 'iter', 'sample_size', 'type', 'n_tokens', 'rank']]
            FREQDIST_LIST.append(curr_freqdist)
        
        if iter_idx % 50 == 0:
            print('  Done iter', iter_idx)
    
# Use pd.concat on FREQDIST_LIST, since it's a list of dfs.
pd.concat(FREQDIST_LIST).to_csv('iterdata/freqdist_iter_500_wrepl.csv', index=False)

# (The file `iterdata/freqdist_iter_500.csv` was created with exactly the same code, just with W_REPL = False.)