Bootstraps samples from the full samples of 35 German suffixes from DECOW16B (synchronic) and RIDGES (diachronic).
Computes their entropy and frequency distributions. Saves outfiles in `iterdata/`.

In [3]:
import pandas as pd
import numpy as np
from scipy.stats import entropy
import os

PATH_TO_COW_SAMPLES = '../1_data/35_samples/7_analysis_samples/'
PATH_TO_RIDGES_SAMPLES = '../1_data/ridges_samples/'
SFXS = [fn.split('_')[0] for fn in os.listdir(PATH_TO_COW_SAMPLES)]
R_SFXS = ['er', 'heit', 'ung']
R_PERS = ['1482-1549', '1550-1649', '1650-1749', '1750-1849', '1850-1914']
NUM_ITER = 100
SIZE_FACTORS = [2**-idx for idx in range(5)]

W_REPL = True

# DECOW data

In [4]:
FREQDIST_LIST = []
ENTROPY_LIST = []

for sfx in SFXS:
    print('\n'+sfx, '='*30)
    
    # Read in sample.
    curr_sfx_df = pd.read_csv(PATH_TO_COW_SAMPLES + sfx + '_sample.csv')
    
    # Generate the factors that we'll subset the samples using: full size, then half (2^-1), quarter (2^-2), eighth (2^-3).
    # Then get subsample sizes for the current sample.
    sizes = [int(np.ceil(len(curr_sfx_df) * factor)) for factor in SIZE_FACTORS]

    for iter_idx in range(1, NUM_ITER+1):
    
        for size_idx in range(len(sizes)):
            
            size = sizes[size_idx]
            factor = SIZE_FACTORS[size_idx]
        
            # Draw a random subsample from curr_sfx_df of size 'size'.
            rd_idcs = np.random.choice(curr_sfx_df.index, size = size, replace = W_REPL)
            curr_subset = curr_sfx_df.iloc[rd_idcs].reset_index().rename(columns = {'index':'orig_idx'})

            # Compute the frequency distribution of the types in this sample and use this to compute entropy.
            # First, count the occurrences of values in 'lemma' and add a rank column.
            freq_df = pd.DataFrame(curr_subset.lemma.value_counts()).reset_index().rename(columns={'index':'type', 'lemma':'n_tokens'})
            freq_df['rank'] = list(range(1, len(freq_df)+1))
            freq_df['iter'] = iter_idx
            freq_df['sfx'] = sfx
            freq_df['sample_size'] = size
            freq_df['factor'] = factor
            
            # Now get the Shannon entropy of the values in n_tokens.
            ent = entropy(freq_df['n_tokens'], base = 2)
            
            # Append this information to FREQDIST_LIST and ENTROPY_LIST for export.
            FREQDIST_LIST.append(freq_df)
            ENTROPY_LIST.append( {'iter':iter_idx, 'sfx':sfx, 'sample_size':size, 'factor':factor, 'entropy':ent, 'n_types':len(freq_df)} )
            
        if iter_idx % 20 == 0:
            print('Done iteration', iter_idx)

# Use pd.concat on FREQDIST_LIST, since it's a list of dfs.
pd.concat(FREQDIST_LIST)[['sfx','iter','factor','sample_size','rank','type','n_tokens']].to_csv('iterdata/freqdist_iter_wrepl.csv', index=False)

# Can just use pd.DataFrame on the other two lists, since they're lists of dicts.
pd.DataFrame(ENTROPY_LIST)[['sfx','iter','factor','sample_size','n_types','entropy']].to_csv('iterdata/entropy_iter_wrepl.csv', index=False)


Done iteration 20
Done iteration 40
Done iteration 60
Done iteration 80
Done iteration 100

Done iteration 20
Done iteration 40
Done iteration 60
Done iteration 80
Done iteration 100

Done iteration 20
Done iteration 40
Done iteration 60
Done iteration 80
Done iteration 100

Done iteration 20
Done iteration 40
Done iteration 60
Done iteration 80
Done iteration 100

Done iteration 20
Done iteration 40
Done iteration 60
Done iteration 80
Done iteration 100

Done iteration 20
Done iteration 40
Done iteration 60
Done iteration 80
Done iteration 100

Done iteration 20
Done iteration 40
Done iteration 60
Done iteration 80
Done iteration 100

Done iteration 20
Done iteration 40
Done iteration 60
Done iteration 80
Done iteration 100

Done iteration 20
Done iteration 40
Done iteration 60
Done iteration 80
Done iteration 100

Done iteration 20
Done iteration 40
Done iteration 60
Done iteration 80
Done iteration 100

Done iteration 20
Done iteration 40
Done iteration 60
Done iteration 80
Done it

# RIDGES data

In [5]:
R_FREQDIST_LIST = []
R_ENTROPY_LIST = []

for sfx in R_SFXS:
    print('\n'+sfx, '='*30)
    
    # Read in sample.
    curr_sfx_df = pd.read_csv(PATH_TO_RIDGES_SAMPLES + sfx + '.csv')
    
    # First we have to transform the RIDGES data into the format we expect: 
    # rather than a type frequency distribution, a sample in which each type 
    # is actually contained the given number of times.
    perlist = curr_sfx_df.period.repeat(curr_sfx_df.frequency)
    lemlist = curr_sfx_df.lemma.repeat(curr_sfx_df.frequency)
    curr_sfx_df = pd.DataFrame({'period': perlist, 'lemma': lemlist})
    
    for per in R_PERS:
        
        curr_per_df = curr_sfx_df[curr_sfx_df.period == per].reset_index(drop=True)
    
        # Generate the factors that we'll subset the samples using: full size, then half (2^-1), quarter (2^-2), eighth (2^-3).
        # Then get subsample sizes for the current sample.
        sizes = [int(np.ceil(len(curr_per_df) * factor)) for factor in SIZE_FACTORS]

        for iter_idx in range(1, NUM_ITER+1):

            for size_idx in range(len(sizes)):

                size = sizes[size_idx]
                factor = SIZE_FACTORS[size_idx]

                # Draw a random subsample from curr_per_df of size 'size'.
                rd_idcs = np.random.choice(curr_per_df.index, size = size, replace = W_REPL)
                curr_subset = curr_per_df.iloc[rd_idcs].reset_index().rename(columns = {'index':'orig_idx'})

                # Compute the frequency distribution of the types in this sample and use this to compute entropy.
                # First, count the occurrences of values in 'lemma' and add a rank column.
                freq_df = pd.DataFrame(curr_subset.lemma.value_counts()).reset_index().rename(columns={'index':'type', 'lemma':'n_tokens'})
                freq_df['rank'] = list(range(1, len(freq_df)+1))
                freq_df['iter'] = iter_idx
                freq_df['sfx'] = sfx
                freq_df['sample_size'] = size
                freq_df['period'] = per
                freq_df['factor'] = factor

                # Now get the Shannon entropy of the values in n_tokens.
                ent = entropy(freq_df['n_tokens'], base = 2)

                # Append this information to FREQDIST_LIST and ENTROPY_LIST for export.
                R_FREQDIST_LIST.append(freq_df)
                R_ENTROPY_LIST.append( {'iter':iter_idx, 'sfx':sfx, 'sample_size':size, 'factor':factor, 'entropy':ent, 'period':per, 'n_types':len(freq_df)} )

            if iter_idx % 20 == 0:
                print('Done', per, '- iteration', iter_idx)

# Use pd.concat on FREQDIST_LIST, since it's a list of dfs.
pd.concat(R_FREQDIST_LIST)[['sfx','iter','period','factor','sample_size','rank','type','n_tokens']].to_csv('iterdata/ridges_freqdist_iter_wrepl.csv', index=False)

# Can just use pd.DataFrame on the other two lists, since they're lists of dicts.
pd.DataFrame(R_ENTROPY_LIST)[['sfx','iter','period','factor','sample_size','n_types','entropy']].to_csv('iterdata/ridges_entropy_iter_wrepl.csv', index=False)


Done 1482-1549 - iteration 20
Done 1482-1549 - iteration 40
Done 1482-1549 - iteration 60
Done 1482-1549 - iteration 80
Done 1482-1549 - iteration 100
Done 1550-1649 - iteration 20
Done 1550-1649 - iteration 40
Done 1550-1649 - iteration 60
Done 1550-1649 - iteration 80
Done 1550-1649 - iteration 100
Done 1650-1749 - iteration 20
Done 1650-1749 - iteration 40
Done 1650-1749 - iteration 60
Done 1650-1749 - iteration 80
Done 1650-1749 - iteration 100
Done 1750-1849 - iteration 20
Done 1750-1849 - iteration 40
Done 1750-1849 - iteration 60
Done 1750-1849 - iteration 80
Done 1750-1849 - iteration 100
Done 1850-1914 - iteration 20
Done 1850-1914 - iteration 40
Done 1850-1914 - iteration 60
Done 1850-1914 - iteration 80
Done 1850-1914 - iteration 100

Done 1482-1549 - iteration 20
Done 1482-1549 - iteration 40
Done 1482-1549 - iteration 60
Done 1482-1549 - iteration 80
Done 1482-1549 - iteration 100
Done 1550-1649 - iteration 20
Done 1550-1649 - iteration 40
Done 1550-1649 - iteration 60
Do

Filse in `iterdata/` whose names do not end in `_wrepl` were also generated with this script, but with `W_REPL = False`.