The variables computed in this script:

**Frequency ratio:** Compute the base-derivation frequency ratio for each pair and then get the average for each lemma.
(If the derivation frequency in the column `lemma_freq` is 0, replace with 1, so that (a) the math doesn't break and (b) because the lemma is only in the sample because it did appear in DECOW16B, and the frequency counts diverge because they are from the smaller DECOW16A-NANO.)

**Semantic relatedness:** The idea here is to take the average of the semantic relatedness probabilities from DErivBase for each suffix.
To identify the base-derivation pairs that are given in DErivBase, we first extract and organise all the rows that contain derivations with the suffix we care about using `get_sfx_rows()`, and then we use the base candidates generated by `backformer_two` to select only those rows that contain actual bases for the given derivations.

**Junctural phonotactics:** The token-based probability of the juncture (bigraph) at the morpheme boundary in the derived words showing up in German simplexes.

**Entropy:** The dependent variable of the analysis, the measure of productivity. Larger values indicate a more evenly-spread-out distribution, which is a sign of a word formation pattern's productivity.

**Other properties of the sample:** Number of tokens (i.e., sample size), number of types.

In [1]:
import os
import pandas as pd
import backformer_two as b
from scipy.stats import entropy

# Read in the files we'll need.
RATIO_FILES = os.listdir('../../1_data/35_samples/6_backform_base_cutoff/')    # freq of bases and derivations
PROBS = pd.read_csv('DErivBase-v2.0-probabilities.txt', sep=' ', header=None)  # prob of sem relatedness
JUNC_PROBS = pd.read_csv('../simplexes/junctures_tokenbased.csv')    # probs of junctural bigraphs

# Extract the list of suffixes.
SFXS = [fn.split('_')[0] for fn in RATIO_FILES]

In [2]:
# Define some things that are needed for translating between my data's format and the suffixes in DErivBase,
# and for dealing with the data in DErivBase's format.

# We need a list that contains all the forms of each suffix that appear in the DErivBase data.
# All are the same as their values in SFXS, except heit -> heit/keit, eV -> e, eA -> e, and itaet -> ität.
SFX_LISTS = [['age'], ['ament'], ['and'], ['ant'], ['anz'], ['ateur'], ['ation'], ['ator'], ['atur'], ['e'], ['el'], 
             ['ement'], ['end'], ['ent'], ['enz'], ['er'], ['eur'], ['e'], ['heit', 'keit'], ['ie'], ['iker'], 
             ['ikum'], ['ik'], ['iment'], ['ismus'], ['ist'], ['ität'], ['iteur'], ['ition'], ['itur'], ['ium'], 
             ['ling'], ['nis'], ['schaft'], ['ung']]

# Also need a list that distinguishes the -eA and -eV derivations based on the POS of their base.
# Can leave the base POSs of all the other suffixes underspecified, since none of the others are syncretic.
BASE_POS = [None, None, None, None, None, None, None, None, None, 'A', None, None, None, None, None, None, None, 'V', 
            None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]


def semrel_get_sfx_rows(prob_df, sfx_list, base_pos):
    """
    Subsets and rearranges the DErivBase probabilities data to contain all derivations with the given
    suffix in one column and all potential bases in another.
    
    Arg:
        prob_df: pandas df containing the DErivBase v2.0 probabilities
        sfx_list: list with string elements containign forms of the suffix
        base_pos: either None or string representing POS of base
    Returns:
        Pandas df, subset + rearrangement of prob_df.
    """
    assert isinstance(sfx_list, list), 'Suffixes must be passed in as lists, e.g. ["ung"]'
    
    sfx_regex = '|'.join([sfx+'_N' for sfx in sfx_list]) if len(sfx_list) > 1 else sfx_list[0]+'_N'
    
    # It's not the case in prob_df that all the derivs are in one col and all the bases are in the other.
    # So select from each col individually, reorder the df where deriv is in the second column, and concat.
    
    # If base_pos is None, then we can select all given bases.
    # If it's not None (i.e., for -e), then we need to make sure that the bases are labelled with the correct POS.
    if base_pos == None:
        sfx_in_col0 = prob_df[ prob_df[0].str.contains(sfx_regex, regex=True) ].rename(columns={0:'sfx', 1:'notsfx', 2:'prob'})
        sfx_in_col1 = prob_df[ prob_df[1].str.contains(sfx_regex, regex=True) ].rename(columns={0:'notsfx', 1:'sfx', 2:'prob'})
    else:
        sfx_in_col0 = prob_df[ (prob_df[0].str.contains(sfx_regex, regex=True)) & (prob_df[1].str.contains('_' + base_pos, regex=True)) ].rename(columns={0:'sfx', 1:'notsfx', 2:'prob'})
        sfx_in_col1 = prob_df[ (prob_df[1].str.contains(sfx_regex, regex=True)) & (prob_df[0].str.contains('_' + base_pos, regex=True)) ].rename(columns={0:'notsfx', 1:'sfx', 2:'prob'})
    sfx_df = pd.concat([sfx_in_col0, sfx_in_col1[['sfx', 'notsfx', 'prob']]])
    
    # Remove the rows where both columns contain sfx_regex, since these are pairs like Aufrichtigkeit/Unaufrichtigkeit.
    # It will be simpler if only the 'heit' column contains derivations, and we don't care about these prefixes right now 
    # anyway. Reset_index() needed for merging with 
    sfx_df = sfx_df[~sfx_df.notsfx.str.contains(sfx_regex, regex=True)].reset_index(drop=True)
    
    # Split deriv and other into separate columns for the word and the POS. 
    # The column containing the derivations must be called 'lemma' for Backformer to work.
    deriv_df = pd.DataFrame(sfx_df.sfx.str.split('_').tolist(), columns = ['lemma', 'lemma_pos'])
    other_df = pd.DataFrame(sfx_df.notsfx.str.split('_').tolist(), columns = ['other', 'other_pos'])
    sfx_df = pd.concat([sfx_df, deriv_df, other_df], axis=1).drop(columns=['sfx', 'notsfx'])
    
    return sfx_df

In [3]:
# Define list that will iteratively gain a dictionary with the data for each suffix in the for loop below.
VARS_LIST = []

for idx in range(len(SFXS)):
    curr_sfx = SFXS[idx]

    # ===== Frequency ratios =====
    
    ratio_df = pd.read_csv('../../1_data/35_samples/6_backform_base_cutoff/' + RATIO_FILES[idx])
    
    # We're only interested in the subset with true_base == 1.
    ratio_df = ratio_df[ratio_df.true_base == 1]
    
    # Compute freq ratio, replacing a lemma_freq of 0 with 1 to avoid divide-by-zero issues. 
    ratio_df['freq_ratio'] = pd.np.where(ratio_df.lemma_freq == 0,
                                       ratio_df.base_freq,  # base_freq/1 = base_freq
                                       ratio_df.base_freq / ratio_df.lemma_freq)
    ratio_df['log_freq_ratio'] = pd.np.log(ratio_df.freq_ratio)
    
    
    # ===== Semantic relatedness =====
    
    sfx_list = SFX_LISTS[idx]
    base_pos = BASE_POS[idx]
    bf_sfx = '-e' if curr_sfx in ['-eA', '-eV'] else curr_sfx  # The form required for backformer.
    
    # Get all probabilities of relatedness for the current suffix.
    curr_probs = semrel_get_sfx_rows(PROBS, sfx_list, base_pos)
    
    # Get the base candidates from Backformer.
    base_cands = b.get_bases_no_cleanup(curr_probs, bf_sfx)
    
    # We want to see if any of the values in columns beginning with 'base_cand' are the same as the value in 'other'.
    # Use df.values to get an array of the values in each row of base_cand cols, zip tog with value of 'other', and compare.
    base_cand_colnames = base_cands.columns[base_cands.columns.str.startswith('base_cand')]
    base_cands['other_in_cand'] = [x[0] in x[1] for x in zip(base_cands['other'], base_cands[base_cand_colnames].values)]
    
    # Only select those pairs whose bases are among the candidates. This allows one derivative to appear with multiple
    # bases, but that's OK, since sometimes it's not so clear which of clearly related bases is The True one (maybe there
    # is no true one).
    bases_df = base_cands[base_cands['other_in_cand']]
    bases_df = bases_df.drop(columns = base_cand_colnames).drop(columns=['other_in_cand']).reset_index(drop=True)
    
    # ===== Junctural probabilities =====
    
    len_sfx = len(curr_sfx) - 1
    junc_df = pd.read_csv('../../1_data/35_samples/7_analysis_samples/' + curr_sfx + '_sample.csv')
        
    # Identify the bigraph that spans the juncture for each lemma.
    junc_df['bigraph'] = [x[-(len_sfx+1):-(len_sfx-1)] for x in junc_df.lemma]
    
    # Merge with JUNC_PROBS, and give probabilty of 0 to any junctures that don't appear in simplexes at all.
    junc_df = pd.merge(junc_df, JUNC_PROBS[['bigraph', 'propn']], how='left', on='bigraph')
    junc_df['junc_prob'] = junc_df['propn'].fillna(0)
    junc_df = junc_df.drop(columns=['propn']).rename(columns={'bigraph':'juncture'})
    
    # ===== Shannon entropy =====

    sample_df = pd.read_csv('../../1_data/35_samples/7_analysis_samples/' + curr_sfx + '_sample.csv')
    ent = entropy(sample_df.lemma.value_counts().values, base = 2)
    
    # ===== Putting it all together =====
    
    VARS_LIST.append({'sfx': curr_sfx, 
                      'mean_freq_ratio': ratio_df.freq_ratio.mean(), 
                      'mean_log_freq_ratio': ratio_df.log_freq_ratio.mean(),
                      'semrel_prob': bases_df.prob.mean(),
                      'mean_junc_prob': junc_df.junc_prob.mean(),
                      'n_tokens': len(sample_df), 
                      'n_types': len(sample_df.lemma.unique()),
                      'entropy': ent
                     })

VARS_DF = pd.DataFrame(VARS_LIST)
VARS_DF

Unnamed: 0,sfx,mean_freq_ratio,mean_log_freq_ratio,semrel_prob,mean_junc_prob,n_tokens,n_types,entropy
0,-age,240.157098,0.901831,0.748042,0.001762,1403,14,1.901844
1,-ament,0.059939,-4.03237,0.69349,0.006011,4452,3,1.147413
2,-and,0.947942,-0.811246,0.833429,0.002903,37,4,1.283551
3,-ant,73.311989,1.715324,0.804665,0.003625,8137,66,4.044952
4,-anz,42.976578,0.789614,0.857974,0.003383,5300,16,2.313807
5,-ateur,877.290151,4.903979,0.864684,0.003089,13671,71,2.905207
6,-ation,2.693713,-1.218215,0.935052,0.002737,15824,221,5.371834
7,-ator,84.806951,1.72735,0.800832,0.002619,16585,119,4.792289
8,-atur,66.806245,1.324651,0.768693,0.003395,4826,16,2.392126
9,-eA,34.653455,1.799621,0.910419,0.011912,754,33,4.241452


In [4]:
VARS_DF.to_csv('sfx_data.csv', index=False)