Computes the frequency, and via frequency the probability, of all bigraphs that appear in monomorphemic German words.

In [1]:
import pandas as pd

In [2]:
def get_bigraph_freq(df, tokenbased=True):
    """
    
    Args:
        df: pandas dataframe with the columns lemma, POS, and freq (subset of simplex_filtered3.csv).
        tokenbased: bool indicating whether the bigraph frequency is multipled by the token counts (default: True)
    Returns:
        Pandas df with the columns bigraph, freq, and proportion out of all bigraphs in that df
    """
    
    bigraphs_list = []
    
    for idx, row in df.iterrows():
        curr_lemma = row['lemma']
        
        # Get the two-character strings in each lemma and, if tokenbased=True, 
        # associate each bigraph with the original lemma's frequency value.
        # Otherwise just count it once.
        # (Thanks https://stackoverflow.com/questions/21844546/forming-bigrams-of-words-in-list-of-sentences-with-python/21844800#21844800 )

        if tokenbased:        
            curr_bigraphs = [{'bigraph':b[0]+b[1], 'freq':row['freq']} for b in zip(curr_lemma[:-1], curr_lemma[1:])]        
        else:
            curr_bigraphs = [{'bigraph':b[0]+b[1], 'freq':1} for b in zip(curr_lemma[:-1], curr_lemma[1:])]
            
        bigraphs_list.extend(curr_bigraphs)
        
    # Convert the list of dictionaries in bigraphs_list into a dataframe and sum the counts for each bigraph.
    # Add proportion.
    bigraphs_df = pd.DataFrame(bigraphs_list).groupby(['bigraph']).sum().reset_index()
    bigraphs_df['propn'] = bigraphs_df['freq']/bigraphs_df['freq'].sum()
    
    return bigraphs_df

In [2]:
# Read in the df.
simplex_df = pd.read_csv('outfiles/simplex_filtered3.csv')

# Lowercase lemmas.
simplex_df['lemma'] = simplex_df['lemma'].str.lower()

# Get the frequency of each bigraph for all POSs.
get_bigraph_freq(simplex_df).to_csv('junc_data/junctures_tokenbased.csv', index=False)