In [1]:
import re

In [2]:
def get_ngram_tokens(tokens, n=1):
    '''Create a list of ngram token sequences from a list of single tokens
    
    Args:
       tokens -- a list of tokens
       n -- the number of tokens to group by (default n=1)
       
    Return:
       a list of ngram tokens
    '''
    if n==1:
        return tokens
    
    ngram_tokens = []
    for i, _ in enumerate(tokens[:-(n-1)]):
        ngram = ' '.join(tokens[i:i+n])
        ngram_tokens.append(ngram)
    
    return ngram_tokens

In [3]:
def tokenize(text, lowercase=False, strip_chars=''):
    '''create a list of tokens from a text string after applying normalization
    
    Args:
        text        -- a string object
        lowercase   -- True/False indication of whether string should be transformed to all lowercase
        strip_chars -- a string indicating a sequence of characters to be removed from text before tokenization
        
    Returns:
        a list of tokens
    '''
    if lowercase:
        text = text.lower()
    
    rdict = str.maketrans('','', strip_chars)
    
    text = text.translate(rdict)
    tokens = text.split()
    
    return tokens

In [4]:
def make_kwic(kw, text, win=4):
    '''A basic KWIC function for a text
    
    Args:
        kw   -- string match for keyword to match for each line
        text -- a list of tokens for the text
        
    Return:
        list of lines of form [ [left context words], kw, [right context words]]
    '''
    
    hits = [(w,i) for i,w in enumerate(text) if w==kw]
    lines = []
    for hit in hits:
        hidx = hit[1]
        left = text[max(0,hidx-win):hidx]
        kw = hit[0]
        right = text[hidx+1 : min(hidx+win+1, len(text)+1)]
        
        left = ['']*(win-len(left)) + left if len(left)<win else left
        
        right = right + ['']*(win-len(right))  if len(right)<win else right
        
        lines.append([left, kw, right])
        
    return lines

In [5]:
def print_kwic(kwic, win=None):
    '''A basic print function for a KWIC object
    
    Args:
        kwic -- a list of KWIC lines of the form [ [left words], kw, [right words]]
        win  -- if None then use all words provided in context otherwise limit by win
        
    Prints KWIC lines with left context width/padding win*8 characters
    '''
    
    if not kwic:
        return
    
    if win is None:
        win = len(kwic[0][0])
        
    max_left = max([len(' '.join(l[0])) for l in kwic])
    
    for line in kwic:
        print("{: >{}}  {}  {}".format(' '.join(line[0][-win:]), 
                                      max_left, 
                                      line[1], 
                                      ' '.join(line[2][:win])
                                     )
             )            

In [6]:
def sort_kwic(kwic, order=None):
    ''' sort a kwic list using the passed positional arguments 
    
    Args:
        kwic   -- a list of lists [ [left tokens], kw, [right tokens]]
        order  -- a list of one or more positional arguments of form side-pos, e.g. L1, R3, L4 (default: None)
    
    Returns:
        kwic sorted for each positional argument in reverse, i.e. ['R1','L1'] sorts first by L1 and then R1
    '''
    if order is None:
        return kwic
   
    win = len(kwic[0][0])

    order = [order] if not type(order) is list else order
    order.reverse()
    
    for sort_term in order:
        if not re.match('[LR][1-4]', sort_term):
            pass
        
        pos1 = 0 if sort_term[0]=='L' else 2
        pos2 = int(sort_term[1])-1
        pos2 = win-1-pos2 if sort_term[0]=='L' else pos2
        kwic.sort(key=lambda l : l[pos1][pos2])
    return kwic

In [7]:
from IPython.display import display, HTML
import math

In [8]:
def plot_keyitems(df, num=10, c1='red', c2='blue', corpusA='corpus A', corpusB='corpus B'):
    '''create a horizontal bar plot of top/bottom N items in a keyness table
    
    Args:
        df - a data frame created by calculated_keyness with cols: item, keyness
        num - the number of top and bottom ranked items to include
        c1/c2 - color for the bars
        corpusA/corpusB - labels/names of corpora
        
    Returns:
        matplotlib plot 
    '''
    def selc_df(df, x=2):
        return df.head(x).append(df.tail(x))

    tb_df=selc_df(df,num)
    
    yh=int(num/10)*5
    
    colors = [c1]*num + [c2]*num
    
    ax = tb_df.set_index('item')['keyness'].plot(kind='barh', zorder=2,
                                        figsize=(8, yh),
                                        color=colors, alpha=0.5, width=0.75)
    
    # Despine
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    
    
    # Draw vertical axis lines
    vals = ax.get_xticks()
    for tick in vals:
        ax.axvline(x=tick, linestyle='dashed', alpha=0.4, color='#eeeeee', zorder=1)

        
    ax.set_xlabel("Keyness", labelpad=20, weight='bold', size=12)

    # Set y-axis label
    ax.set_ylabel("")

    ax.annotate(f'Distinctive items\nin {corpusB}', (10,num+num/2), color=c2)
    ax.annotate(f'Distinctive items\nin {corpusA}', (-10,num/2), ha='right', color=c1)

    
    return ax

In [9]:
def show_keyitems(df, n=20, c1='red', c2='blue', corpusA='corpus A', corpusB='corpus B'):
    '''plot  top/bottom n items from a keyness analysis table
    
    Args:
        df - a data frame created by calculated_keyness with cols: item, keyness
        num - the number of top and bottom ranked items to include
        c1/c2 - color for the bars
    
    Returns:
        HTML string containing two column table
    '''
   
    template = '''
        <div style=' float:left; width: 40%; text-align: center'>
        <h3>{}</h3>
        {}</div>
       <div style='width: 40%; padding-left: 20px; float: left; '>
       <h3 style="text-align: center">{}</h3>
        {}</div>
    '''


    idiv = '''
            <div style="font-size: {}px; color: {}; margin-bottom: 2px; float: left; 
            margin: 10px; padding: 2px; background-color: #f7f7f7; border-radius: 6px">
            {}</div>
            '''
    
    top = df[['item', 'keyness']].head(n).values
    bottom = df[['item', 'keyness']].tail(n).values

    top_str = '\n'.join([idiv.format(5*math.log(kness), c1, item) for size, (item, kness) in enumerate(top,1)])
    bottom_str = '\n'.join([idiv.format(5*math.log(abs(kness)), c2, item) for size, (item, kness) in enumerate(bottom,1)])
    
    
    display(HTML(
        template.format(corpusA,top_str, corpusB, bottom_str)
    ))

In [10]:
def calculate_keyness(fdist1, fdist2, fthreshold=5, keyness_threshold=6.6, top=100, print_table=True):
    '''
    '''
    
    c1size = sum(fdist1.values())
    c2size = sum(fdist2.values())

    
    kdata = []
    
    for item, freq in fdist1.items():
        if freq<fthreshold:
            continue
            
        ref_freq = fdist2.get(item,0)
        
        if ref_freq<fthreshold:
            continue
        
        
        keyness = log_likelihood(freq, c1size, ref_freq, c2size)
        
        row = {'item': item, 'freq': freq, 'ref_freq': ref_freq, 'keyness': keyness}
        
        if keyness>keyness_threshold:
        
            kdata.append(row)
        
    
    kdf = pd.DataFrame(kdata)[['item', 'freq', 'ref_freq', 'keyness']]
    
    kdf=kdf.sort_values('keyness', ascending=False)
    
    if not print_table:
        return kdf[:top]
    
    template = "{: <25}{: <10}{: <10}{:0.3f}"
    
    header = "{: <25}{: <10}{: <10}{}".format('WORD', 'Corpus A', 'Corpus B', 'Keyness')
    
    print("{}\n{}".format(header, "="*len(header)))
    
    for item, freq, ref_freq, keyness in kdf[:top].values:
        print(template.format(item, freq, ref_freq, keyness))

In [11]:
def log_likelihood(item_A_freq, corpus_A_size, item_B_freq, corpus_B_size):
    '''
    
    '''
    E1 = corpus_A_size*(item_A_freq+item_B_freq) / (corpus_A_size+corpus_B_size)
    E2 = corpus_B_size*(item_A_freq+item_B_freq) / (corpus_A_size+corpus_B_size)

    G2 = 2*((item_A_freq*math.log(item_A_freq/E1)) + (item_B_freq*math.log(item_B_freq/E2)))
    
    sign = 1 if (item_A_freq / corpus_A_size) >= (item_B_freq / corpus_B_size) else -1
    
    return sign*G2