In [1]:
# # import re
# from collections import defaultdict, Counter
# # from itertools import groupby
# # from nltk import flatten
# # from nltk.util import trigrams as get_trigrams
# from mwu_measures.preprocessing_corpus import clean_bnc_line, preprocess_bnc
# # import mwu_measures
# from mwu_measures.processing_corpus import Corpus
import mwu_measures
from mwu_measures.compute_functions import min_max_norm
from mwu_measures import compute_functions
from mwu_measures import processing_corpus
# from mwu_measures.mwu_functions import get_association, get_entropy_dif
from collections import defaultdict, Counter
# from nltk import FreqDist
import numpy as np
import pandas as pd

In [2]:
def get_dispersion(ngram_freq, token_freq, corpus_proportions):
    """
    Computes the "Dispersion" variable for an ngram as the
    KLD divergence between its occurrences in each corpus and
    the overall corpus proportions.
    :param bigram_freq: The frequency of the ngram in each
        corpus as a (corpus, frequency) tuple.
    :param token_freq: The token frequency of the ngram,
        used to transform frequencies into proportions.
    :returns: Dispersion measure as a scalar.
    """
    
    ngram_props = [(corpus, freq / token_freq) for corpus, freq in ngram_freq.items()]
    ngram_props = pd.DataFrame(ngram_props, columns=['corpus', 'ngram_prop'])
    ngram_props = pd.merge(
        corpus_proportions,
        ngram_props,
        on='corpus',
        how='left'
        ).fillna(0)
    kld_props = compute_functions.get_kld(ngram_props['ngram_prop'].values,
        ngram_props['freq'].values)
    return kld_props

def get_association(part_1, part_2, token_freq, unigram_frequencies, fw_dist, n_trigrams=None):
    """
    Obtains the association between the components of a bigram as
    the KLD between joint occurrence and overall occurrence.
    Calculated both forward and backwards in the ngram.
    :param comp_1: String with the first component of the ngram. Tuple for trigram association.
    :param comp_2: String with the second component of the ngram.
    :param token_freq: Token frequency of the ngram.
    :unigram_frequencies: Overall unigram frequencies, summed 
        over the whole corpus. In the form of a Counter.
    :param bigram_frequencies: Necessary only for backwards association in 
        trigrams. In the form of {corpus: {a: {b {c: x}}}}.
    :return: A tuple, (association_forward, association_backward).
    """
    # unigram_frequencies = processing_corpus.UNIGRAM_TOTAL
    # joint probability is conditioned on the unigram frequencies

    comp_1_freq = fw_dist.total()
    comp_2_freq = unigram_frequencies.get(part_2, 0) # Part 2 is a unigram in bigrams and trigrams
    prob_2_1 = token_freq / comp_1_freq
    prob_1_2 = token_freq / comp_2_freq

    if isinstance(part_1, tuple):
        prob_1 = comp_1_freq / n_trigrams # Because the frequency is calculated by taking summing over trigram frequencies, the probability should be calculated with those too.
    else: 
        prob_1 = comp_1_freq / unigram_frequencies.total()
    prob_2 = comp_2_freq / unigram_frequencies.total()

    assoc_f = compute_functions.get_kld(np.array([prob_2_1, 1 - prob_2_1]),
        np.array([prob_2, 1 - prob_2]))
    assoc_b = compute_functions.get_kld(np.array([prob_1_2, 1 - prob_1_2]),
        np.array([prob_1, 1 - prob_1]))
    return assoc_f, assoc_b

def get_entropy_dif(ngram_1_freqs, ngram_2):
    """
    Function to obtain the difference in the entropy of a 
    slot in the ngram and the entropy if the target component
    was eliminated from the distribution.
    :param ngram_1_freqs: Frequency distribution (nltk.FreqDist)
        of the successors of the target slot.
    :param ngram_2: A string specifying the occurrying component
        of the ngram to be eliminated from the frequency distribution.
    :returns: The difference, as counterfactual - actual. Scalar.
    """
    slot_dist = np.array(list(ngram_1_freqs.values()))
    entropy = compute_functions.get_entropy(slot_dist)
    freqs_cf = ngram_1_freqs.copy()
    _ = freqs_cf.pop(ngram_2)
    freqs_cf = np.array(list(freqs_cf.values()))
    entropy_cf = compute_functions.get_entropy(freqs_cf)
    h_diff = entropy_cf - entropy
    return h_diff
    
def get_ngram_scores(ngram, corpus, verbose=False):
    """
    Function for computing the MWU measures for a target ngram. 
    :param ngram: A string with the ngram to be analyzed.
    :returns: A dictionary with all MWU measures obtained:
        Token frequency, dispersion, type frequency for each slot,
        entropy difference for each slot, both directions of 
        association.
    """
    comps = ngram.split(' ')
    comp_1 = comps[0]
    comp_2 = comps[1]
    if len(comps) == 2:
        this_type = 'bigram'
        comp_3 = ''
    elif len(comps) == 3:
        this_type = 'trigram'
        comp_3 = comps[2]
    else:
        print('Error! ngram length not supported')

    fw_dist = corpus.get_fw_distribution(ngram)
    bw_dist = corpus.get_bw_distribution(ngram)

    if this_type == 'bigram':    
        ngram_freq = {this_corpus: freq.get(comp_2, 0) for this_corpus, freq in fw_dist.items()}
    if this_type == 'trigram':
        ngram_freq = {this_corpus: freq.get(comp_3, 0) for this_corpus, freq in fw_dist.items()}
    
    # Token frequency
    token_freq = sum(ngram_freq.values())
    if token_freq == 0:
        print(f'<<{" ".join([comp_1, comp_2, comp_3])}>> is not in the corpus')
        return None
    # Dispersion
    corpus_proportions = corpus.corpus_proportions
    dispersion = get_dispersion(ngram_freq, token_freq, corpus_proportions)
    
    # Total frequencies
    fw_dist = sum(fw_dist.values(), Counter())
    bw_dist = sum(bw_dist.values(), Counter())

    # Type frequencies
    typef_1 = len(bw_dist)
    typef_2 = len(fw_dist)

    # Entropy
    if this_type == 'bigram':
        slot1_diff = get_entropy_dif(bw_dist, comp_1)
        slot2_diff = get_entropy_dif(fw_dist, comp_2)
    elif this_type == 'trigram':
        slot1_diff = get_entropy_dif(bw_dist, (comp_1, comp_2))
        slot2_diff = get_entropy_dif(fw_dist, comp_3)

    unigram_dict = corpus.total_unigrams
    n_trigrams = corpus.n_trigrams
    # Association
    if this_type == 'bigram':
        part_1 = comp_1
        part_2 = comp_2
    elif this_type == 'trigram':
        part_1 = (comp_1, comp_2)
        part_2 = comp_3
    assoc_f, assoc_b = get_association(part_1, part_2, token_freq, unigram_dict, fw_dist, n_trigrams) 


    if this_type == 'bigram':
        return {
            'ngram': (comp_1, comp_2), 
            'first': comp_1,
            'second': comp_2,
            'token_freq': token_freq,
            'dispersion': dispersion,
            'type_1': typef_1,
            'type_2': typef_2,
            'entropy_1': slot1_diff,
            'entropy_2': slot2_diff,
            'assoc_f': assoc_f,
            'assoc_b': assoc_b
            }
    elif this_type == 'trigram':
        return {
            'ngram': (comp_1, comp_2, comp_3), 
            'first': ' '.join([comp_1, comp_2]),
            'second': comp_3,
            'token_freq': token_freq,
            'dispersion': dispersion,
            'type_1': typef_1,
            'type_2': typef_2,
            'entropy_1': slot1_diff,
            'entropy_2': slot2_diff,
            'assoc_f': assoc_f,
            'assoc_b': assoc_b
            }

In [3]:
this_corpus = mwu_measures.processing_corpus.process_corpus(corpus_name='bnc', corpus_dir='small_corpus.txt', verbose=True, chunk_size=1000000)

8928 lines processed
16500 lines processed
25755 lines processed
34338 lines processed
42803 lines processed
52217 lines processed
61689 lines processed
71326 lines processed
80991 lines processed
90366 lines processed
98040 lines processed
105999 lines processed
114462 lines processed
123188 lines processed
130358 lines processed
137924 lines processed
147611 lines processed
157164 lines processed
168734 lines processed
178303 lines processed
188473 lines processed
196901 lines processed
206573 lines processed
220860 lines processed
237194 lines processed
250060 lines processed
261943 lines processed
270684 lines processed
278913 lines processed
287810 lines processed
296127 lines processed
301046 lines processed


In [95]:
# Trigram mins and max. something seems wrong? Nope, it's correct
this_corpus.corpus_conn.execute("""
    WITH trigram_totals AS (
        SELECT ug_1, ug_2, ug_3, SUM(freq) as freq
        FROM trigram_db
        GROUP BY ug_1, ug_2, ug_3
    ),
    token_frequency AS (
        SELECT 
            max(freq) AS max_token_trigram
        FROM trigram_totals
    ),
    type_1 AS (
        SELECT max(typef_1) as max_type1_trigram
        FROM (
            SELECT ug_3, count( distinct concat(ug_1, ug_2) ) AS typef_1
            FROM trigram_totals
            GROUP BY ug_3
        )
    ),
    type_2 AS (
    SELECT max(typef_2) AS max_type2_trigram
    FROM (
        SELECT ug_1, ug_2, count( distinct ug_3 ) AS typef_2
        FROM trigram_totals
        GROUP BY ug_1, ug_2
        )
    )
    SELECT token_frequency.max_token_trigram, type_1.max_type1_trigram, type_2.max_type2_trigram
    FROM token_frequency, type_1, type_2
""").fetch_df()



Unnamed: 0,max_token_trigram,max_type1_trigram,max_type2_trigram
0,1778.0,115430,10735


In [96]:
# bigram max token and types
this_corpus.corpus_conn.execute("""
    WITH bigram_totals AS (
        SELECT ug_1, ug_2, SUM(freq) as freq
        FROM trigram_db
        GROUP BY ug_1, ug_2
        ),
    token_frequency AS (
        SELECT 
            max(freq) AS max_token_bigram
        FROM bigram_totals
        ),
    type_1 AS (
        SELECT max(typef_1) AS max_type1_bigram
        FROM (
            SELECT ug_2, count( distinct ug_1 ) AS typef_1
            FROM bigram_totals
            GROUP BY ug_2
            )
        ),
    type_2 AS (
        SELECT max(typef_2) AS max_type2_bigram
        FROM (
            SELECT ug_1, count( distinct ug_2 ) AS typef_2
            FROM bigram_totals
            GROUP BY ug_1
            )
        )
    SELECT token_frequency.max_token_bigram, type_1.max_type1_bigram, type_2.max_type2_bigram
    FROM token_frequency, type_1, type_2
""").fetch_df()

Unnamed: 0,max_token_bigram,max_type1_bigram,max_type2_bigram
0,37846.0,27844,33770
