In [1]:
import mwu_measures
import pandas as pd
import statsmodels.api as sm
from patsy import dmatrices
import numpy as np
from importlib import reload
mwu_measures = reload(mwu_measures)
mwu_measures.mwu_functions = reload(mwu_measures.mwu_functions)

First, you need to run the corpus processing function. Before trying other data, we can use the (hard-coded) synthetic corpus in Gries.

In [2]:
mwu_measures.process_corpus(test_corpus=True)

This is what a preprocess corpus should look like.

In [3]:
np.array(list(mwu_measures.processing_corpus.TRIGRAM_BW.items()))

array([['A',
        defaultdict(<function make_bigram_dict at 0x0000019FA49E49A0>, {'d': defaultdict(<class 'collections.Counter'>, {'a': Counter({'START': 1})}), 'c': defaultdict(<class 'collections.Counter'>, {'d': Counter({'a': 1}), 'h': Counter({'g': 1}), 'a': Counter({'o': 1}), 'c': Counter({'a': 1})}), 'b': defaultdict(<class 'collections.Counter'>, {'c': Counter({'d': 1, 'h': 1, 'c': 1}), 'e': Counter({'b': 1}), 'z': Counter({'y': 1})}), 'e': defaultdict(<class 'collections.Counter'>, {'b': Counter({'c': 1})}), 'f': defaultdict(<class 'collections.Counter'>, {'b': Counter({'e': 1})}), 'g': defaultdict(<class 'collections.Counter'>, {'f': Counter({'b': 1})}), 'h': defaultdict(<class 'collections.Counter'>, {'g': Counter({'f': 1})}), 'i': defaultdict(<class 'collections.Counter'>, {'b': Counter({'c': 1})}), 'j': defaultdict(<class 'collections.Counter'>, {'i': Counter({'b': 1})}), 'k': defaultdict(<class 'collections.Counter'>, {'j': Counter({'i': 1})}), 'a': defaultdict(<class '

We can then easily compare the results from Gries' paper. These are the bigrams in tables 3 and 4. Note that entropy_2 in table 4 uses a different calculation, and is not supposed to match with the paper.
Also, because the author reports 1 - dispersion, I'll print it like that.

In [4]:
x = mwu_measures.get_mwu_scores(['b d', 'c b', 'a c']) # TODO: Token, type, and dispersion working. Entropy and association are bad.
x['dispersion'] = 1 - x['dispersion']
x

Unnamed: 0,ngram,first,second,token_freq,dispersion,type_1,type_2,entropy_1,entropy_2,assoc_f,assoc_b
0,b d,b,d,3,0.199452,6,9,0.069372,0.029215,0.156592,0.261216
1,c b,c,b,5,0.811623,8,4,0.095055,0.225603,0.638064,0.42119
2,a c,a,c,2,0.564654,6,4,0.002592,0.039036,0.332089,0.251953


Now we can use real data and a real corpus. I used the BNC corpus because it's what I have at hand. This is currently the only corpus supported, but I'll add others soon. You have to get your own copy of the BNC.

In [5]:
mwu_measures.process_corpus('bnc', 'bnc_tokenized.txt', chunk_size=100000, verbose=False)

Merging....


In [11]:
from pympler import asizeof
asizeof.asizeof(mwu_measures.processing_corpus.TRIGRAM_FW)

7951633576

We can take the multiword units from Muraki et al., 2022 (provided in the directory), from here: https://osf.io/ksypa/. For now, we can only use the bigrams. All bigrams not occurring in the BNC will be skipped.

In [None]:
mwu_examples = pd.read_csv('MultiwordExpression_Concreteness_Ratings.csv')
mwu_examples['length'] = mwu_examples['Expression'].apply(lambda x: len(x.split()))
mwu_examples = mwu_examples.loc[(mwu_examples['length'] == 2) | (mwu_examples['length'] == 3)]
mwu_examples['Expression'] = mwu_examples['Expression'].apply(lambda x: x.lower())
print(f'Number of possible bigrams and trigrams: {len(mwu_examples)}')

In [None]:
mwu_examples

In [None]:
mwu_scores = mwu_measures.get_mwu_scores(mwu_examples.sample(1000)['Expression'], normalize=False, parallel=False, verbose=True)
# Notice: very slow now for some ngrams. E.g., 'meted out'. Memory leak??

In [None]:
mwu_scores = mwu_measures.get_mwu_scores(mwu_examples['Expression'][0:40], normalize=False, parallel=True)

In [None]:
mwu_scores = mwu_measures.get_mwu_scores(mwu_examples['Expression'][0:100], normalize=True, entropy_limits=[-0.1, 0.1], scale_entropy=True, verbose=False, track_progress=True)
# TODO: this could very easily be parallel https://dask.pydata.org/en/latest/
# TODO: https://superfastpython.com/learning-paths/

On my test computer, this took around 6 minutes, including the normalization step. In my laptop, it was more like 15. We can see how many we had to skip because they're not in the corpus.

In [None]:
print(f'Ngrams that occur in BNC: {len(mwu_scores['normalized'])}')

Let's do something very sloppy just as an illustration: relationship between concreteness and the MWU measures?

In [None]:
mwu_examples_filter = mwu_examples.loc[mwu_examples['Expression'].isin(list(mwu_scores['normalized']['ngram']))]
concreteness_mwu = pd.merge(mwu_examples_filter, mwu_scores['normalized'], how='left', left_on='Expression', right_on='ngram')


In [None]:
concreteness_mwu = concreteness_mwu.dropna()

In [None]:

y, X = dmatrices('Mean_C ~ token_freq + dispersion + type_1 + type_2 + entropy_1 + entropy_2 + assoc_f + assoc_b', data=concreteness_mwu, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

Nice! We can also take an MWU score based on this. First we can take an average, and compare it with a weighted average. This will be part of the package shortly.

In [None]:
only_scores = concreteness_mwu[['token_freq', 'dispersion', 'type_1', 'type_2', 'entropy_1', 'entropy_2', 'assoc_f', 'assoc_b']]
concreteness_mwu['mwu_score'] = only_scores.mean(axis = 1)

In [None]:
concreteness_mwu['mwu_weighted_1'] = only_scores.apply(lambda x: np.average(x, weights=[0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]), axis=1)
concreteness_mwu['mwu_weighted_2'] = only_scores.apply(lambda x: np.average(x, weights=[0.1, 0.3, 0.05, 0.05, 0.2, 0.2, 0.05, 0.05]), axis=1)

In [None]:
import seaborn as sns

In [None]:
sns.regplot(x="mwu_score", y="Mean_C", data=concreteness_mwu, line_kws={"color": "red"})

In [None]:
sns.regplot(x="mwu_weighted_1", y="Mean_C", data=concreteness_mwu, line_kws={"color": "red"})

In [None]:
sns.regplot(x="mwu_weighted_2", y="Mean_C", data=concreteness_mwu, line_kws={"color": "red"})

Not the most interesting relationship, but it's a living. There you go!