In [4]:
import mwu_measures
import pandas as pd
import statsmodels.api as sm
from patsy import dmatrices
import numpy as np
from importlib import reload
mwu_measures = reload(mwu_measures)
mwu_measures.mwu_functions = reload(mwu_measures.mwu_functions)

First, you need to run the corpus processing function. Before trying other data, we can use the (hard-coded) synthetic corpus in Gries.

In [5]:
mwu_measures.process_corpus(test_corpus=True)

This is what a preprocess corpus should look like.

In [9]:
np.array(list(mwu_measures.processing_corpus.UNIGRAM_FREQUENCIES_PC.items()))

array([['A',
        Counter({'a': 5, 'b': 5, 'c': 4, 'z': 2, 'n': 2, 'q': 2, 'r': 2, 'd': 1, 'e': 1, 'f': 1, 'g': 1, 'h': 1, 'i': 1, 'j': 1, 'k': 1, 'y': 1, 'o': 1, 'p': 1, 'x': 1})],
       ['B',
        Counter({'b': 5, 'd': 4, 'x': 3, 'y': 2, 'c': 2, 'p': 2, 'e': 2, 'j': 2, 'q': 2, 'z': 2, 'i': 1, 'g': 1, 'n': 1, 'k': 1, 'r': 1, 'f': 1, 'o': 1})],
       ['C',
        Counter({'g': 5, 'j': 4, 'k': 4, 'r': 3, 'b': 3, 'd': 3, 'h': 3, 'o': 2, 'c': 2, 'f': 2, 'i': 1, 'e': 1, 'a': 1})]],
      dtype=object)

We can then easily compare the results from Gries' paper. These are the bigrams in tables 3 and 4. Note that entropy_2 in table 4 uses a different calculation, and is not supposed to match with the paper.
Also, because the author reports 1 - dispersion, I'll print it like that.

In [13]:
x = mwu_measures.get_mwu_scores(['b d', 'c b', 'a c']) # TODO: Discrepancias con TYPE1, ENTROPY, pero en b d. What?
x['dispersion'] = 1 - x['dispersion']
x

Unnamed: 0,ngram,first,second,token_freq,dispersion,type_1,type_2,entropy_1,entropy_2,assoc_f,assoc_b
0,b d,b,d,2,0.199126,5,9,0.083484,0.015141,0.043196,0.074341
1,c b,c,b,5,0.811873,8,4,0.0,0.0,0.62043,0.404903
2,a c,a,c,2,0.565102,6,4,0.0,0.0,0.316464,0.239136


In [12]:
y = mwu_measures.get_mwu_scores(['b d', 'c b', 'a c'], parallel=True) 
y['dispersion'] = 1 - y['dispersion']
y 

Number of cores in use: 31


[Parallel(n_jobs=31)]: Using backend LokyBackend with 31 concurrent workers.


[Parallel(n_jobs=31)]: Done   1 out of  31 | elapsed:    0.6s remaining:   18.4s
[Parallel(n_jobs=31)]: Done   2 out of  31 | elapsed:    0.6s remaining:    9.2s
[Parallel(n_jobs=31)]: Done   3 out of  31 | elapsed:    0.6s remaining:    6.0s
[Parallel(n_jobs=31)]: Done   4 out of  31 | elapsed:    0.7s remaining:    4.9s
[Parallel(n_jobs=31)]: Done   5 out of  31 | elapsed:    0.7s remaining:    3.9s
[Parallel(n_jobs=31)]: Done   6 out of  31 | elapsed:    0.8s remaining:    3.2s
[Parallel(n_jobs=31)]: Done   7 out of  31 | elapsed:    0.8s remaining:    2.7s
[Parallel(n_jobs=31)]: Done   8 out of  31 | elapsed:    0.8s remaining:    2.3s
[Parallel(n_jobs=31)]: Done   9 out of  31 | elapsed:    0.8s remaining:    1.9s
[Parallel(n_jobs=31)]: Done  10 out of  31 | elapsed:    0.8s remaining:    1.7s
[Parallel(n_jobs=31)]: Done  11 out of  31 | elapsed:    0.8s remaining:    1.5s
[Parallel(n_jobs=31)]: Done  12 out of  31 | elapsed:    0.8s remaining:    1.3s
[Parallel(n_jobs=31)]: Done 

Unnamed: 0,ngram,first,second,token_freq,dispersion,type_1,type_2,entropy_1,entropy_2,assoc_f,assoc_b
0,b d,b,d,2,0.199126,5,9,0.083484,0.015141,0.043196,0.074341
1,c b,c,b,5,0.811873,8,4,0.0,0.0,0.62043,0.404903
2,a c,a,c,2,0.565102,6,4,0.0,0.0,0.316464,0.239136


Now we can use real data and a real corpus. I used the BNC corpus because it's what I have at hand. This is currently the only corpus supported, but I'll add others soon. You have to get your own copy of the BNC.

In [8]:
mwu_measures.process_corpus('bnc', 'small_corpus.txt', chunk_size=100000, verbose=False)

We can take the multiword units from Muraki et al., 2022 (provided in the directory), from here: https://osf.io/ksypa/. For now, we can only use the bigrams. All bigrams not occurring in the BNC will be skipped.

In [None]:
mwu_examples = pd.read_csv('MultiwordExpression_Concreteness_Ratings.csv')
mwu_examples['length'] = mwu_examples['Expression'].apply(lambda x: len(x.split()))
mwu_examples = mwu_examples.loc[mwu_examples['length'] == 2]
mwu_examples['Expression'] = mwu_examples['Expression'].apply(lambda x: x.lower())
print(f'Number of possible bigrams: {len(mwu_examples)}')

In [10]:
mwu_scores = mwu_measures.get_mwu_scores(mwu_examples['Expression'][0:40], normalize=False)

In [None]:
mwu_scores = mwu_measures.get_mwu_scores(mwu_examples['Expression'][0:40], normalize=False, parallel=True)

  return bound(*args, **kwds)


In [None]:
mwu_scores = mwu_measures.get_mwu_scores(mwu_examples['Expression'][0:100], normalize=True, entropy_limits=[-0.1, 0.1], scale_entropy=True, verbose=False, track_progress=True)
# TODO: this could very easily be parallel https://dask.pydata.org/en/latest/
# TODO: https://superfastpython.com/learning-paths/

On my test computer, this took around 6 minutes, including the normalization step. In my laptop, it was more like 15. We can see how many we had to skip because they're not in the corpus.

In [None]:
print(f'Ngrams that occur in BNC: {len(mwu_scores['normalized'])}')

Let's do something very sloppy just as an illustration: relationship between concreteness and the MWU measures?

In [None]:
mwu_examples_filter = mwu_examples.loc[mwu_examples['Expression'].isin(list(mwu_scores['normalized']['ngram']))]
concreteness_mwu = pd.merge(mwu_examples_filter, mwu_scores['normalized'], how='left', left_on='Expression', right_on='ngram')


In [None]:
concreteness_mwu = concreteness_mwu.dropna()

In [None]:

y, X = dmatrices('Mean_C ~ token_freq + dispersion + type_1 + type_2 + entropy_1 + entropy_2 + assoc_f + assoc_b', data=concreteness_mwu, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

Nice! We can also take an MWU score based on this. First we can take an average, and compare it with a weighted average. This will be part of the package shortly.

In [None]:
only_scores = concreteness_mwu[['token_freq', 'dispersion', 'type_1', 'type_2', 'entropy_1', 'entropy_2', 'assoc_f', 'assoc_b']]
concreteness_mwu['mwu_score'] = only_scores.mean(axis = 1)

In [None]:
concreteness_mwu['mwu_weighted_1'] = only_scores.apply(lambda x: np.average(x, weights=[0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]), axis=1)
concreteness_mwu['mwu_weighted_2'] = only_scores.apply(lambda x: np.average(x, weights=[0.1, 0.3, 0.05, 0.05, 0.2, 0.2, 0.05, 0.05]), axis=1)

In [None]:
import seaborn as sns

In [None]:
sns.regplot(x="mwu_score", y="Mean_C", data=concreteness_mwu, line_kws={"color": "red"})

In [None]:
sns.regplot(x="mwu_weighted_1", y="Mean_C", data=concreteness_mwu, line_kws={"color": "red"})

In [None]:
sns.regplot(x="mwu_weighted_2", y="Mean_C", data=concreteness_mwu, line_kws={"color": "red"})

Not the most interesting relationship, but it's a living. There you go!