In [1]:
import mwu_measures
import pandas as pd
import statsmodels.api as sm
from patsy import dmatrices
import numpy as np
from importlib import reload

First, you need to run the corpus processing function. Before trying other data, we can use the (hard-coded) synthetic corpus in Gries.

In [2]:
this_corpus = mwu_measures.process_corpus(test_corpus=True)

This is what a preprocess corpus should look like.

In [3]:
# np.array(list(mwu_measures.processing_corpus.TRIGRAM_BW.items()))

We can then easily compare the results from Gries' paper. These are the bigrams in tables 3 and 4. Note that entropy_2 in table 4 uses a different calculation, and is not supposed to match with the paper.
Also, because the author reports 1 - dispersion, I'll print it like that.

In [4]:
x = mwu_measures.get_mwu_scores(['b d', 'c b', 'a c'], this_corpus, normalize=False) # TODO: Token, type, and dispersion working. Entropy and association are bad.
x['dispersion'] = 1 - x['dispersion']
x

Unnamed: 0,ngram,first,second,token_freq,dispersion,type_1,type_2,entropy_1,entropy_2,assoc_f,assoc_b,length
0,b d,b,d,3.0,0.199452,6,9,0.069372,0.029215,0.187932,0.285743,2
1,c b,c,b,5.0,0.811623,8,4,0.095055,0.225603,0.638064,0.42119,2
2,a c,a,c,2.0,0.564654,6,4,0.002592,0.039036,0.447279,0.292054,2


Now we can use real data and a real corpus. I used the BNC corpus because it's what I have at hand. This is currently the only corpus supported, but I'll add others soon. You have to get your own copy of the BNC.

In [5]:
bnc_corpus = mwu_measures.process_corpus('bnc', 'bnc_tokenized.txt', chunk_size=1000000, verbose=True)

7861 lines processed
17472 lines processed
26167 lines processed
35242 lines processed
44540 lines processed
53276 lines processed
61333 lines processed
71963 lines processed
79980 lines processed
88075 lines processed
97773 lines processed
104399 lines processed
112940 lines processed
123967 lines processed
133172 lines processed
142715 lines processed
151652 lines processed
162124 lines processed
170961 lines processed
178355 lines processed
186228 lines processed
194144 lines processed
201324 lines processed
210071 lines processed
216833 lines processed
223346 lines processed
229476 lines processed
235960 lines processed
244327 lines processed
251145 lines processed
257443 lines processed
264972 lines processed
272894 lines processed
280105 lines processed
287629 lines processed
297045 lines processed
304479 lines processed
314475 lines processed
323284 lines processed
331039 lines processed
340255 lines processed
350474 lines processed
359253 lines processed
368884 lines processed


We can take the multiword units from Muraki et al., 2022 (provided in the directory), from here: https://osf.io/ksypa/. For now, we can only use the bigrams. All bigrams not occurring in the BNC will be skipped.

In [6]:
mwu_examples = pd.read_csv('MultiwordExpression_Concreteness_Ratings.csv')
mwu_examples['length'] = mwu_examples['Expression'].apply(lambda x: len(x.split()))
mwu_examples = mwu_examples.loc[(mwu_examples['length'] == 2) | (mwu_examples['length'] == 3)]
mwu_examples['Expression'] = mwu_examples['Expression'].apply(lambda x: x.lower())
print(f'Number of possible bigrams and trigrams: {len(mwu_examples)}')

Number of possible bigrams and trigrams: 57213


In [12]:
mwu_scores = mwu_measures.get_mwu_scores(mwu_examples.sample(1000)['Expression'], bnc_corpus, normalize=True, parallel=False, verbose=True, track_progress=True)
# Notice: very slow now for some ngrams. E.g., 'meted out'. Memory leak??

shoe bomb
<<shoe bomb >> is not in the corpus
grain silo
proof spirit
<<proof spirit >> is not in the corpus
component part
small talk
industrial archaeology
assembly line
bathing tub
<<bathing tub >> is not in the corpus
universal value
tip over
tinned meat
lock down
chamber organ
<<chamber organ >> is not in the corpus
public holiday
finger spelling
well man clinic
<<well man clinic>> is not in the corpus
personal relation
speech production
martial law
madison square garden
bar stool
call forwarding
bad cheque
slip out
spot company
<<spot company >> is not in the corpus
transit passenger
<<transit passenger >> is not in the corpus
rights issue
cauchy sequence
<<cauchy sequence >> is not in the corpus
cigarette filter
the penny drops
tap in
piece of eight
<<piece of eight>> is not in the corpus
overstep the mark
blood test
maitres d'hotel
<<maitres d'hotel >> is not in the corpus
fall forward
private practice
scrubbing pad
<<scrubbing pad >> is not in the corpus
signal/noise ratio
<<s

On my test computer, this took around 6 minutes, including the normalization step. In my laptop, it was more like 10. We can see how many we had to skip because they're not in the corpus.

In [17]:
mwu_scores['normalized'].loc[mwu_scores['normalized']['ngram'] == 'press pack']

Unnamed: 0,ngram,first,second,token_freq,dispersion,type_1,type_2,entropy_1,entropy_2,assoc_f,assoc_b,length
550,press pack,press,pack,0.153575,0.251707,0.440514,0.379441,0.5,0.507987,0.00229,0.00729,2


In [14]:
print(f'Ngrams that occur in BNC: {len(mwu_scores['normalized'])}')

Ngrams that occur in BNC: 675


Let's do something very sloppy just as an illustration: relationship between concreteness and the MWU measures?

In [None]:
mwu_examples_filter = mwu_examples.loc[mwu_examples['Expression'].isin(list(mwu_scores['normalized']['ngram']))]
concreteness_mwu = pd.merge(mwu_examples_filter, mwu_scores['normalized'], how='left', left_on='Expression', right_on='ngram')


In [None]:
concreteness_mwu = concreteness_mwu.dropna()

In [None]:

y, X = dmatrices('Mean_C ~ token_freq + dispersion + type_1 + type_2 + entropy_1 + entropy_2 + assoc_f + assoc_b', data=concreteness_mwu, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

Nice! We can also take an MWU score based on this. First we can take an average, and compare it with a weighted average. This will be part of the package shortly.

In [None]:
only_scores = concreteness_mwu[['token_freq', 'dispersion', 'type_1', 'type_2', 'entropy_1', 'entropy_2', 'assoc_f', 'assoc_b']]
concreteness_mwu['mwu_score'] = only_scores.mean(axis = 1)

In [None]:
concreteness_mwu['mwu_weighted_1'] = only_scores.apply(lambda x: np.average(x, weights=[0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]), axis=1)
concreteness_mwu['mwu_weighted_2'] = only_scores.apply(lambda x: np.average(x, weights=[0.1, 0.3, 0.05, 0.05, 0.2, 0.2, 0.05, 0.05]), axis=1)

In [None]:
import seaborn as sns

In [None]:
sns.regplot(x="mwu_score", y="Mean_C", data=concreteness_mwu, line_kws={"color": "red"})

In [None]:
sns.regplot(x="mwu_weighted_1", y="Mean_C", data=concreteness_mwu, line_kws={"color": "red"})

In [None]:
sns.regplot(x="mwu_weighted_2", y="Mean_C", data=concreteness_mwu, line_kws={"color": "red"})

Not the most interesting relationship, but it's a living. There you go!