# Build the curated set of multi-word expressions (MWEs) used in creating our corpora

Using both the extrinsic Specialist Lexicon and an intrinsic pointwise mutual information approach we built a currated set of MWEs that we used as a processing step in all of the corpora. MWEs were joined using a _ between terms.

In [None]:
import re, pickle, nltk, sys
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.collocations import *
from nltk.metrics.association import QuadgramAssocMeasures
from joblib import Parallel, delayed
import numpy as np
import pandas as pd
from collections import defaultdict, Counter

from flashtext import KeywordProcessor
import seaborn as sns
import matplotlib.pyplot as plt

import time

In [None]:
# use spacy as a word tokenizer
import spacy

def use_spacy(s):
    res = nlp(s)
    return([t.text for t in res])

nlp = spacy.load("en", disable=['parser', 'tagger', 'ner'])

## load preprocessed text

In [None]:
# load the pre-processed text from the PubMed Open Access Case Reports (non-tokenized)
dict_text = pickle.load(open("",'rb'))
all_text = ' '.join(list(dict_text.values()))

In [None]:
# load the pre-processed text (tokenized)
dict_document_tokenized = pickle.load(open("",'rb'))
all_text_tokenized_list = list(dict_document_tokenized.values())
all_tokenized_joined = [word for doc in all_text_tokenized_list for word in doc]
print('Corpus contains', len(all_tokenized_joined), 'tokens.')

## specialist lexicon

In [None]:
# load and process the spexialist lexicon
nlm_entries_all = open('reference_files/nlm_specialist_lexicon.txt').read().splitlines()
nlm_all_terms = [l.split('=')[1] for l in nlm_entries_all if re.match('(^{base=|^spelling_variant=)',l)]
nlm_mwes = [t.lower() for t in nlm_all_terms if len(word_tokenize(t)) > 1]
print('Found',len(nlm_mwes),'MWEs from the Specialist Lexicon including spelling variants.')

In [None]:
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_list(nlm_mwes)

In [None]:
filtered_nlm_mwes = list(set(keyword_processor.extract_keywords(all_text)))

In [None]:
print('There are', len(filtered_nlm_mwes), 'Specialist Lexicon MWEs in the corpus.')

## identfy multiword phrases- bigrams and trigrams

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
bigram_finder = BigramCollocationFinder.from_words(all_tokenized_joined)
bigram_finder.apply_freq_filter(10)
all_bigrams_by_pmi = bigram_finder.score_ngrams(bigram_measures.pmi)

In [None]:
bigram_doc_counter = defaultdict(int)

start_time = time.time()

for i,doc in enumerate(all_text_tokenized_list):
    
    if i % 100 == 0:
        sys.stdout.write('completed %d of %d documents \r' % (i, len(all_text_tokenized_list)))
    
    bigram_finder = BigramCollocationFinder.from_words(doc)
    doc_bigrams_by_pmi = bigram_finder.score_ngrams(bigram_measures.pmi)
    doc_bigrams = [x[0] for x in doc_bigrams_by_pmi]
    
    for bigram in doc_bigrams:
        bigram_doc_counter[bigram] += 1

print("bigram identification --- %s seconds ---" % (time.time() - start_time))

In [None]:
bigrams_in_more_10_docs = {k:v for k,v in bigram_doc_counter.items() if v > 9}
all_bigrams_filtered = [bigram for bigram in all_bigrams_by_pmi if bigram[0] in bigrams_in_more_10_docs.keys()]

In [None]:
all_bigrams_filtered[0:100]

In [None]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()
trigram_finder = TrigramCollocationFinder.from_words(all_tokenized_joined)
trigram_finder.apply_freq_filter(10)
all_trigrams_by_pmi = trigram_finder.score_ngrams(trigram_measures.pmi)

In [None]:
trigram_doc_counter = defaultdict(int)

start_time = time.time()

for i,doc in enumerate(all_text_tokenized_list):
    
    if i % 100 == 0:
        sys.stdout.write('completed %d of %d documents \r' % (i, len(all_text_tokenized_list)))
    
    trigram_finder = TrigramCollocationFinder.from_words(doc)
    doc_trigrams_by_pmi = trigram_finder.score_ngrams(trigram_measures.pmi)
    doc_trigrams = [x[0] for x in doc_trigrams_by_pmi]
    
    for trigram in doc_trigrams:
        trigram_doc_counter[trigram] += 1

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
trigrams_in_more_10_docs = {k:v for k,v in trigram_doc_counter.items() if v > 9}
all_trigrams_filtered = [trigram for trigram in all_trigrams_by_pmi if trigram[0] in trigrams_in_more_10_docs.keys()]

In [None]:
all_trigrams_filtered[0:100]

In [None]:
# get some output
print('Found bigrams:',len(all_bigrams_filtered))
print('Found trigrams:',len(all_trigrams_filtered))
print('Total n-grams:', (len(all_bigrams_filtered) + len(all_trigrams_filtered)))

## analysis of identified MWEs

In [None]:
# the keyword_processor with the final MWEs
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_list(filtered_nlm_mwes) 

In [None]:
all_ngrams_filt = [mwe for mwe in all_bigrams_filtered + all_trigrams_filtered if ' '.join(mwe[0]) not in keyword_processor]
print('Identified',len(all_ngrams_filt),'unique MWEs using PMI not found in the Specialist Lexicon.')
print('SL terms identified by PMI:', (len(all_bigrams_filtered) + len(all_trigrams_filtered) - len(all_ngrams_filt)), '(', round((len(all_bigrams_filtered) + len(all_trigrams_filtered) - len(all_ngrams_filt)) / len(filtered_nlm_mwes) * 100, 2), '%)')

In [None]:
# identification of PMI score threshold
BI_pmi_mwes_in_SL_scores = np.asarray([mwe[1] for mwe in all_bigrams_filtered if ' '.join(mwe[0]) in keyword_processor])
BI_pmi_mwes_NOT_SL_scores = np.asarray([mwe[1] for mwe in all_bigrams_filtered if ' '.join(mwe[0]) not in keyword_processor])
BI_pmi_mwes_all_scores = np.asarray([mwe[1] for mwe in all_bigrams_filtered])

TRI_pmi_mwes_in_SL_scores = np.asarray([mwe[1] for mwe in all_trigrams_filtered if ' '.join(mwe[0]) in keyword_processor])
TRI_pmi_mwes_NOT_SL_scores = np.asarray([mwe[1] for mwe in all_trigrams_filtered if ' '.join(mwe[0]) not in keyword_processor])
TRI_pmi_mwes_all_scores = np.asarray([mwe[1] for mwe in all_trigrams_filtered])

In [None]:
plt.subplot(1, 2, 1)
ax1 = sns.kdeplot(BI_pmi_mwes_in_SL_scores, label='SL')
sns.kdeplot(BI_pmi_mwes_all_scores, ax=ax1, label='not SL')
plt.legend()
plt.title('Bigrams')

plt.subplot(1, 2, 2)
ax2 = sns.kdeplot(TRI_pmi_mwes_in_SL_scores, label='SL')
sns.kdeplot(TRI_pmi_mwes_all_scores, ax=ax2, label='not SL')
plt.legend()
plt.title('Trigrams')
plt.tight_layout()

## filter the MWEs identified by PMI score

In [None]:
BI_q50, BI_q95 = np.percentile(BI_pmi_mwes_all_scores, [50, 95])
quartile_filtered_bigrams = [mwe for mwe in all_bigrams_filtered if BI_q50 < mwe[1] < BI_q95]
print('number filtered bigrams:', len(quartile_filtered_bigrams), '(percent total bigrams:', len(quartile_filtered_bigrams)/len(all_bigrams_filtered), ')')

TRI_q50, TRI_q95 = np.percentile(TRI_pmi_mwes_all_scores, [50, 95])
quartile_filtered_trigrams = [mwe for mwe in all_trigrams_filtered if TRI_q50 < mwe[1] < TRI_q95]
print('number filtered trigrams:', len(quartile_filtered_trigrams), '(percent total trigrams:', len(quartile_filtered_trigrams)/len(all_trigrams_filtered), ')')

## join lexicon MWEs and PMI MWEs

In [None]:
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_list(filtered_nlm_mwes) 

In [None]:
# now filter
non_SL_MWEs = [' '.join(mwe[0]) for mwe in quartile_filtered_bigrams + quartile_filtered_trigrams if ' '.join(mwe[0]) not in keyword_processor]
all_mwes = filtered_nlm_mwes + non_SL_MWEs
print('Total MWEs from SL and PMI:', len(all_mwes))

## use KeywordProcessor to replace in whole body of text

In [None]:
keyword_replacer = KeywordProcessor()

mwe_dict = dict(zip(list(map(lambda x: '_'.join(use_spacy(x)), all_mwes)), [[i] for i in all_mwes]))
keyword_replacer.add_keywords_from_dict(mwe_dict)

In [None]:
print(len(keyword_replacer))

In [None]:
# produce both a full text and a dict with MWEs replaced

import time
start_time = time.time()

all_text_joined_mwe = keyword_replacer.replace_keywords(all_text)

dict_text_MWEs_replaced = {k:keyword_replacer.replace_keywords(v) for k,v in dict_text.items()}



print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
all_text_joined_mwe[0:1000]

In [None]:
list(dict_text_MWEs_replaced.values())[0][0:1000]

## save MWEs and the corpora with the MWEs joined

In [None]:
pickle.dump(all_mwes, open("OA_CR_mwes.pkl", "wb"))
pickle.dump(all_text_joined_mwe, open("OA_CR_11_1_full_text_with_MWEs.pkl","wb"))
pickle.dump(dict_text_MWEs_replaced, open("OA_CR_11_1_doc_dict_with_MWEs.pkl","wb"))