# LAB 3: Automated Terminology Extraction

Extract technical terms from ACL Anthology

Objectives:
* part of speech tagging with spacy
* extract phrases that match a part of speech pattern
* scale processing pipeline with dask
* compute c-values

## Part I: Test c-value function

In [45]:
import pandas as pd
import numpy as np
from cytoolz import *
from tqdm.auto import tqdm
tqdm.pandas()

In [46]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:42605")
client

0,1
Client  Scheduler: tcp://127.0.0.1:42605  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.62 GB


In [47]:
df = pd.read_parquet('s3://ling583/acl.parquet', storage_options={'anon':True})

In [48]:
import dask.dataframe as dd
import dask.bag as db

In [49]:
df = dd.from_pandas(df, npartitions=100)
texts = df['text'].to_bag()

### Set up spaCy

In [50]:
import spacy

In [51]:
nlp = spacy.load('en_core_web_sm', exclude=['parser', 'ner', 'lemmatizer', 'attribute_ruler'])

In [52]:
from spacy.matcher import Matcher

In [53]:
matcher = Matcher(nlp.vocab)
matcher.add('Term', [[{'TAG': {'IN': ['JJ', 'NN']}},
                      {'TAG': {'IN': ['JJ', 'NN', 'IN', 'HYPH']}, 'OP': '*'},
                      {'TAG': 'NN'}]])

### Extract candidate terms

In [54]:
def get_candidates(text):
    doc = nlp(text)
    spans = matcher(doc, as_spans=True)
    return [tuple(tok.norm_ for tok in span) for span in spans]

In [55]:
graph = texts.map(get_candidates) \
             .flatten() \
             .frequencies()

In [56]:
%%time

candidates = graph.compute()

CPU times: user 10.8 s, sys: 1.56 s, total: 12.3 s
Wall time: 6min 2s


In [57]:
candidates[:10]

[(('polynomial', 'time'), 234),
 (('recognition', 'phase'), 17),
 (('input', 'string'), 379),
 (('spurious', 'ambiguity'), 148),
 (('function', 'application'), 40),
 (('relative', 'ordering'), 29),
 (('considerable', 'interest'), 40),
 (('large', 'number'), 1357),
 (('same', 'function'), 26),
 (('function', 'argument'), 5)]

In [58]:
len(candidates)

920136

### Compute c-values

$$\mbox{C-value}(a)=\begin{cases}\log_2|a|\cdot f(a) & \mbox{if } a \mbox{ is not nested}\\\log_2|a|\left(f(a)-\frac{1}{P(T_a)}\sum_{b\in T_a}f(b)\right) & \mbox{otherwise}\\\end{cases}$$


In [59]:
from collections import defaultdict, Counter

In [60]:
freqs = defaultdict(Counter)
for c, f in candidates:
    freqs[len(c)][c] = f

In [61]:
from nltk import ngrams

In [63]:
def get_subterms(term):
    k = len(term)
    for m in range(k-1, 1, -1):
        yield from ngrams(term, m)

In [64]:
from math import log2

In [65]:
def c_value(F, theta):
    
    termhood = Counter()
    longer = defaultdict(list)
    
    for k in sorted(F, reverse=True):
        for term in F[k]:
            if term in longer:
                discount = sum(longer[term]) / len(longer[term])
            else:
                discount = 0
            c = log2(k) * (F[k][term] - discount)
            if c > theta:
                termhood[term] = c
                for subterm in get_subterms(term):
                    if subterm in F[len(subterm)]:
                        longer[subterm].append(F[k][term])
    return termhood

In [66]:
terms = c_value(freqs, theta=500)

In [39]:
for t, c in terms.most_common(20):
    print(f'{c:8.2f} {freqs[len(t)][t]:4d} {" ".join(t)}')

 5236.00 5682 language model
 4461.67 5060 machine translation
 4451.14 2330 part - of - speech
 4410.50 5388 natural language
 3583.00 3583 training set
 3379.00 3920 neural network
 3346.00 3346 previous work
 3171.75 1366 end - to - end
 3012.00 3012 other hand
 3003.00 3003 test set
 2923.00 2923 future work
 2589.83 1634 natural language processing
 2370.00 2370 target language
 2317.22 1462 sentence - level
 2301.37 1452 large - scale
 2209.44 1394 word - level
 2174.00 2174 parse tree
 2144.45 1353 n - gram
 2059.00 2059 training corpus
 2019.24 1274 f - score


In [40]:
for t, c in tail(20, terms.most_common()):
    print(f'{c:8.2f} {freqs[len(t)][t]:4d} {" ".join(t)}')

  516.70  326 sub - word
  515.11  325 chinese word segmentation
  515.00  515 lexical information
  515.00  515 morphological analysis
  515.00  515 input sequence
  514.00  514 classification problem
  513.00  513 local context
  512.00  512 time complexity
  511.00  511 text generation
  511.00  511 probabilistic model
  511.00  511 tree kernel
  510.00  510 phrase pair
  509.00  509 distributional similarity
  508.77  321 natural language generation
  508.77  321 f1 - score
  508.77  321 hyper - parameter
  507.19  320 set of candidate
  507.00  507 standard deviation
  504.00  504 beam size
  503.00  503 dependency relation


In [58]:
with open('terms.txt', 'w') as f:
    for term in terms:
        print(' '.join(term), file = f)