# LAB 3: Automated Terminology Extraction

Extract technical terms from ACL Anthology

Objectives:
* part of speech tagging with spacy
* extract phrases that match a part of speech pattern
* scale processing pipeline with dask
* compute c-values

## Part I: Test c-value function

In [1]:
import pandas as pd
import numpy as np
from cytoolz import *
from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
df = pd.read_parquet('s3://ling583/acl.parquet', storage_options={'anon':True})

In [3]:
df = df.sample(500, random_state=100)

### Set up spaCy

In [4]:
import spacy

In [5]:
nlp = spacy.load('en_core_web_sm', exclude=['parser', 'ner', 'lemmatizer', 'attribute_ruler'])

In [6]:
from spacy.matcher import Matcher

In [7]:
matcher = Matcher(nlp.vocab)
matcher.add('Term', [[{'TAG': {'IN': ['JJ', 'NN']}},
                      {'TAG': {'IN': ['JJ', 'NN', 'IN', 'HYPH']}, 'OP': '*'},
                      {'TAG': 'NN'}]])

### Extract candidate terms

In [8]:
def get_candidates(text):
    doc = nlp(text)
    spans = matcher(doc, as_spans=True)
    return [tuple(tok.norm_ for tok in span) for span in spans]

In [9]:
candidates = list(concat(df['text'].progress_apply(get_candidates)))

  0%|          | 0/500 [00:00<?, ?it/s]

In [10]:
candidates[:10]

[('effective', 'approach'),
 ('approach', 'for', 'resume'),
 ('effective', 'approach', 'for', 'resume'),
 ('resume', 'information'),
 ('approach', 'for', 'resume', 'information'),
 ('effective', 'approach', 'for', 'resume', 'information'),
 ('information', 'extraction'),
 ('resume', 'information', 'extraction'),
 ('approach', 'for', 'resume', 'information', 'extraction'),
 ('effective', 'approach', 'for', 'resume', 'information', 'extraction')]

### Compute c-values

$$\mbox{C-value}(a)=\begin{cases}\log_2|a|\cdot f(a) & \mbox{if } a \mbox{ is not nested}\\\log_2|a|\left(f(a)-\frac{1}{P(T_a)}\sum_{b\in T_a}f(b)\right) & \mbox{otherwise}\\\end{cases}$$


In [11]:
from collections import defaultdict, Counter

In [14]:
freqs = defaultdict(Counter)
for c in candidates:
    freqs[len(c)][c] += 1

In [15]:
from nltk import ngrams

In [16]:
list(range(4, 1, -1))

[4, 3, 2]

In [17]:
def get_subterms(term):
    k = len(term)
    for m in range(k-1, 1, -1):
        yield from ngrams(term, m)

In [18]:
from math import log2

In [19]:
def c_value(F, theta):
    
    termhood = Counter()
    longer = defaultdict(list)
    
    for k in sorted(F, reverse=True):
        for term in F[k]:
            if term in longer:
                discount = sum(longer[term]) / len(longer[term])
            else:
                discount = 0
            c = log2(k) * (F[k][term] - discount)
            if c > theta:
                termhood[term] = c
                for subterm in get_subterms(term):
                    if subterm in F[len(subterm)]:
                        longer[subterm].append(F[k][term])
    return termhood

In [20]:
terms = c_value(freqs, theta=75)

In [21]:
for t, c in terms.most_common(20):
    print(f'{c:8.2f} {freqs[len(t)][t]:4d} {" ".join(t)}')

  446.00  446 language model
  420.27  213 part - of - speech
  317.00  458 natural language
  310.00  310 training set
  307.48  194 sentence - level
  298.00  381 machine translation
  271.00  271 other hand
  265.00  265 test set
  261.00  261 previous work
  251.00  306 neural network
  242.50  153 word - level
  238.00  238 word alignment
  229.87   99 end - to - end
  223.48  141 natural language processing
  220.00  220 future work
  209.22  132 n - gram
  209.22  132 large - scale
  198.00  270 co -
  194.95  123 f - measure
  193.37  122 f - score


In [22]:
for t, c in tail(20, terms.most_common()):
    print(f'{c:8.2f} {freqs[len(t)][t]:4d} {" ".join(t)}')

   84.00   84 target domain
   83.00   83 sentence level
   82.72   32 part - of - speech tagging
   80.83   51 part of speech
   80.00   80 first step
   80.00   80 head word
   80.00   80 sentence pair
   79.00   79 time step
   78.00   78 baseline system
   78.00   78 - word
   77.66   49 t - test
   77.66   49 character - level
   77.00   77 related work
   77.00   77 sentence compression
   76.00   76 mutual information
   76.00   76 re -
   76.00   76 small number
   76.00   76 deep learning
   76.00   76 recent work
   76.00   76 text classification
