## Lab 3

In [4]:
import pandas as pd 
import numpy as np 
from cytoolz import * 
from tqdm.auto import tqdm 
tqdm.pandas()

In [27]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:46513")
client

0,1
Client  Scheduler: tcp://127.0.0.1:46513  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.62 GB


In [6]:
df = pd.read_parquet('s3://ling583/acl.parquet', storage_options = {'anon':True})

In [7]:
# distribute accross cluster 
import dask.dataframe as dd
import dask.bag as db

In [8]:
# divided into 60 article per piece
df = dd.from_pandas(df, npartitions = 100)
texts = df['text'].to_bag()

In [9]:
import spacy

In [10]:
nlp = spacy.load('en_core_web_sm', exclude = ['parser', 'ner', 'lemmatizer', 'attribute_ruler'])

In [11]:
# extract candidate terms 

from spacy.matcher import Matcher 

In [12]:
matcher = Matcher(nlp.vocab)
matcher.add('Term', [[{'TAG': {'IN': ['JJ', 'NN']}}, 
                     {'TAG': {'IN': ['JJ', 'NN', 'IN', 'HYPH']}, 'OP': '*'}, 
                     {'TAG': 'NN'}]])

In [17]:
def get_candidates(text):
    doc = nlp(text)
    spans = matcher(doc, as_spans = True)
    return [tuple(tok.norm_ for tok in span) for span in spans]

In [23]:
# define graph to get list of candidate per text  
# get the freq of the candidate
graph = texts.map(get_candidates) \
             .flatten() \
             .frequencies()

In [28]:
%%time

candidates = graph.compute()

CPU times: user 10.3 s, sys: 1.46 s, total: 11.7 s
Wall time: 5min 48s


In [29]:
candidates[:10]

[(('polynomial', 'time'), 234),
 (('recognition', 'phase'), 17),
 (('input', 'string'), 379),
 (('spurious', 'ambiguity'), 148),
 (('function', 'application'), 40),
 (('relative', 'ordering'), 29),
 (('considerable', 'interest'), 40),
 (('large', 'number'), 1357),
 (('same', 'function'), 26),
 (('function', 'argument'), 5)]

In [34]:
# count freq and organize by len
from collections import defaultdict, Counter

In [35]:
freqs = defaultdict(Counter)
for c, f in candidates: 
    freqs[len(c)][c] = f

In [36]:
# given a candidate, loook at subterms 
# if the can is 5, then look for sub seq with 4, 3, 2 terms 

from nltk import ngrams 

In [37]:
def get_subterms(term):
    k = len(term)
    for m in range(k-1, 1, -1):
        yield from ngrams(term, m)

In [38]:
from math import log2

In [39]:
def c_value(F, theta):
    
    termhood = Counter()
    longer = defaultdict(list)
    
    for k in sorted(F, reverse = True):
        for term in F[k]:
            if term in longer: 
                discount = sum(longer[term]) / len(longer[term])
            else: 
                discount = 0
            c = log2(k) * (F[k][term] - discount)
            if c > theta: 
                termhood[term] = c 
                for subterm in get_subterms(term):
                    if subterm in F[len(subterm)]:
                        longer[subterm].append(F[k][term])
    return termhood 

In [43]:
# change theta to get more/less technical terms
terms = c_value(freqs, theta = 250)

In [44]:
# terms from top of the list 
for t, c in terms.most_common(20):
    print(f'{c:8.2f} {freqs[len(t)][t]:4d} {" ".join(t)}')

 5236.00 5682 language model
 4935.26 2330 part - of - speech
 4875.20 5388 natural language
 4560.00 5060 machine translation
 3599.25 3920 neural network
 3583.00 3583 training set
 3346.00 3346 previous work
 3171.75 1366 end - to - end
 3012.00 3012 other hand
 3003.00 3003 test set
 2923.00 2923 future work
 2370.00 2370 target language
 2363.18 1634 natural language processing
 2317.22 1462 sentence - level
 2301.37 1452 large - scale
 2278.00 2626 co -
 2209.44 1394 word - level
 2174.00 2174 parse tree
 2144.45 1353 n - gram
 2059.00 2059 training corpus


In [45]:
for t, c in tail(20, terms.most_common()):
    print(f'{c:8.2f} {freqs[len(t)][t]:4d} {" ".join(t)}')

  256.00  256 candidate answer
  255.41  110 bag - of - word
  255.00  255 different language
  255.00  255 novel method
  255.00  255 dev set
  255.00  255 abstractive summarization
  254.00  254 head noun
  254.00  254 regular expression
  254.00  254 dimensional vector
  254.00  254 random walk
  253.00  253 meaning representation
  253.00  253 temporal relation
  253.00  253 model score
  253.00  253 classification model
  252.01  159 free word order
  252.00  252 upper bound
  252.00  252 summarization task
  251.00  576 chinese word
  251.00  251 document classification
  250.42  158 fan - out


In [48]:
with open('terms.txt', 'w') as f: 
    for term in terms: 
        print(' '.join(term), file = f)