## Lab 3

#### Download and explore the data

In [3]:
import pandas as pd 
import numpy as np 
from cytoolz import * 
from tqdm.auto import tqdm 
tqdm.pandas()

In [8]:
df = pd.read_parquet('s3://ling583/acl.parquet', storage_options = {'anon':True})

In [9]:
df.head()

Unnamed: 0,year,tag,title,text,abstract,body
0,1990,P90-1001,Polynomial Time Parsing of Combinatory Categor...,Polynomial Time Parsing of Combinatory Categor...,In this paper we present a polynomial time par...,Combinatory Categorial Grammar (CCG) is an ex...
1,1990,P90-1002,Structure and Intonation in Spoken Language Un...,Structure and Intonation in Spoken Language Un...,The structure imposed upon spoken sentences by...,"Halliday observed that this constraint, which..."
2,1990,P90-1003,"Prosody, Syntax and Parsing","Prosody, Syntax and Parsing We describe the mo...",We describe the modification of a grammar to t...,"Prosodic information can mark lexical stress, ..."
3,1990,P90-1004,Empirical Study of Predictive Powers of Simple...,Empirical Study of Predictive Powers of Simple...,This empirical study attempts to find answers ...,Difficulty in resolving structural ambiguity i...
4,1990,P90-1005,Structural Disambiguation With Constraint Prop...,Structural Disambiguation With Constraint Prop...,We present a new grammatical formalism called ...,We are interested in an efficient treatment of...


In [7]:
len(df)

6167

In [10]:
df = df.sample(500, random_state = 100)

In [11]:
import spacy

In [13]:
nlp = spacy.load('en_core_web_sm', exclude = ['parser', 'ner', 'lemmatizer', 'attribute_ruler'])

In [14]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f914fa8ae50>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f914faa9c70>)]

In [15]:
doc = nlp(df['text'].iloc[0])

In [17]:
doc[0:10]

Resume Information Extraction with Cascaded Hybrid Model This paper presents

In [18]:
doc[200]

The

In [19]:
# get speech level and normalized form
# DT = determiner
doc[200].tag_, doc[200].norm_

('DT', 'the')

In [20]:
# extract candidate terms 

from spacy.matcher import Matcher 

In [25]:
matcher = Matcher(nlp.vocab)
matcher.add('Term', [[{'TAG': {'IN': ['JJ', 'NN']}}, 
                     {'TAG': {'IN': ['JJ', 'NN', 'IN', 'HYPH']}, 'OP': '*'}, 
                     {'TAG': 'NN'}]])

In [26]:
spans = matcher(doc, as_spans = True)

In [27]:
tuple(tok.norm_ for tok in spans[0])

('effective', 'approach')

In [28]:
def get_candidates(text):
    doc = nlp(text)
    spans = matcher(doc, as_spans = True)
    return [tuple(tok.norm_ for tok in span) for span in spans]

In [30]:
get_candidates(df['text'].iloc[0])

[('effective', 'approach'),
 ('approach', 'for', 'resume'),
 ('effective', 'approach', 'for', 'resume'),
 ('resume', 'information'),
 ('approach', 'for', 'resume', 'information'),
 ('effective', 'approach', 'for', 'resume', 'information'),
 ('information', 'extraction'),
 ('resume', 'information', 'extraction'),
 ('approach', 'for', 'resume', 'information', 'extraction'),
 ('effective', 'approach', 'for', 'resume', 'information', 'extraction'),
 ('automatic', 'resume'),
 ('resume', 'management'),
 ('automatic', 'resume', 'management'),
 ('cascaded', 'information'),
 ('information', 'extraction'),
 ('cascaded', 'information', 'extraction'),
 ('first', 'pass'),
 ('second', 'pass'),
 ('detailed', 'information'),
 ('entire', 'resume'),
 ('appropriate', 'model'),
 ('hybrid', 'model'),
 ('f', '-', 'score'),
 ('hierarchical', 'structure'),
 ('contextual', 'structure'),
 ('structured', 'information'),
 ('automatic', 'construction'),
 ('construction', 'of', 'database'),
 ('automatic', 'construc

In [31]:
candidates = list(concat(df['text'].progress_apply(get_candidates)))

  0%|          | 0/500 [00:00<?, ?it/s]

In [33]:
# count freq and organize by len
from collections import defaultdict, Counter

In [34]:
freqs = defaultdict(Counter)
for c in candidates: 
    freqs[len(c)][c] += 1

In [35]:
freqs.keys()

dict_keys([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 12, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32])

In [38]:
freqs[5].most_common(5)

[(('part', '-', 'of', '-', 'speech'), 213),
 (('end', '-', 'to', '-', 'end'), 99),
 (('tree', '-', 'to', '-', 'string'), 57),
 (('sequence', '-', 'to', '-', 'sequence'), 41),
 (('state', '-', 'ofthe', '-', 'art'), 37)]

In [39]:
# given a candidate, loook at subterms 
# if the can is 5, then look for sub seq with 4, 3, 2 terms 

from nltk import ngrams 

In [40]:
list(range(4, 1, -1))

[4, 3, 2]

In [41]:
def get_subterms(term):
    k = len(term)
    for m in range(k-1, 1, -1):
        yield from ngrams(term, m)

In [42]:
from math import log2

In [49]:
def c_value(F, theta):
    
    termhood = Counter()
    longer = defaultdict(list)
    
    for k in sorted(F, reverse = True):
        for term in F[k]:
            if term in longer: 
                discount = sum(longer[term]) / len(longer[term])
            else: 
                discount = 0
            c = log2(k) * (F[k][term] - discount)
            if c > theta: 
                termhood[term] = c 
                for subterm in get_subterms(term):
                    if subterm in F[len(subterm)]:
                        longer[subterm].append(F[k][term])
    return termhood 

In [56]:
# change theta to get more/less technical terms
terms = c_value(freqs, theta = 75)

In [51]:
# terms from top of the list 
for t, c in terms.most_common(20):
    print(f'{c:8.2f} {freqs[len(t)][t]:4d} {" ".join(t)}')

  420.27  213 part - of - speech
  411.00  446 language model
  317.00  458 natural language
  310.00  310 training set
  307.48  194 sentence - level
  298.00  381 machine translation
  271.00  271 other hand
  265.00  265 test set
  261.00  261 previous work
  251.00  306 neural network
  242.50  153 word - level
  238.00  238 word alignment
  229.87   99 end - to - end
  223.48  141 natural language processing
  220.00  220 future work
  209.22  132 n - gram
  209.22  132 large - scale
  198.00  270 co -
  194.95  123 f - measure
  193.37  122 f - score


In [57]:
for t, c in tail(20, terms.most_common()):
    print(f'{c:8.2f} {freqs[len(t)][t]:4d} {" ".join(t)}')

   84.00   84 target domain
   83.00   83 sentence level
   82.72   32 part - of - speech tagging
   80.83   51 part of speech
   80.00   80 first step
   80.00   80 head word
   80.00   80 sentence pair
   79.00   79 time step
   78.00   78 baseline system
   78.00   78 - word
   77.66   49 t - test
   77.66   49 character - level
   77.00   77 related work
   77.00   77 sentence compression
   76.00   76 mutual information
   76.00   76 re -
   76.00   76 small number
   76.00   76 deep learning
   76.00   76 recent work
   76.00   76 text classification


#### Scale up the processing pipeline and annotate the whole corpus

In [None]:
# we just did 500 article, we need to do the rest 

