In [35]:
import pandas as pd
import numpy as np
from cytoolz import *
from tqdm.auto import tqdm
tqdm.pandas()

In [70]:
terms = [t.split() for t in open('terms.txt')]

In [131]:
terms[:10]

[['part', '-', 'of', '-', 'speech', 'tagging'],
 ['word', '-', 'to', '-', 'word'],
 ['part', '-', 'of', '-', 'speech'],
 ['state', '-', 'ofthe', '-', 'art'],
 ['tree', '-', 'to', '-', 'string'],
 ['-', 'fold', 'cross', '-', 'validation'],
 ['end', '-', 'to', '-', 'end'],
 ['state', '-', 'of', '-', 'theart'],
 ['sequence', '-', 'to', '-', 'sequence'],
 ['context', '-', 'free', 'grammar']]

In [38]:
df = pd.read_parquet('s3://ling583/micusp.parquet', storage_options={'anon':True})

In [39]:
df.head()

Unnamed: 0,filename,text
0,micusp/BIO.G0.15.1.html,"New York City, 1908: different colors of skin..."
1,micusp/BIO.G1.04.1.html,\tThe fish-tetrapod transition has been calle...
2,micusp/BIO.G3.03.1.html,\tIntracellular electric fields are of great ...
3,micusp/BIO.G0.11.1.html,Environmental stresses to plants have been st...
4,micusp/BIO.G1.01.1.html,\tThe recurrent cholera pandemics have been re...


### Set up spaCy

In [40]:
import spacy

In [41]:
nlp = spacy.load('en_core_web_sm', exclude=['parser', 'ner', 'lemmatizer', 'attribute_ruler'])

In [42]:
from spacy.matcher import Matcher

In [43]:
matcher = Matcher(nlp.vocab)
matcher.add('Term', [[{'TAG': {'IN': ['JJ', 'NN']}},
                      {'TAG': {'IN': ['JJ', 'NN', 'IN', 'HYPH']}, 'OP': '*'},
                      {'TAG': 'NN'}]])

### Extract candidate terms

In [44]:
def get_candidates(text):
    doc = nlp(text)
    spans = matcher(doc, as_spans=True)
    return [tuple(tok.norm_ for tok in span) for span in spans]

In [45]:
candidates = list(concat(df['text'].progress_apply(get_candidates)))

  0%|          | 0/788 [00:00<?, ?it/s]

In [46]:
candidates[:10]

[('skin', 'swirl'),
 ('great', 'melting'),
 ('melting', 'pot'),
 ('great', 'melting', 'pot'),
 ('cultural', 'medley'),
 ('last', 'crevice'),
 ('unprecedented', 'uniformity'),
 ('similar', 'effect'),
 ('global', 'biodiversity'),
 ('monotonous', 'fate')]

### Compute c-values

In [48]:
from collections import defaultdict, Counter

In [49]:
freqs = defaultdict(Counter)
for c in candidates:
    freqs[len(c)][c] += 1

In [50]:
from nltk import ngrams

In [52]:
def get_subterms(term):
    k = len(term)
    for m in range(k-1, 1, -1):
        yield from ngrams(term, m)

In [53]:
from math import log2

In [54]:
def c_value(F, theta):
    
    termhood = Counter()
    longer = defaultdict(list)
    
    for k in sorted(F, reverse=True):
        for term in F[k]:
            if term in longer:
                discount = sum(longer[term]) / len(longer[term])
            else:
                discount = 0
            c = log2(k) * (F[k][term] - discount)
            if c > theta:
                termhood[term] = c
                for subterm in get_subterms(term):
                    if subterm in F[len(subterm)]:
                        longer[subterm].append(F[k][term])
    return termhood

In [122]:
nonclterms = c_value(freqs, theta=25)

In [123]:
for t, c in nonclterms.most_common(20):
    print(f'{c:8.2f} {freqs[len(t)][t]:4d} {" ".join(t)}')

  282.00  282 other hand
  242.00  264 health care
  222.00  126 part - time faculty
  206.00  206 same time
  177.52  112 long - term
  169.00  169 high school
  160.08  153 part - time
  149.00  167 body color
  138.00  138 eye color
  137.00  137 domestic violence
  133.14   98 self - esteem
  120.46   76 decision - making
  118.00  146 wing venation
  112.53   71 low - income
  111.00  111 renewable energy
  103.02   65 quality of life
  103.02   65 state of nature
  103.02   65 spell - caster
  103.02   65 community violence exposure
  101.00  101 wild type


In [124]:
for t, c in tail(20, nonclterms.most_common()):
    print(f'{c:8.2f} {freqs[len(t)][t]:4d} {" ".join(t)}')

   26.00   26 starting point
   26.00   26 surface water
   26.00   26 individual level
   26.00   26 static electricity
   26.00   26 mid -
   26.00   26 test rig
   26.00   26 foam cleaning
   26.00   44 environmental justice
   26.00   26 zooplankton community
   26.00   26 informed consent
   26.00   26 depressed mood
   26.00   26 flow sheet
   25.36   16 necessary in order
   25.36   16 cross - immunity
   25.36   16 mutant body color
   25.36   16 hypothesis - testing
   25.36   16 real - world
   25.36   16 self - presentation
   25.36   16 maximum grip strength
   25.36   16 turbulent kinetic energy


---

**Remove non-specific terms**

---

In [143]:
with open('nonclterms.txt', 'w') as f:
    for term in nonclterms:
        print(' '.join(term), file=f)

In [144]:
nonclterms = [t.split() for t in open('nonclterms.txt')]

In [145]:
terms_tuple = [tuple(lst) for lst in terms]
nonclterms_tuple = [tuple(lst) for lst in nonclterms]

In [146]:
terms_set = set(terms_tuple)
nonclterms_set = set(nonclterms_tuple)

In [151]:
finalterms = terms_set.difference(nonclterms_set)

In [152]:
with open('terms-final.txt', 'w') as f:
    for term in finalterms:
        print(' '.join(term), file=f)