## Lab 3

In [2]:
import pandas as pd 
import numpy as np 
from cytoolz import * 
from tqdm.auto import tqdm 
tqdm.pandas()

In [75]:
terms = [t.split() for t in open('terms.txt')]

In [76]:
len(terms)

645

In [6]:
terms[:10]

[['such', 'as', 'part', '-', 'of', '-', 'speech'],
 ['part', '-', 'of', '-', 'speech', 'tagger'],
 ['part', '-', 'of', '-', 'speech', 'tagging'],
 ['part', '-', 'of', '-', 'speech', 'tag'],
 ['sequence', '-', 'to', '-', 'sequence', 'model'],
 ['word', '-', 'to', '-', 'word'],
 ['part', '-', 'of', '-', 'speech'],
 ['word', '-', 'by', '-', 'word'],
 ['state', '-', 'ofthe', '-', 'art'],
 ['tree', '-', 'to', '-', 'string']]

In [11]:
df = pd.read_parquet('s3://ling583/micusp.parquet', storage_options = {'anon':True})

In [12]:
df.head()

Unnamed: 0,filename,text
0,micusp/BIO.G0.15.1.html,"New York City, 1908: different colors of skin..."
1,micusp/BIO.G1.04.1.html,\tThe fish-tetrapod transition has been calle...
2,micusp/BIO.G3.03.1.html,\tIntracellular electric fields are of great ...
3,micusp/BIO.G0.11.1.html,Environmental stresses to plants have been st...
4,micusp/BIO.G1.01.1.html,\tThe recurrent cholera pandemics have been re...


#### Remove terms that aren't specific to computational linguistics

In [24]:
import spacy

In [25]:
nlp = spacy.load('en_core_web_sm', exclude = ['parser', 'ner', 'lemmatizer', 'attribute_ruler'])

In [27]:
doc = nlp(df['text'].iloc[0])

In [29]:
doc[1:10]

New York City, 1908: different colors of

In [30]:
from spacy.matcher import Matcher 

In [31]:
matcher = Matcher(nlp.vocab)
matcher.add('Term', [[{'TAG': {'IN': ['JJ', 'NN']}}, 
                     {'TAG': {'IN': ['JJ', 'NN', 'IN', 'HYPH']}, 'OP': '*'}, 
                     {'TAG': 'NN'}]])

In [33]:
spans = matcher(doc, as_spans = True)

In [34]:
spans

[skin swirl,
 great melting,
 melting pot,
 great melting pot,
 cultural medley,
 last crevice,
 unprecedented uniformity,
 similar effect,
 global biodiversity,
 monotonous fate,
 invasive species,
 threat of invasive species,
 international commerce,
 small portion,
 global problem,
 giant global problem,
 zebra mussel,
 human interest,
 ecosystem disturbance,
 hasty action,
 irreparable damage,
 global asset,
 asset of biodiversity,
 global asset of biodiversity,
 invasive predator,
 minimal disturbance,
 human contact,
 Insufficient understanding,
 predatory land,
 land snail,
 predatory land snail,
 African land,
 giant African land,
 land snail,
 African land snail,
 giant African land snail,
 A. fulica,
 Polynesian government,
 French Polynesian government,
 hasty French Polynesian government,
 invasive control,
 control project,
 invasive control project,
 project into action,
 control project into action,
 invasive control project into action,
 resultant extinction,
 painful l

In [35]:
def get_candidates(text):
    doc = nlp(text)
    spans = matcher(doc, as_spans = True)
    return [tuple(tok.norm_ for tok in span) for span in spans]

In [37]:
get_candidates(df['text'].iloc[0])

[('skin', 'swirl'),
 ('great', 'melting'),
 ('melting', 'pot'),
 ('great', 'melting', 'pot'),
 ('cultural', 'medley'),
 ('last', 'crevice'),
 ('unprecedented', 'uniformity'),
 ('similar', 'effect'),
 ('global', 'biodiversity'),
 ('monotonous', 'fate'),
 ('invasive', 'species'),
 ('threat', 'of', 'invasive', 'species'),
 ('international', 'commerce'),
 ('small', 'portion'),
 ('global', 'problem'),
 ('giant', 'global', 'problem'),
 ('zebra', 'mussel'),
 ('human', 'interest'),
 ('ecosystem', 'disturbance'),
 ('hasty', 'action'),
 ('irreparable', 'damage'),
 ('global', 'asset'),
 ('asset', 'of', 'biodiversity'),
 ('global', 'asset', 'of', 'biodiversity'),
 ('invasive', 'predator'),
 ('minimal', 'disturbance'),
 ('human', 'contact'),
 ('insufficient', 'understanding'),
 ('predatory', 'land'),
 ('land', 'snail'),
 ('predatory', 'land', 'snail'),
 ('african', 'land'),
 ('giant', 'african', 'land'),
 ('land', 'snail'),
 ('african', 'land', 'snail'),
 ('giant', 'african', 'land', 'snail'),
 ('a

In [38]:
candidates = list(concat(df['text'].progress_apply(get_candidates)))

  0%|          | 0/788 [00:00<?, ?it/s]

In [39]:
from collections import defaultdict, Counter

In [40]:
freqs = defaultdict(Counter)
for c in candidates: 
    freqs[len(c)][c] += 1

In [41]:
freqs.keys()

dict_keys([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])

In [42]:
freqs[5].most_common(5)

[(('trend', 'of', 'part', '-', 'time'), 15),
 (('asymmetry', 'in', 'stock', 'price', 'response'), 13),
 (('cycle', '-', 'to', '-', 'cycle'), 13),
 (('interaction', 'term', 'on', 'stock', 'price'), 10),
 (('basal', 'area', 'per', 'sample', 'area'), 9)]

In [43]:
from nltk import ngrams 

In [44]:
def get_subterms(term):
    k = len(term)
    for m in range(k-1, 1, -1):
        yield from ngrams(term, m)

In [45]:
from math import log2

In [46]:
def c_value(F, theta):
    
    termhood = Counter()
    longer = defaultdict(list)
    
    for k in sorted(F, reverse = True):
        for term in F[k]:
            if term in longer: 
                discount = sum(longer[term]) / len(longer[term])
            else: 
                discount = 0
            c = log2(k) * (F[k][term] - discount)
            if c > theta: 
                termhood[term] = c 
                for subterm in get_subterms(term):
                    if subterm in F[len(subterm)]:
                        longer[subterm].append(F[k][term])
    return termhood 

In [156]:
terms_2 = c_value(freqs, theta = 50)

In [157]:
for t, c in terms_2.most_common(20):
    print(f'{c:8.2f} {freqs[len(t)][t]:4d} {" ".join(t)}')

  282.00  282 other hand
  264.00  264 health care
  252.00  126 part - time faculty
  206.00  206 same time
  177.52  112 long - term
  169.00  169 high school
  167.00  167 body color
  155.33   98 self - esteem
  146.00  146 wing venation
  138.00  138 eye color
  137.00  137 domestic violence
  120.46   76 decision - making
  112.53   71 low - income
  111.00  111 renewable energy
  103.02   65 quality of life
  103.02   65 state of nature
  103.02   65 spell - caster
  103.02   65 community violence exposure
  101.00  101 wild type
   97.00   97 civil society


In [158]:
for t, c in tail(20, terms_2.most_common()):
    print(f'{c:8.2f} {freqs[len(t)][t]:4d} {" ".join(t)}')

   55.00   55 storm water
   55.00   55 kinetic energy
   55.00   55 sex education
   54.00   54 grip force
   54.00   54 physical activity
   54.00  119 community violence
   53.89   34 client - provider
   53.89   34 recurrent breast cancer
   53.00   53 social interaction
   53.00   53 professional development
   52.30   33 nation - state
   52.00   52 wide range
   52.00   52 front end
   51.00   51 first step
   51.00   51 time period
   51.00   51 non -
   51.00   51 federal government
   51.00  116 violence exposure
   50.72   32 short - term
   50.72   32 idea of body


In [159]:
with open('terms-2.txt', 'w') as f: 
    for term in terms_2: 
        print(' '.join(term), file = f)

In [160]:
terms_2 = [t.split() for t in open('terms-2.txt')]

In [161]:
new_terms = [t for t in terms if t not in terms_2]

In [162]:
# now the lenth reduced 
len(new_terms)

633

In [163]:
with open('term-final.txt', 'w') as f: 
    for term in new_terms: 
        print(' '.join(term), file = f)