# Labelling Health Grant Descriptions with MeSH Terms

In [1]:
import pandas as pd
import numpy as np
import json
import spacy
import textacy
import re

from spacy.lang.en import English
from spacy.matcher import PhraseMatcher, Matcher
from spacy.tokens import Doc, Span, Token

from collections import Counter

In [2]:
nlp = spacy.load('en_core_web_sm')

1. Import health terms.
2. Filter terms down to the desired level.
3. Add them as patterns to the matcher, labelled by the terms in the highest category level desired.
4. Find matches in documents.
5. Add a token attribute to hold the category label, and create a callback function to update this attribute. Run matching again.

- Stretch goal: associate terms using part of speech in a network or coocurrence matrix.

**1. Import Health Terms**

In [3]:
with open('../data/processed/mesh_codes_processed_5_8_2018.json', 'r') as f:
    health_terms = json.load(f)

**2. Filter Terms**

In [74]:
def process_string(string):
    string = string.split(', ')
    string = ' '.join(string[::-1])
    return string.lower()

def filter_terms(terms, order, on='ConceptStringProcessed'):
    filtered = {}
    for tree_number, properties in terms.items():
        if properties['tree_order'] >= order:
#             if properties.get('tree_{}_{}'.format(on, order)) is not None:
            if order == 0:
                top_parent = properties['tree_string_0']
            else:
                top_parent = properties['tree_{}_{}'.format(on, order)]
            names = list(set([process_string(properties['{}'.format(t)]) for t in ['TermString', 'ConceptNameString', 'DescriptorNameString']]))
#             names = list(set([process_string(properties['{}'.format(t)]) for t in ['ConceptNameString']]))
            for name in names:
                filtered[name.lower()] = top_parent
    return filtered

In [95]:
health_terms_0 = filter_terms(health_terms, 0)
health_terms_0.pop('will')

'psychiatry and psychology'

In [96]:
with open('../data/processed/mesh_term_label_0_n.json', 'w') as f:
    json.dump(health_terms_0, f)

In [97]:
health_terms_filtered = filter_terms(health_terms, 1)

In [98]:
len(health_terms_filtered)

55183

**4. Import Project Descriptions**

In [7]:
health_grants = pd.read_csv('../data/processed/health_research_grants_4_26_2018.csv')

n_docs = 200
descriptions = health_grants.sample(n=200, random_state=42)['public_description']

In [8]:
%time descriptions_clean = [textacy.preprocess_text(textacy.preprocess.fix_bad_unicode(d), lowercase=True) for d in descriptions]

CPU times: user 546 ms, sys: 3.61 ms, total: 550 ms
Wall time: 551 ms


**3. Create PhraseMatcher**

In [9]:
tokenizer = nlp.tokenizer

%time docs = [tokenizer(d) for d in descriptions_clean]

CPU times: user 1.92 s, sys: 21.3 ms, total: 1.94 s
Wall time: 1.97 s


In [10]:
%time phrases = [(tokenizer(k), v) for k, v in health_terms_filtered.items()]

for phrase, _ in phrases:
    for token in phrase:
        _ = tokenizer.vocab[token.text]

CPU times: user 5.82 s, sys: 183 ms, total: 6 s
Wall time: 6.08 s


In [11]:
matcher_test = PhraseMatcher(tokenizer.vocab, max_length=10)

In [12]:
%%time
for phrase in phrases:
    if phrase[1] is not None:
        if len(phrase[0]) < 10:
            matcher_test.add(phrase[1], None, phrase[0])

CPU times: user 115 ms, sys: 2.63 ms, total: 118 ms
Wall time: 118 ms


**4. Find Matches**

In [13]:
%%time
matches_test = []

for text in descriptions_clean:
    doc = tokenizer(text)
#     for w in doc:
#         _ = doc.vocab[w.text]
    matches_test.append(matcher_test(doc))

CPU times: user 1.87 s, sys: 4.76 ms, total: 1.88 s
Wall time: 1.88 s


In [14]:
print("Estimated time to execute matching on all documents:", (0.6 + 2.4 + 1.8) * (len(health_grants) / n_docs / 60), 'minutes')

Estimated time to execute matching on all documents: 15.398800000000001 minutes


(It will take roughly half an hour to apply the current matching scheme to around 40,000 documents.)

The results here seem pretty promising at first. I originally thought that there would be no matches in many documents, but it looks as if all documents have multiple matches. Perhaps the standardisation of language within the health and medical fields is pretty strong. One issue is words that are wrongly matched because they appear both in the MeSH terms and also in the documens, but clearly not in a medical sense. For example "will" is matched wherever it occurs as a "psychological process", which is clearly not going to be the case for many uses of the word.

There is also an issue with a lot of terms not being picked up. For example "healthcare" is not picked up, as well as many terms consisting of multiple words, such as "silicon sensor", where only "silicon" is detected. Also key technologies, such as "encryption" are missing.

In [16]:
Token.set_extension('is_mesh', default=False)
Token.set_extension('mesh_label', default=None)

In [17]:
for i, match in enumerate(matches_test):
    if i in [0, 100]:
        print(i, '='*100)
        for match_id, start, end in match:
            string_id = nlp.vocab.strings[match_id]  # get string representation
            span = docs[i][start:end]  # the matched span
            print('{:<16}\t{:<48}\t{}'.format(match_id, string_id, span.text))
            
            tokens = [t for t in docs[i][start:end]]
            for t in tokens:
                t._.is_mesh = True
                t._.mesh_label = health_terms_filtered.get(span.text)
            doc_mod = []
        for t in docs[i]:
            if t._.is_mesh:
                doc_mod.append(t.text.upper())
            else:
                doc_mod.append(t.text)
        print('\nFull Text:')
        print(' '.join(doc_mod), '\n')

14104734280646896271	inorganic chemicals                             	silicon
12114241926807388188	psychologic processes                           	will
10687308069696298868	tissues                                         	tissue
12114241926807388188	psychologic processes                           	will
737360591271645834	humanities                                      	knowledge
737360591271645834	humanities                                      	human body
4877011699741374108	human activities                                	travel
10687308069696298868	tissues                                         	tissue
737360591271645834	humanities                                      	knowledge
14104734280646896271	inorganic chemicals                             	silicon
12114241926807388188	psychologic processes                           	will
3380507654813987851	physical processes                              	time
10687308069696298868	tissues                                         	tissue
141

**5. Add Token Attribute and Callback**

(Leaving for now, as there are improvements to make first)

### Rebuilding Docs using Matches Only

Let's create versions of the documents that contain only the MeSH terms.

In [19]:
new_descriptions = []
for i, match in enumerate(matches_test):
    d = []
    for match_id, start, end in match:
        string_id = nlp.vocab.strings[match_id]  # get string representation
        span = docs[i][start:end]  # the matched span
        d.append(span.text)
    new_descriptions.append(d)

A document now looks like this:

In [20]:
' '.join(new_descriptions[0])

'silicon will tissue will knowledge human body travel tissue knowledge silicon will time tissue silicon silicon back time time time feedback equipment'

In [21]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [22]:
description_term_counts = Counter(flatten(new_descriptions))

In [23]:
len(flatten(new_descriptions))

6576

In [24]:
description_term_counts.most_common(20)

[('will', 808),
 ('work', 139),
 ('disease', 109),
 ('award', 96),
 ('time', 80),
 ('risk', 74),
 ('technology', 63),
 ('methods', 62),
 ('resources', 60),
 ('hiv', 58),
 ('knowledge', 55),
 ('life', 54),
 ('population', 54),
 ('cells', 53),
 ('programs', 53),
 ('cell', 52),
 ('safety', 51),
 ('policy', 48),
 ('hospital', 45),
 ('education', 44)]

### Apply To All Documents

In [628]:
descriptions_all = health_grants['public_description']

In [632]:
%time descriptions_all_clean = [textacy.preprocess_text(textacy.preprocess.fix_bad_unicode(d), lowercase=True) for d in descriptions_all]

CPU times: user 1min 36s, sys: 1.63 s, total: 1min 38s
Wall time: 1min 41s


In [630]:
tokenizer = nlp.tokenizer

In [633]:
%time docs = [tokenizer(d) for d in descriptions_all_clean]

CPU times: user 5min 21s, sys: 3.76 s, total: 5min 24s
Wall time: 5min 30s


In [48]:
%time phrases = [(tokenizer(k), v) for k, v in health_terms_filtered.items()]

for phrase, _ in phrases:
    for token in phrase:
        _ = tokenizer.vocab[token.text]

CPU times: user 4.63 s, sys: 237 ms, total: 4.86 s
Wall time: 4.93 s


In [635]:
matcher = PhraseMatcher(tokenizer.vocab, max_length=10)

In [636]:
%%time
for phrase in phrases:
    if phrase[1] is not None:
        if len(phrase[0]) < 10:
            matcher.add(phrase[1], None, phrase[0])

CPU times: user 117 ms, sys: 7 ms, total: 124 ms
Wall time: 124 ms


In [637]:
%%time
matches = []

for text in descriptions_all_clean:
    doc = tokenizer(text)
#     for w in doc:
#         _ = doc.vocab[w.text]
    matches.append(matcher(doc))

CPU times: user 5min 26s, sys: 2.21 s, total: 5min 28s
Wall time: 5min 33s


In [638]:
new_descriptions = []
for i, match in enumerate(matches):
    d = []
    for match_id, start, end in match:
        string_id = nlp.vocab.strings[match_id]  # get string representation
        span = docs[i][start:end]  # the matched span
        d.append(span.text)
    new_descriptions.append(d)

In [639]:
description_term_counts = Counter(flatten(new_descriptions))

In [661]:
description_term_counts.most_common(20)

[('will', 134092),
 ('disease', 18178),
 ('work', 16793),
 ('cells', 15420),
 ('award', 13002),
 ('risk', 12848),
 ('time', 12533),
 ('technology', 11314),
 ('methods', 11239),
 ('safety', 10701),
 ('cell', 10422),
 ('children', 9542),
 ('policy', 9538),
 ('programs', 9273),
 ('life', 9149),
 ('knowledge', 9094),
 ('population', 8695),
 ('hiv', 8678),
 ('brain', 7709),
 ('education', 7676)]

We definitely need to get rid of the word "will" from all documents" as it occurs 10 times more than for any other document.

In [647]:
new_descriptions_filtered = [[t for t in n_d if t != 'will'] for n_d in new_descriptions]

**Possible Improvements**

1. Enhance the list of terms by splitting into unigrams and filtering both the most uncommon and most common. Manually check for terms that are sufficiently appropriate in each category. - **Not sure if this is actually going to be useful after trying..**.
2. ~~Use POS tagging to stop tagging words that are not used in a medical sense, e.g. "will". It seems like all of the MeSH terms are nouns, but I could be wrong...~~ **Doesn't work unless we use a regular `Matcher`, which is much slower. One alternative could be to filter terms by frequency instead.**
3. Match on lemmas - **Also only possible with a regular `Matcher`.**

**Think About**
- Best way to use the transformed documents - clustering vs network.
- Whether there's a computationally efficient way to find similar words within a document (probably most efficient just to stick with spaCy's PhraseMatcher).
- Using POS tagging to stop tagging words that are not used in a medical sense, e.g. "will". It seems like all of the MeSH terms are nouns, but I could be wrong...

## 2. Improved Iteration

### Create MeSH Term Unigrams

There's a risk that many of the terms in the MeSh list may not be found in documents because the phrasing in the documents does not match the exact form of the MeSh term. To prevent this, we can split the terms into their unigram consituents, find the unigrams that occur under only one parent category, and do any other filtering to be left with a set of terms that are single words corresponding to a MeSH category at the order we have chosen.

In [25]:
string_phrases = [(a.text, b) for a, b in phrases]

In [26]:
string_phrases[:10]

[('calcium channel agonists', 'and proteins peptides amino acids'),
 ('(+)-isomer bay-k-8644', 'heterocyclic compounds'),
 ('(+)-isomer methoxyhydroxyphenylglycol', 'organic chemicals'),
 ('methoxyhydroxyphenylglycol', 'organic chemicals'),
 ('muscarinic antagonists', 'heterocyclic compounds'),
 ('(+)-isomer oxyphenonium bromide', 'organic chemicals'),
 ('serotonin receptor agonists', 'and proteins peptides amino acids'),
 ('(+,-)-isomer 2,5-dimethoxy-4-methylamphetamine', 'organic chemicals'),
 ('cytochrome p-450 cyp2d6 inhibitors', 'heterocyclic compounds'),
 ('(+-)-isomer bupropion', 'organic chemicals')]

Outline of the process:

- Map unigrams to labels
- Eliminate those that appear in multiple labels
- Eliminate those that are too uncommon
- Eliminate those that are too common
- Eliminate from map 
- Update original map

First we split the phrases into unigrams and map them to their labels (`unigram_mesh_terms`).

In [27]:
unigram_mesh_terms = []

for phrase, label in string_phrases:
    phrase = phrase.split(' ')
    for p in phrase:
        unigram_mesh_terms.append((p, label))

We then find out how many categories each of the unigrams appears in (`unigram_mesh_terms_label_counts`).

In [28]:
unigram_mesh_terms_label_counts = {}

for phrase, label in unigram_mesh_terms:
    if phrase not in unigram_mesh_terms_label_counts:
        unigram_mesh_terms_label_counts[phrase] = {'terms': [],
                                                  'count': 0}
    if label in unigram_mesh_terms_label_counts[phrase]['terms']:
        continue
    else:
        unigram_mesh_terms_label_counts[phrase]['terms'].append(label)
        unigram_mesh_terms_label_counts[phrase]['count'] = len(unigram_mesh_terms_label_counts[phrase]['terms'])

Next, we find all of those that appear in more than one category, and we remove them from the original list of unigrams (`unigram_mesh_term_singles`), and make a `Counter` of the terms (`unigram_mesh_term_counts`).

In [29]:
unigram_mesh_terms_label_multis = [phrase for phrase, properties in unigram_mesh_terms_label_counts.items() if properties['count'] != 1 ]

In [30]:
unigram_mesh_term_singles = [(t[0], t[1]) for t in unigram_mesh_terms if t[0] not in unigram_mesh_terms_label_multis]

In [31]:
len(unigram_mesh_term_singles)

39079

In [32]:
unigram_mesh_term_counts = Counter([t[0] for t in unigram_mesh_term_singles])

We're still left with 30,000 unigrams, so we probably need to filter a bit more.

In [33]:
len(unigram_mesh_term_counts)

30083

Let's have a look at some of the most common...

In [34]:
unigram_mesh_term_counts.most_common(20)

[('transporter', 73),
 ('chromosomes', 61),
 ('republic', 43),
 ('rats', 42),
 ('institute', 27),
 ('electrophoresis', 26),
 ('oxidoreductases', 26),
 ('cyclin', 26),
 ('assays', 25),
 ('transporters', 24),
 ('lactobacillus', 24),
 ('isotopes', 23),
 ('waste', 22),
 ('spectrometry', 22),
 ('myosin', 21),
 ('gel', 20),
 ('acetyltransferase', 20),
 ('inborn', 20),
 ('perceptions', 18),
 ('methyltransferase', 18)]

... and the most uncommon.

In [35]:
uncommon_n = 1
unigram_mesh_term_counts_uncommon = Counter({k: v for k, v in unigram_mesh_term_counts.items() if v <= uncommon_n})

In [36]:
unigram_mesh_term_counts_uncommon.most_common(20)

[('bay-k-8644', 1),
 ('oxyphenonium', 1),
 ('(+,-)-isomer', 1),
 ('2,5-dimethoxy-4-methylamphetamine', 1),
 ('bupropion', 1),
 ('fenfluramine', 1),
 ('tocainide', 1),
 ('tomoxetine', 1),
 ('bisoprolol', 1),
 ('encainide', 1),
 ('(1', 1),
 ('alpha,2', 1),
 ('(1(s*(s*)),2', 1),
 ('fosinopril', 1),
 ('antihypertensive', 1),
 ('(11', 1),
 ('(11a', 1),
 ('anthramycin', 1),
 ('(13', 1),
 ('radiopharmaceuticals', 1)]

From this we can see that we can filter:
- Terms that are only numbers or numbers and punctuation
- Short tokens

In [37]:
unigram_mesh_term_singles = [t for t in unigram_mesh_term_singles if not t[0].isdigit()]
unigram_mesh_term_singles = [t for t in unigram_mesh_term_singles if re.match("^[A-Za-z0-9]*$", t[0])]
unigram_mesh_term_singles = [t for t in unigram_mesh_term_singles if len(t[0]) > 2]

In [38]:
unigram_mesh_term_singles_counts = Counter(unigram_mesh_term_singles)

After doing this, we can see that we're left with a more sensible list. There's a few terms that seem a bit vague, but without handpicking them out, it seems like it could be an unnecessarily complex task to remove them, so we'll leave it for now.

In [39]:
unigram_mesh_term_singles_counts.most_common(20)

[(('transporter', 'and proteins peptides amino acids'), 73),
 (('chromosomes', 'genetic processes'), 61),
 (('republic', 'geographic locations'), 43),
 (('rats', 'eukaryota'), 42),
 (('institute', 'health care economics and organizations'), 27),
 (('electrophoresis', 'investigative techniques'), 26),
 (('oxidoreductases', 'enzymes and coenzymes'), 26),
 (('cyclin', 'and proteins peptides amino acids'), 26),
 (('assays', 'investigative techniques'), 25),
 (('transporters', 'and proteins peptides amino acids'), 24),
 (('lactobacillus', 'bacteria'), 24),
 (('isotopes', 'inorganic chemicals'), 23),
 (('waste', 'environment and public health'), 22),
 (('spectrometry', 'investigative techniques'), 22),
 (('myosin', 'and proteins peptides amino acids'), 21),
 (('gel', 'investigative techniques'), 20),
 (('acetyltransferase', 'enzymes and coenzymes'), 20),
 (('inborn', 'nutritional and metabolic diseases'), 20),
 (('perceptions', 'psychologic processes'), 18),
 (('methyltransferase', 'enzymes 

In [40]:
uncommon_n = 1
unigram_mesh_term_sinlges_counts_uncommon = Counter({k: v for k, v in unigram_mesh_term_singles_counts.items() if v <= uncommon_n})

In [41]:
unigram_mesh_term_sinlges_counts_uncommon.most_common(20)

[(('oxyphenonium', 'organic chemicals'), 1),
 (('bupropion', 'organic chemicals'), 1),
 (('fenfluramine', 'organic chemicals'), 1),
 (('tocainide', 'organic chemicals'), 1),
 (('tomoxetine', 'organic chemicals'), 1),
 (('bisoprolol', 'organic chemicals'), 1),
 (('encainide', 'organic chemicals'), 1),
 (('fosinopril', 'and proteins peptides amino acids'), 1),
 (('antihypertensive', 'heterocyclic compounds'), 1),
 (('anthramycin', 'heterocyclic compounds'), 1),
 (('radiopharmaceuticals', 'organic chemicals'), 1),
 (('ethynodiol', 'polycyclic compounds'), 1),
 (('menthol', 'lipids'), 1),
 (('antipruritics', 'signs and symptoms pathological conditions'), 1),
 (('enprostil', 'biological factors'), 1),
 (('permethrin', 'organic chemicals'), 1),
 (('swainsonine', 'heterocyclic compounds'), 1),
 (('lincomycin', 'carbohydrates'), 1),
 (('pipecuronium', 'heterocyclic compounds'), 1),
 (('pregnanolone', 'polycyclic compounds'), 1)]

In [42]:
unigram_mesh_terms = list(set(unigram_mesh_term_singles))

In [43]:
len(unigram_mesh_terms)

25617

**3. Create PhraseMatcher**

In [47]:
unigram_mesh_terms_parsed = [(tokenizer(k), v) for k, v in unigram_mesh_terms]

In [49]:
%time phrases_all = phrases + unigram_mesh_terms_parsed

for phrase, _ in phrases_all:
    for token in phrase:
        _ = tokenizer.vocab[token.text]

CPU times: user 1.07 ms, sys: 64 µs, total: 1.13 ms
Wall time: 1.14 ms


In [50]:
matcher_test_all = PhraseMatcher(tokenizer.vocab, max_length=10)

In [51]:
%%time
for phrase in phrases_all:
    if phrase[1] is not None:
        if len(phrase[0]) < 10:
            matcher_test_all.add(phrase[1], None, phrase[0])

CPU times: user 168 ms, sys: 6.97 ms, total: 175 ms
Wall time: 175 ms


**4. Find Matches**

In [52]:
%%time
matches_test_all = []

for text in descriptions_clean:
    doc = tokenizer(text)
#     for w in doc:
#         _ = doc.vocab[w.text]
    matches_test_all.append(matcher_test(doc))

CPU times: user 1.9 s, sys: 18.9 ms, total: 1.92 s
Wall time: 1.95 s


In [53]:
print("Estimated time to execute matching on all documents:", (0.6 + 2.4 + 1.8) * (len(health_grants) / n_docs / 60), 'minutes')

Estimated time to execute matching on all documents: 15.398800000000001 minutes


(It will take roughly half an hour to apply the current matching scheme to around 40,000 documents.)

The results here seem pretty promising at first. I originally thought that there would be no matches in many documents, but it looks as if all documents have multiple matches. Perhaps the standardisation of language within the health and medical fields is pretty strong. One issue is words that are wrongly matched because they appear both in the MeSH terms and also in the documens, but clearly not in a medical sense. For example "will" is matched wherever it occurs as a "psychological process", which is clearly not going to be the case for many uses of the word.

There is also an issue with a lot of terms not being picked up. For example "healthcare" is not picked up, as well as many terms consisting of multiple words, such as "silicon sensor", where only "silicon" is detected. Also key technologies, such as "encryption" are missing.

In [54]:
for i, match in enumerate(matches_test):
    if i in [0, 100]:
        print(i, '='*100)
        for match_id, start, end in match:
            string_id = nlp.vocab.strings[match_id]  # get string representation
            span = docs[i][start:end]  # the matched span
            print('{:<16}\t{:<48}\t{}'.format(match_id, string_id, span.text))
            
            tokens = [t for t in docs[i][start:end]]
            for t in tokens:
                t._.is_mesh = True
                t._.mesh_label = health_terms_filtered.get(span.text)
            doc_mod = []
        for t in docs[i]:
            if t._.is_mesh:
                doc_mod.append(t.text.upper())
            else:
                doc_mod.append(t.text)
        print('\nFull Text:')
        print(' '.join(doc_mod), '\n')

14104734280646896271	inorganic chemicals                             	silicon
12114241926807388188	psychologic processes                           	will
10687308069696298868	tissues                                         	tissue
12114241926807388188	psychologic processes                           	will
737360591271645834	humanities                                      	knowledge
737360591271645834	humanities                                      	human body
4877011699741374108	human activities                                	travel
10687308069696298868	tissues                                         	tissue
737360591271645834	humanities                                      	knowledge
14104734280646896271	inorganic chemicals                             	silicon
12114241926807388188	psychologic processes                           	will
3380507654813987851	physical processes                              	time
10687308069696298868	tissues                                         	tissue
141

In [60]:
health_terms_all = {k.text: v for k, v in phrases_all}

In [63]:
with open('../data/processed/mesh_term_label_1_n.json', 'w') as f:
    json.dump(health_terms_all, f)

#### spaCy Custom Pipeline Module for Phrase Matching

In [None]:
Token.set_extension('is_mesh', default=False)
Token.set_extension('mesh_label', default=None)
Doc.set_extension('mesh_terms', default=[])
Doc.set_extension('mesh_label', default=[])

class MeshTermMatcher(object):
    name = 'mesh_term_matcher'

    def __init__(self, tokenizer, terms):
        patterns = [(tokenizer(text), label) for text, label in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
        for pattern, label in patterns:
            self.matcher.add(label, None, pattern)

    def __call__(self, doc):
        matches = self.matcher(doc)
        for match_id, start, end in matches:
            span = Span(doc, start, end, label=match_id)
            doc._.mesh_labels = terms.get(label)
            for token in span:
                label = terms.get(token.text)
                token._.is_mesh = True
                token._.mesh_label = label
                doc._.mesh_labels.append(label)
                doc._.mesh_terms.append(token.text)
        return doc

### Play with MeSH Term Hierarchy

In [511]:
def filter_terms(terms, min_order, max_order, on='ConceptStringProcessed', get_strings_from=['TermString', 'ConceptNameString', 'DescriptorNameString']):
    filtered = {}
    for tree_number, properties in terms.items():
        if (properties['tree_order'] >= min_order) & (properties['tree_order'] <= max_order):
            top_parent = properties['tree_{}_{}'.format(on, min_order)]
            names = list(set([process_string(properties['{}'.format(t)]) for t in get_strings_from]))
            for name in names:
                filtered[name.lower()] = top_parent
    return filtered

In [512]:
health_terms_filtered_1_3 = filter_terms(health_terms, 1, 3)

In [513]:
len(health_terms_filtered_1_3)

12986