## Narrative Text Preprocessing

Requires scispaCy model "en_core_sci_sm"

To keep drug names together, combine number and word if word has number and hyphen, and does not contain "year" or "old"
Remove numbers, punctuation, stopwords, and lemmatize

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.tokens import Token
import pandas as pd
import re
import string

nlp = spacy.load('en_core_sci_sm')

# remove some stop words
nlp.Defaults.stop_words -= {"no", "not", "none", "nowhere"}

# remove period from punctuation
my_punct = re.sub('\.', '', string.punctuation)

### set token extensions
# has_num if has number
num_getter = lambda token: any([t.isdigit() for t in token.text])
Token.set_extension("has_num", getter=num_getter, force=True)

# has_alpha if has letter
alpha_getter = lambda token: any([t.isalpha() for t in token.text])
Token.set_extension("has_alpha", getter=alpha_getter, force=True)

# has_punct if has punctuation
punct_getter = lambda token: any([t in string.punctuation for t in token.text])
Token.set_extension("has_punct", getter=punct_getter, force=True)

# is_drug if has number and -, but not 'year' or 'old'
drug_getter = lambda token: bool(token._.has_num and '-' in token.text
                                 and 'year' not in token.text 
                                 and 'old' not in token.text)
                                #and re.match(pattern='^[0-9]+\-[a-z]+', string=token.text))
Token.set_extension("is_drug", getter=drug_getter, force = True)

# returns tokens that do not have numbers or drugs
def rm_numbers(doc):
    return [token for token in doc if not token._.has_num or token._.is_drug]

# returns tokens that do not have punctuation
def rm_punct(doc): 
    return [token for token in doc if not token.is_punct and token.text not in string.punctuation]

# returns tokens that are not stopwords
def rm_stopwords(doc):
    return [token for token in doc if not token.is_stop]

# removes punctuation from lemma, replaces with space unless is drug
def lemmatize(doc):
    for token in doc:
        if token._.is_drug:
            token.lemma_ = ''.join(re.sub(f'[{string.punctuation}]', ' ', token.text).split())
        elif token._.has_punct:
            token.lemma_ = ' '.join(re.sub(r'[\-\/]', ' ', token.text).split())
            token.lemma_ = ' '.join(re.sub(r'[!"#$%&\'()*+,.:;<=>?@[\\\]^_`{|}~]', '', token.lemma_).split())

    return [token.lemma_ for token in doc]