In [1]:
import spacy
from spacy.attrs import ORTH

In [35]:
nlp = spacy.load("en_core_web_sm")
nlp.tokenizer.add_special_case('Vet. App.', [{ORTH: 'Vet. App.'}])
nlp.tokenizer.add_special_case('Veterans Law Judge', [{ORTH: 'Veterans Law Judge'}])
nlp.tokenizer.add_special_case('Veterans Affairs', [{ORTH: 'Veterans Affairs'}])
nlp.tokenizer.add_special_case("Veterans' Appeals", [{ORTH: "Veterans' Appeals"}])

In [36]:
ruler = nlp.get_pipe("attribute_ruler")
patterns = [[{"TEXT": "["}], [{"TEXT": "\n"}], [{"TEXT": "'"}], [{"TEXT": "\r"}], [{"TEXT": "t"}]]
attrs = {"POS": "PUNCT"}
ruler.add(patterns=patterns, attrs=attrs, index=0)


In [37]:
def spacy_tokenize(txt):
    doc = nlp(txt)
    tokens = list(doc)
    clean_tokens = []
    par_removed = ''
    for t in tokens:
        if t.pos_ == 'PUNCT':
            pass
        elif t.pos_ == 'NUM':
            clean_tokens.append(f'<NUM{len(t)}>')
        elif t.lemma_ == "'s":
            pass
        elif '(' in t.lemma_:
            par_split = t.lemma_.split('(')
            for elem in par_split:
                par_removed = par_removed + elem
            par_split = spacy_tokenize(par_removed)
            for elem in par_split:
                clean_tokens.append(elem)
        elif "\n" in t.lemma_:
            par_split = t.lemma_.split('\n')
            for elem in par_split:
                if elem != ' ' and elem != '':
                    par_removed = par_removed + ' ' + elem
            par_split = spacy_tokenize(par_removed)
            for elem in par_split:
                clean_tokens.append(elem)
        else:
            clean_tokens.append(t.lemma_.lower())
    return clean_tokens

In [41]:
example_procedure_1 = "Archive Date: 07/11/16"
example_fact_1 = "He related that he was \"having a reoccurrence of bronchitis [symptoms].\""
example_procedure_2 = "Veterans Law Judge, Board of Veterans' Appeals\n\n\n\nDepartment of Veterans Affairs"
example_cit_2 = "38 U.S.C.A. \u00a7\u00a7 1101, 1110 (West 2014); 38 C.F.R. \u00a7\u00a7 3.303, 3.304 (2015)."
example_seg_1 = "Acting Veterans \n Law Judge, Board of Veterans' Appeals\n\nUnder 38 U.S.C.A. \u00a7 7252, only \ntoday a decision of the Board is appealable to the Court."
example_basic_1 = 'In sum, as the preponderance of the evidence is against the Veteran\'s claim, his appeal must be denied.'
example_cit_1 = 'Smith v. Gober, 14 Vet. App. 227 (2000), aff\'d 281 F.3d 1384 (Fed. Cir. 2002); Dela Cruz v. Principi, 15 Vet. App. 143 (2001); see also Quartuccio v. Principi, 16 Vet. App. 183 (2002).'
example_rule_1 = '"To establish a right to compensation for a present disability, a Veteran must show: "(1) the existence of a present disability; (2) in-service incurrence or aggravation of a disease or injury; and (3) a causal relationship between the present disability and the disease or injury incurred or aggravated during service"-the so-called "nexus" requirement."'
example_mixed_1 = 'In Dingess v. Nicholson, 19 Vet. App. 473 (2006), the U.S. Court of Appeals for Veterans Claims held that, upon receipt of an application for a service-connection claim, 38 U.S.C.A. � 5103(a) and 38 C.F.R. � 3.159(b) require VA to provide the claimant with notice that a disability rating and an effective date for the award of benefits will be assigned if service connection is awarded. '

In [42]:
text = example_seg_1
print(text)
print(spacy_tokenize(text), len(spacy_tokenize(text)))

Acting Veterans 
 Law Judge, Board of Veterans' Appeals

Under 38 U.S.C.A. § 7252, only 
today a decision of the Board is appealable to the Court.
['acting', 'veterans', 'law', 'judge', 'board', 'of', "veterans' appeal", 'under', '<NUM2>', 'u.s.c.a.', '§', '<NUM4>', 'only', 'today', 'a', 'decision', 'of', 'the', 'board', 'be', 'appealable', 'to', 'the', 'court'] 24
