In [9]:
import json 

import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex

## 1) Hent original modell

In [10]:
nlp = spacy.load("nb_core_news_sm")

## 2) Justere på tokenizeren

### Ikke splitte ord med bindestrek

Ønsker å beholde sammensatte ord med bindestreker, ikke splitte dem:

In [3]:
# tokenisering med default tokenizer
doc = nlp("14a-vedtak")
print([t.text for t in doc]) 

['14a', '-', 'vedtak']


In [11]:
_quotes = CONCAT_QUOTES.replace("'", "")

infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [ r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]

)

infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer
doc = nlp("14a-vedtak")
print([t.text for t in doc]) 

['14a-vedtak']


### Fjerne noen spesielle regler

Den norske spacy-modellen inneholder en liste med regler og unntak som gjør at vanlige forkortelser som "t.o.m." ikke splittes opp til "t", ".", "o", ".", "m", "." 

Det er vel og bra, men på lista er det også noen ting som ikke gir mening å beholde samlet, som "i.".

In [12]:
custom_rules = dict({'\t': [{65: '\t', 74: 103, 75: '_SP'}],
 '\n': [{65: '\n', 74: 103, 75: '_SP'}],
 ' ': [{65: ' ', 74: 103, 75: '_SP'}],
 '")': [{65: '")'}],
 "'": [{65: "'"}],
 "''": [{65: "''"}],
 '(*_*)': [{65: '(*_*)'}],
 '(-8': [{65: '(-8'}],
 '(-:': [{65: '(-:'}],
 '(-;': [{65: '(-;'}],
 '(-_-)': [{65: '(-_-)'}],
 '(._.)': [{65: '(._.)'}],
 '(:': [{65: '(:'}],
 '(;': [{65: '(;'}],
 '(=': [{65: '(='}],
 '(>_<)': [{65: '(>_<)'}],
 '(^_^)': [{65: '(^_^)'}],
 '(o:': [{65: '(o:'}],
 '(¬_¬)': [{65: '(¬_¬)'}],
 '(ಠ_ಠ)': [{65: '(ಠ_ಠ)'}],
 '(╯°□°）╯︵┻━┻': [{65: '(╯°□°）╯︵┻━┻'}],
 ')-:': [{65: ')-:'}],
 '):': [{65: '):'}],
 '-_-': [{65: '-_-'}],
 '-__-': [{65: '-__-'}],
 '._.': [{65: '._.'}],
 '0.0': [{65: '0.0'}],
 '0.o': [{65: '0.o'}],
 '0_0': [{65: '0_0'}],
 '0_o': [{65: '0_o'}],
 '8)': [{65: '8)'}],
 '8-)': [{65: '8-)'}],
 '8-D': [{65: '8-D'}],
 '8D': [{65: '8D'}],
 ":'(": [{65: ":'("}],
 ":')": [{65: ":')"}],
 ":'-(": [{65: ":'-("}],
 ":'-)": [{65: ":'-)"}],
 ':(': [{65: ':('}],
 ':((': [{65: ':(('}],
 ':(((': [{65: ':((('}],
 ':()': [{65: ':()'}],
 ':)': [{65: ':)'}],
 ':))': [{65: ':))'}],
 ':)))': [{65: ':)))'}],
 ':*': [{65: ':*'}],
 ':-(': [{65: ':-('}],
 ':-((': [{65: ':-(('}],
 ':-(((': [{65: ':-((('}],
 ':-)': [{65: ':-)'}],
 ':-))': [{65: ':-))'}],
 ':-)))': [{65: ':-)))'}],
 ':-*': [{65: ':-*'}],
 ':-/': [{65: ':-/'}],
 ':-0': [{65: ':-0'}],
 ':-3': [{65: ':-3'}],
 ':->': [{65: ':->'}],
 ':-D': [{65: ':-D'}],
 ':-O': [{65: ':-O'}],
 ':-P': [{65: ':-P'}],
 ':-X': [{65: ':-X'}],
 ':-]': [{65: ':-]'}],
 ':-o': [{65: ':-o'}],
 ':-p': [{65: ':-p'}],
 ':-x': [{65: ':-x'}],
 ':-|': [{65: ':-|'}],
 ':-}': [{65: ':-}'}],
 ':/': [{65: ':/'}],
 ':0': [{65: ':0'}],
 ':1': [{65: ':1'}],
 ':3': [{65: ':3'}],
 ':>': [{65: ':>'}],
 ':D': [{65: ':D'}],
 ':O': [{65: ':O'}],
 ':P': [{65: ':P'}],
 ':X': [{65: ':X'}],
 ':]': [{65: ':]'}],
 ':o': [{65: ':o'}],
 ':o)': [{65: ':o)'}],
 ':p': [{65: ':p'}],
 ':x': [{65: ':x'}],
 ':|': [{65: ':|'}],
 ':}': [{65: ':}'}],
 ':’(': [{65: ':’('}],
 ':’)': [{65: ':’)'}],
 ':’-(': [{65: ':’-('}],
 ':’-)': [{65: ':’-)'}],
 ';)': [{65: ';)'}],
 ';-)': [{65: ';-)'}],
 ';-D': [{65: ';-D'}],
 ';D': [{65: ';D'}],
 ';_;': [{65: ';_;'}],
 '<.<': [{65: '<.<'}],
 '</3': [{65: '</3'}],
 '<3': [{65: '<3'}],
 '<33': [{65: '<33'}],
 '<333': [{65: '<333'}],
 '<space>': [{65: '<space>'}],
 '=(': [{65: '=('}],
 '=)': [{65: '=)'}],
 '=/': [{65: '=/'}],
 '=3': [{65: '=3'}],
 '=D': [{65: '=D'}],
 '=|': [{65: '=|'}],
 '>.<': [{65: '>.<'}],
 '>.>': [{65: '>.>'}],
 '>:(': [{65: '>:('}],
 '>:o': [{65: '>:o'}],
 '><(((*>': [{65: '><(((*>'}],
 '@_@': [{65: '@_@'}],
 'Aq.': [{65: 'Aq.'}],
 'C++': [{65: 'C++'}],
 'E. coli': [{65: 'E. coli'}],
 'O.O': [{65: 'O.O'}],
 'O.o': [{65: 'O.o'}],
 'O_O': [{65: 'O_O'}],
 'O_o': [{65: 'O_o'}],
 'V.V': [{65: 'V.V'}],
 'V_V': [{65: 'V_V'}],
 'XD': [{65: 'XD'}],
 'XDD': [{65: 'XDD'}],
 '[-:': [{65: '[-:'}],
 '[:': [{65: '[:'}],
 '\\")': [{65: '\\")'}],
 '\\n': [{65: '\\n', 74: 103, 75: '_SP'}],
 '\\t': [{65: '\\t', 74: 103, 75: '_SP'}],
 '^_^': [{65: '^_^'}],
 '^__^': [{65: '^__^'}],
 '^___^': [{65: '^___^'}],
 'a.m.': [{65: 'a.m.'}],
 'adm.dir.': [{65: 'adm.dir.'}],
 'b.c.': [{65: 'b.c.'}],
 'bl.a.': [{65: 'bl.a.'}],
 'c.c.': [{65: 'c.c.'}],
 'cand.mag.': [{65: 'cand.mag.'}],
 'd.d.': [{65: 'd.d.'}],
 'd.m.': [{65: 'd.m.'}],
 'd.y.': [{65: 'd.y.'}],
 'dr.philos.': [{65: 'dr.philos.'}],
 'e.Kr.': [{65: 'e.Kr.'}],
 'e.l.': [{65: 'e.l.'}],
 'f.Kr.': [{65: 'f.Kr.'}],
 'f.eks.': [{65: 'f.eks.'}],
 'f.o.m.': [{65: 'f.o.m.'}],
 'h.r.adv.': [{65: 'h.r.adv.'}],
 'kgl.res.': [{65: 'kgl.res.'}],
 'm.a.o.': [{65: 'm.a.o.'}],
 'm.m.': [{65: 'm.m.'}],
 'mag.art.': [{65: 'mag.art.'}],
 'o.0': [{65: 'o.0'}],
 'o.O': [{65: 'o.O'}],
 'o.a.': [{65: 'o.a.'}],
 'o.l.': [{65: 'o.l.'}],
 'o.o': [{65: 'o.o'}],
 'o_0': [{65: 'o_0'}],
 'o_O': [{65: 'o_O'}],
 'o_o': [{65: 'o_o'}],
 'p.a.': [{65: 'p.a.'}],
 'p.m.': [{65: 'p.m.'}],
 'p.t.': [{65: 'p.t.'}],
 'ph.d.': [{65: 'ph.d.'}],
 'red.anm.': [{65: 'red.anm.'}],
 'res.kap.': [{65: 'res.kap.'}],
 's.d.': [{65: 's.d.'}],
 's.u.': [{65: 's.u.'}],
 's.å.': [{65: 's.å.'}],
 'st.meld.': [{65: 'st.meld.'}],
 'st.prp.': [{65: 'st.prp.'}],
 't.o.m.': [{65: 't.o.m.'}],
 'tl;dr': [{65: 'tl;dr'}],
 'v.v': [{65: 'v.v'}],
 'v_v': [{65: 'v_v'}],
 'vit.ass.': [{65: 'vit.ass.'}],
 'xD': [{65: 'xD'}],
 'xDD': [{65: 'xDD'}],
 '\xa0': [{65: '\xa0', 74: 103, 73: '  ', 75: '_SP'}],
 '¯\\(ツ)/¯': [{65: '¯\\(ツ)/¯'}],
 'ಠ_ಠ': [{65: 'ಠ_ಠ'}],
 'ಠ︵ಠ': [{65: 'ಠ︵ಠ'}],
 '—': [{65: '—'}],
 '’': [{65: '’'}],
                    })

# Special rule for 1%, 40% and so on.
for i in range(1,101):
    custom_rules[f'{i}%'] = [{65:f'{i}%'}]

In [14]:
nlp.tokenizer.rules = custom_rules

## 3) Lager en entity ruler

### Personnavn

Ønsker å vaske bort alle personnavn. Tilrettelegger for dette ved å legge inn SSBs navnelister som entity rules

In [15]:
# leser inn navneliste fra fil
contents = open("NAVN_PATTERN.jsonl", "r").read() 
patterns = [json.loads(str(item)) for item in contents.strip().split('\n')]

In [16]:
ent_ruler = EntityRuler(nlp, overwrite_ents = True)
ent_ruler.add_patterns(patterns)

### Diagnosekoder

Ønsker å vaske bort alle sykdommer og diagnoser

In [17]:
# leser inn navneliste fra fil
contents = open("DIAG_PATTERN.jsonl", "r").read() 
patterns = [json.loads(str(item)) for item in contents.strip().split('\n')]

In [18]:
ent_ruler.add_patterns(patterns)

### FNR

Fjerner fødselsnr vha. regex, men legger til en regel for å kunne beholde fnr-tagen vi kommer til å bruke:

In [19]:
pattern = [{"label": "[fnr]", "pattern": [{"IS_PUNCT": True},{"lower": "fnr"},{"IS_PUNCT": True}]}]

In [20]:
ent_ruler.add_patterns(pattern)

### Mail

In [21]:
pattern = [{"label": "[email]", "pattern": [{"IS_PUNCT": True},{"lower": "email"},{"IS_PUNCT": True}]}]

In [22]:
ent_ruler.add_patterns(pattern)

### dato


In [23]:
pattern = [{"label": "[dato]", "pattern": [{"IS_PUNCT": True},{"lower": "dato"},{"IS_PUNCT": True}]}]

In [24]:
ent_ruler.add_patterns(pattern)

## Fjerner datoer

In [25]:
'''import re
date_pattern1 = re.search(r'\d{2}/\d{2}',sentences)
date_pattern2 = re.search(r'\d{2}/\d{1}',sentences)
date_pattern3 = re.search(r'\d{1}/\d{1}',sentences)
date_pattern4 = re.search(r'\d{1}/\d{2}',sentences)

ent_ruler.add_patterns(date_pattern1,date_pattern2,date_pattern3,date_pattern4)'''

"import re\ndate_pattern1 = re.search(r'\\d{2}/\\d{2}',sentences)\ndate_pattern2 = re.search(r'\\d{2}/\\d{1}',sentences)\ndate_pattern3 = re.search(r'\\d{1}/\\d{1}',sentences)\ndate_pattern4 = re.search(r'\\d{1}/\\d{2}',sentences)\n\nent_ruler.add_patterns(date_pattern1,date_pattern2,date_pattern3,date_pattern4)"

## 4) Legger til entiry ruleren i tekstprosesserings-pipen

In [26]:
nlp.add_pipe(ent_ruler)

## 5) Lagrer til disk

In [27]:
nlp.to_disk('spacy_norsk_custom')

Modellen kan da senere tas i bruk ved å laste inn

In [28]:
nlp = spacy.load('spacy_norsk_custom')

In [30]:
nlp.tokenizer.rules

{'\t': [{65: '\t', 74: 103, 75: '_SP'}],
 '\n': [{65: '\n', 74: 103, 75: '_SP'}],
 ' ': [{65: ' ', 74: 103, 75: '_SP'}],
 '")': [{65: '")'}],
 "'": [{65: "'"}],
 "''": [{65: "''"}],
 '(*_*)': [{65: '(*_*)'}],
 '(-8': [{65: '(-8'}],
 '(-:': [{65: '(-:'}],
 '(-;': [{65: '(-;'}],
 '(-_-)': [{65: '(-_-)'}],
 '(._.)': [{65: '(._.)'}],
 '(:': [{65: '(:'}],
 '(;': [{65: '(;'}],
 '(=': [{65: '(='}],
 '(>_<)': [{65: '(>_<)'}],
 '(^_^)': [{65: '(^_^)'}],
 '(o:': [{65: '(o:'}],
 '(¬_¬)': [{65: '(¬_¬)'}],
 '(ಠ_ಠ)': [{65: '(ಠ_ಠ)'}],
 '(╯°□°）╯︵┻━┻': [{65: '(╯°□°）╯︵┻━┻'}],
 ')-:': [{65: ')-:'}],
 '):': [{65: '):'}],
 '-_-': [{65: '-_-'}],
 '-__-': [{65: '-__-'}],
 '._.': [{65: '._.'}],
 '0.0': [{65: '0.0'}],
 '0.o': [{65: '0.o'}],
 '0_0': [{65: '0_0'}],
 '0_o': [{65: '0_o'}],
 '1%': [{65: '1%'}],
 '10%': [{65: '10%'}],
 '100%': [{65: '100%'}],
 '11%': [{65: '11%'}],
 '12%': [{65: '12%'}],
 '13%': [{65: '13%'}],
 '14%': [{65: '14%'}],
 '15%': [{65: '15%'}],
 '16%': [{65: '16%'}],
 '17%': [{65: '17%'}]