# Word tokenization
## Tokenize words without tokenization hints

In [1]:
from estnltk import Text

t = '''Aadressilt bla@bla.ee tuli 10 000 kirja, st. spammi aadressile foo@foo.ee 10 tunni jooksul 2017. aastal. \
A. H. Tammsaare: 1,0 m / s = 3, 67 km/h.'''
t

'Aadressilt bla@bla.ee tuli 10 000 kirja, st. spammi aadressile foo@foo.ee 10 tunni jooksul 2017. aastal. A. H. Tammsaare: 1,0 m / s = 3, 67 km/h.'

In [2]:
text = Text(t)
text.tag_layer(['words'])
text['words']

text
Aadressilt
bla
@
bla.ee
tuli
10
000
kirja
","
st


## Tokenize words with tokenization hints
### Create tokenization hints layer

Word tokenizer can use hints from the tokenization_hints layer. One can create tokenization_hints layer with TokenizationHintsTagger.

In [3]:
from estnltk.taggers import TokenizationHintsTagger

tokenization_hints_tagger = TokenizationHintsTagger()

text = Text(t)
status = {}
tokenization_hints_tagger.tag(text, status)
text['tokenization_hints']

text,normalized,_priority_
bla@bla.ee,,"(0, 0)"
10 000,10000,"(1, 0)"
st.,,"(4, 0)"
foo@foo.ee,,"(0, 0)"
10,10,"(1, 0)"
2017.,2017,"(1, 0)"
A. H. Tammsaare,A. H. Tammsaare,"(3, 0)"
10,10,"(1, 0)"
m / s,m/s,"(2, 0)"
"3, 67",367,"(1, 0)"


In [4]:
status

{'number_of_conflicts': 0}

### Tokenize words
Word tokenizer uses tokenizaton hints layer if it is present.

In [5]:
text.tag_layer(['words'])
text['words']

text
Aadressilt
bla@bla.ee
tuli
10 000
kirja
","
st.
spammi
aadressile
foo@foo.ee
