# Word tokenization
## Tokenize words without tokenization hints

In [1]:
from estnltk import Text
t = '''Aadressilt bla@bla.ee tuli 10 000 kirja aadressile foo@foo.ee 10 tunni jooksul 2017. aastal. \
A. H. Tammsaare: 1,0 m / s = 3, 67 km/h.'''
t

'Aadressilt bla@bla.ee tuli 10 000 kirja aadressile foo@foo.ee 10 tunni jooksul 2017. aastal. A. H. Tammsaare: 1,0 m / s = 3, 67 km/h.'

In [2]:
text = Text(t)
text.tag_layer(['words'])
text.words.text

['Aadressilt',
 'bla',
 '@',
 'bla.ee',
 'tuli',
 '10',
 '000',
 'kirja',
 'aadressile',
 'foo',
 '@',
 'foo.ee',
 '10',
 'tunni',
 'jooksul',
 '2017.',
 'aastal',
 '.',
 'A.',
 'H.',
 'Tammsaare',
 ':',
 '1,0',
 'm',
 '/',
 's',
 '=',
 '3',
 ',',
 '67',
 'km',
 '/',
 'h',
 '.']

## Tokenize words with tokenization hints
### Create tokenization hints layer

Word tokenizer can use hints from the tokenization_hints layer. One can create tokenization_hints layer with TokenizationHintsTagger.

In [3]:
from estnltk.taggers import TokenizationHintsTagger

tokenization_hints_tagger = TokenizationHintsTagger()

text = Text(t)
status = {}
tokenization_hints_tagger.tag(text, status)
text.tokenization_hints

SL[Span(bla@bla.ee, {'normalized': None, '_priority_': (0, 0)}),
Span(10 000 , {'normalized': '10000', '_priority_': (1, 0)}),
Span(foo@foo.ee, {'normalized': None, '_priority_': (0, 0)}),
Span(10 , {'normalized': '10', '_priority_': (1, 0)}),
Span(2017., {'normalized': '2017', '_priority_': (1, 0)}),
Span(A. H. Tammsaare, {'normalized': 'A. H. Tammsaare', '_priority_': (3, 0)}),
Span(1,0 , {'normalized': '1,0', '_priority_': (1, 0)}),
Span(m / s, {'normalized': 'm/s', '_priority_': (2, 0)}),
Span(3, 67 , {'normalized': '3,67', '_priority_': (1, 0)}),
Span(km/h, {'normalized': 'km/h', '_priority_': (2, 0)})]

In [4]:
status

{'number_of_conflicts': 0}

### Tokenize words
Word tokenizer uses tokenizaton hints layer if it is present.

In [5]:
text.tag_layer(['words'])
text.words.text

['Aadressilt',
 'bla@bla.ee',
 'tuli',
 '10 000 ',
 'kirja',
 'aadressile',
 'foo@foo.ee',
 '10 ',
 'tunni',
 'jooksul',
 '2017.',
 'aastal',
 '.',
 'A. H. Tammsaare',
 ':',
 '1,0 ',
 'm / s',
 '=',
 '3, 67 ',
 'km/h',
 '.']