# Word tokenization
## Tokenize words without tokenization hints

In [1]:
from estnltk import Text
from estnltk.layer_operations import repr_html

t = '''Aadressilt bla@bla.ee tuli 10 000 kirja aadressile foo@foo.ee 10 tunni jooksul 2017. aastal. \
A. H. Tammsaare: 1,0 m / s = 3, 67 km/h.'''
t

'Aadressilt bla@bla.ee tuli 10 000 kirja aadressile foo@foo.ee 10 tunni jooksul 2017. aastal. A. H. Tammsaare: 1,0 m / s = 3, 67 km/h.'

In [2]:
text = Text(t)
text.tag_layer(['words'])
text.words.text

['Aadressilt',
 'bla',
 '@',
 'bla.ee',
 'tuli',
 '10',
 '000',
 'kirja',
 'aadressile',
 'foo',
 '@',
 'foo.ee',
 '10',
 'tunni',
 'jooksul',
 '2017.',
 'aastal',
 '.',
 'A.',
 'H.',
 'Tammsaare',
 ':',
 '1,0',
 'm',
 '/',
 's',
 '=',
 '3',
 ',',
 '67',
 'km',
 '/',
 'h',
 '.']

## Tokenize words with tokenization hints
### Create tokenization hints layer

Word tokenizer can use hints from the tokenization_hints layer. One can create tokenization_hints layer with TokenizationHintsTagger.

In [3]:
from estnltk.taggers import TokenizationHintsTagger

tokenization_hints_tagger = TokenizationHintsTagger()

text = Text(t)
status = {}
tokenization_hints_tagger.tag(text, status)
repr_html(text['tokenization_hints'])

Unnamed: 0,text,_priority_,normalized
0,bla@bla.ee,"(0, 0)",
1,10 000,"(1, 0)",10000
2,foo@foo.ee,"(0, 0)",
3,10,"(1, 0)",10
4,2017.,"(1, 0)",2017
5,A. H. Tammsaare,"(3, 0)",A. H. Tammsaare
6,10,"(1, 0)",10
7,m / s,"(2, 0)",m/s
8,"3, 67","(1, 0)",367
9,km/h,"(2, 0)",km/h


In [4]:
status

{'number_of_conflicts': 0}

### Tokenize words
Word tokenizer uses tokenizaton hints layer if it is present.

In [5]:
text.tag_layer(['words'])
text.words.text

['Aadressilt',
 'bla@bla.ee',
 'tuli',
 '10 000 ',
 'kirja',
 'aadressile',
 'foo@foo.ee',
 '10 ',
 'tunni',
 'jooksul',
 '2017.',
 'aastal',
 '.',
 'A. H. Tammsaare',
 ':',
 '1,0 ',
 'm / s',
 '=',
 '3, 67 ',
 'km/h',
 '.']