# Text segmentation
## Taggers
The following taggers are involved with the text segmentation.
### TokensTagger
Uses nltk WordPunctTokenizer to tokenize raw text. Creates tokens layer.

In [1]:
from estnltk.taggers import TokensTagger
tokens_tagger = TokensTagger()
tokens_tagger

name,layer,attributes,depends_on
TokensTagger,tokens,[],[]

0,1
apply_punct_postfixes,True


In [2]:
from estnltk import Text
T = '''Aadressilt bla@bla.ee tuli 10 000 kirja. Kirjad, st. spamm saabus 10 tunni jooksul.

A. H. Tammsaare 1935. aatal: 1,0 m / s = 3, 67 km/h.'''
text = Text(T)
tokens_tagger.tag(text)
text['tokens']

layer name,attributes,parent,enveloping,ambiguous,span count
tokens,,,,False,44

text
Aadressilt
bla
@
bla
.
ee
tuli
10
000
kirja


### CompounTokenTagger

In [3]:
from estnltk.taggers import CompoundTokenTagger
compound_token_tagger = CompoundTokenTagger()
compound_token_tagger

name,layer,attributes,depends_on
CompoundTokenTagger,compound_tokens,"[type, normalized]",[tokens]

0,1
tag_initials,True
conflict_resolving_strategy,MAX
compound_types_to_merge,"{abbrevation, name}"
tag_emails,True
tag_numbers,True
tag_abbreviations,True
tag_units,True


In [4]:
compound_token_tagger.tag(text)
text['compound_tokens']

layer name,attributes,parent,enveloping,ambiguous,span count
compound_tokens,"type, normalized",,tokens,False,6

text,type,normalized
bla@bla.ee,e-mail,
st.,non_ending_abbreviation,st.
A. H. Tammsaare,name,A. H. Tammsaare
1935.,numeric,1935
m / s,unit,m/s
km/h,unit,km/h


### WordTokenizer

In [5]:
from estnltk.taggers import WordTokenizer
word_tokenizer = WordTokenizer()
word_tokenizer

name,layer,attributes,depends_on
WordTokenizer,words,[],[compound_tokens]


In [6]:
word_tokenizer.tag(text)
text['words']

layer name,attributes,parent,enveloping,ambiguous,span count
words,,,,False,30

text
Aadressilt
bla@bla.ee
tuli
10
000
kirja
.
Kirjad
","
st.


### SentenceTokenizer

In [7]:
from estnltk.taggers import SentenceTokenizer
sentence_tokenizer = SentenceTokenizer()
sentence_tokenizer

name,layer,attributes,depends_on
SentenceTokenizer,sentences,[],"[compound_tokens, words]"


In [8]:
sentence_tokenizer.tag(text)
text['sentences']

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,3

text
Aadressilt bla@bla.ee tuli 10 000 kirja.
"Kirjad, st. spamm saabus 10 tunni jooksul."
"A. H. Tammsaare 1935. aatal: 1,0 m / s = 3, 67 km/h."


t = '''Aadressilt bla@bla.ee tuli 10 000 kirja, st. spammi aadressile foo@foo.ee 10 tunni jooksul 2017. aastal. \
A. H. Tammsaare: 1,0 m / s = 3, 67 km/h.'''

In [9]:
from estnltk.taggers import ParagraphTokenizer
paragraph_tokenizer = ParagraphTokenizer()
paragraph_tokenizer

name,layer,attributes,depends_on
ParagraphTokenizer,paragraphs,[],[sentences]

0,1
regex,\s*\n\n


In [10]:
paragraph_tokenizer.tag(text)
text['paragraphs']

layer name,attributes,parent,enveloping,ambiguous,span count
paragraphs,,,sentences,False,2

text
"Aadressilt bla@bla.ee tuli 10 000 kirja. Kirjad, st. spamm saabus 10 tunni jooksul."
"A. H. Tammsaare 1935. aatal: 1,0 m / s = 3, 67 km/h."


In [11]:
text

text
"Aadressilt bla@bla.ee tuli 10 000 kirja. Kirjad, st. spamm saabus 10 tunni jooksul. A. H. Tammsaare 1935. aatal: 1,0 m / s = 3, 67 km/h."

layer name,attributes,parent,enveloping,ambiguous,span count
paragraphs,,,sentences,False,2
sentences,,,words,False,3
tokens,,,,False,44
compound_tokens,"type, normalized",,tokens,False,6
words,,,,False,30


# tag_layer
All above is equivalent to

In [12]:
Text(T).tag_layer(['paragraphs'])

text
"Aadressilt bla@bla.ee tuli 10 000 kirja. Kirjad, st. spamm saabus 10 tunni jooksul. A. H. Tammsaare 1935. aatal: 1,0 m / s = 3, 67 km/h."

layer name,attributes,parent,enveloping,ambiguous,span count
paragraphs,,,sentences,False,2
sentences,,,words,False,3
tokens,,,,False,44
compound_tokens,"type, normalized",,tokens,False,6
words,,,,False,30


# analyse
One can also write

In [13]:
Text(T).analyse('segmentation')

text
"Aadressilt bla@bla.ee tuli 10 000 kirja. Kirjad, st. spamm saabus 10 tunni jooksul. A. H. Tammsaare 1935. aatal: 1,0 m / s = 3, 67 km/h."

layer name,attributes,parent,enveloping,ambiguous,span count
paragraphs,,,sentences,False,2
sentences,,,words,False,3
words,,,,False,30


Here auxiliary layers are deleted.

# Word tokenization
## Tokenize words without tokenization hints

In [14]:
from estnltk import Text

t = '''Aadressilt bla@bla.ee tuli 10 000 kirja, st. spammi aadressile foo@foo.ee 10 tunni jooksul 2017. aastal. \
A. H. Tammsaare: 1,0 m / s = 3, 67 km/h.'''
t

'Aadressilt bla@bla.ee tuli 10 000 kirja, st. spammi aadressile foo@foo.ee 10 tunni jooksul 2017. aastal. A. H. Tammsaare: 1,0 m / s = 3, 67 km/h.'

In [15]:
text = Text(t)
text.tag_layer(['words'])
text['words']

layer name,attributes,parent,enveloping,ambiguous,span count
words,,,,False,29

text
Aadressilt
bla@bla.ee
tuli
10
000
kirja
","
st.
spammi
aadressile


## TokenizationHintsTagger

TokenizationHintsTagger is not currently used in the segmentation pipeline.

In [16]:
from estnltk.taggers import TokenizationHintsTagger
tokenization_hints_tagger = TokenizationHintsTagger()
tokenization_hints_tagger

name,layer,attributes,depends_on
TokenizationHintsTagger,tokenization_hints,"[normalized, _priority_]",[]

0,1
tag_email,True
overlapped,False
tag_initials,False
tag_numbers,True
tag_abbreviations,False
tag_unit,True
conflict_resolving_strategy,MAX


In [17]:
tokenization_hints_tagger._tagger

name,layer,attributes,depends_on
RegexTagger,tokenization_hints,"[normalized, _priority_]",[]

0,1
conflict_resolving_strategy,MAX
overlapped,False


In [18]:
text = Text(T)
status = {}
tokenization_hints_tagger.tag(text, status)
text['tokenization_hints']

layer name,attributes,parent,enveloping,ambiguous,span count
tokenization_hints,"normalized, _priority_",,,False,8

text,normalized,_priority_
bla@bla.ee,,"(0, 0)"
10 000,10000,"(1, 0)"
10,10,"(1, 0)"
1935.,1935,"(1, 0)"
10,10,"(1, 0)"
m / s,m/s,"(2, 0)"
"3, 67",367,"(1, 0)"
km/h,km/h,"(2, 0)"


In [19]:
status

{}