# Text segmentation
## Taggers
The following taggers are involved with the text segmentation.
### TokensTagger
Uses nltk WordPunctTokenizer to tokenize raw text. Creates tokens layer.

In [1]:
from estnltk.taggers import TokensTagger
tokens_tagger = TokensTagger()
tokens_tagger

name,output layer,output attributes,input layers
TokensTagger,tokens,(),()

0,1
apply_punct_postfixes,True


In [2]:
from estnltk import Text
T = '''Aadressilt bla@bla.ee tuli 10 000 kirja. Kirjad, st. spamm saabus 10 tunni jooksul.

A. H. Tammsaare 1935. aatal: 1,0 m / s = 3, 67 km/h.'''
text = Text(T)
tokens_tagger.tag(text)
text['tokens']

layer name,attributes,parent,enveloping,ambiguous,span count
tokens,,,,False,44

text
Aadressilt
bla
@
bla
.
ee
tuli
10
000
kirja


### CompounTokenTagger

In [3]:
from estnltk.taggers import CompoundTokenTagger
compound_token_tagger = CompoundTokenTagger()
compound_token_tagger

name,output layer,output attributes,input layers
CompoundTokenTagger,compound_tokens,"('type', 'normalized')","('tokens',)"

0,1
custom_abbreviations,()
ignored_words,set()
tag_numbers,True
tag_units,True
tag_email_and_www,True
tag_emoticons,True
tag_xml,True
tag_initials,True
tag_abbreviations,True
tag_case_endings,True


In [4]:
compound_token_tagger.tag(text)
text['compound_tokens']

layer name,attributes,parent,enveloping,ambiguous,span count
compound_tokens,"type, normalized",,tokens,False,8

text,type,normalized
"['bla', '@', 'bla', '.', 'ee']",['email'],
"['10', '000']",['numeric'],10000
"['st', '.']",['non_ending_abbreviation'],st.
"['A', '.', 'H', '.', 'Tammsaare']",['name_with_initial'],A. H. Tammsaare
"['1935', '.']",['numeric'],1935.
"['1', ',', '0']",['numeric'],10
"['m', '/', 's']",['unit'],m/s
"['km', '/', 'h']",['unit'],km/h


### WordTagger

In [5]:
from estnltk.taggers import WordTagger
word_tagger = WordTagger()
word_tagger

name,output layer,output attributes,input layers
WordTagger,words,"('normalized_form',)","('tokens', 'compound_tokens')"

0,1
make_ambiguous,True


In [6]:
word_tagger.tag(text)
text['words']

layer name,attributes,parent,enveloping,ambiguous,span count
words,normalized_form,,,True,27

text,normalized_form
Aadressilt,
bla@bla.ee,
tuli,
10 000,10000
kirja,
.,
Kirjad,
",",
st.,st.
spamm,


### SentenceTokenizer

In [7]:
# NBVAL_IGNORE_OUTPUT
from estnltk.taggers import SentenceTokenizer
sentence_tokenizer = SentenceTokenizer()
sentence_tokenizer

name,output layer,output attributes,input layers
SentenceTokenizer,sentences,(),"('words', 'compound_tokens')"

0,1
base_sentence_tokenizer,<nltk.tokenize.punkt.PunktSentenceTokenizer object at 0x0000015C1399D198>
fix_paragraph_endings,True
fix_compound_tokens,True
fix_numeric,True
fix_parentheses,True
fix_double_quotes,True
fix_inner_title_punct,True
fix_repeated_ending_punct,True
fix_double_quotes_based_on_counts,False
use_emoticons_as_endings,True


In [8]:
sentence_tokenizer.tag(text)
text['sentences']

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,3

text
"['Aadressilt', 'bla@bla.ee', 'tuli', '10 000', 'kirja', '.']"
"['Kirjad', ',', 'st.', 'spamm', 'saabus', '10', 'tunni', 'jooksul', '.']"
"['A. H. Tammsaare', '1935.', 'aatal', ':', '1,0', 'm / s', '=', '3', ',', '67', 'km/h', '.']"


In [9]:
t = '''Aadressilt bla@bla.ee tuli 10 000 kirja, st. spammi aadressile foo@foo.ee 10 tunni jooksul 2017. aastal. \
A. H. Tammsaare: 1,0 m / s = 3, 67 km/h.'''

In [10]:
from estnltk.taggers import ParagraphTokenizer
paragraph_tokenizer = ParagraphTokenizer()
paragraph_tokenizer

name,output layer,output attributes,input layers
ParagraphTokenizer,paragraphs,(),"('sentences',)"

0,1
regex,\s*\n\n
paragraph_tokenizer,"RegexpTokenizer(pattern='\\s*\n\n', gaps=True, discard_empty=True, flags=<RegexF ..., type: <class 'nltk.tokenize.regexp.RegexpTokenizer'>"


In [11]:
paragraph_tokenizer.tag(text)
text['paragraphs']

layer name,attributes,parent,enveloping,ambiguous,span count
paragraphs,,,sentences,False,2

text
"['Aadressilt', 'bla@bla.ee', 'tuli', '10 000', 'kirja', '.', 'Kirjad', ',', 'st. ..., type: <class 'list'>, length: 15"
"['A. H. Tammsaare', '1935.', 'aatal', ':', '1,0', 'm / s', '=', '3', ',', '67', 'km/h', '.']"


In [12]:
text

text
"Aadressilt bla@bla.ee tuli 10 000 kirja. Kirjad, st. spamm saabus 10 tunni jooksul.A. H. Tammsaare 1935. aatal: 1,0 m / s = 3, 67 km/h."

layer name,attributes,parent,enveloping,ambiguous,span count
paragraphs,,,sentences,False,2
sentences,,,words,False,3
tokens,,,,False,44
compound_tokens,"type, normalized",,tokens,False,8
words,normalized_form,,,True,27


# tag_layer
All above is equivalent to

In [13]:
Text(T).tag_layer(['paragraphs'])

text
"Aadressilt bla@bla.ee tuli 10 000 kirja. Kirjad, st. spamm saabus 10 tunni jooksul.A. H. Tammsaare 1935. aatal: 1,0 m / s = 3, 67 km/h."

layer name,attributes,parent,enveloping,ambiguous,span count
paragraphs,,,sentences,False,2
sentences,,,words,False,3
tokens,,,,False,44
compound_tokens,"type, normalized",,tokens,False,8
words,normalized_form,,,True,27


# analyse
One can also write

In [14]:
Text(T).analyse('segmentation')

text
"Aadressilt bla@bla.ee tuli 10 000 kirja. Kirjad, st. spamm saabus 10 tunni jooksul.A. H. Tammsaare 1935. aatal: 1,0 m / s = 3, 67 km/h."

layer name,attributes,parent,enveloping,ambiguous,span count
paragraphs,,,sentences,False,2
sentences,,,words,False,3
words,normalized_form,,,True,27


Here auxiliary layers are deleted.

# Word tokenization

In [15]:
from estnltk import Text

t = '''Aadressilt bla@bla.ee tuli 10 000 kirja, st. spammi aadressile foo@foo.ee 10 tunni jooksul 2017. aastal. \
A. H. Tammsaare: 1,0 m / s = 3, 67 km/h.'''
t

'Aadressilt bla@bla.ee tuli 10 000 kirja, st. spammi aadressile foo@foo.ee 10 tunni jooksul 2017. aastal. A. H. Tammsaare: 1,0 m / s = 3, 67 km/h.'

In [16]:
text = Text(t)
text.tag_layer(['words'])
text['words']

layer name,attributes,parent,enveloping,ambiguous,span count
words,normalized_form,,,True,26

text,normalized_form
Aadressilt,
bla@bla.ee,
tuli,
10 000,10000
kirja,
",",
st.,st.
spammi,
aadressile,
foo@foo.ee,
