In [1]:
from estnltk import Text
from estnltk.converters.TCF_exporter import export_TCF
from estnltk.converters.TCF_importer import import_TCF

# TCF exporter and importer

Export `Text` object to TCF and import from `TCF`.
## Short example

In [2]:
# create a text object with segmentation layers
text = Text('Tere maailm!')
text.analyse('segmentation')

text
Tere maailm!

layer name,attributes,parent,enveloping,ambiguous,span count
paragraphs,,,sentences,False,1
sentences,,,words,False,1
words,normalized_form,,,False,3


In [3]:
# export the text object to a xml file and also print out the xml
# paragraphs layer is not exported
print(export_TCF(text, file='tcf/tcf_short.xml'))

<D-Spin version="0.4" xmlns="http://www.dspin.de/data">
  <MetaData xmlns="http://www.dspin.de/data/metadata"/>
  <TextCorpus lang="et" xmlns="http://www.dspin.de/data/textcorpus">
    <text>Tere maailm!</text>
    <tc:tokens xmlns:tc="http://www.dspin.de/data/textcorpus">
      <tc:token ID="t0" start="0" end="4">Tere</tc:token>
      <tc:token ID="t1" start="5" end="11">maailm</tc:token>
      <tc:token ID="t2" start="11" end="12">!</tc:token>
    </tc:tokens>
    <tc:sentences xmlns:tc="http://www.dspin.de/data/textcorpus">
      <tc:sentence ID="s0" tokenIDs="t0 t1 t2"/>
    </tc:sentences>
  </TextCorpus>
</D-Spin>



In [4]:
# The TCF importer reads an xml file and returns a text object.
import_TCF(file='tcf/tcf_short.xml')

text
Tere maailm!

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
words,normalized_form,,,False,3


## Longer example

In [5]:
from estnltk.text import Layer

text = Text('Karin, kes lendab New Yorki, tahab seal veeta puhkuse. Ta tuleb teisel augustil tagasi.')
text.analyse('segmentation')
text.analyse('morphology')

# clauses layer
layer = Layer(name='clauses', enveloping='words')
layer.add_annotation(text.words[2:6])
layer.add_annotation([text.words[0], *text.words[7:11]])
layer.add_annotation(text.words[12:17])
text.add_layer(layer)

# verb_chains layer
layer = Layer(name='verb_chains', enveloping='words')
layer.add_annotation(text.words[3:4])
layer.add_annotation(text.words[7:10:2])
layer.add_annotation(text.words[13:17:3])
text.add_layer(layer)

# time_phrases layer
layer = Layer(name='time_phrases', enveloping='words')
layer.add_annotation(text.words[14:16])
text.add_layer(layer)

text

text
"Karin, kes lendab New Yorki, tahab seal veeta puhkuse. Ta tuleb teisel augustil tagasi."

layer name,attributes,parent,enveloping,ambiguous,span count
paragraphs,,,sentences,False,1
sentences,,,words,False,2
words,normalized_form,,,False,18
morph_analysis,"lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,18
clauses,,,words,False,3
time_phrases,,,words,False,1
verb_chains,,,words,False,3


In [6]:
text['sentences']

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,2

text
"['Karin', ',', 'kes', 'lendab', 'New', 'Yorki', ',', 'tahab', 'seal', 'veeta', 'puhkuse', '.']"
"['Ta', 'tuleb', 'teisel', 'augustil', 'tagasi', '.']"


In [7]:
text['clauses']

layer name,attributes,parent,enveloping,ambiguous,span count
clauses,,,words,False,3

text
"['Karin', 'tahab', 'seal', 'veeta', 'puhkuse']"
"['kes', 'lendab', 'New', 'Yorki']"
"['Ta', 'tuleb', 'teisel', 'augustil', 'tagasi']"


In [8]:
text['verb_chains']

layer name,attributes,parent,enveloping,ambiguous,span count
verb_chains,,,words,False,3

text
['lendab']
"['tahab', 'veeta']"
"['tuleb', 'tagasi']"


In [9]:
text['time_phrases']

layer name,attributes,parent,enveloping,ambiguous,span count
time_phrases,,,words,False,1

text
"['teisel', 'augustil']"


The default version of the output of the TCF exporter is 0.4. This produces an xml document that can be used as as an input for [Weblicht service](https://weblicht.sfs.uni-tuebingen.de/weblicht/).

In [10]:
export_TCF(text, 'tcf/tcf.xml')
None

The output is in the file [tcf/tcf.xml](tcf/tcf.xml).

The version '0.5' produces an xml for possible future TCF version. This includes layers for clauses, verb_chains and time_phrases.

In [11]:
print(export_TCF(text, version='0.5'))

<D-Spin version="0.5" xmlns="http://www.dspin.de/data">
  <MetaData xmlns="http://www.dspin.de/data/metadata"/>
  <TextCorpus lang="et" xmlns="http://www.dspin.de/data/textcorpus">
    <text>Karin, kes lendab New Yorki, tahab seal veeta puhkuse. Ta tuleb teisel augustil tagasi.</text>
    <tc:tokens xmlns:tc="http://www.dspin.de/data/textcorpus">
      <tc:token ID="t0" start="0" end="5">Karin</tc:token>
      <tc:token ID="t1" start="5" end="6">,</tc:token>
      <tc:token ID="t2" start="7" end="10">kes</tc:token>
      <tc:token ID="t3" start="11" end="17">lendab</tc:token>
      <tc:token ID="t4" start="18" end="21">New</tc:token>
      <tc:token ID="t5" start="22" end="27">Yorki</tc:token>
      <tc:token ID="t6" start="27" end="28">,</tc:token>
      <tc:token ID="t7" start="29" end="34">tahab</tc:token>
      <tc:token ID="t8" start="35" end="39">seal</tc:token>
      <tc:token ID="t9" start="40" end="45">veeta</tc:token>
      <tc:token ID="t10" start="46" end="53">puhkuse</tc:t

## Tests for exporter and importer

In [12]:
text

text
"Karin, kes lendab New Yorki, tahab seal veeta puhkuse. Ta tuleb teisel augustil tagasi."

layer name,attributes,parent,enveloping,ambiguous,span count
paragraphs,,,sentences,False,1
sentences,,,words,False,2
words,normalized_form,,,False,18
morph_analysis,"lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,18
clauses,,,words,False,3
time_phrases,,,words,False,1
verb_chains,,,words,False,3


In [13]:
# version 0.4
export_TCF(import_TCF(export_TCF(text))) == export_TCF(text)

True

In [14]:
#version 0.5
export_TCF(import_TCF(export_TCF(text, version='0.5')), version='0.5') == export_TCF(text, version='0.5')

True

In [15]:
# version 0.5
del text.paragraphs

text == import_TCF(export_TCF(text, version='0.5'))

True

In [16]:
#version 0.4
del text.clauses
del text.verb_chains
del text.time_phrases

text == import_TCF(export_TCF(text))

True