# Storing Text objects in Postgres

This tutorial demonstrates how to store and query estnltk text objects in postgres database.

In [1]:
from estnltk import Text
from estnltk.storage.postgres import PostgresStorage, JsonbTextQuery, JsonbLayerQuery, create_schema, delete_schema
from estnltk.taggers import VabamorfTagger

The first line of the `pgpass` file that matches the given `PostgresStorage` arguments is used to connect to an existing PostgreSQL database. File format:

    host:port:dbname:user:password

In [2]:
storage = PostgresStorage(pgpass_file='~/.pgpass',
                          dbname='test_db',
                          schema='my_schema')
create_schema(storage)

INFO:storage.py:40: connecting to host: 'localhost', port: '5432', dbname: 'test_db', user: 'pault'
INFO:storage.py:56: schema: 'my_schema', temporary: False, role: 'pault'


In [3]:
storage['my_first_collection'].create('first demo collection')
storage['my_second_collection'].create('second demo collection')
storage

INFO:collection.py:103: new empty collection 'my_first_collection' created
INFO:collection.py:103: new empty collection 'my_second_collection' created


Unnamed: 0_level_0,Unnamed: 1_level_0,total_size,comment
collection,layers,Unnamed: 2_level_1,Unnamed: 3_level_1
my_first_collection,,32 kB,first demo collection
my_second_collection,,32 kB,second demo collection


List of collection names.

In [4]:
storage.collections

['my_first_collection', 'my_second_collection']

Delete collections.

In [5]:
del storage['my_first_collection']
# or
storage['my_second_collection'].delete()

storage

## Collections

Collection stores text objects and provides read/write API.

Create a new collection:

In [6]:
collection = storage["my_collection"].create('demo collection')

INFO:collection.py:103: new empty collection 'my_collection' created


Add some data:

In [7]:
with collection.insert() as collection_insert:
    text1 = Text('ööbik laulab.').tag_layer(['morph_analysis'])
    collection_insert(text1)
    print(text1)

    text2 = Text('öökull ei laula.').tag_layer(['morph_analysis'])
    key2 = collection_insert(text2, key=7)
    print(text2)

Text(text='ööbik laulab.')
Text(text='öökull ei laula.')


In [8]:
collection

Unnamed: 0,detached,attributes,ambiguous,parent,enveloping,_base,meta
compound_tokens,False,"(type, normalized)",False,,tokens,compound_tokens,[]
morph_analysis,False,"(lemma, root, root_tokens, ending, clitic, for...",True,words,,words,[]
sentences,False,(),False,,words,sentences,[]
tokens,False,(),False,,,tokens,[]
words,False,"(normalized_form,)",False,,,words,[]


Iterate over collection:

In [9]:
for key, text in collection.select():
    print(key, text)

1 Text(text='ööbik laulab.')
7 Text(text='öökull ei laula.')


Search for a particular entry by key:

In [10]:
txt = collection.select_by_key(7)
print(txt)

Text(text='öökull ei laula.')


Search using layer attributes:

In [11]:
q = JsonbTextQuery('morph_analysis', lemma='laulma')
for key, txt in collection.select(query=q):
    print(key, txt)

1 Text(text='ööbik laulab.')
7 Text(text='öökull ei laula.')


Search using multiple layer attributes:

In [12]:
q = JsonbTextQuery('morph_analysis', lemma='laulma', form='b')
for key, txt in collection.select(query=q):
    print(key, txt)

1 Text(text='ööbik laulab.')


Search using "OR" query:

In [13]:
q = JsonbTextQuery('morph_analysis', lemma='ööbik') | \
    JsonbTextQuery('morph_analysis', lemma='öökull')
for key, txt in collection.select(query=q):
    print(key, txt)

1 Text(text='ööbik laulab.')
7 Text(text='öökull ei laula.')


Search using "AND" query:

In [14]:
q = JsonbTextQuery('morph_analysis', lemma='ööbik') & \
    JsonbTextQuery('morph_analysis', lemma='öökull')
for key, txt in collection.select(query=q):
    print(key, txt)

Search using a composite query:

In [15]:
q = (JsonbTextQuery('morph_analysis', lemma='ööbik') | JsonbTextQuery('morph_analysis', lemma='öökull')) & \
    JsonbTextQuery('morph_analysis', lemma='laulma')
for key, txt in collection.select(query=q):
    print(key, txt)

1 Text(text='ööbik laulab.')
7 Text(text='öökull ei laula.')


or use a convenience method `find_fingerprint`:

In [16]:
for key, txt in collection.find_fingerprint(
                    query={
                        "layer": "morph_analysis",
                        "ambiguous": True,
                        "field": "lemma",
                        "query": [{'ööbik', 'laulma'}, {'öökull', 'laulma'}] # (ööbik AND laulma) OR (öökull AND laulma)
                    },
                    order_by_key=True):
    print(key, txt)

1 Text(text='ööbik laulab.')
7 Text(text='öökull ei laula.')


In [17]:
for key, txt in collection.find_fingerprint(
                    query={
                        "layer": "morph_analysis",
                        "ambiguous": True,
                        "field": "lemma",
                        "query": ['öökull', 'laulma'] # öökull OR laulma
                    },
                    order_by_key=True):
    print(key, txt)

1 Text(text='ööbik laulab.')
7 Text(text='öökull ei laula.')


In [18]:
for key, txt in collection.find_fingerprint(
                    query={
                        "layer": "morph_analysis",
                        "ambiguous": True,
                        "field": "lemma",
                        "query": [{'öökull', 'laulma'}] # öökull AND laulma
                    },
                    order_by_key=True):
    print(key, txt)

7 Text(text='öökull ei laula.')


Delete collection

In [19]:
collection.delete()

## Working with layers

Let's say you want to create a collection which stores only layers up to "sentences":

In [20]:
collection = storage.get_collection('collection_with_layers')
collection.create()

INFO:collection.py:103: new empty collection 'collection_with_layers' created


Unnamed: 0,detached,attributes,ambiguous,parent,enveloping,_base,meta


In [21]:
with collection.insert() as collection_insert:
    collection_insert(Text('see on esimene lause').tag_layer(["sentences"]))
    collection_insert(Text('see on teine lause').tag_layer(["sentences"]));

Check what layers are present:

In [22]:
for key, text in collection.select():
    print(key, text, text.layers.keys())

1 Text(text='see on esimene lause') dict_keys(['sentences', 'words', 'compound_tokens', 'tokens'])
2 Text(text='see on teine lause') dict_keys(['sentences', 'words', 'compound_tokens', 'tokens'])


Now, you want to add new layers "my_first_layer" and "my_second_layer" to store other information. However, you want to store them in a separate table. For this purpose collection object has a `create_layer` method:

In [23]:
from estnltk.storage.postgres import RowMapperRecord


layer1 = "my_first_layer"
tagger1 = VabamorfTagger(disambiguate=False, layer_name=layer1)

def row_mapper_1(row):
    text_id, text = row[0], row[1]
    layer = tagger1.tag(text, return_layer=True)
    return [RowMapperRecord(layer=layer, meta=None)]

collection.create_layer(layer1, data_iterator=collection.select(), row_mapper=row_mapper_1)


layer2 = "my_second_layer"
tagger2 = VabamorfTagger(disambiguate=False, layer_name=layer2)

def row_mapper_2(row):
    text_id, text = row[0], row[1]
    layer = tagger2.tag(text, return_layer=True)
    return [RowMapperRecord(layer=layer, meta=None)]

collection.create_layer(layer2, data_iterator=collection.select(), row_mapper=row_mapper_2)

INFO:collection.py:830: collection: 'collection_with_layers'
INFO:collection.py:849: preparing to create a new layer: 'my_first_layer'
INFO:collection.py:915: layer created: 'my_first_layer'
INFO:collection.py:830: collection: 'collection_with_layers'
INFO:collection.py:849: preparing to create a new layer: 'my_second_layer'
INFO:collection.py:915: layer created: 'my_second_layer'


Specified `callable` function is applied to each `text` entry in a collection and is expected to return a `Layer` instance.

If your `tagger` uses layers that are stored in separate tables, use `layers` argument to specify the layers to fetch. These layers will be merged with the text object before passing it to `callable`:

```
collection.create_layer(layer,
                        layers=['layer1', 'layer2', ...],
                        callable=lambda t: tagger.tag(t, return_layer=True))
```

Make sure the new layer has been created:

In [24]:
collection.get_layer_names()

['my_first_layer',
 'compound_tokens',
 'tokens',
 'sentences',
 'words',
 'my_second_layer']

### Searching layers

Iterate over collection and new layers using `select` method:

In [25]:
for key, text in collection.select(layers=['my_first_layer', 'my_second_layer']):
    print(key, text, text.layers.keys())

1 Text(text='see on esimene lause') dict_keys(['my_first_layer', 'compound_tokens', 'tokens', 'sentences', 'words', 'my_second_layer'])
2 Text(text='see on teine lause') dict_keys(['my_first_layer', 'compound_tokens', 'tokens', 'sentences', 'words', 'my_second_layer'])


Notice that the detached layers 'my_first_layer' and 'my_first_layer' are available within the returned `Text` object.

Search layer using `JsonbLayerQuery`:

In [26]:
for key, text in collection.select(layer_query={
        layer1: JsonbLayerQuery(layer_name=layer1, lemma='esimene') | \
                JsonbLayerQuery(layer_name=layer1, lemma='teine')
        }):
    print(key, text)

1 Text(text='see on esimene lause')
2 Text(text='see on teine lause')


Search over multiple layers using `JsonbLayerQuery`:

In [27]:
for key, text in collection.select(layer_query={
        layer1: JsonbLayerQuery(layer_name=layer1, lemma='esimene') | \
                JsonbLayerQuery(layer_name=layer1, lemma='teine'),
        layer2: JsonbLayerQuery(layer_name=layer2, lemma='esimene')
        }):
    print(key, text)

1 Text(text='see on esimene lause')


The same layer query can be specified using convenience method `find_fingerprint`:

In [28]:
for key, text in collection.find_fingerprint(layer_query={
            layer1: {
                "field": "lemma",
                "query": ["esimene", "teine"],
                "ambiguous": True
            },
            layer2: {
                "field": "lemma",
                "query": ["esimene"],
                "ambiguous": True
            }}):
    print(key, text)

1 Text(text='see on esimene lause')


Delete layer

In [29]:
collection.delete_layer(layer1)

INFO:collection.py:1024: layer deleted: 'my_first_layer'


### Indexing layers

Ngram index enables to index ngrams in layer attributes.
For example, a bigram index on an attribute with values `['see', 'on', 'esimene', 'lause']` will contain pairs *'see-on'*, *'on-esimene'*, *'esimene-lause'*.
Indices of a higher order are also supported.

To build an ngram index, provide an argument *ngram_index* when creating a new layer.
The following code creates a bi-gram index on an attribute *lemma* for a newly created layer *indexed_layer*:

In [30]:
indexed_layer = 'indexed_layer'
tagger1 = VabamorfTagger(disambiguate=False, layer_name=indexed_layer)

def row_mapper(row):
    text_id, text = row[0], row[1]
    layer = tagger1.tag(text, return_layer=True)
    return [RowMapperRecord(layer=layer, meta=None)]

collection.create_layer(indexed_layer,
                        data_iterator=collection.select(),
                        row_mapper=row_mapper_1,
                        ngram_index={"lemma": 2})

INFO:collection.py:830: collection: 'collection_with_layers'
INFO:collection.py:849: preparing to create a new layer: 'indexed_layer'
INFO:collection.py:915: layer created: 'indexed_layer'


To search an ngram index, use method `find_fingerprint` along with `layer_ngram_query` argument.

Search entries containing lemma bigram 'see-olema':

In [31]:
q = {indexed_layer: {
        "lemma": [("see", "olema")]
    }}
for key, text in collection.find_fingerprint(layer_ngram_query=q):
    print(key, text)

1 Text(text='see on esimene lause')
2 Text(text='see on teine lause')


Search 'teine-lause' OR 'olema-esimene':

In [32]:
q = {indexed_layer: {
        "lemma": [("teine", "lause"), ("olema", "esimene")]
    }}
for key, text in collection.find_fingerprint(layer_ngram_query=q):
    print(key, text)

1 Text(text='see on esimene lause')
2 Text(text='see on teine lause')


Search 'see-olema' AND 'olema-esimene':

In [33]:
q = {indexed_layer: {
        "lemma": [[("see", "olema"), ("olema", "esimene")]]
    }}
for key, text in collection.find_fingerprint(layer_ngram_query=q):
    print(key, text)

1 Text(text='see on esimene lause')


Delete schema and all collections

In [34]:
delete_schema(storage)

Close database connection

In [35]:
storage.close()

## Working with fragments

In [36]:
schema = "test_fragment"
storage = PostgresStorage(pgpass_file='~/.pgpass',
                          schema=schema, dbname='test_db')
create_schema(storage)

INFO:storage.py:40: connecting to host: 'localhost', port: '5432', dbname: 'test_db', user: 'pault'
INFO:storage.py:56: schema: 'test_fragment', temporary: False, role: 'pault'


In [37]:
from estnltk.storage.postgres import select_raw

table_name = 'fragment_test'
collection = storage.get_collection(table_name)
collection.create()

with collection.insert() as collection_insert:
    text1 = Text('see on esimene lause').tag_layer(["sentences"])
    collection_insert(text1)
    text2 = Text('see on teine lause').tag_layer(["sentences"])
    collection_insert(text2)

layer_fragment_name = "layer_fragment_1"
tagger = VabamorfTagger(disambiguate=False, layer_name=layer_fragment_name)
collection.old_slow_create_layer(layer_fragment_name,
                                 data_iterator=collection.select(),
                                 row_mapper=lambda row: [
                                     RowMapperRecord(layer=tagger.tag(row[1], return_layer=True), meta=None)])

fragment_name = "fragment_1"

def row_mapper(row):
    text_id, text, meta, detached_layers = row
    parent_layer = detached_layers[layer_fragment_name]['layer']
    parent_id = detached_layers[layer_fragment_name]['layer_id']
    return [{'fragment': parent_layer, 'parent_id': parent_id},
            {'fragment': parent_layer, 'parent_id': parent_id}]

collection.create_fragment(fragment_name,
                    data_iterator=select_raw(storage=storage,
                                             collection_name=table_name,
                                             layers=[layer_fragment_name]),
                    row_mapper=row_mapper,
                    create_index=False,
                    ngram_index=None)

INFO:collection.py:103: new empty collection 'fragment_test' created
INFO:collection.py:689: collection: 'fragment_test'
INFO:collection.py:700: preparing to create a new layer: 'layer_fragment_1'
INFO:collection.py:762: layer created: 'layer_fragment_1'


In [38]:
delete_schema(storage)
storage.close()