# Storing Text objects in Postgres

This tutorial demonstrates how to store and query estnltk text objects in postgres database.

In [2]:
from estnltk import Text
from estnltk.storage.postgres import PostgresStorage, JsonbTextQuery, JsonbLayerQuery
from estnltk.taggers import VabamorfTagger

Connect to an existing postgres database

In [3]:
storage = PostgresStorage(pgpass_file=r"C:\Users\distorti\projects\ut\estnltk\estnltk\storage\postgres\.pgpass",
                          schema="my_schema")

Create schema if needed:

In [4]:
storage.create_schema()

## Collections

Collection stores text objects and provides read/write API.

Create a new collection:

In [5]:
collection = storage.get_collection("my_collection")
collection.create()

Add some data:

In [6]:
text1 = Text('ööbik laulab.').tag_layer(['morph_analysis'])
key1 = collection.insert(text1)
print(key1, text1)

text2 = Text('öökull ei laula.').tag_layer(['morph_analysis'])
key2 = collection.insert(text2, key=7)
print(key2, text2)

1 Text(text="ööbik laulab.")
7 Text(text="öökull ei laula.")


Iterate over collection:

In [7]:
for key, text in collection.select():
    print(key, text)

1 Text(text="ööbik laulab.")
7 Text(text="öökull ei laula.")


Search for a particular entry by key:

In [8]:
txt = collection.select_by_key(7)
print(txt)

Text(text="öökull ei laula.")


Search using layer attributes:

In [9]:
q = JsonbTextQuery('morph_analysis', lemma='laulma')
for key, txt in collection.select(query=q):
    print(key, txt)

1 Text(text="ööbik laulab.")
7 Text(text="öökull ei laula.")


Search using multiple layer attributes:

In [10]:
q = JsonbTextQuery('morph_analysis', lemma='laulma', form='b')
for key, txt in collection.select(query=q):
    print(key, txt)

1 Text(text="ööbik laulab.")


Search using "OR" query:

In [11]:
q = JsonbTextQuery('morph_analysis', lemma='ööbik') | \
    JsonbTextQuery('morph_analysis', lemma='öökull')
for key, txt in collection.select(query=q):
    print(key, txt)

1 Text(text="ööbik laulab.")
7 Text(text="öökull ei laula.")


Search using "AND" query:

In [12]:
q = JsonbTextQuery('morph_analysis', lemma='ööbik') & \
    JsonbTextQuery('morph_analysis', lemma='öökull')
for key, txt in collection.select(query=q):
    print(key, txt)

Search using a composite query:

In [13]:
q = (JsonbTextQuery('morph_analysis', lemma='ööbik') | JsonbTextQuery('morph_analysis', lemma='öökull')) & \
    JsonbTextQuery('morph_analysis', lemma='laulma')
for key, txt in collection.select(query=q):
    print(key, txt)

1 Text(text="ööbik laulab.")
7 Text(text="öökull ei laula.")


or use a convenience method `find_fingerprint`:

In [14]:
for key, txt in collection.find_fingerprint(
                    query={
                        "layer": "morph_analysis",
                        "ambiguous": True,
                        "field": "lemma",
                        "query": [{'ööbik', 'laulma'}, {'öökull', 'laulma'}] # (ööbik AND laulma) OR (öökull AND laulma)
                    },
                    order_by_key=True):
    print(key, txt)

1 Text(text="ööbik laulab.")
7 Text(text="öökull ei laula.")


In [15]:
for key, txt in collection.find_fingerprint(
                    query={
                        "layer": "morph_analysis",
                        "ambiguous": True,
                        "field": "lemma",
                        "query": ['öökull', 'laulma'] # öökull OR laulma
                    },
                    order_by_key=True):
    print(key, txt)

1 Text(text="ööbik laulab.")
7 Text(text="öökull ei laula.")


In [16]:
for key, txt in collection.find_fingerprint(
                    query={
                        "layer": "morph_analysis",
                        "ambiguous": True,
                        "field": "lemma",
                        "query": [{'öökull', 'laulma'}] # öökull AND laulma
                    },
                    order_by_key=True):
    print(key, txt)

7 Text(text="öökull ei laula.")


Delete collection

In [17]:
collection.delete()

## Working with layers

Let's say you want to create a collection which stores only layers up to "sentences":

In [18]:
collection = storage.get_collection("collection_with_layers")
collection.create()

collection.insert(Text('see on esimene lause').tag_layer(["sentences"]))
collection.insert(Text('see on teine lause').tag_layer(["sentences"]));

Check what layers are present:

In [19]:
for key, text in collection.select():
    print(key, text, text.layers.keys())

1 Text(text="see on esimene lause") dict_keys(['tokens', 'sentences', 'words', 'compound_tokens'])
2 Text(text="see on teine lause") dict_keys(['tokens', 'sentences', 'words', 'compound_tokens'])


Now, you want to add new layers "my_first_layer" and "my_second_layer" to store other information. However, you want to store them in a separate table. For this purpose collection object has a `create_layer` method:

In [20]:
layer1 = "my_first_layer"
layer1_table = collection.layer_name_to_table_name(layer1)
tagger1 = VabamorfTagger(disambiguate=False, layer_name=layer1)
collection.create_layer(layer1, callable=lambda t: tagger1.tag(t, return_layer=True))

layer2 = "my_second_layer"
layer2_table = collection.layer_name_to_table_name(layer2)
tagger2 = VabamorfTagger(disambiguate=False, layer_name=layer2)
collection.create_layer(layer2, callable=lambda t: tagger2.tag(t, return_layer=True))

Specified `callable` function is applied to each `text` entry in a collection and is expected to return a `Layer` instance.

If your `tagger` uses layers that are stored in separate tables, use `layers` argument to specify the layers to fetch. These layers will be merged with the text object before passing it to `callable`:

```
collection.create_layer(layer,
                        layers=['layer1', 'layer2', ...],
                        callable=lambda t: tagger.tag(t, return_layer=True))
```

Make sure the new layer has been created:

In [21]:
collection.get_layer_names()

['my_first_layer', 'my_second_layer']

### Searching layers

Iterate over collection and new layers using `select` method:

In [22]:
for key, text in collection.select(layers=['my_first_layer', 'my_second_layer']):
    print(key, text, text.layers.keys())

1 Text(text="see on esimene lause") dict_keys(['tokens', 'my_first_layer', 'sentences', 'words', 'compound_tokens', 'my_second_layer'])
2 Text(text="see on teine lause") dict_keys(['tokens', 'my_first_layer', 'sentences', 'words', 'compound_tokens', 'my_second_layer'])


Notice that the detached layers 'my_first_layer' and 'my_first_layer' are available within the returned `Text` object.

Search layer using `JsonbLayerQuery`:

In [23]:
for key, text in collection.select(layer_query={
        layer1: JsonbLayerQuery(layer_table=layer1_table, lemma='esimene') | \
                JsonbLayerQuery(layer_table=layer1_table, lemma='teine')
        }):
    print(key, text)

1 Text(text="see on esimene lause")
2 Text(text="see on teine lause")


Search over multiple layers using `JsonbLayerQuery`:

In [24]:
for key, text in collection.select(layer_query={
        layer1: JsonbLayerQuery(layer_table=layer1_table, lemma='esimene') | \
                JsonbLayerQuery(layer_table=layer1_table, lemma='teine'),
        layer2: JsonbLayerQuery(layer_table=layer2_table, lemma='esimene')
        }):
    print(key, text)

1 Text(text="see on esimene lause")


The same layer query can be specified using convenience method `find_fingerprint`:

In [25]:
for key, text in collection.find_fingerprint(layer_query={
            layer1: {
                "field": "lemma",
                "query": ["esimene", "teine"],
                "ambiguous": True
            },
            layer2: {
                "field": "lemma",
                "query": ["esimene"],
                "ambiguous": True
            }}):
    print(key, text)

1 Text(text="see on esimene lause")


Delete layer

In [26]:
collection.delete_layer(layer1)

### Indexing layers

Ngram index enables to index ngrams in layer attributes.
For example, a bigram index on an attribute with values `['see', 'on', 'esimene', 'lause']` will contain pairs *'see-on'*, *'on-esimene'*, *'esimene-lause'*.
Indices of a higher order are also supported.

To build an ngram index, provide an argument *ngram_index* when creating a new layer.
The following code creates a bi-gram index on an attribute *lemma* for a newly created layer *indexed_layer*:

In [27]:
indexed_layer = "indexed_layer"
indexed_layer_table = collection.layer_name_to_table_name(indexed_layer)
tagger1 = VabamorfTagger(disambiguate=False, layer_name=indexed_layer)
collection.create_layer(indexed_layer, 
                        callable=lambda t: tagger1.tag(t, return_layer=True),
                        ngram_index={"lemma": 2}
                       )

To search an ngram index, use method *find_fingerprint* along with *layer_ngram_query* argument.

Search entries containing lemma bigram 'see-olema':

In [28]:
q = {indexed_layer: {
        "lemma": [("see", "olema")]
    }}
for key, text in collection.find_fingerprint(layer_ngram_query=q):
    print(key, text)

1 Text(text="see on esimene lause")
2 Text(text="see on teine lause")


Search 'teine-lause' OR 'olema-esimene':

In [29]:
q = {indexed_layer: {
        "lemma": [("teine", "lause"), ("olema", "esimene")]
    }}
for key, text in collection.find_fingerprint(layer_ngram_query=q):
    print(key, text)

1 Text(text="see on esimene lause")
2 Text(text="see on teine lause")


Search 'see-olema' AND 'olema-esimene':

In [30]:
q = {indexed_layer: {
        "lemma": [[("see", "olema"), ("olema", "esimene")]]
    }}
for key, text in collection.find_fingerprint(layer_ngram_query=q):
    print(key, text)

1 Text(text="see on esimene lause")


Delete collection

In [31]:
collection.delete()

Close database connection

In [32]:
storage.close()