# Extracting and Storing Addresses

This tutorial demonstrates how to extract addresses from text and store results in Postgres using `PostgresStorage` module.

In [1]:
from estnltk import Text
from estnltk.taggers import AddressPartTagger, AddressGrammarTagger
from estnltk.storage.postgres import PostgresStorage, LayerQuery, RowMapperRecord
from estnltk.storage.postgres import delete_schema

In this tutorial we are going to use the following small toy dataset:

In [2]:
text_corpus = [
    'Kontor asub aadressil Rävala 5, Tallinn.',
    'Salong asub uuel aadressil, üle tee asuvas Rävala pst 7 hoones',
    'Korterite müük: Gonsiori tn 36, Tallinn'
]

First, let's save our dataset to the database:

In [3]:
storage = PostgresStorage(pgpass_file='~/.pgpass',
                          schema="grammarextractor",
                          create_schema_if_missing=True)

collection = storage.add_collection("texts_with_addresses")

with collection.insert() as collection_insert:
    for key, text in enumerate(text_corpus):
        collection_insert(Text(text).tag_layer(['words']), key=key)

INFO:storage.py:52: connecting to host: 'localhost', port: '5432', dbname: 'test_db', user: 'postgres'
INFO:storage.py:70: new schema 'grammarextractor' created
INFO:storage.py:80: schema: 'grammarextractor', temporary: False, role: 'postgres'
INFO:storage.py:156: new empty collection 'texts_with_addresses' created
INFO:collection_text_object_inserter.py:100: inserted 3 texts into the collection 'texts_with_addresses'


In [4]:
storage

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,rows,total_size,comment
collection,version,relations,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
texts_with_addresses,3.0,,0,56 kB,created by postgres on Mon Oct 24 15:48:36 2022
texts_with_addresses,3.0,structure,0,32 kB,


In [5]:
collection

Unnamed: 0,layer_type,attributes,ambiguous,sparse,parent,enveloping,meta
tokens,attached,(),False,False,,,[]
words,attached,"(normalized_form,)",True,False,,,[]
compound_tokens,attached,"(type, normalized)",False,False,,tokens,[]


Next, we extract addresses and save them in a separate layer:

In [6]:
address_token_tagger = AddressPartTagger(output_layer='address_tokens')

def row_mapper_1(row):
    text_id, text = row[0], row[1]
    layer = address_token_tagger.tag(text)["address_tokens"]
    return [RowMapperRecord(layer=layer, meta=None)]

collection.create_layer(tagger=address_token_tagger, sparse=True)


address_tagger = AddressGrammarTagger(output_layer='addresses', input_layer='address_tokens')

def row_mapper_2(row):
    text_id, text = row[0], row[1]
    layer = address_tagger.tag(text)['addresses']
    return [RowMapperRecord(layer=layer, meta=None)]

collection.create_layer(tagger=address_tagger, sparse=True)

collection

INFO:collection.py:915: collection: 'texts_with_addresses'
INFO:collection.py:934: preparing to create a new layer: 'address_tokens'
INFO:collection.py:966: inserting data into the 'address_tokens' layer table
INFO:collection_detached_layer_inserter.py:86: inserted 3 detached 'address_tokens' layers into the collection 'texts_with_addresses'
INFO:collection.py:1001: layer created: 'address_tokens'
INFO:collection.py:915: collection: 'texts_with_addresses'
INFO:collection.py:934: preparing to create a new layer: 'addresses'
INFO:collection.py:966: inserting data into the 'addresses' layer table
INFO:collection_detached_layer_inserter.py:86: inserted 3 detached 'addresses' layers into the collection 'texts_with_addresses'
INFO:collection.py:1001: layer created: 'addresses'


Unnamed: 0,layer_type,attributes,ambiguous,sparse,parent,enveloping,meta
tokens,attached,(),False,False,,,[]
words,attached,"(normalized_form,)",True,False,,,[]
compound_tokens,attached,"(type, normalized)",False,False,,tokens,[]
address_tokens,detached,"(grammar_symbol, type)",True,True,,,[]
addresses,detached,"(grammar_symbol, TÄNAV, MAJA, ASULA, MAAKOND, INDEKS)",True,True,,address_tokens,[]


Let's now load one text object and see what's inside:

In [7]:
collection.selected_layers = ['tokens', 'compound_tokens', 'words', 'address_tokens', 'addresses']

collection[0]

text
"Kontor asub aadressil Rävala 5, Tallinn."

layer name,attributes,parent,enveloping,ambiguous,span count
tokens,,,,False,8
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,8
address_tokens,"grammar_symbol, type",,,True,4
addresses,"grammar_symbol, TÄNAV, MAJA, ASULA, MAAKOND, INDEKS",,address_tokens,True,1


As we can see, the `addresses` layer has attributes TÄNAV, MAJA, ASULA, MAAKOND, INDEKS which  can be used in search. For example, we can search for records containing a street name 'Rävala' and a house number '5':

In [8]:
q = LayerQuery(layer_name="addresses", TÄNAV='Rävala', MAJA='5')
for key, text in collection.select(query=q):
    print(text)

Text(text='Kontor asub aadressil Rävala 5, Tallinn.')


And for search for a street name 'Gonsiori tn':

In [9]:
q = LayerQuery(layer_name="addresses", TÄNAV="Gonsiori tn")
for key, text in collection.select(query=q):
    print(text)

Text(text='Korterite müük: Gonsiori tn 36, Tallinn')


In [10]:
delete_schema(storage)