# Lemmatization and full text search (FTS)

1. Install ElasticSearch (ES).
2. Install an ES plugin for Polish https://github.com/allegro/elasticsearch-analysis-morfologik

In [2]:
from elasticsearch import Elasticsearch
es = Elasticsearch(['https://localhost:9200/'], verify_certs = False, basic_auth=('elastic','1CtGXMjQe8eRoAfQZCUS'))

  _transport = transport_class(


In [3]:
es.info()



ObjectApiResponse({'name': 'KRKML0004.local', 'cluster_name': 'elasticsearch', 'cluster_uuid': '4RgVB8ahROeKb-55kRVLEQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

3. Define an ES analyzer for Polish texts containing:
    * standard tokenizer
    * synonym filter with the following definitions:
        - kpk - kodeks postępowania karnego
        - kpc - kodeks postępowania cywilnego
        - kk - kodeks karny
        - kc - kodeks cywilny
    * Morfologik-based lemmatizer
    * lowercase filter

In [4]:
index_settings={
    "settings": {
        "index":{
            'analysis': {
                "analyzer": {
                    "defined_analyzer":{
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": ["lowercase", "polish_synonym", "morfologik_stem"]
                    }
                },
                "filter": {
                    "polish_synonym":{
                        "type": "synonym",
                        "tokenizer": "standard",
                        "synonyms": [
                            "kpk, kodeks postępowania karnego",
                            "kpc, kodeks postępowania cywilnego",
                            "kk, kodeks karny",
                            "kc, kodeks cywilny"
                        ]
                    }
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "text": {
                "type": "text",
                "analyzer": "defined_analyzer"
            },
            "name": {"type": "text"}
        }
    }
}

4. Define an ES index for storing the contents of the legislative acts.

In [6]:
es.indices.delete(index="polish_index")

es.indices.create(index="polish_index", body=index_settings)

  es.indices.create(index="polish_index", body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'polish_index'})

5. Load the data to the ES index.

In [10]:
import os
path = "../ustawy"
ustawy = []
os.chdir(path)
  
    
for file in os.listdir():
    if file.endswith(".txt"):
        file_path = f"{file}"
        with open(file_path, 'r') as f:
            text = f.read()
            es.create(
                index="polish_index",
                id=file,
                body={"text": text},
            )

  es.create(


























































































6. Determine the number of legislative acts containing the word ustawa (in any form).

In [12]:
es.search(
    body={
        "query": {
            "match": {
                "text": {
                    "query": "ustawa"
                }
            }
        }
    },
    index="polish_index",
)["hits"]["total"]["value"]

  es.search(


1178

7. Determine the number of occurrences of the word ustawa by searching for this particular form, including the other inflectional forms.

In [25]:
ustawa_search = es.search(index="polish_index", body={
    "size": 10000,
    "query": {
        "query_string": {
            "fields": ["text"],
            "query": "ustawa",
            "analyzer": "defined_analyzer"
        }
    }
})

ustawa_count = 0
for h in ustawa_search['hits']['hits']:
    vectors = es.termvectors(
            index="polish_index",
            id=h['_id'],
            fields=['text'], 
            field_statistics=False
    )
    ustawa_count += vectors['term_vectors']['text']['terms']["ustawa"]['term_freq']
    
ustawa_count

  ustawa_search = es.search(index="polish_index", body={










































































24934

8. Determine the number of occurrences of the word ustaw by searching for this particular form, including the other inflectional forms.

In [26]:
ustaw_search = es.search(index="polish_index", body={
#     "size": 10000,
    "query": {
        "query_string": {
            "fields": ["text"],
            "query": "ustawić",
            "analyzer": "defined_analyzer"
        }
    }
})

ustaw_count = 0
for h in ustaw_search['hits']['hits']:
    vectors = es.termvectors(
            index="polish_index",
            id=h['_id'],
            fields=['text'], 
            field_statistics=False
    )
    ustaw_count += vectors['term_vectors']['text']['terms']["ustawić"]['term_freq']
    
ustaw_count

  ustaw_search = es.search(index="polish_index", body={


82

In [27]:
ustaw_search['hits']['total']

{'value': 378, 'relation': 'eq'}

In [28]:
ustaw_count + ustawa_count

25016

9. Determine the number of legislative acts containing the words kodeks postępowania cywilnego in the specified order, but in any inflection form.

In [29]:
es.search(
    index="polish_index",
    body={
        "query": {
            "match_phrase": {
                "text": {
                    "query": "kodeks postępowania cywilnego"
                }
            }
        }
    }
)["hits"]["total"]["value"]

  es.search(


99

10. Determine the number of legislative acts containing the words wchodzi w życie (in any form) allowing for up to 2 additional words in the searched phrase.

In [30]:
es.search(
    index="polish_index",
    body={
        "query": {
            "match_phrase": {
                "text": {
                    "query": "wchodzi w życie",
                    "slop": 2
                }
            }
        }
    }
)["hits"]["total"]["value"]

  es.search(


1174

11. Determine the 10 documents that are the most relevant for the phrase konstytucja.

In [44]:
konstytucja_search = es.search(
    index="polish_index",
    body={
        "query": {
            "match": {
                "text": {
                    "query": "konstytucja",
                }
            }
        },
        "highlight": {
            "fields": {
                "text": {}
            },
            "boundary_scanner": "sentence",
            "number_of_fragments": 3
        }
    },
    sort="_score",
    size="10"
)["hits"]["hits"]

for document in konstytucja_search:
    print(f"Document: {document['_id']} Score: {document['_score']}")

Document: 1997_629.txt Score: 6.869376
Document: 2000_443.txt Score: 6.6642833
Document: 1997_604.txt Score: 6.633483
Document: 1996_350.txt Score: 6.628302
Document: 1997_642.txt Score: 6.2530584
Document: 2001_23.txt Score: 6.0589767
Document: 1996_199.txt Score: 5.9289904
Document: 1999_688.txt Score: 5.8507533
Document: 2001_1082.txt Score: 5.467437
Document: 1997_681.txt Score: 5.467437


  konstytucja_search = es.search(


12. Print the excerpts containing the word konstytucja (up to three excerpts per document) from the previous task.

In [46]:
for document in konstytucja_search:
    for highlight in document['highlight']['text']:
        print(f"Document: {document['_id']}\n Text: {highlight}\n\n")

Document: 1997_629.txt
 Text: o zmianie ustawy konstytucyjnej o trybie przygotowania
           i uchwalenia <em>Konstytucji</em> Rzeczypospolitej


Document: 1997_629.txt
 Text: W ustawie  konstytucyjnej z  dnia 23 kwietnia 1992 r. o trybie przygotowania i 
uchwalenia <em>Konstytucji</em>


Document: 1997_629.txt
 Text: Do zgłoszenia projektu <em>Konstytucji</em> załącza się wykaz 
                obywateli popierających zgłoszenie


Document: 2000_443.txt
 Text: umowy międzynarodowej i nie wypełnia przesłanek określonych w art. 89
     ust. 1 lub art. 90 <em>Konstytucji</em>


Document: 2000_443.txt
 Text: międzynarodowej lub załącznika nie
     wypełnia przesłanek określonych w art. 89 ust. 1 lub art. 90 <em>Konstytucji</em>


Document: 2000_443.txt
 Text: co do zasadności wyboru
  trybu ratyfikacji umowy międzynarodowej, o którym mowa w art. 89 ust. 2
  <em>Konstytucji</em>


Document: 1997_604.txt
 Text: Jeżeli Trybunał Konstytucyjny wyda orzeczenie o sprzeczności celów partii 
  