In [119]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import connections
client = Elasticsearch()

connections.configure(
    default={'hosts': 'localhost'},
)

In [120]:
from elasticsearch_dsl import analyzer
analyzer = analyzer(
    'std_pl',
    tokenizer="standard",
    filter=["morfologik_stem"],
)

In [121]:
from elasticsearch_dsl import Index
judgments = Index('judgments')
judgments.analyzer(analyzer)
# judgments.delete(ignore=404)
print(judgments.to_dict())
# judgments.create()


{'settings': {'analysis': {'analyzer': {'std_pl': {'tokenizer': 'standard', 'filter': ['morfologik_stem'], 'type': 'custom'}}}}}


In [146]:
from elasticsearch_dsl import DocType, Text, Date, Keyword, Nested, InnerDoc

judgments.close()

class Judge(InnerDoc):
    name = Keyword(analyzer=analyzer)

@judgments.doc_type
class Judgment(DocType):
    content = Text(analyzer='std_pl', fielddata=True)
    judgment_date = Date()
    signature = Keyword()
    judges = Nested(Judge)

    class Meta:
        index = 'judgments'

In [123]:
# configs 
DATA_DIR = "/run/media/maciej/Nowy/data/json/"
CHOSEN_YEAR = str(2011)
FILE_LIST = 'files.pickle'


In [124]:
import json
import os
import pickle

from tqdm import tqdm

judgments.open()

def load_data():    
    files = pickle.load(open(FILE_LIST, 'rb'))
    files = os.listdir(DATA_DIR)
    results = []
    for file in tqdm(files):
        if file.startswith("judgment"):
            file_path = os.path.join(DATA_DIR, file)

            with open(file_path, 'r') as f:
                data = json.load(f)
                judgments = [x for x in data["items"] if x["judgmentDate"].startswith(CHOSEN_YEAR)]

            for judgment in judgments:
                Judgment(
                    content=judgment['textContent'],
                    judgment_date=judgment['judgmentDate'],
                    signature=judgment['id'],
                    judge=[Judge(name=judge['name']) for judge in judgment['judges']],
                ).save()
    return results

load_data()

100%|██████████| 3192/3192 [03:45<00:00, 14.13it/s]


[]

### 6. Znajdź liczbę orzeczeń, w których występuje słowo szkoda.
```
GET judgments/_search 
{
  "query": {
    "match": {
      "content": "szkoda"
    }
  }, 
  "explain": true
}
```

In [132]:
s = Judgment.search().extra(explain=True)
query_content = s.query("match", content="szkoda")
print(query_content.to_dict())
szkoda_occurences = query_content.execute()

print(szkoda_occurences.hits.total)

{'query': {'match': {'content': 'szkoda'}}, 'explain': True}
1828


### 7. Znajdź liczbę orzeczeń, w których występuje fraza trwały uszczerbek na zdrowiu, dokładnie w tej kolejności ale w dowolnej formie fleksyjnej.

```
GET judgments/_search
{
    "query": {
        "match_phrase": {
            "content": "trwały uszczerbek na zdrowiu"
        }
    }
}
```

In [126]:
from elasticsearch_dsl.query import Match, Q, MatchPhrase
# q = Q(
#     "match_phrase", 
#     query={"content": "trwały uszczerbek na zdrowiu"},
# )

# permanent_damage = s.query(q).execute()
m = MatchPhrase(
    content="trwały uszczerbek na zdrowiu"
)
print(s.query(m).to_dict())
permanent_damage = s.query(m).execute()

{'query': {'match_phrase': {'content': 'trwały uszczerbek na zdrowiu'}}, 'explain': True}


In [127]:
permanent_damage.hits.total

24

### 8. Jak wyżej, ale z uwzględnieniem możliwości wystąpienia maksymalnie 2 dodatkowych słów pomiędzy dowolnymi elementami frazy.
```
GET /_search
{
    "query": {
        "span_near" : {
            "clauses" : [
                { "span_term" : { "content" : "trwały" } },
                { "span_term" : { "content" : "uszczerbek" } },
                { "span_term" : { "content" : "na" } },
                { "span_term" : { "content" : "zdrowiu" } }
            ],
            "slop" : 2,
            "in_order" : true
        }
    }
}
```

In [128]:
from elasticsearch_dsl.query import SpanNear
span = SpanNear(
    clauses=[
        {"span_term": {"content": "trwały"}},
        {"span_term": {"content": "uszczerbek"}},
        {"span_term": {"content": "na"}},
        {"span_term": {"content": "zdrowie"}},
    ],
    slop=2,
    in_order=True,
)
print(s.query(span).to_dict())
permanent_damage_with_span = s.query(span).execute()

{'query': {'span_near': {'clauses': [{'span_term': {'content': 'trwały'}}, {'span_term': {'content': 'uszczerbek'}}, {'span_term': {'content': 'na'}}, {'span_term': {'content': 'zdrowie'}}], 'slop': 2, 'in_order': True}}, 'explain': True}


In [134]:
permanent_damage_with_span.hits.total

24

In [143]:
from elasticsearch_dsl import aggs, Search
body = {
    "size": 0,
    "aggs": {
        "judge": {
            "terms": {
                "field": "judge.name.keyword",
            }
        }
    }
}
query = Search.from_dict(body).index("judgments").doc_type("judgment")
judges = query.execute()

[]