In [1]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import connections
client = Elasticsearch()

connections.configure(
    default={'hosts': 'localhost'},
)

In [2]:
from elasticsearch_dsl import analyzer
analyzer = analyzer(
    'my_analyzer',
    tokenizer="standard",
    filter=["morfologik_stem"],
    analyzer="morfologik"
)

In [7]:
from elasticsearch_dsl import Index
judgments = Index('judgments')
judgments.analyzer(analyzer)
judgments.delete(ignore=404)
judgments.create()


In [8]:
from elasticsearch_dsl import DocType, Text, Date, Keyword, Nested, InnerDoc

class Judge(InnerDoc):
    name = Text(analyzer=analyzer)

@judgments.doc_type
class Judgment(DocType):
    content = Text(analyzer=analyzer)
    judgment_date = Date()
    signature = Keyword()
    judges = Nested(Judge)

    class Meta:
        index = 'judgments'


In [9]:
# configs 
DATA_DIR = "/run/media/maciej/Nowy/data/json/"
CHOSEN_YEAR = str(2011)
FILE_LIST = 'files.pickle'


In [11]:
import json
import os
import pickle

from tqdm import tqdm

def load_data():    
    files = pickle.load(open(FILE_LIST, 'rb'))
    files = os.listdir(DATA_DIR)
    results = []
    for file in tqdm(files):
        if file.startswith("judgment"):
            file_path = os.path.join(DATA_DIR, file)

            with open(file_path, 'r') as f:
                data = json.load(f)
                judgments = [x for x in data["items"] if x["judgmentDate"].startswith(CHOSEN_YEAR)]

            for judgment in judgments:
                Judgment(
                    content=judgment['textContent'],
                    judgment_date=judgment['judgmentDate'],
                    signature=judgment['id'],
                    judge=[Judge(name=judge['name']) for judge in judgment['judges']],
                ).save()
    return results


load_data()

100%|██████████| 3192/3192 [02:45<00:00, 19.28it/s]


[]

### 6. Znajdź liczbę orzeczeń, w których występuje słowo szkoda.
```
GET judgments/_search 
{
  "query": {
    "match": {
      "content": "szkoda"
    }
  }, 
  "explain": true
}
```

In [95]:
s = Judgment.search().extra(explain=True)
print(s.query("match", content="szkoda").to_dict())
szkoda_occurences = s.query("match", content="szkoda").execute()

{'query': {'match': {'content': 'szkoda'}}, 'explain': True}


In [38]:

szkoda_occurences.hits.total

413

### 7. Znajdź liczbę orzeczeń, w których występuje fraza trwały uszczerbek na zdrowiu, dokładnie w tej kolejności ale w dowolnej formie fleksyjnej.

```
GET judgments/_search
{
    "query": {
        "match_phrase": {
            "content": "trwały uszczerbek na zdrowiu"
        }
    }
}
```

In [69]:
from elasticsearch_dsl.query import Match, Q, MatchPhrase
m = Match(
    content={"query": "trwały uszczerbek na zdrowiu", "type": "phrase"},
)
# q = Q(
#     "match_phrase", 
#     query={"content": "trwały uszczerbek na zdrowiu"},
# )

# permanent_damage = s.query(q).execute()
m = MatchPhrase(
    content="trwały uszczerbek na zdrowiu"
)
print(s.query(m).to_dict())
permanent_damage = s.query(m).execute()

{'query': {'match_phrase': {'content': 'trwały uszczerbek na zdrowiu'}}, 'explain': True}


In [84]:
permanent_damage.hits.total

10

### 8. Jak wyżej, ale z uwzględnieniem możliwości wystąpienia maksymalnie 2 dodatkowych słów pomiędzy dowolnymi elementami frazy.
```
GET /_search
{
    "query": {
        "span_near" : {
            "clauses" : [
                { "span_term" : { "content" : "trwały" } },
                { "span_term" : { "content" : "uszczerbek" } },
                { "span_term" : { "content" : "na" } },
                { "span_term" : { "content" : "zdrowiu" } }
            ],
            "slop" : 2,
            "in_order" : true
        }
    }
}
```

In [93]:
from elasticsearch_dsl.query import SpanNear
span = SpanNear(
    clauses=[
        {"span_term": {"content": "trwały"}},
        {"span_term": {"content": "uszczerbek"}},
        {"span_term": {"content": "na"}},
        {"span_term": {"content": "zdrowiu"}},
    ],
    slop=2,
    in_order=True,
)
print(s.query(span).to_dict())
permanent_damage_with_span = s.query(span).execute()

{'query': {'span_near': {'clauses': [{'span_term': {'content': 'trwały'}}, {'span_term': {'content': 'uszczerbek'}}, {'span_term': {'content': 'na'}}, {'span_term': {'content': 'zdrowiu'}}], 'slop': 2, 'in_order': True}}, 'explain': True}


In [94]:
permanent_damage.hits.total

10