In [18]:
import requests

PORT = "9200"
URL = "http://localhost:" + PORT + "/"

def parse_analysis(string):
    split_input = [x.strip() for x in string.split("\t") if x != ""]
    
    if len(split_input) != 3:
        return ()
    
    base, form_explicit, _disamb = split_input
    form = form_explicit.split(":")[0]
    
    return (base, form)

def analyze(string):
    content = requests.post(URL, data=string.encode('utf-8')).text
    analyzed_words = [parse_analysis(analysis) for analysis in content.split("\n")[1::2] if analysis != ""]
    
    return [analysis for analysis in analyzed_words if analysis != ()]
    
def is_valid_unigram(unigram):
    word, analysis = unigram
    return analysis != "interp" and not analysis.startswith("num")


In [20]:
analyze("ala ma kota\n\n\n\n12     123.")

[('Ala', 'subst'), ('mieć', 'fin'), ('kot', 'subst')]

In [21]:
FILE_LIST = 'files.p'
DATA_DIR = "/run/media/maciej/Nowy/data/json/"
CHOSEN_YEAR = "2011"

In [30]:
import regex as re

judgment_types = {
    "I":  "A?C.*",
    "II": "A?U.*",
    "III": "A?K.*",
    "IV": "G.*",
    "V": "A?P.*",
    "VI": "R.*",
    "VII": "W.*",
    "VIII": "Am.*",
}

def get_judgment_type(number):
    for key, value in judgment_types.items():
        if re.match(value, number):
            print("Found a match for {}".format(number))
            return key
    print("Did not match for {}".format(
        number
    ))

In [None]:
import os, json, pickle
from collections import defaultdict

import regex
from tqdm import tqdm

word_pattern = "\p{Letter}+"


def load_data():
    total_judgments = []
    files = pickle.load(open(FILE_LIST, 'rb'))
    
    for file in tqdm(files):
        if file.startswith("judgment"):
            file_path = os.path.join(DATA_DIR, file)

            with open(file_path, 'r') as f:
                data = json.load(f)
                judgments = [
                    (x["textContent"], x["courtCases"][0]["caseNumber"]) for x in data["items"] 
                    if x["judgmentDate"].startswith(CHOSEN_YEAR) and x["courtType"] in ("COMMON", "SUPREME")
                ]
                
            total_judgments += judgments

    judgments_with_explanations = []    
    for judgment, _case_num in total_judgments:
        explanation_pattern = "[uU]zasadnienie:"
        
        match = regex.search(explanation_pattern, judgment)        
        if match:
            judgments_with_explanations.append((judgment[match.end():], _case_num))
            
    analyzed_judgments = defaultdict(lambda: [])
    raw_judgments = defaultdict(lambda: [])
    
    for judgment, case_number in tqdm(judgments_with_explanations):
        
        judgment = regex.sub("<.*?>", "", judgment)
        judgment = regex.sub("-\n(\p{Letter}+)", r"\1", judgment)
        judgment = judgment.lower()

        judgment_type = get_judgment_type(case_number)

        raw_judgments[judgment_type].append(judgment)
        analyzed_judgments[judgment_type].append(analyze(judgment))
        
    
    return analyzed_judgments, raw_judgments

analyzed_judgments, raw_judgments = load_data()

100%|██████████| 68/68 [00:01<00:00, 51.50it/s]
  0%|          | 0/18 [00:00<?, ?it/s]

Did not match for III KRS 4/11


  6%|▌         | 1/18 [00:00<00:15,  1.10it/s]

Did not match for III UK 174/10


 11%|█         | 2/18 [00:02<00:22,  1.38s/it]

Did not match for III K 48/11


 17%|█▋        | 3/18 [00:03<00:15,  1.02s/it]

Did not match for V Ca 1179/11


 22%|██▏       | 4/18 [00:07<00:26,  1.88s/it]

Did not match for III AUa 191/11


 28%|██▊       | 5/18 [00:11<00:29,  2.30s/it]

Did not match for V Ca 1140/11


 33%|███▎      | 6/18 [00:13<00:26,  2.24s/it]

Did not match for III ZS 21/10


In [40]:
len(analyzed_judgments.keys())

1

In [41]:
list(analyzed_judgments.values())[0]

[[('kolegium', 'subst'),
  ('sąd', 'subst'),
  ('okręgowy', 'adj'),
  (',', 'interp'),
  ('działać', 'pcon'),
  ('na', 'prep'),
  ('podstawa', 'subst'),
  ('artykuł', 'brev'),
  ('.', 'interp'),
  ('73', 'adj'),
  ('§', 'interp'),
  ('2', 'adj'),
  ('i', 'conj'),
  ('3', 'adj'),
  ('prawo', 'subst'),
  ('o', 'prep'),
  ('rok', 'brev'),
  ('.', 'interp'),
  ('o', 'prep'),
  ('krajowy', 'adj'),
  ('rada', 'subst'),
  ('sądownictwo', 'subst'),
  ('(', 'interp'),
  ('jednolity', 'adj'),
  ('tekst', 'subst'),
  (':', 'interp'),
  ('dziennik', 'brev'),
  ('.', 'interp'),
  ('u', 'prep'),
  ('.', 'interp'),
  ('z', 'prep'),
  ('2010', 'adj'),
  ('rok', 'brev'),
  ('.', 'interp'),
  ('numer', 'brev'),
  ('11', 'num'),
  (',', 'interp'),
  ('pozycja', 'brev'),
  ('.', 'interp'),
  ('67', 'adj'),
  (')', 'interp'),
  ('1922', 'num'),
  ('/', 'interp'),
  ('2010', 'num'),
  ('w', 'prep'),
  ('sprawa', 'subst'),
  ('odmowa', 'subst'),
  ('przeniesienie', 'subst'),
  ('w', 'prep'),
  ('stan', 'subs