In [None]:
import requests

PORT = "9200"
URL = "http://localhost:" + PORT + "/"

def parse_analysis(string):
    split_input = [x.strip() for x in string.split("\t") if x != ""]
    
    if len(split_input) != 3:
        return ()
    
    base, form_explicit, _disamb = split_input
    form = form_explicit.split(":")[0]
    
    return (base, form)

def analyze(string):
    content = requests.post(URL, data=string.encode('utf-8')).text
    analyzed_words = [parse_analysis(analysis) for analysis in content.split("\n")[1::2] if analysis != ""]
    
    return [analysis for analysis in analyzed_words if analysis != ()]
    
def is_valid_unigram(unigram):
    word, analysis = unigram
    return analysis != "interp" and not analysis.startswith("num")


In [None]:
analyze("ala ma kota\n\n\n\n12     123.")

In [3]:
FILE_LIST = 'files.p'
DATA_DIR = "/run/media/maciej/Nowy/data/json/"
CHOSEN_YEAR = "2011"

In [8]:
import regex as re

judgment_types = {
    "I":  "A?C.*",
    "II": "A?U.*",
    "III": "A?K.*",
    "IV": "G.*",
    "V": "A?P.*",
    "VI": "R.*",
    "VII": "W.*",
    "VIII": "Am.*",
}

def get_judgment_type(number):
    for key, value in judgment_types.items():
        if re.search(value, number):
            return key
    print("Unmatched {}".format(number))

In [14]:
import os, json, pickle
from collections import defaultdict

import regex
from tqdm import tqdm

word_pattern = "\p{Letter}+"


def load_data():
    total_judgments = []
    files = pickle.load(open(FILE_LIST, 'rb'))
    
    for file in tqdm(files):
        if file.startswith("judgment"):
            file_path = os.path.join(DATA_DIR, file)

            with open(file_path, 'r') as f:
                data = json.load(f)
                judgments = [
                    (x["textContent"], x["courtCases"][0]["caseNumber"]) for x in data["items"] 
                    if x["judgmentDate"].startswith(CHOSEN_YEAR) and x["courtType"] in ("COMMON", "SUPREME")
                ]
                
            total_judgments += judgments
    
    print("Found {} judgments from common and supreme courts".format(len(total_judgments)))
    
    judgments_with_explanations = []    
    for judgment, _case_num in total_judgments:        
        match = regex.search("uzasadnienie", judgment, regex.IGNORECASE)        
        if match:
            judgments_with_explanations.append((judgment[match.end():], _case_num))
            
    print("{} of them have explanation section".format(len(judgments_with_explanations)))
    
    analyzed_judgments = defaultdict(lambda: [])
    raw_judgments = defaultdict(lambda: [])
    
    for judgment, case_number in tqdm(judgments_with_explanations):
        
        judgment = regex.sub("<.*?>", "", judgment)
        judgment = regex.sub("-\n(\p{Letter}+)", r"\1", judgment)
        judgment = judgment.lower()

        judgment_type = get_judgment_type(case_number)

        raw_judgments[judgment_type].append(judgment)
        analyzed_judgments[judgment_type].append(analyze(judgment))
    
    return analyzed_judgments, raw_judgments

analyzed_judgments, raw_judgments = load_data()

100%|██████████| 68/68 [00:01<00:00, 56.75it/s]
  0%|          | 0/3215 [00:00<?, ?it/s]

Found 3791 judgments from common and supreme courts
3215 of them have explanation section


  2%|▏         | 58/3215 [02:24<2:10:53,  2.49s/it]

Unmatched II Ns 576/08


  4%|▍         | 138/3215 [05:19<1:58:49,  2.32s/it]

Unmatched SNO 6/11


  7%|▋         | 230/3215 [08:29<1:50:13,  2.22s/it]

Unmatched SNO 19/11


 10%|▉         | 309/3215 [14:10<2:13:17,  2.75s/it]

Unmatched SDI 11/11


 12%|█▏        | 395/3215 [17:54<2:07:49,  2.72s/it]

Unmatched SNO 24/11


 16%|█▌        | 522/3215 [23:10<1:59:31,  2.66s/it]

Unmatched III SO 4/11


 17%|█▋        | 550/3215 [24:03<1:56:34,  2.62s/it]

Unmatched SNO 22/11


 17%|█▋        | 551/3215 [24:04<1:56:25,  2.62s/it]

Unmatched SNO 26/11


 17%|█▋        | 552/3215 [24:06<1:56:17,  2.62s/it]

Unmatched SNO 27/11


 17%|█▋        | 553/3215 [24:06<1:56:05,  2.62s/it]

Unmatched SNO 28/11


 22%|██▏       | 711/3215 [30:06<1:46:00,  2.54s/it]

Unmatched SNO 29/11


 26%|██▌       | 824/3215 [34:10<1:39:09,  2.49s/it]

Unmatched III SO 9/11


 43%|████▎     | 1390/3215 [59:55<1:18:40,  2.59s/it]

Unmatched SDI 26/10


 45%|████▌     | 1451/3215 [1:03:30<1:17:13,  2.63s/it]

Unmatched SNO 53/10


 48%|████▊     | 1549/3215 [1:06:46<1:11:49,  2.59s/it]

Unmatched III SO 18/10


 48%|████▊     | 1551/3215 [1:06:49<1:11:41,  2.59s/it]

Unmatched III ZS 19/10


 49%|████▉     | 1575/3215 [1:07:40<1:10:27,  2.58s/it]

Unmatched SNO 54/10


 49%|████▉     | 1576/3215 [1:07:40<1:10:23,  2.58s/it]

Unmatched SNO 55/10


 53%|█████▎    | 1696/3215 [1:12:38<1:05:03,  2.57s/it]

Unmatched III ZS 21/10


 54%|█████▍    | 1740/3215 [1:14:47<1:03:24,  2.58s/it]

Unmatched III SO 19/10


 54%|█████▍    | 1743/3215 [1:14:50<1:03:12,  2.58s/it]

Unmatched III SZ 2/10


 60%|█████▉    | 1918/3215 [1:21:30<55:06,  2.55s/it]  

Unmatched SDI 3/11


 61%|██████    | 1957/3215 [1:22:57<53:19,  2.54s/it]

Unmatched III ZS 1/11


 62%|██████▏   | 1989/3215 [1:23:57<51:44,  2.53s/it]

Unmatched III BO 1/10


 62%|██████▏   | 1993/3215 [1:24:04<51:33,  2.53s/it]

Unmatched III SO 1/11


 62%|██████▏   | 1994/3215 [1:24:05<51:29,  2.53s/it]

Unmatched III SO 2/11


 69%|██████▉   | 2222/3215 [1:30:54<40:37,  2.45s/it]

Unmatched SNO 3/11


 73%|███████▎  | 2352/3215 [1:35:36<35:04,  2.44s/it]

Unmatched SNO 11/11


 75%|███████▌  | 2426/3215 [1:38:11<31:55,  2.43s/it]

Unmatched III SZ 1/11


 76%|███████▌  | 2438/3215 [1:38:30<31:23,  2.42s/it]

Unmatched III ZS 3/11


 76%|███████▌  | 2439/3215 [1:38:31<31:20,  2.42s/it]

Unmatched III ZS 4/11


 81%|████████  | 2610/3215 [1:44:27<24:12,  2.40s/it]

Unmatched SNO 33/11


 81%|████████  | 2611/3215 [1:44:30<24:10,  2.40s/it]

Unmatched SNO 34/11


 98%|█████████▊| 3137/3215 [2:02:27<03:02,  2.34s/it]

Unmatched SNO 40/11


 98%|█████████▊| 3138/3215 [2:02:28<03:00,  2.34s/it]

Unmatched SNO 42/11


 98%|█████████▊| 3158/3215 [2:03:08<02:13,  2.34s/it]

Unmatched SDI 22/11


100%|██████████| 3215/3215 [2:06:11<00:00,  2.35s/it]


In [24]:
import pickle
pickle.dump(list(analyzed_judgments.items()), open("analyzed_judgments.p", "wb"))
pickle.dump(list(raw_judgments.items()), open("raw_judgments.p", "wb"))


In [28]:
print([len(x) for x in raw_judgments.values()])
print([len(x) for x in analyzed_judgments.values()])

[2100, 510, 371, 132, 36, 33, 8, 25]
[2100, 510, 371, 132, 36, 33, 8, 25]
