In [118]:
import requests

PORT = "9200"
URL = "http://localhost:" + PORT + "/"

def parse_analysis(string):
    split_input = [x.strip() for x in string.split("\t") if x != ""]
    
    if len(split_input) != 3:
        return ()
    
    base, form_explicit, _disamb = split_input
    form = form_explicit.split(":")[0]
    
    return (base, form)

def analyze(string):
    content = requests.post(URL, data=string.encode('utf-8')).text
    analyzed_words = [parse_analysis(analysis) for analysis in content.split("\n")[1::2] if analysis != ""]
    
    return [analysis for analysis in analyzed_words if analysis != ()]
    
def is_valid_unigram(unigram):
    word, analysis = unigram
    return analysis != "interp" and not analysis.startswith("num")


In [119]:
analyze("ala ma kota\n\n\n\n12     123.")

[('Ala', 'subst'), ('mieć', 'fin'), ('kot', 'subst')]

In [127]:
FILE_LIST = 'files.p'
DATA_DIR = "/run/media/maciej/Nowy/data/json/"
CHOSEN_YEAR = "2011"

In [129]:
import os, json, pickle
from collections import defaultdict

import regex
from tqdm import tqdm

word_pattern = "\p{Letter}+"


def load_data():
    total_judgments = []
    files = pickle.load(open(FILE_LIST, 'rb'))
    
    for file in tqdm(files):
        if file.startswith("judgment"):
            file_path = os.path.join(DATA_DIR, file)

            with open(file_path, 'r') as f:
                data = json.load(f)
                judgments = [x["textContent"] for x in data["items"] if x["judgmentDate"].startswith(CHOSEN_YEAR)]
            total_judgments += judgments
            
    analyzed_judgments = []
    
#     i = 50
    for judgment in tqdm(total_judgments):
        previous_word = None

        judgment = regex.sub("<.*?>", "", judgment)
        judgment = regex.sub("-\n(\p{Letter}+)", r"\1", judgment)
        judgment = judgment.lower()
        
        analyzed_judgments.append(analyze(judgment))
        
#         i -= 1
#         if i == 0:
#             break
    
    return analyzed_judgments

analyzed_judgments = load_data()

# print(len(bigrams))


100%|██████████| 68/68 [00:01<00:00, 56.13it/s]
100%|██████████| 6699/6699 [5:21:51<00:00,  2.88s/it]  


In [130]:
# import pickle

# with open('analyzed_judgments.p', 'wb') as f:
#     pickle.dump(analyzed_judgments, f)

In [None]:
import pickle

analyzed_judgments = pickle.load(open('analyzed_judgments.p', 'rb'))

In [131]:
for x in analyzed_judgments[1]:
    print(x)

('sygnatura', 'brev')
('.', 'interp')
('akt', 'subst')
('v', 'subst')
('circa', 'brev')
('264', 'num')
('/', 'interp')
('11', 'adj')
('dzień', 'subst')
('29', 'adj')
('kwiecień', 'subst')
('2011', 'adj')
('rok', 'brev')
('.', 'interp')
('w', 'prep')
('skład', 'subst')
(':', 'interp')
('sędzia', 'subst')
('SO', 'subst')
('Ewa', 'subst')
('cylc', 'subst')
('so', 'xxx')
('Agnieszka', 'subst')
('fronczak', 'subst')
('po', 'prep')
('rozpoznać', 'ger')
('w', 'prep')
('dzień', 'subst')
('29', 'adj')
('kwiecień', 'subst')
('2011', 'adj')
('rok', 'brev')
('.', 'interp')
('w', 'prep')
('Warszawa', 'subst')
('femininum', 'brev')
('.', 'interp')
('a', 'subst')
('.', 'interp')
('strona', 'brev')
('.', 'interp')
(',', 'interp')
('koło', 'brev')
('.', 'interp')
('j', 'interj')
('.', 'interp')
('strona', 'brev')
('.', 'interp')
(',', 'interp')
('a', 'conj')
('.', 'interp')
('z', 'brev')
('.', 'interp')
(',', 'interp')
('koło', 'brev')
('.', 'interp')
('język', 'brev')
('.', 'interp')
('(', 'interp')
(

In [124]:
from collections import defaultdict

bigrams = defaultdict(lambda: 0)
unigrams = defaultdict(lambda: 0)

for judgment in analyzed_judgments:
    judgment = [x for x in judgment if is_valid_unigram(x)]
    
    previous_unigram = None
    for current_unigram in judgment:
        if previous_unigram:
            bigrams[(previous_unigram, current_unigram)] += 1
        
        unigrams[current_unigram] += 1
        previous_unigram = current_unigram