In [2]:
import requests

PORT = "9200"
URL = "http://localhost:" + PORT + "/"

def parse_analysis(string):
    split_input = [x.strip() for x in string.split("\t") if x != ""]
    
    if len(split_input) != 3:
        return ()
    
    base, form_explicit, _disamb = split_input
    form = form_explicit.split(":")[0]
    
    return (base, form)

def analyze(string):
    content = requests.post(URL, data=string.encode('utf-8')).text
    analyzed_words = [parse_analysis(analysis) for analysis in content.split("\n")[1::2] if analysis != ""]
    
    return [analysis for analysis in analyzed_words if analysis != ()]
    
def is_valid_unigram(unigram):
    word, analysis = unigram
    return analysis != "interp" and not analysis.startswith("num")


In [5]:
analyze("ala ma kota\n\n\n\n12     123.")

[('Ala', 'subst'), ('mieć', 'fin'), ('kot', 'subst')]

In [127]:
FILE_LIST = 'files.p'
DATA_DIR = "/run/media/maciej/Nowy/data/json/"
CHOSEN_YEAR = "2011"

In [129]:
import os, json, pickle
from collections import defaultdict

import regex
from tqdm import tqdm

word_pattern = "\p{Letter}+"


def load_data():
    total_judgments = []
    files = pickle.load(open(FILE_LIST, 'rb'))
    
    for file in tqdm(files):
        if file.startswith("judgment"):
            file_path = os.path.join(DATA_DIR, file)

            with open(file_path, 'r') as f:
                data = json.load(f)
                judgments = [x["textContent"] for x in data["items"] if x["judgmentDate"].startswith(CHOSEN_YEAR)]
            total_judgments += judgments
            
    analyzed_judgments = []
    
#     i = 50
    for judgment in tqdm(total_judgments):
        previous_word = None

        judgment = regex.sub("<.*?>", "", judgment)
        judgment = regex.sub("-\n(\p{Letter}+)", r"\1", judgment)
        judgment = judgment.lower()
        
        analyzed_judgments.append(analyze(judgment))
        
#         i -= 1
#         if i == 0:
#             break
    
    return analyzed_judgments

analyzed_judgments = load_data()

100%|██████████| 68/68 [00:01<00:00, 56.13it/s]
100%|██████████| 6699/6699 [5:21:51<00:00,  2.88s/it]  


In [130]:
# import pickle

# with open('analyzed_judgments.p', 'wb') as f:
#     pickle.dump(analyzed_judgments, f)

In [1]:
import pickle

analyzed_judgments = pickle.load(open('analyzed_judgments.p', 'rb'))

In [18]:
def get_unigram_count(judgments):
    return sum([len(j) for j in judgments])

print("Count of unigrams before filtering: {}".format(get_unigram_count(analyzed_judgments)))
      
filtered_judgments = [[unigram for unigram in j if is_valid_unigram(unigram)] for j in analyzed_judgments]
print("Count of unigrams after filtering: {}".format(get_unigram_count(filtered_judgments)))

Count of unigrams before filtering: 10951114
Count of unigrams after filtering: 8726760


In [25]:
from collections import defaultdict

from tqdm import tqdm

bigrams = defaultdict(lambda: 0)
unigrams = defaultdict(lambda: 0)

for judgment in tqdm(filtered_judgments):    
    previous_unigram = None
    
    for current_unigram in judgment:
        if previous_unigram:
            bigrams[(previous_unigram, current_unigram)] += 1
        
        unigrams[current_unigram] += 1
        previous_unigram = current_unigram

bigram_count = sum(bigrams.values())

100%|██████████| 6699/6699 [00:11<00:00, 563.45it/s]


In [26]:
import numpy as np


def get_k(bigram):
    bigram_occurrence_count = bigrams[bigram]
    k_11 = bigram_occurrence_count
    k_12 = unigrams[bigram[1]] - bigram_occurrence_count
    k_21 = unigrams[bigram[0]] - bigram_occurrence_count
    k_22 = bigram_count - k_11 - k_12 - k_21

    return np.array(
        [[k_11, k_12],
         [k_21, k_22]]
    )


def H(k):
    """Count Shannon's entropy"""
    N = np.sum(k)

    return np.sum(k / N * np.log(k / N + (k == 0)))


def get_llr(bigram):
    k = get_k(bigram)
    return 2 * np.sum(k) * (H(k) - H(k.sum(axis=0)) - H(k.sum(axis=1)))


In [34]:
sorted(bigrams.items(), key=lambda x:x[1], reverse=True)[:3]

[((('z', 'prep'), ('dzień', 'subst')), 39804),
 ((('ustęp', 'brev'), ('1', 'adj')), 25432),
 ((('z', 'prep'), ('artykuł', 'brev')), 20679)]

In [38]:
get_llr((("krwinka", "subst"), ("czerwony", "adj")))

164.5663334064712

In [40]:
get_llr((('z', 'prep'), ('dzień', 'subst')))

197932.28599531404