In [5]:
import os
import json
import re
import string
import nltk
import pprint
import random
import math
import numpy as np

In [6]:
all_words = []

def read_file(file_path):
    with open (file_path) as file:
        json_content = json.load(file)
        item_count = 0
        for item in json_content:
            item_count += 1
            text_content = re.sub("<.*?>", "", item["textContent"])
            text_content = text_content.replace('-\n', '')
            word_content = text_content.split()
            topicSpecificPunctuation = '„”–§…«»'
            translator = str.maketrans('', '', string.punctuation+topicSpecificPunctuation)
            
            for word in word_content:
                word = word.translate(translator).lower()
                if len(word)>0:
                    all_words.append(word)
                
        print(item_count)
        
def read_all_judgments_from_2018():
    for filename in os.listdir("data_filtered/"):
        print(filename)
        read_file("data_filtered/" + filename)
        
read_all_judgments_from_2018()
print(len(all_words))

judgments-3163.json
9
judgments-3168.json
100
judgments-3164.json
100
judgments-3171.json
100
judgments-3165.json
100
judgments-3167.json
100
judgments-3169.json
100
judgments-3173.json
81
judgments-3172.json
100
judgments-3166.json
100
judgments-3170.json
100
2530124


In [7]:
CHANGE_TO_BASIC_FORM = False
sorted_rank = {}
TOTAL_COUNT = len(all_words)

def generate_rank():        
    words_rank = {}
    for word in all_words:
        current = words_rank.get(word)
        
        if current is None:
            words_rank[word] = 1
        else:
            words_rank[word] = current + 1
    
    return words_rank

sorted_rank = generate_rank()
print (type(sorted_rank))

<class 'dict'>


In [8]:
def custom_filter(word):
    if re.match("^[a-ząćęłńóśźż]+$", word):
        return True
    return False

def filter_list(list_of_words):
    return [word for word in list_of_words if custom_filter(word)]

In [9]:
words = filter_list(all_words)
pprint.pprint(words[:10])

['uzasadnienie', 'powód', 'sp', 'z', 'oo', 'sp', 'k', 'z', 'siedzibą', 'w']


In [10]:
counter, limit = 0, 10
for k, v in sorted_rank.items():
    if counter < limit:
        print (k, v)
    counter += 1

uzasadnienie 1092
powód 3746
sp 1461
z 69427
oo 1468
k 16810
siedzibą 1189
w 117267
p 5069
wniósł 1181


In [11]:
bigrams = nltk.bigrams(words)

In [12]:
bigrams_frequency = nltk.FreqDist(bigrams)
counter, limit = 0, 10


for k,v in bigrams_frequency.items():
    if counter < limit:
        print (k,v)
    counter += 1


('uzasadnienie', 'powód') 73
('powód', 'sp') 5
('sp', 'z') 1417
('z', 'oo') 1463
('oo', 'sp') 14
('sp', 'k') 14
('k', 'z') 426
('z', 'siedzibą') 1169
('siedzibą', 'w') 1052
('w', 'p') 694


In [13]:
def pointwise_mutual_information(bigram, bigram_count):
    probability_of_bigram = bigram_count / TOTAL_COUNT
    first, second = bigram[0], bigram[1]
    probability_of_first, probability_of_second = sorted_rank[first]/TOTAL_COUNT, sorted_rank[second]/TOTAL_COUNT
    return math.log(probability_of_bigram/(probability_of_first*probability_of_second))
    

In [14]:
# Pointwise mutual information
bigram_result = {}
for bigram, bigram_count in bigrams_frequency.items():
    bigram_result[bigram] = (pointwise_mutual_information(bigram, bigram_count), bigram_count)

In [15]:
counter, limit = 0, 10

for k,v in bigram_result.items():
    if counter < limit:
        print (k,v)
    counter += 1

('uzasadnienie', 'powód') (3.8100282731998756, 73)
('powód', 'sp') (0.8378964890397356, 5)
('sp', 'z') (3.565168579474035, 1417)
('z', 'oo') (3.5923359433797177, 1463)
('oo', 'sp') (2.8043035800504668, 14)
('sp', 'k') (0.3662305628222664, 14)
('k', 'z') (-0.07954212859923658, 426)
('z', 'siedzibą') (3.5787838163153407, 1169)
('siedzibą', 'w') (2.9491507028296353, 1052)
('w', 'p') (1.083143328257217, 694)


In [38]:
MINIMUM_BIGRAM_OCCURENCE = -1

sd = sorted(bigram_result.items(), key=lambda key_value: key_value[1][0], reverse=True)
counter, limit = 0, 30
for k,v in sd:
    if counter < limit and v[1] > MINIMUM_BIGRAM_OCCURENCE:
        print (k,v)
        counter += 1

('prosze', 'stawic') (14.74377887136037, 1)
('obowiazuje', 'cie') (14.74377887136037, 1)
('cie', 'miesieczby') (14.74377887136037, 1)
('porysowała', 'barierkę') (14.74377887136037, 1)
('podziemny', 'zamykający') (14.74377887136037, 1)
('kompleksów', 'skraplania') (14.74377887136037, 1)
('ewidencyjno', 'sprawozdawczym') (14.74377887136037, 1)
('wielichowska', 'opalska') (14.74377887136037, 1)
('nowakowski', 'andżelika') (14.74377887136037, 1)
('andżelika', 'pruk') (14.74377887136037, 1)
('odsuniętej', 'szybie') (14.74377887136037, 1)
('wgnieciona', 'obręcz') (14.74377887136037, 1)
('krata', 'wlotu') (14.74377887136037, 1)
('odepchnięty', 'obrócił') (14.74377887136037, 1)
('nawierzchnią', 'gruntowotrawiastą') (14.74377887136037, 1)
('sadowski', 'czasowoprzestrzenna') (14.74377887136037, 1)
('dodatnią', 'prognozą') (14.74377887136037, 1)
('osobnikiem', 'zdemoralizowanym') (14.74377887136037, 1)
('beaty', 'tonasko') (14.74377887136037, 1)
('artykułując', 'przezwiska') (14.74377887136037, 1

In [17]:
def shannon_entrophy(word_occurences, total_words):
    sum = 0
    for x in np.nditer(word_occurences):
        if x!= 0:
            sum += (x/total_words) * math.log(x/total_words)
    return sum

def H(word_occurences):
    return shannon_entrophy(word_occurences, TOTAL_COUNT)

In [23]:
def calculate_contingency_table(bigram, bigram_count, total_words):
    first, second = bigram[0], bigram[1]
    first_occurence, second_occurence = sorted_rank[first], sorted_rank[second]
    '''
    |------  |---------| 
    | A,B    |B,notA   |
    |------  |---------|
    | A,notB |notA,notB|
    |------|---------|
    '''
    return np.array([
            [bigram_count, first_occurence-bigram_count],
            [second_occurence-bigram_count, total_words-first_occurence-second_occurence]])

In [24]:
def log_likeliheood_ratio(bigram_key):
    k = calculate_contingency_table(bigram_key, bigram_result[bigram_key][1], TOTAL_COUNT)

    return 2 * np.sum(k) * (H(k) - H(k.sum(axis=0)) -H(k.sum(axis=1)))

In [43]:
print (bigram_result[('stawek', 'dziennych')])
counter, limit = 0, 10

log_ratios = [(key, log_likeliheood_ratio(key), value[1]) for key, value in bigram_result.items()]

# print (log_likeliheood_ratio(('stawek', 'dziennych')))

(9.305699562437173, 120)


In [42]:
sd = sorted(log_ratios, key=lambda key_value: key_value[1], reverse=True)
pprint.pprint(sd[:30])

[(('z', 'dnia'), 86952.744987227095, (3.1599235647023516, 11852)),
 (('na', 'podstawie'), 46081.661634982847, (4.02483424835121, 4694)),
 (('ubezpieczeń', 'społecznych'),
  44522.179009044834,
  (6.784678163048855, 2620)),
 (('w', 'dniu'), 39886.546419177925, (3.005591800027203, 5205)),
 (('art', 'kpc'), 35546.057674896678, (4.808019871681763, 3207)),
 (('sygn', 'akt'), 34392.293122109972, (6.085967109668274, 2435)),
 (('art', 'ust'), 32659.421496778694, (4.626651795043272, 3142)),
 (('sąd', 'okręgowy'), 30570.779660573575, (5.187985965521512, 2452)),
 (('w', 'sprawie'), 29855.654486413114, (2.704789408281566, 4647)),
 (('zgodnie', 'z'), 28773.168524345179, (3.4943666905382997, 3374)),
 (('art', 'kc'), 27015.85769147292, (4.852914016146875, 2404)),
 (('sąd', 'rejonowy'), 26907.068155841178, (5.182833943022735, 2170)),
 (('na', 'rzecz'), 26315.727073114645, (3.9176388942580305, 2851)),
 (('w', 'tym'), 23268.145128867029, (2.4019279949601753, 4197)),
 (('sądu', 'najwyższego'), 22077.4301