In [2]:
import os
import json
import re
import string
import nltk
import pprint
import random
import math

In [3]:
all_words = []

def read_file(file_path):
    with open (file_path) as file:
        json_content = json.load(file)
        item_count = 0
        for item in json_content:
            item_count += 1
            text_content = re.sub("<.*?>", "", item["textContent"])
            text_content = text_content.replace('-\n', '')
            word_content = text_content.split()
            topicSpecificPunctuation = '„”–§…«»'
            translator = str.maketrans('', '', string.punctuation+topicSpecificPunctuation)
            
            for word in word_content:
                word = word.translate(translator).lower()
                if len(word)>0:
                    all_words.append(word)
                
        print(item_count)
        
def read_all_judgments_from_2018():
    for filename in os.listdir("data_filtered/"):
        print(filename)
        read_file("data_filtered/" + filename)
        
read_all_judgments_from_2018()
print(len(all_words))

judgments-3163.json
9
judgments-3168.json
100
judgments-3164.json
100
judgments-3171.json
100
judgments-3165.json
100
judgments-3167.json
100
judgments-3169.json
100
judgments-3173.json
81
judgments-3172.json
100
judgments-3166.json
100
judgments-3170.json
100
2530124


In [4]:
CHANGE_TO_BASIC_FORM = False
sorted_rank = {}
TOTAL_COUNT = len(all_words)

def generate_rank():        
    words_rank = {}
    for word in all_words:
        current = words_rank.get(word)
        
        if current is None:
            words_rank[word] = 1
        else:
            words_rank[word] = current + 1
    
    return words_rank

sorted_rank = generate_rank()
print (type(sorted_rank))

<class 'dict'>


In [5]:
def custom_filter(word):
    if re.match("^[a-ząćęłńóśźż]+$", word):
        return True
    return False

def filter_list(list_of_words):
    return [word for word in list_of_words if custom_filter(word)]

In [6]:
words = filter_list(all_words)
pprint.pprint(words[:10])

['uzasadnienie', 'powód', 'sp', 'z', 'oo', 'sp', 'k', 'z', 'siedzibą', 'w']


In [7]:
counter, limit = 0, 10
for k, v in sorted_rank.items():
    if counter < limit:
        print (k, v)
    counter += 1

uzasadnienie 1092
powód 3746
sp 1461
z 69427
oo 1468
k 16810
siedzibą 1189
w 117267
p 5069
wniósł 1181


In [8]:
bigrams = nltk.bigrams(words)

In [9]:
bigrams_frequency = nltk.FreqDist(bigrams)
counter, limit = 0, 10


for k,v in bigrams_frequency.items():
    if counter < limit:
        print (k,v)
    counter += 1


('uzasadnienie', 'powód') 73
('powód', 'sp') 5
('sp', 'z') 1417
('z', 'oo') 1463
('oo', 'sp') 14
('sp', 'k') 14
('k', 'z') 426
('z', 'siedzibą') 1169
('siedzibą', 'w') 1052
('w', 'p') 694


In [10]:
def pointwise_mutual_information(bigram, bigram_count):
    probability_of_bigram = bigram_count / TOTAL_COUNT
    first, second = bigram[0], bigram[1]
    probability_of_first, probability_of_second = sorted_rank[first]/TOTAL_COUNT, sorted_rank[second]/TOTAL_COUNT
    return math.log(probability_of_bigram/(probability_of_first*probability_of_second))
    

In [11]:
# Pointwise mutual information
bigram_result = {}
for bigram, bigram_count in bigrams_frequency.items():
    bigram_result[bigram] = (pointwise_mutual_information(bigram, bigram_count), bigram_count)

In [12]:
counter, limit = 0, 10

for k,v in bigram_result.items():
    if counter < limit:
        print (k,v)
    counter += 1

('uzasadnienie', 'powód') (3.8100282731998756, 73)
('powód', 'sp') (0.8378964890397356, 5)
('sp', 'z') (3.565168579474035, 1417)
('z', 'oo') (3.5923359433797177, 1463)
('oo', 'sp') (2.8043035800504668, 14)
('sp', 'k') (0.3662305628222664, 14)
('k', 'z') (-0.07954212859923658, 426)
('z', 'siedzibą') (3.5787838163153407, 1169)
('siedzibą', 'w') (2.9491507028296353, 1052)
('w', 'p') (1.083143328257217, 694)


In [13]:
MINIMUM_BIGRAM_OCCURENCE = 100

sd = sorted(bigram_result.items(), key=lambda key_value: key_value[1][0], reverse=True)
counter, limit = 0, 30
for k,v in sd:
    if counter < limit and v[1] > MINIMUM_BIGRAM_OCCURENCE:
        print (k,v)
        counter += 1

('samorządowej', 'jednostce') (9.659856309547225, 101)
('trybunału', 'konstytucyjnego') (9.655208351930142, 114)
('zdrowotnej', 'finansowanych') (9.584079242226641, 107)
('księgę', 'wieczystą') (9.58262761293319, 126)
('sfery', 'budżetowej') (9.476135511997146, 144)
('doznaną', 'krzywdę') (9.47101457923695, 109)
('uzasadnieniem', 'doręczyć') (9.427498804438159, 104)
('stawek', 'dziennych') (9.305699562437173, 120)
('grach', 'hazardowych') (9.292592434072608, 174)
('gospodarstwa', 'rolnego') (9.260375133292637, 152)
('jednostce', 'sfery') (9.187870519097498, 105)
('obszaru', 'ograniczonego') (9.164520854771906, 120)
('gospodarstwie', 'rolnym') (9.092264825249924, 180)
('ograniczonego', 'użytkowania') (9.046099639743833, 189)
('służby', 'wojskowej') (9.030311306116026, 112)
('godzinach', 'nadliczbowych') (8.989466834746212, 124)
('st', 'sekr') (8.9886036047791, 134)
('tekst', 'jednolity') (8.983946361279594, 166)
('utraciła', 'zdolność') (8.923984075626493, 103)
('doświadczenia', 'życiow

In [14]:
def shannon_entrophy(word_occurences, total_words):
    values = [(word/total_words) * math.log(word/total_words) if word != 0 else 0 for word in word_occurences]
    return sum(values)

def H(word_occurences, total_words):
    return shannon_entrophy(word_occurences, total_words)

In [43]:
def calculate_contingency_table(bigram, bigram_count, total_words):
    print("Calcualte contigency table called for", bigram, bigram_count, total_words)
    first, second = bigram[0], bigram[1]
    first_occurence, second_occurence = sorted_rank[first], sorted_rank[second]
    '''
    |------  |---------| 
    | A,B    |B,notA   |
    |------  |---------|
    | A,notB |notA,notB|
    |------|---------|
    '''
    return np.array[[bigram_count, first_occurence-bigram_count],
            [second_occurence-bigram_count, total_words-first_occurence-second_occurence]]

In [83]:
def log_likeliheood_ratio(bigram):
    print ("log_likelihood_ratio called for ", bigram)
    print ("bigram", bigram)
    # TODO: FIX THAT
#     print("bigram_result[bigram[0]]", bigram_result[(bigram[0])])
    print (bigram_result[('Nowy', 'Jork')])
    k = calculate_contingency_table(bigram, bigram_result[bigram[0]], TOTAL_COUNT)

    return 2 * np.sum(k) * (H(k) - H(k.sum(axis=0)) -H(k.sum(axis=1)))

In [84]:
def test_calculate_contingency_table():
    text = ['Nowy', 'Jork', 'to', 'piekne', 'miasto', 'Nowy', 'ale', 'nie', 'pies', 'Jork', 'stary', 'ani', 'Nowy']
    bigram_of_text = nltk.bigrams(text)
    bigrams_frequency = nltk.FreqDist(bigram_of_text)
    for bigram in bigrams_frequency.items():
#         print (bigram)
        print (log_likeliheood_ratio(bigram))
    #     contingency_table = calculate_contingency_table()

test_calculate_contingency_table()

log_likelihood_ratio called for  (('Nowy', 'Jork'), 1)
bigram (('Nowy', 'Jork'), 1)


TypeError: sample() missing 1 required positional argument: 'k'