In [88]:
import os, json, re, string, requests, pprint, nltk, operator, numpy as np, math, tqdm
from time import gmtime, strftime
from functools import wraps
from time import time

In [89]:
def timing(f):
    @wraps(f)
    def wrap(*args, **kw):
        ts = time()
        result = f(*args, **kw)
        te = time()
        print ('func:%r args:[%r, %r] took: %2.4f sec' % \
          (f.__name__, args, kw, te-ts))
        return result
    return wrap

In [91]:
def process_judgment(judgment):
    data_to_query = (",").join(judgment)
    r = requests.post(data=data_to_query.encode("utf-8"), url="http://localhost:9200")
    response_text = r.text
    splited_response = response_text.splitlines()
    splited_response = [" ".join(x.replace("\t", " ").replace("none", "")[1:].split(":")[:2][:1]).replace(" ", ":")
                        for x in splited_response if ":" in x]
    return splited_response

In [None]:
all_words = []

def read_file(file_path):
    with open (file_path) as file:
        json_content = json.load(file)
        item_count = 0
        for item in tqdm.tqdm(json_content):
            judgment = []
            item_count += 1
            text_content = re.sub("<.*?>", "", item["textContent"])
            text_content = text_content.replace('-\n', '')
            word_content = text_content.split()
            topicSpecificPunctuation = '„”–§…«»'
            translator = str.maketrans('', '', string.punctuation+topicSpecificPunctuation)
            
            for word in word_content:
                word = word.translate(translator).lower()
                if len(word)>0:
                    judgment.append(word)
            unigrams = process_judgment(judgment)
                
            with open(file_path + "_results.txt", 'w') as out_file:
                for unigram in unigrams:
                    out_file.write(unigram + "\n")
     
def read_all_judgments_from_2018():
    with open("../data_filtered/raport.txt", "a") as raport_file:
        for filename in tqdm.tqdm(os.listdir("../data_filtered/")):
            if not filename + "_results.txt" in os.listdir("../data_filtered/"):
                raport_file.write(str(strftime("%Y-%m-%d %H:%M:%S", gmtime())) + " ")
                raport_file.write("Writing to file " + filename + "\n")
                raport_file.flush()
                read_file("../data_filtered/" + filename)
                raport_file.write(str(strftime("%Y-%m-%d %H:%M:%S", gmtime())) + " ")
                raport_file.write("Writing to file " + filename + "FINISHED \n")
                raport_file.flush()
        
read_all_judgments_from_2018()

  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/9 [00:00<?, ?it/s][A
 11%|█         | 1/9 [00:02<00:20,  2.60s/it][A
 22%|██▏       | 2/9 [00:04<00:14,  2.00s/it][A
 33%|███▎      | 3/9 [00:05<00:10,  1.78s/it][A
 44%|████▍     | 4/9 [00:07<00:09,  1.86s/it][A
 56%|█████▌    | 5/9 [00:11<00:08,  2.25s/it][A
 67%|██████▋   | 6/9 [00:25<00:12,  4.24s/it][A
 78%|███████▊  | 7/9 [00:26<00:07,  3.82s/it][A
 89%|████████▉ | 8/9 [00:28<00:03,  3.51s/it][A
100%|██████████| 9/9 [00:31<00:00,  3.46s/it][A
  8%|▊         | 1/12 [00:31<05:42, 31.11s/it]
  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:00<00:50,  1.95it/s][A
  2%|▏         | 2/100 [00:02<01:43,  1.06s/it][A
  3%|▎         | 3/100 [00:05<03:13,  1.99s/it][A
  4%|▍         | 4/100 [00:10<04:12,  2.63s/it][A
  5%|▌         | 5/100 [00:11<03:30,  2.21s/it][A
  6%|▌         | 6/100 [00:14<03:41,  2.36s/it][A
  7%|▋         | 7/100 [00:14<03:15,  2.10s/it][A
  8%|▊         | 8/100 [00:17<03

In [None]:
def read_list_from_file(file_path):
    with open(file_path) as file:
        content = file.read().split("\n")
        return content

In [None]:
splited_response = processed_unigrams
splited_response = [item for sublist in splited_response for item in sublist]


In [None]:
bigrams = nltk.bigrams(splited_response)
unigrams_frequency = nltk.FreqDist(splited_response)
bigrams_frequency = nltk.FreqDist(bigrams)

In [None]:
sorted_unigrams = sorted(unigrams_frequency.items(), key=operator.itemgetter(1), reverse=True)
pprint.pprint(sorted_unigrams[:30])

In [None]:
sorted_bigrams = sorted(bigrams_frequency.items(), key=operator.itemgetter(1), reverse=True)
pprint.pprint(sorted_bigrams[:30])

In [None]:
TOTAL_COUNT = len(splited_response)

In [None]:
def shannon_entrophy(word_occurences, total_words):
    sum = 0
    for x in np.nditer(word_occurences):
        if x!= 0:
            sum += (x/total_words) * math.log(x/total_words)
    return sum

def H(word_occurences):
    return shannon_entrophy(word_occurences, TOTAL_COUNT)

def calculate_contingency_table(bigram, bigram_count, total_words):
    first, second = bigram[0], bigram[1]
    first_occurence, second_occurence = unigrams_frequency[first], unigrams_frequency[second]
    '''
    |------  |---------| 
    | A,B    |B,notA   |
    |------  |---------|
    | A,notB |notA,notB|
    |------|---------|
    '''
    return np.array([
            [bigram_count, first_occurence-bigram_count],
            [second_occurence-bigram_count, total_words-first_occurence-second_occurence-bigram_count]])

def log_likeliheood_ratio(bigram_key):
    k = calculate_contingency_table(bigram_key, bigrams_frequency[bigram_key], TOTAL_COUNT)
    return 2 * np.sum(k) * (H(k) - H(k.sum(axis=0)) -H(k.sum(axis=1)))

In [None]:
log_ratios = [(key, log_likeliheood_ratio(key), value) for key, value in bigrams_frequency.items()]

In [None]:
sorted_by_log_ratios = sorted(log_ratios, key=operator.itemgetter(1), reverse=True)
sorted_by_log_ratios[:30]

In [None]:
filtered_by_tags = [log_ratio for log_ratio in log_ratios 
                    if "subst" in log_ratio[0][0] and
                    ("subst" in log_ratio[0][1] or "adj" in log_ratio[0][1])]
filtered_by_tags_sorted = sorted(filtered_by_tags, key=operator.itemgetter(1), reverse=True)
filtered_by_tags_sorted[:30]