In [1]:
## imports
import matplotlib.pyplot as plt
import requests
from pathlib import Path
import os
import regex
from bs4 import BeautifulSoup
from spacy.tokenizer import Tokenizer
import unicodedata
%reload_ext restmagic

### TASK 1 --- clean all acts from eventual HTML syntax

In [2]:
acts_directory = Path("ustawy/")
files = acts_directory.glob('*')
os.makedirs("ustawy_cleaned/", exist_ok=True)
acts_cleaned_dir = Path("ustawy_cleaned/")

for file in files: 
    file_name = file.name
    with open(file, 'r', encoding='utf-8') as f:
        cnt = regex.sub("\p{Cf}", "", f.read().lower())
        cnt = cnt.replace("-","")
        #cnt2 = regex.sub("\p{Z}+", " ", cnt)
        act_cleaned = BeautifulSoup(cnt, "lxml").get_text()  ## remove all html code

    cleaned_file_path = acts_cleaned_dir.joinpath(file_name)
    with open(cleaned_file_path, "w", encoding="utf-8") as f:
        f.write(act_cleaned)

### TASK 2 -- tokenize acts using SpaCy tokenizer 

In [3]:
from spacy.lang.pl import Polish

files_cleaned = acts_cleaned_dir.glob("*")

tokens = dict()

tokenizer = Polish().tokenizer

for file in files_cleaned:
    file_name = file.stem
    with open(file, "r", encoding="utf-8") as f:
        doc = tokenizer(f.read())
        tokens[file_name] = doc
        

### TASK 3 --- frequency list for each processed file 

In [4]:

freqs = dict()
for file, doc in tokens.items():
    freqs[file] = dict()
    for token in doc:
        try:
            freqs[file][token.text] += 1
        except KeyError:
            freqs[file][token.text] = 1

In [5]:
# sort each token dict
def print_frequency_sorted(token_dct, reverse=True):
    return sorted(token_dct.items(), reverse=reverse, key= lambda item: item[1])

print_frequency_sorted(freqs["1996_269"])

[('.', 55),
 (',', 34),
 ('nr', 17),
 ('poz', 17),
 ('z', 12),
 ('r', 12),
 ('w', 12),
 ('\n', 10),
 ('"', 10),
 ('o', 9),
 ('1', 8),
 ('2', 8),
 ('oraz', 7),
 ('art', 7),
 (')', 7),
 ('ust', 6),
 ('\n         ', 6),
 ('i', 5),
 (':', 5),
 ('dnia', 4),
 ('\n ', 4),
 ('po', 4),
 ('prokuratora', 4),
 ('generalnego', 4),
 ('dz.u', 3),
 ('kwietnia', 3),
 ('się', 3),
 ('otrzymuje', 3),
 ('brzmienie', 3),
 ('\n             ', 3),
 ('1996', 2),
 ('ustawa', 2),
 ('ustawy', 2),
 ('policji', 2),
 ('urzędzie', 2),
 ('ochrony', 2),
 ('państwa', 2),
 ('ustawie', 2),
 ('6', 2),
 ('1990', 2),
 ('(', 2),
 ('30', 2),
 ('1991', 2),
 ('94', 2),
 ('422', 2),
 ('107', 2),
 ('461', 2),
 ('1992', 2),
 ('54', 2),
 ('254', 2),
 ('1994', 2),
 ('53', 2),
 ('214', 2),
 ('1995', 2),
 ('4', 2),
 ('17', 2),
 ('34', 2),
 ('163', 2),
 ('104', 2),
 ('515', 2),
 ('wprowadza', 2),
 ('następujące', 2),
 ('zmiany', 2),
 ('ciężkiego', 2),
 ('lub', 2),
 (';', 2),
 ('\n      ', 2),
 ('minister', 2),
 ('spraw', 2),
 ('wewnętrz

In [6]:
# TASK 4 --- global frequency list 

def reduce_words_freqs(freqs):
    dct = dict()
    for token, count in [items for tokens_dicts in freqs.values() for items in tokens_dicts.items()]:
        try:
            dct[token] += count
        except KeyError:
            dct[token] = count
    return dct
        
    
global_freqs = reduce_words_freqs(freqs)
g_freq_arr = print_frequency_sorted(global_freqs)
for i in range(50):
    print(g_freq_arr[i])

('.', 431296)
(',', 341126)
('w', 201201)
('\n', 181338)
(')', 100139)
('i', 89918)
('art', 83799)
('z', 82438)
('1', 71416)
('o', 64766)
('do', 60730)
('2', 56385)
('\n  ', 54976)
('"', 54914)
('ust', 53636)
('\n     ', 53410)
('na', 50642)
('\n   ', 46689)
('się', 45886)
('lub', 45800)
('poz', 45224)
('nr', 44942)
(':', 39203)
('3', 37937)
('oraz', 33558)
('r', 33061)
('\n           ', 32959)
('mowa', 28783)
(';', 25266)
('4', 23382)
('nie', 22988)
(' ', 22966)
('przez', 20951)
('\n       ', 20542)
('\n             ', 20485)
('§', 20464)
('pkt', 19124)
('dnia', 17954)
('których', 17934)
('\n ', 17455)
('a', 16923)
('5', 16854)
('_', 16809)
('od', 16682)
('\n         ', 16483)
('po', 13546)
('jest', 13197)
('ustawy', 13099)
('(', 13035)
('6', 12292)


In [7]:
len(global_freqs)

66812

In [11]:
# TASK 5 --- Reject entries shorter than 2 characters or containing non-letter characters 
#pattern = regex.compile(r"\p{Ll}{2,}+$", regex.UNICODE) 
pattern_pol_only = regex.compile(r"[A-Za-zĄąĆćĘęŁłŃńÓóŚśŹźŻż]{2,}+$", regex.UNICODE) 


def filter_out_non_letters(item):
    word = item[0].lower()
    #return regex.match(pattern, word)
    return regex.match(pattern_pol_only, word)


g_freq_arr2_filtered = list(filter(filter_out_non_letters, g_freq_arr))  # jest posortowana
#g_freq_arr2 = list(filter(filter_out_non_letters, g_freq_arr))

In [12]:
len(g_freq_arr2_filtered)

55223

In [13]:
print(g_freq_arr2_filtered[:100])

[('art', 83799), ('do', 60730), ('ust', 53636), ('na', 50642), ('się', 45886), ('lub', 45800), ('poz', 45224), ('nr', 44942), ('oraz', 33558), ('mowa', 28783), ('nie', 22988), ('przez', 20951), ('pkt', 19124), ('dnia', 17954), ('których', 17934), ('od', 16682), ('po', 13546), ('jest', 13197), ('ustawy', 13099), ('może', 12096), ('jeżeli', 12038), ('którym', 11790), ('za', 11139), ('brzmienie', 10576), ('spraw', 10021), ('otrzymuje', 9835), ('albo', 8708), ('dodaje', 8423), ('ich', 8199), ('dla', 7934), ('pracy', 7631), ('minister', 7569), ('której', 7477), ('brzmieniu', 7296), ('drodze', 7179), ('podstawie', 6852), ('stosuje', 6680), ('przypadku', 6503), ('niż', 6452), ('tym', 6366), ('jego', 6320), ('są', 6156), ('być', 6120), ('zakresie', 6101), ('właściwy', 6094), ('państwa', 5838), ('przepisy', 5838), ('wyrazy', 5817), ('ze', 5509), ('ustawie', 5479), ('także', 5445), ('przepisów', 5347), ('rozporządzenia', 5346), ('tych', 5048), ('określonych', 5046), ('postępowania', 4909), ('osó

### TASK 6 --- plot ranks of terms (frequency) in logarithmic scale

In [None]:
ranks = list(range(len(g_freq_arr2_filtered)))

y_freq = [item[1] for item in reversed(g_freq_arr2_filtered)]

In [None]:
g_freq_arr2_filtered[:100]

In [None]:
fig1 = plt.figure(1)
fig1.set_size_inches(20.5, 8.5)
ax1 = fig1.gca()

ax1.set_xscale("log", base=10); 
ax1.set_yscale("log", base=10)

ax1.tick_params(bottom=False, left=False)
#ax1.set_ylim(bottom=10**3)

#ax1.set_xticklabels(ranks)
#ax1.set_yticklabels(y_freq)
#xlbls = ax1.get_xticklabels()
#for lbl in xlbls:
##    lbl.set_rotation("vertical")
  #  lbl.set_fontsize(9)

xs = [items[0] for items in g_freq_arr2_filtered]
ys = [items[1] for items in g_freq_arr2_filtered]


plt.plot(xs, ys)




plt.show()

### TASK 7 --- find words not contained in the dictionary

In [None]:
import morfeusz2
morf = morfeusz2.Morfeusz(generate=False)  # for speed
#analys = morf.analyse("porozumiewać")
analys

In [14]:
# TASK 7 -- Find words that are not in the dictionary 

import morfeusz2
morf = morfeusz2.Morfeusz()  # for speed

not_known_words = set()

for token,_ in g_freq_arr2_filtered:
    analysis = morf.analyse(token)
    for edge in analysis:
        if edge[2][2] == 'ign' and edge[2][0] == edge[2][1]:
            print(analysis)
            #break
            not_known_words.add(token)
    #if not_known_words:
    #    break

[(0, 1, ('poz', 'poz', 'ign', [], []))]
[(0, 1, ('późn', 'późn', 'ign', [], []))]
[(0, 1, ('str', 'str', 'ign', [], []))]
[(0, 1, ('gmo', 'gmo', 'ign', [], []))]
[(0, 1, ('sww', 'sww', 'ign', [], []))]
[(0, 1, ('skw', 'skw', 'ign', [], []))]
[(0, 1, ('zm', 'zm', 'ign', [], []))]
[(0, 1, ('ex', 'ex', 'ign', [], []))]
[(0, 1, ('ike', 'ike', 'ign', [], []))]
[(0, 1, ('remediacji', 'remediacji', 'ign', [], []))]
[(0, 1, ('ure', 'ure', 'ign', [], []))]
[(0, 1, ('rozdz', 'rozdz', 'ign', [], []))]
[(0, 1, ('uke', 'uke', 'ign', [], []))]
[(0, 1, ('itp', 'itp', 'ign', [], []))]
[(0, 1, ('kn', 'kn', 'ign', [], []))]
[(0, 1, ('np', 'np', 'ign', [], []))]
[(0, 1, ('cn', 'cn', 'ign', [], []))]
[(0, 1, ('pkwiu', 'pkwiu', 'ign', [], []))]
[(0, 1, ('udt', 'udt', 'ign', [], []))]
[(0, 1, ('bswsg', 'bswsg', 'ign', [], []))]
[(0, 1, ('bswp', 'bswp', 'ign', [], []))]
[(0, 1, ('biobójczych', 'biobójczych', 'ign', [], []))]
[(0, 1, ('phs', 'phs', 'ign', [], []))]
[(0, 1, ('fep', 'fep', 'ign', [], []))]
[(0,

In [15]:
len(not_known_words)

2541

### TASK 8 -- 30 words w/ highest rank not belonging to dictionary 

In [16]:
limit = 30
not_known_occurences = set()
for item in g_freq_arr2_filtered:
    for token in not_known_words:
        if token == item[0]:
            not_known_occurences.add(item)
            break  

In [17]:
highest_rank_not_known_srt = print_frequency_sorted(dict(not_known_occurences))

In [None]:
not_known30 = highest_rank_not_known_srt[:limit]
not_known30

### TASK 9 --- 30 random words w/ 5 occurences not belonging to dict

In [19]:
import random as r

def find_5_occur(item):
    return item[1] == 5

not_known_5_occ = list(filter(find_5_occur, not_known_occurences))
r.choices(not_known_5_occ, k=limit)

[('państwamistronami', 5),
 ('agave', 5),
 ('kpwig', 5),
 ('tzn', 5),
 ('ppkt', 5),
 ('ami', 5),
 ('regazyfikacyjnego', 5),
 ('urt', 5),
 ('regazyfikacyjnego', 5),
 ('inci', 5),
 ('agave', 5),
 ('glinowo', 5),
 ('odgazowywacze', 5),
 ('państwamistronami', 5),
 ('betezda', 5),
 ('ośc', 5),
 ('winopochodne', 5),
 ('najmnie', 5),
 ('vista', 5),
 ('denitracyjne', 5),
 ('odgazowywacze', 5),
 ('tzn', 5),
 ('tów', 5),
 ('swine', 5),
 ('glinowo', 5),
 ('rr', 5),
 ('instrumen', 5),
 ('swine', 5),
 ('instrumen', 5),
 ('państwamistronami', 5)]

### TASK 10 --- Levenshtein distance custom (from [norvig-spell-correct](https://norvig.com/spell-correct.html) )

In [262]:
not_known_5_occ

NameError: name 'not_known_5_occ' is not defined

In [256]:
# create possible set of words 1 edit away:
def one_edit(word):
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return [w for w in set(deletes + transposes + replaces + inserts) if w in zip(*g_freq_arr2_filtered)[0]]

def two_edits(word): 
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

def get_probability_of_word(words):
    # zwraca posortowne probabilities dla wszystkich zmian w słowie 
    g_freq_arr2_filtered
    
    pass

In [None]:
wlimit = 10
for token in not_known30:
    possible_replacements = one_edit(token)
print(get_probability_of_word(possible_replacements)[:wlimit])

In [None]:
wlimit = 10
for token in not_known_5_occ:
    possible_replacements5 = one_edit(token)
print(get_probability_of_word(possible_replacements5)[:wlimit])

### TASK 11 --- fuzzy search in ES 

In [None]:
from elasticsearch import Elasticsearch, helpers
#from elasticsearch import Elasticsearch
def insert_sgjp_to_es(filename: str): #-> #Generator[Dict[str, str], None, None]:
    with open(filename, encoding="UTF-8") as file:
        # skip header and license, later proper rows have 4 elements divided with tabs, where 1st is the word
        for line in file:
            if len(line.split()) != 4:
                continue
            else:
                yield {
                    "_index": "dict_jp",
                    "word": line.split()[0],
                }
                break
        
        i = 0
        for line in file:
            yield {
                "_index": "dict_jp",
                "word": line.split()[0],
            }
            
            i += 1
            if i % 100000 == 0:
                print(i)

filename = "sgjp.tab"
es = Elasticsearch()

helpers.bulk(es, insert_sgjp_to_es("sgjp-20211121.tab"), chunk_size=100000, request_timeout=6000)

### Fuzzy search  top 30 słów 

In [None]:
highest_rank_not_known_srt30 = [ ('poz', 45224),('późn', 1065), ('str', 516), ('gmo', 298),('sww', 216),('skw', 196),('zm', 192),
 ('ex', 167),('ike', 162), ('remediacji', 120), ('ure', 103), ('rozdz', 102), ('uke', 97), ('itp', 96),
 ('kn', 95), ('np', 85), ('cn', 83), ('pkwiu', 81), ('udt', 77), ('bswsg', 70), ('bswp', 66),
 ('biobójczych', 63), ('phs', 53), ('fep', 42), ('mgo', 41), ('utk', 40), ('frd', 36), ('uokik', 35),
 ('ron', 35), ('cbśp', 34) ]

In [None]:
#import json
def search_req_dct(word):
    return {
      "query": {
        "fuzzy": {
          "word": {
            "value": word,
            "fuzziness": "AUTO",
            "max_expansions": 50,
            "prefix_length": 0,
            "transpositions": True,
            "rewrite": "constant_score"
          }
        }
      }
    }


responses = []
for word in highest_rank_not_known_srt30:
    responses.append(es.search(index="dict_jp", body=search_req_dct(word[0])))

### Fuzzy search top5 wybranych losowo słów

In [None]:
## TODO
#There is a problem with line endings. Can scan lines looking if 