In [1]:
from python_files.getwrdlist import *
from python_files.load_corpus import *
from IPython.display import clear_output
import pyperclip

word_dict_p = {}
for item in word_dict:
    for i in range(0,len(word_dict[item])):
        word_dict_p[item + str(i+1).replace("1","")] = word_dict[item][i]

In [2]:
##Load combined frequency data

hjk_corpus  = load_corpus_summary("corpus/hjk.summary", inverse_dict)["corpus"]
wiki_corpus = load_corpus_summary("corpus/wikipedia.summary", inverse_dict)["corpus"]
open_corpus = load_corpus_summary("corpus/opensub.summary", inverse_dict)["corpus"]

freq_ = create_combined_corpus([hjk_corpus, wiki_corpus, open_corpus])

for word in word_dict:
    if word not in freq_:
        freq_[word] = 0


Loaded corpus from corpus/hjk.summary (88.316596M words)
Loaded corpus from corpus/wikipedia.summary (61.987162M words)
Loaded corpus from corpus/opensub.summary (196.755303M words)
Created combined corpus


# <h1>Verbs</h1>

The following lines of code create a file called "verbs.html" that contains the most common verbs.

Verbs in Croatian can be grouped in families. Like phrasal verbs in English, Croatian verbs can be formed combining a base form with a preposition; the only difference is that in Croatian the preposition goes in front of the verb and they form a single word (they are not separated by a space). Verbs that have the same base belong to the same "extended family", as I decided to call them. The file verbs_extended.txt contains the all the families of verbs

On the other hand, Croatian verbs can be perfective or imperfective, and one can often list perfective/imperfective pairs. I have called this pairs "cousins". Cousins are listed in verbs_cousins.txt

The complete family of a verb is the extended family plus all the cousins of each of the members of the family.

To make the listing easier, I've created a filed called "verbs_main_list.txt" that contains the list of the main verbs of each family. This list was built by hand considering the most common verbs. Also, I've tried to list mostly the imperfective verbs.

The FREQ_THRESHOLD determines where the frequency threshold is set to display other family members of each verb of the main verbs list. If the frequency of a member of the family is lower than the threshold, in is not added to the final list. I determined 0.014 to be a good number considering a intermidiate level.

In [4]:
FREQ_THRESHOLD = 0.014

In [5]:
## Load extended families and cousins
##
cousins_  = {}
extended_ = {}
fulllist_ = []

with open("verbs_lists/verbs_cousins.txt","r",encoding="UTF-8") as f:
    for line in f:
        line_ = line.replace("\n","").split(",")
        cousins_[line_[0]] = line_[1]
    f.close()

antisymmetric_words = {}
for item in cousins_:
    if not cousins_[item] in cousins_:
        antisymmetric_words[cousins_[item]] = item
        
cousins_.update(antisymmetric_words)
    
with open("verbs_lists/verbs_extended.txt","r",encoding="UTF-8") as f:
    for line in f:
        line_ = line.replace("\n","").split(",")

        stem = line_[0]
        line_.remove(stem)
        extended_[stem] = line_

In [6]:
## Load main verbs list,
###
main_verb_list = []     ## LIST WITH THE MAIN STEMS
extended_verb_list = [] ## LIST WITH MAIN STEMS PLUS EXTENDED FAMILY PLUS COUSINS

with open("verbs_lists/verbs_main_list.txt","r",encoding="UTF-8") as f:
    for line in f:
        line = line.replace("\n","")
        
        main_verb_list.append(line)
        
        if line not in extended_:
            extended_[line] = [line]
        
        extended_verb_list.extend(extended_[line])
        
        if line not in extended_verb_list:
            extended_verb_list.append(line)
            
        if line in cousins_ and cousins_[line] not in extended_verb_list:
            extended_verb_list.append(cousins_[line])      
            
        for item in extended_[line]:
            if item in cousins_ and cousins_[item] not in extended_verb_list:
                extended_verb_list.append(cousins_[item])

        
    f.close()

### Sort list
### Create a dictionary with general frequency scores for word families
### Each verb on the main list gets a score equal to the highest frequency
### of all the verbs of the family (extended and cousins)
main_verb_list_score_dict = {}
for item in main_verb_list:
    
    max_score = 0
    if item in freq_:
        max_score = freq_[item]
    if item in cousins_ and cousins_[item] in freq_ and freq_[cousins_[item]] > max_score:
        max_score = freq_[cousins_[item]]
    
    for subitem in extended_[item]:
        if subitem in freq_ and freq_[subitem] > max_score:
            max_score = freq_[subitem]
        if subitem in cousins_ and cousins_[subitem] in freq_ and freq_[cousins_[subitem]] > max_score:
            max_score = freq_[cousins_[subitem]]
            
    main_verb_list_score_dict[item] = max_score
    
main_verb_list.sort(key = lambda x:main_verb_list_score_dict[x], reverse=True)

## Find verbs that are on the list, and verb in the list that are below the freq threshold
exclude = []
for word in main_verb_list:
    if word not in word_dict:
        continue
    if freq_[word.replace("2","")] < FREQ_THRESHOLD:
        exclude.append(word)
for word in exclude:
    main_verb_list.remove(word)
    
## Missing verbs
for word in word_dict:
    for w in word_dict[word]:
        if "Verb" not in w.typ:
            continue
        if w.name in main_verb_list:
            continue
        if freq_[w.name] < FREQ_THRESHOLD:
            continue
        main_verb_list.append(word)
        

    
## Find family members that lie below the frequency threshold 
uncommon_verbs = []
for word in extended_verb_list:
    word = word.replace("2","")
    if word not in freq_:
        freq_[word] = 0
    if freq_[word] < FREQ_THRESHOLD:
        #print(word + " (" + str(freq_[word]) + ")")
        uncommon_verbs.append(word)

In [8]:
## Get irregular verbs

def get_tense(tense, verb):
    table = str(word_dict_p[verb].tables_str).replace("[","").replace("]","").replace("'","").split(",")
    if len(table) == 0 or table[0] == "":
        return ""
    
    if tense in ["1","2","3","4","5","6"]:
        m = table.index("Present")
        return table[m + int(tense)]
    
    if tense == "past":
        if not "Activepastparticiple" in table:
            return ""
        m = table.index("Activepastparticiple")
        return table[m+1]
    
    if tense == "imperative":
        if not "Imperative" in table:
            return ""
        m = table.index("Imperative")
        return table[m+2]

def conjugate(tense, verb):
    
    
    tense_ = {}
    if verb.endswith("ovati"):
        tense_["1"] = verb[:-5] + "ujem"
        tense_["3"] = verb[:-5] + "uje"
        tense_["6"] = verb[:-5] + "uju"
        tense_["past"] = verb[:-5] + "ovao"
        tense_["imperative"] = verb[:-5] + "uj"
        
    elif verb.endswith("ivati"):
        tense_["1"] = verb[:-5] + "ujem"
        tense_["3"] = verb[:-5] + "uje"
        tense_["6"] = verb[:-5] + "uju"
        tense_["past"] = verb[:-5] + "ivao"
        tense_["imperative"] = verb[:-5] + "uj"
        
    elif verb.endswith("jeti"):
        tense_["1"] = verb[:-4] + "em"
        tense_["3"] = verb[:-4] + "e"
        tense_["6"] = verb[:-4] + "u"
        tense_["past"] = verb[:-4] + "io"
        tense_["imperative"] = verb[:-4] + "i" 
    
    elif verb.endswith("ati"):
        tense_["1"] = verb[:-3] + "am"
        tense_["3"] = verb[:-3] + "a"
        tense_["6"] = verb[:-3] + "aju"
        tense_["past"] = verb[:-3] + "ao"
        tense_["imperative"] = verb[:-3] + "aj"
        
    elif verb.endswith("eti"):
        tense_["1"] = verb[:-3] + "em"
        tense_["3"] = verb[:-3] + "e"
        tense_["6"] = verb[:-3] + "u"
        tense_["past"] = verb[:-3] + "o"
        tense_["imperative"] = verb[:-3] + "i"
        
    elif verb.endswith("uti"):
        tense_["1"] = verb[:-3] + "em"
        tense_["3"] = verb[:-3] + "e"
        tense_["6"] = verb[:-3] + "u"
        tense_["past"] = verb[:-3] + "uo"
        tense_["imperative"] = verb[:-3] + "i"
        
    elif verb.endswith("iti"):
        tense_["1"] = verb[:-3] + "im"
        tense_["3"] = verb[:-3] + "i"
        tense_["6"] = verb[:-3] + "e"
        tense_["past"] = verb[:-3] + "io"
        tense_["imperative"] = verb[:-3] + "i"
        
        
    if tense not in tense_:
        return ""
    return tense_[tense]

  
def get_irregulars(tense):
    global total
    total = 0
    irregulars = []
    for v in word_dict_p:
        v = v.replace("2","")
        if v not in word_dict_p or "Verb" not in word_dict_p[v].typ or word_dict_p[v].tables_str == "":
            continue

        if get_tense(tense,v) != conjugate(tense,v):
            irregulars.append(v)
        total += 1
    
    return irregulars

print(len(get_irregulars("1")))

1524


In [9]:
## Print verbs

printed_verbs = []
irregularities = get_irregulars("1")
text = ""
count = 0
d_count = 0
g_count = 0

def get_verb_definition(v):
    if v in word_dict_p:
        definition = word_dict_p[v].get_english().split(" / ")[0]
        if definition.startswith("perfective") or definition.startswith("imperfective"):
            definition = word_dict_p[cousins_[v]].get_english().split(" / ")[0]
        if "(" in definition:
            definition = definition[definition.find(")")+2:]
        if len(definition) > 12:
            definition = definition.split(",")[0]
        if " (" in definition:
            definition = definition.split(" (")[0]
        if len(definition) > 12:    
            definition = definition[0:10] + "..."
        return ": " + definition.replace("to ","")
    return ""

def text_verb(v):
    return_ = ""
    if v not in word_dict_p:
        return v
    
    if "imperfective or perfective" in word_dict_p[v].typ:
        return_ = "<i>" + v.replace("2","") + " *</i>"
    elif "imperfective" in word_dict_p[v].typ:
        return_ = "<i>" + v.replace("2","") + "</i>"
    else:
        return_ = v.replace("2","")
    
    if v in irregularities:
        return_ += " ! "
    
    return return_

def text_of_verb_and_cousin(v):
    text = ""
    global count
    global d_count
    
    v_ok = True
    cousin_ok = True
    if v in printed_verbs or v in uncommon_verbs or v not in word_dict_p:
        v_ok = False
    if v not in cousins_:
        cousin_ok = False
    elif cousins_[v] in printed_verbs or cousins_[v] in uncommon_verbs or cousins_[v] not in word_dict_p:
        cousin_ok = False  
    
    
    if v_ok and cousin_ok:
        text += str(count) + ". " + text_verb(v)
        text += " (" + text_verb(cousins_[v]) + ")"       
        printed_verbs.append(v)
        printed_verbs.append(cousins_[v])
        count += 1
        d_count += 2    
            
    if v_ok and not cousin_ok:
        text += str(count) + ". " + text_verb(v)     
        printed_verbs.append(v)
        
        if v in cousins_ and cousins_[v] in word_dict_p:
            text += " (<s>" + text_verb(cousins_[v]) + "</s>)"
            printed_verbs.append(cousins_[v])
        
        count += 1
        d_count += 1
        
    if not v_ok and cousin_ok:
        text += str(count) + ". " + "<s>" + text_verb(v) + "</s>"
        text += " (" + text_verb(cousins_[v]) + ")"       
        printed_verbs.append(v)
        printed_verbs.append(cousins_[v])
        count += 1
        d_count += 1
    

    if not v_ok and not cousin_ok:
        return ""
    
    return text + get_verb_definition(v) + "<br>\n"
    
    
##############################################################
    
for v in main_verb_list:
    
    text_n = text_of_verb_and_cousin(v)
    if v in extended_:
        for item in extended_[v]:
            text_n += text_of_verb_and_cousin(item)
    
    if text_n == "":
        continue
    text += text_n + "<br>\n"
    g_count += 1
    
f_out = open("most_frequent/Verb.html","w+",encoding="UTF-8")
f_out.write(text)
f_out.close()
#pyperclip.copy(text)
print("Total number of verbs:  " + str(d_count))
print("Total number of groups: " + str(g_count))
print("Total number of rows:   " + str(count-1))

Total number of verbs:  1605
Total number of groups: 704
Total number of rows:   1214


In [20]:
## Missing verbs
for word in word_dict:
    for w in word_dict[word]:
        if "Verb" not in w.typ:
            continue
        if w.name in printed_verbs:
            continue
        if freq_[w.name] < FREQ_THRESHOLD:
            continue
        print(w.name)
        

# Other reports 

In [31]:
FREQ_THRESHOLD = 0.014

In [32]:
## Nouns with unexpected endings

fem = []
mas = []
neu = []

for word in word_dict:
    for w in word_dict[word]:
        
        if word in ["put"]:
            continue
        
        if "Noun" not in w.typ:
            continue
         
        if freq_[word] < FREQ_THRESHOLD:
            continue
        
        if word[-1] not in ["a"] and "(f)" in w.typ and not word[-3:] == "ost":
            fem.append(word)
        if word[-1] in ["a","e","o"] and "(m)" in w.typ:
            mas.append(word)
        if word[-1] not in ["e","o"] and "(n)" in w.typ:
            neu.append(word)

print(mas)
print(fem)
print(neu)

['anđeo', 'babo', 'dečko', 'dio', 'do', 'euro', 'finale', 'gospoda', 'groblje', 'kama', 'kolega', 'luda', 'oba', 'obojica', 'ocjena', 'odaja', 'orao', 'pakao', 'posao', 'prijetnja', 'prvenstvo', 'radio', 'sluga', 'smisao', 'sto', 'ubojica', 'udio', 'veo', 'vojvoda', 'vođa', 'žele', 'auto', 'papa', 'ugao', 'video', 'tata', 'moto', 'Meksiko']
['bit', 'bol', 'bolest', 'cijev', 'crven', 'desni', 'dob', 'dobit', 'doprinos', 'financije', 'glad', 'grudi', 'hlače', 'jesen', 'kap', 'korist', 'krv', 'kćer', 'kći', 'laž', 'lisice', 'ljubav', 'mast', 'mati', 'minut', 'misao', 'momčad', 'motiv', 'moć', 'napast', 'narav', 'nauk', 'nit', 'novine', 'noć', 'obavijest', 'obitelj', 'obje', 'oblast', 'ovlast', 'pomoć', 'povijest', 'počast', 'propast', 'ravan', 'riječ', 'savjest', 'skrb', 'smrt', 'strast', 'stvar', 'svijest', 'tvar', 'urednik', 'uš', 'vaš', 'večer', 'vijest', 'vile', 'vlast', 'zamisao', 'zapovijed', 'zelen', 'zvijer', 'čast', 'os', 'ponoć', 'sol', 'četvrt', 'vrst']
['doba', 'kola', 'leđa',

In [33]:
## Collectives and plural only nouns
collectives = []
plural_only = []

for word in word_dict:
    for w in word_dict[word]:
        
        if "Noun" not in w.typ:
            continue
         
        if freq_[word] < FREQ_THRESHOLD:
            continue
            
        if "collective" in w.get_english() or "collectively" in w.get_english():
            collectives.append(word)
        
        if "plural only" in w.get_english():
            plural_only.append(word)

print(collectives)
print(plural_only)

['braća', 'cvijeće', 'djeca', 'drveće', 'dvojica', 'gospoda', 'kamenje', 'momčad', 'povrće', 'raja', 'trojica', 'voće', 'zelen']
['desni', 'financije', 'hlače', 'kola', 'leđa', 'lisice', 'nosila', 'novine', 'pluća', 'prsa', 'usta', 'vile', 'vrata']


In [34]:
## Irregular adjectives
bad_feminines = []
bad_comparatives = []

for word in word_dict:
    for w in word_dict[word]:
        
        if "Adjective" not in w.typ:
            continue
        if freq_[word] < FREQ_THRESHOLD:
            continue
        
        conj = w.tables
        for i in range(0, len(conj)):
            conj[i] = conj[i].replace("a","")

            
        ##IRREGULARITIES IN FEMININE
        if "feminine" in conj:
            ind = conj.index("feminine")

            y = word
            if y[-1] == "i":
                y = y[:-1]

            if y.replace("a","") != conj[ind+1]:
                bad_feminines.append(word)

        ##IRREGULARITIES IN COMPARATIVE
        if conj.count("singulr") > 2:

            if word.replace("a","") + "iji" not in conj and word.replace("a","") + "ji" not in conj and conj[ind+1] + "iji" not in conj and conj[ind+1] + "ji" not in conj:

                ind2 = conj.index("singulr",conj.index("singulr",conj.index("singulr")+1)+1)
                comparative = conj[ind2+17]

                if "og()" in comparative:
                    continue

                bad_comparatives.append(word)

print(bad_feminines)
print(bad_comparatives)

['bio', 'blizak', 'bolestan', 'cio', 'debeo', 'evropski', 'gladak', 'izvrstan', 'koristan', 'mio', 'nagao', 'nizak', 'nuždan', 'okrugao', 'podao', 'redak', 'rijedak', 'sladak', 'smio', 'svijetao', 'svjestan', 'težak', 'topao', 'uveo', 'uzak', 'veseo', 'zao', 'zreo', 'častan']
['bijel', 'bijesan', 'blag', 'blizak', 'brz', 'dalek', 'debeo', 'dobar', 'drag', 'dubok', 'dug', 'gladak', 'glup', 'grub', 'gust', 'jak', 'kratak', 'kriv', 'lak', 'lijep', 'ljut', 'lud', 'malen', 'mek', 'mlad', 'nizak', 'redak', 'rijedak', 'siv', 'skup', 'sladak', 'smiješan', 'strog', 'suh', 'svijetao', 'tanak', 'težak', 'tih', 'tvrd', 'uzak', 'velik', 'visok', 'vrijedan', 'zao', 'čest', 'čvrst', 'širok', 'žestok', 'živ', 'žut', 'mal', 'zainteresiran']


 <h1>Nouns, adjectives and adverbs</h1>

In [35]:
WORD_TYPE  = "Noun"    ## Choose Noun, Adjective or Adverb

In [36]:
word_count = {}
word_count["Noun"] = 2000
word_count["Adverb"] = 500
word_count["Adjective"] = 500
word_count["AdjectiveAdverb"] = 1000

In [37]:
left_out = {}
left_out["Noun"] = ["bilo","bio","kad","do","bit","nova","bez","kod","pravo","pod","tim","oko","sad","kada","rad","lak","treba","oka","dok","kraj","dug","mora","jak","igra","njega","radio","skup","crven","žele","težak","kim","nit","duga","baš","bod","bok","puta","minut","drug","kim","tim","ud"]
left_out["Adjective"] = ["bio"]
left_out["Adverb"] = ["bio","sam","ja","po","le","di","što","do","od"]

## combine adjectives and adverbs
left_out["AdjectiveAdverb"] = []
left_out["AdjectiveAdverb"].extend(left_out["Adjective"])
left_out["AdjectiveAdverb"].extend(left_out["Adverb"])

In [38]:
most_frequent = list(freq_.keys())
most_frequent.sort(key = lambda x:freq_[x], reverse=True)

output = open("most_frequent/" + WORD_TYPE + ".html","w+",encoding="UTF-8")

i = 0
for x in most_frequent:
    word_type_ok = False
    for item in word_dict[x]:
        if WORD_TYPE in item.typ or item.typ in WORD_TYPE:
            word_type_ok = True
            break
    
    if x in left_out[WORD_TYPE]:
        continue
    
    if x[0] == x[0].upper():
        continue
    
    if not word_type_ok:
        continue

    if x in bad_feminines or x in bad_comparatives:
        x = x + " !"
    if x in collectives:
        x = x + " (coll)"
    if x in plural_only:
        x = x + " (pl.)"
    if x in fem or x in mas or x in neu:
        x = x + word_dict[x][0].typ.replace("Noun "," ")
    
    output.write(x + "\n")
    i += 1
    if i == word_count[WORD_TYPE]:
        break
        
output.close()

In [43]:
dave_source["um"]

0.1864905262553457