Esmalt loeb failid listi ning viib nad paremini käideltatavale kujule

def format_files(dir_name):
    for f in listdir(dir_name):
        with open(join(dir_name, f), 'r') as fin:
            data = fin.read().splitlines(True)
        with open(join(dir_name, f), 'w') as fout:
            fout.writelines(data[1:])

format_files("eesti_seadus")

In [1]:
from estnltk import Text
from os import listdir
from os.path import isfile, join
from bs4 import BeautifulSoup as bs
import re
import html

name_matcher = r"\<title\>(.+)\<\/title\>"
sample_data = "eesti_seadus"

def is_content_word(word):
    return len(set(word["analysis"][0]["partofspeech"]) & set(['S', 'V', 'A', 'C', 'U', 'D'])) > 0

def lemmafy_text(text):
    xs = Text(text).tag_analysis()
    return " ".join([x["analysis"][0]["lemma"] if is_content_word(x) else '?' for x in xs.words])
    

def read_files_from_dir(dir_name):
    total_data = []
    corresponding_names = []

    for f in listdir(dir_name):
        if isfile(join(dir_name, f)):
            content, name = formatted_data(join(dir_name, f))
            total_data.append(lemmafy_text(content))
            if name:
                corresponding_names.append(html.unescape(name.group(1)))
            else:
                corresponding_names.append(f)
    return total_data, corresponding_names

def formatted_data(filename):
    with open(filename, "r") as file:
        content = "".join(file.readlines())
        match = re.search(name_matcher, content)
        bs_content = bs(content, "lxml")
        return bs_content.text, match

corpus, names = read_files_from_dir(sample_data)

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

BoW = CountVectorizer(analyzer='word', ngram_range=(1, 1))
X_BoW = BoW.fit_transform(corpus)
print("BoW feature names:", len(BoW.get_feature_names()))

BoN = CountVectorizer(analyzer='word', ngram_range=(2, 2))
X_BoN = BoN.fit_transform(corpus)
print("BoN feature names:", len(BoN.get_feature_names()))

Tfidf = TfidfVectorizer()
X_Tfidf = Tfidf.fit_transform(corpus)
print("TF-IDf feature names:", len(Tfidf.get_feature_names()))

BoW feature names: 28423
BoN feature names: 420551
TF-IDf feature names: 28423


In [3]:
from sklearn.metrics.pairwise import cosine_similarity

similarities_BoW  =  []
similarities_tfidf = []
similarities_BoN  =  []

for i in range(len(names)):
    similarity_bow  =  [-1, -1, -1]
    similarity_tfidf = [-1, -1, -1]
    similarity_bon  =  [-1, -1, -1]

    for j in range(len(names)):
        if i != j:
            bow_cosine_sim = cosine_similarity(X_BoW[i], X_BoW[j])
            if bow_cosine_sim > similarity_bow[0]:
                similarity_bow = [bow_cosine_sim, i, j]

            tfidf_cosine_sim = cosine_similarity(X_Tfidf[i], X_Tfidf[j])
            if tfidf_cosine_sim > similarity_tfidf[0]:
                similarity_tfidf = [tfidf_cosine_sim, i, j]

            bon_cosine_sim = cosine_similarity(X_BoN[i], X_BoN[j])
            if bon_cosine_sim > similarity_bon[0]:
                similarity_bon = [bon_cosine_sim, i, j]
    
    similarities_BoW.append((similarity_bow[1], similarity_bow[2]))
    similarities_tfidf.append((similarity_tfidf[1], similarity_tfidf[2]))
    similarities_BoN.append((similarity_bon[1], similarity_bon[2]))

In [4]:
print("BoW - BoN: "    + str(sum([similarities_BoW[x] == similarities_BoN[x]   for x in range(len(names))]) / len(names)))
print("BoW - TF_IDF: " + str(sum([similarities_BoW[x] == similarities_tfidf[x] for x in range(len(names))]) / len(names)))
print("BoN - TF_IDF: " + str(sum([similarities_BoN[x] == similarities_tfidf[x] for x in range(len(names))]) / len(names)))

BoW - BoN: 0.4413265306122449
BoW - TF_IDF: 0.6556122448979592
BoN - TF_IDF: 0.38010204081632654


In [7]:
import networkx as nx
graphs = [("BoW", similarities_BoW), ("TF-IDF", similarities_tfidf), ("BoN", similarities_BoN)]

for case, graph in graphs:
    graph = nx.Graph(graph)
    largest = set()
    for network in list(nx.connected_components(graph)):
        if len(network) > len(largest):
            largest = network
    
    print(case + " järgi suurim seaduste grupp on suurusega (" + str(len(largest)) + "): {" + ", ".join([names[x] for x in largest]) + "}\n")

BoW järgi suurim seaduste grupp on suurusega (31): { Prokuratuuriseadus ( terviktekst juuni 2001 ) ,  Avaliku teenistuse seadus ( terviktekst veebr 2001 ) ,  Kaitseväeteenistuse seadus ( terviktekst dets 2001 ) ,  Asjaõigusseadus ( terviktekst dets 2001 ) ,  Elundite ja kudede siirdamise seadus ( aprill 2002 ) ,  Riigi omandisse kuuluvat maavara sisaldavale maatükile ehitise rajamise kord ( veebruar 1999 ) ,  Tsiviilseadustiku üldosa seadus ( ELIT 1996/12 ) ,  Asjaõigusseaduse rakendamise seadus ( terviktekst dets 2001 ) ,  Pärimisseadus ( terviktekst juuni 2001 ) ,  Kinnistusraamatuseadus ( terviktekst dets 2001 ) ,  Kinnisasja sundvõõrandamise seadus ( ELIT 1997/2 ) ,  Riigireservi seadus ( terviktekst dets 2001 ) ,  Riigi poolt eraõiguslike juriidiliste isikute asutamise ja nendes osalemise seadus ( september 1999 ) ,  Maareformi seadus ( terviktekst juuni 2001 ) ,  Haldusmenetluse seadus ( okt 2001 ) ,  Kohtuekspertiisiseadus ( okt 2001 ) ,  Riigivastutuse seadus ( juuni 2001 ) ,  

Hinnang:

---

Lõppkokkuvõteks
 - Suurim seaduste grupp olid oodatust palju suurem
 - Sarnasus erinevate mudelite vahel on oodatust väiksem
 - scikit.learn ning networkx library'd on väga kasulikud
 - Seadustes on üllatavalt palju erinevaid sõnu kasutatud
 - TF-IDF on hea mõõdik sõna tähtsuse leidmiseks
 - estnltk 1.4 on keerukusastme võrra kiirem 1.6 versioonist
 - Peale vaadates BoN grupi siseselt teemade lõikes sarnasusi ei leidnud.
 - BoW gruppitas palju maaga seotud seaduseid. Ehk on seal palju raha/hüvitiste terminoloogiagat.
 - TF-IDF gruppitas omanditega seotud seadusi

