In [1]:
from esperanto_analyzer.analyzers.morphological import *
from esperanto_analyzer.speech import *

In [2]:
class Lemmatizer:
    def __init__(self):
        pass
    def lemmatize(self, word: str) -> str:
        # adjective
        adjective = AdjectiveMorphologicalAnalyzer(word)
        if adjective.analyze():
            #print("Word is an adjective!")
            new_word = list(word)
            if new_word[-1] == 'n':
                new_word.pop()
            if new_word[-1] == 'j':
                new_word.pop()
            return ''.join(new_word)
       
            
        
        # adverb
        adverb = AdverbMorphologicalAnalyzer(word)
        if adverb.analyze():
            #print("Word is an adverb!")
            new_word = list(word)
            if new_word[-1] == 'n':
                new_word.pop()
            return ''.join(new_word)
        
        # verb
        verb = VerbMorphologicalAnalyzer(word)
        if verb.analyze():
            #print("Word is a verb!")
            new_word = list(word)
            if new_word[-1] != 'i':
                new_word.pop() # глаголы заканчиваются на -as, -os, -is
                new_word.pop()
                new_word.append('i')
            return ''.join(new_word)
            
        
        # noun
        noun = NounMorphologicalAnalyzer(word)
        if noun.analyze():
            #print("Word is a noun!")
            new_word = list(word)
            if new_word[-1] == 'n':
                new_word.pop()
            if new_word[-1] == 'j':
                new_word.pop()
            return ''.join(new_word)
        
        
        # pronoun
        pronoun = PronounMorphologicalAnalyzer(word)
        if pronoun.analyze():
            #print("Word is a pronoun!")
            new_word = list(word)
            if new_word[-1] == 'n':
                new_word.pop()
            if new_word[-1] == 'j':
                new_word.pop()
            return ''.join(new_word)
          
            
        # article
        article = ArticleMorphologicalAnalyzer(word)
        if article.analyze():
            #print("Word is an article!")
            return word
        
        
        # conjunction
        conjunction = ConjunctionMorphologicalAnalyzer(word)
        if conjunction.analyze():
            #print("Word is a conjunction!")
            return word
        
        # preposition
        preposition = PrepositionMorphologicalAnalyzer(word)
        if preposition.analyze():
            #print("Word is a preposition")
            return word
    

        # number
        number = NumeralMorphologicalAnalyzer(word)
        if number.analyze():
            #print("Word is a number")
            return word
        
        # interjection
        interjection = InterjectionMorphologicalAnalyzer(word)
        if interjection.analyze():
            #print("Word is an interjection!")
            return word
        
        #print("Unknown word: may be a proper name or sth else")
        return ""

In [3]:
lemmatizer = Lemmatizer()

tests = ['belajn', 'hejme', 'iras', 'iri', 'vortaro', 'vortarojn', 'mi', 'vi', 'vin', 'siajn', 'la', 'kun', 'tri', 'deka', 'dekan', 'asdasdasd']
for test in tests:
    print(test, lemmatizer.lemmatize(test))


belajn bela
hejme hejme
iras iri
iri iri
vortaro vortaro
vortarojn vortaro
mi mi
vi vi
vin vi
siajn sia
la la
kun kun
tri tri
deka deka
dekan deka
asdasdasd 


In [4]:
def remove_punctuation(word: str) -> str:
    new_word = ''
    for letter in word:
        if letter.isalpha() or letter == '-':
            new_word += letter.lower()
    return new_word

In [5]:
def process_file(filename: str):
    with open(filename, "r") as file:
        out = open(filename + "_out", "w")
        for line in file.readlines():
            for word in line.split():
                processed_word = normalize_word(remove_punctuation(word))
                out.write(processed_word)
            out.write("\n")
        out.close()

In [6]:
!ls -lh data

total 2048520
-rw-r--r--@ 1 h1de0us  staff   126K Jul  7  2011 111-fabloj_RuLit_Net_99691.txt
-rw-r--r--@ 1 h1de0us  staff    55K Oct  5  2012 Asch_La_sorcistino_el_Kastilio_RuLit_Net.txt
-rw-r--r--@ 1 h1de0us  staff   3.7M Dec  7  2012 Avtor_neizvesten_La_Sankta_Biblio_RuLit_Net.txt
-rw-r--r--@ 1 h1de0us  staff    64K May 14  2012 Avtor_neizzvesten_Fabloj_de_Cent_paraboloj_RuLit_Net.txt
-rw-r--r--@ 1 h1de0us  staff    11K Jun 17  2012 Baghy_La_sagaca_knabino_RuLit_Net.txt
-rw-r--r--@ 1 h1de0us  staff   214K Sep 27  2012 Baum_La_Eksterordinara_Lando_Oz_RuLit_Net.txt
-rw-r--r--@ 1 h1de0us  staff   298K Sep 30  2012 Baum_La_Mikscifona_Knabino_de_Oz_RuLit_Net.txt
-rw-r--r--@ 1 h1de0us  staff    57K Aug 30  2016 Baza-Radikaro-Oficiala_RuLit_Me.txt
-rw-r--r--@ 1 h1de0us  staff   736K Feb  1  2019 Bein_Vortaro-de-Esperanto-1910_RuLit_Me.txt
-rw-r--r--@ 1 h1de0us  staff   255K Jul 27  2011 Berrouz_Ĉe_la_koro_de_la_tero_RuLit_Net.txt
-rw-r--r--@ 1 h1de0us  staff   314K Jun  3  2013 

-rw-r--r--@ 1 h1de0us  staff   440B Jan 11 15:52 rikki-tikki-tavi_RuLit_Net_99651.txt
-rw-r--r--@ 1 h1de0us  staff   138K Jan 11 13:06 robinsono_kruso.u.txt
-rwxr-xr-x  1 h1de0us  staff   331B May 15  2008 [31mrogxerborgxes.txt[m[m
-rwxr-xr-x  1 h1de0us  staff   431B May 16  2008 [31msupernova.txt[m[m
-rw-r--r--@ 1 h1de0us  staff   454K Jul  7  2011 tarzan-de-la-simioj_RuLit_Net_99573.txt
-rw-r--r--@ 1 h1de0us  staff   451K Jan  7  2014 tarzan_de_la_simioj.u.txt
-rwxr-xr-x  1 h1de0us  staff   5.8K May 23  2008 [31mtelefono.txt[m[m
-rw-r--r--@ 1 h1de0us  staff    58K Jan  6  2014 tri_noveloj.u.txt


In [7]:
total_words = 0
missed_words = 0

def process_file(filename: str):
    global total_words, missed_words
    with open('data/' + filename, encoding='iso-8859-1') as file:
        out_filename = "processed_data/" + filename[:-4] + "_processed.txt"
        out = open(out_filename, "w")
        print
        for line in file.readlines():
            for word in line.split():
                processed_word = lemmatizer.lemmatize(remove_punctuation(word))
                total_words += 1
                #print(processed_word)
                out.write(processed_word)
                out.write(" ")
                if (processed_word == ""):
                    missed_words += 1
            out.write("\n")
        out.close()
    
print(total_words, missed_words)

0 0


In [8]:
import os

directory = os.fsencode("data")
    
for file in os.listdir(directory):
    print(file)
    process_file(file.decode("utf-8"))

b'Piron_\xc4\x88u_ni_kunvenis_vane_RuLit_Net.txt'
b'Bein_Vortaro-de-Esperanto-1910_RuLit_Me.txt'
b'handzlik.txt'
b'supernova.txt'
b'Reyto_La_nigra_kapitano_RuLit_Net.txt'
b'tarzan_de_la_simioj.u.txt'
b'gulivero.txt'
b'fabeloj_al_helenjo.u.txt'
b'robinsono_kruso.u.txt'
b'Zamenhof_Esenco-kaj-estonteco-de-la-ideo-de-Lingvo-Internacia_RuLit_Net.txt'
b'Garrison_La-stratoj-de-Askelono_RuLit_Net.txt'
b'la_aventuro_de_la_dancantoj.u.txt'
b'Reyto_La_blonda_ciklono_RuLit_Net.txt'
b'Konan_Doyl_La_ses_Napoleonoj_RuLit_Net.txt'
b'monumento.txt'
b'rogxerborgxes.txt'
b'Baza-Radikaro-Oficiala_RuLit_Me.txt'
b'kd.txt'
b'Reyto_Fred_la_malpura_la_kapitano_RuLit_Net.txt'
b'dolchamar.txt'
b'Mopassan_La-maljunulo_RuLit_Me.txt'
b'Obama_Inaugura_parolado_de_Barack_Obama_20-a_de_januaro_2009_RuLit_Net.txt'
b'Konan_Doyl_La_vaka_domo_RuLit_Net.txt'
b'lafaraono.txt'
b'Orzeszkowa_La-bona-sinjorino_RuLit_Net.txt'
b'Konan_Doyl_Skandalo_en_Bohemio_RuLit_Net.txt'
b'telefono.txt'
b'la-nebulozo-de-andromedo_RuLit_Net_996

In [9]:
# combine all processed files into one 
out_directory = os.fsencode("processed_data")

with open("result.txt", "w") as result:
    for filename in os.listdir(out_directory):
        if filename != b'.DS_Store' and filename.decode("utf-8") != "eo_wiki_corpus_processed.txt":
            file = open('processed_data/' + filename.decode("utf-8"), "r")
            for line in file.readlines():
                for word in line.split():
                    result.write(word)
                    result.write(" ")
            file.close()
            result.write('\n')

In [10]:
!wc result.txt

     174 5659651 30821975 result.txt


In [None]:
# не записываю википедию в один файл с художественной литературой

In [11]:
# добавляю новые художественные тексты, газеты и веб-страницы

def process_file_2(filename: str, in_folder: str, out_folder: str):
    global total_words, missed_words
    with open(in_folder + filename, encoding='iso-8859-1') as file:
        out_filename = out_folder + filename[:-4] + "_processed.txt"
        out = open(out_filename, "w")
        print
        for line in file.readlines():
            for word in line.split():
                processed_word = lemmatizer.lemmatize(remove_punctuation(word))
                total_words += 1
                #print(processed_word)
                out.write(processed_word)
                out.write(" ")
                if (processed_word == ""):
                    missed_words += 1
            out.write("\n")
        out.close()
    
process_file_2("epo_literature_2011_300K-sentences.txt", "literature_300k/", "processed_literature_300k/")
print("Added literature")
process_file_2("epo_newscrawl_2017_300K-sentences.txt", "newscrawl_300k/", "processed_newscrawl_300k/")
print("Added newscrawl")
process_file_2("epo_web_2012_300K-sentences.txt", "web_300k/", "processed_web_300k/")
print("Added web pages")

Added literature
Added newscrawl
Added web pages


In [12]:
with open("result.txt", "a") as result:
    file = open('processed_literature_300k/epo_literature_2011_300K-sentences_processed.txt', "r")
    count = 0
    for line in file.readlines():
        for word in line.split():
            result.write(word)
            result.write(" ")
        if count % 300 == 0: # делаю пачки из 300 предложений
            result.write('\n')
        count += 1
            
    file.close()
    result.write('\n')

In [13]:
!wc result.txt

    1175 9374293 50306260 result.txt


In [14]:
with open("result_newscrawl_and_web.txt", "w") as result:
    file = open('processed_newscrawl_300k/epo_newscrawl_2017_300K-sentences_processed.txt', "r")
    for line in file.readlines():
        for word in line.split():
            result.write(word)
            result.write(" ")
        if count % 300 == 0:
            result.write('\n')
        count += 1
    file.close()
    result.write('\n')
                
with open("result_newscrawl_and_web.txt", "a") as result:
    file = open('processed_web_300k/epo_web_2012_300K-sentences_processed.txt', "r")
    for line in file.readlines():
        for word in line.split():
            result.write(word)
            result.write(" ")
        if count % 300 == 0:
            result.write('\n')
        count += 1
    file.close()
    result.write('\n')

In [15]:
!wc result_newscrawl_and_web.txt

    2002 8907981 51974248 result_newscrawl_and_web.txt


In [16]:
with open("result_wiki.txt", "w") as result:
    file = open('processed_data/eo_wiki_corpus_processed.txt', "r")
    for line in file.readlines():
        for word in line.split():
            result.write(word)
            result.write(" ")
        if count % 2000 == 0:
            result.write('\n')
    file.close()
    result.write('\n')

In [17]:
!wc result_wiki.txt

 7612070 123484971 715483477 result_wiki.txt


In [8]:
token_pattern = "[A-Za-zĉĥĵŝŭĈĜĤĴŜŬ]+"

from sklearn.feature_extraction.text import TfidfVectorizer
def make_table_and_dict(corpus_path, min_df, max_df, token_pattern = None, use_idf = True):
    with open(corpus_path, 'r') as corpus_file:
        if token_pattern:
            vectorizer = TfidfVectorizer(analyzer='word', min_df=min_df, token_pattern=token_pattern, use_idf=use_idf)
        else:
            vectorizer = TfidfVectorizer(analyzer='word', min_df=min_df)
        data_vectorized = vectorizer.fit_transform(corpus_file)
    return data_vectorized, vectorizer.get_feature_names(), vectorizer.idf_

In [9]:
esp_data_vectorized, esp_dictionary, idfs = make_table_and_dict('result.txt', 0.01,  0.8 , token_pattern)
pairs = dict(zip(esp_dictionary, idfs))
with open('dictionary.txt', 'w') as output_file:
    for word in esp_dictionary:
        if (pairs[word] > 1.4):
            output_file.write(word)
            output_file.write('\n')

        else:
            print('here')

here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here


In [10]:
pairs = list(zip(esp_dictionary, idfs))
sorted(pairs, key = lambda x: x[1])

[('en', 1.007682496172016),
 ('la', 1.010256500167189),
 ('de', 1.0119761910467155),
 ('esti', 1.0119761910467155),
 ('kaj', 1.0162884012648958),
 ('pli', 1.0171530792262495),
 ('da', 1.0206192872027355),
 ('el', 1.0240975515790605),
 ('ki', 1.0240975515790605),
 ('por', 1.0240975515790605),
 ('pri', 1.0240975515790605),
 ('al', 1.0249690113292016),
 ('kun', 1.0249690113292016),
 ('kiu', 1.0258412311838874),
 ('alia', 1.0284624646637617),
 ('ili', 1.0284624646637617),
 ('ti', 1.0284624646637617),
 ('tio', 1.0284624646637617),
 ('vi', 1.0284624646637617),
 ('havi', 1.0293377382426168),
 ('sia', 1.0293377382426168),
 ('kiel', 1.0302137785964967),
 ('povi', 1.0302137785964967),
 ('fari', 1.031090587070031),
 ('li', 1.031090587070031),
 ('pro', 1.031090587070031),
 ('per', 1.0319681650113899),
 ('se', 1.0319681650113899),
 ('sed', 1.0319681650113899),
 ('tie', 1.0319681650113899),
 ('tiu', 1.0319681650113899),
 ('ui', 1.0319681650113899),
 ('dum', 1.0328465137722962),
 ('granda', 1.0328465

In [5]:
from scipy.sparse.linalg import svds
import numpy as np

In [4]:
def create_table(data_vectorized, k, name):
    u, sigma, vt = svds(data_vectorized, k)
    print(sigma)
    with open(name + str(k) + '.npy', 'wb') as f:
        np.save(f, np.dot(np.diag(sigma), vt).T)

In [23]:
create_table(esp_data_vectorized, 1024, "esp_sigma_v")

[ 0.14508604  0.14529951  0.14544627 ...  4.78065603  7.01406035
 28.35116013]


In [24]:
# делаем векторы размера n * m : n = 5 (размер n - граммы), m = 1024 (размер вектора)

In [32]:
with open('result.txt', 'r') as file:
    f = open('texts_to_vectors.txt', 'w')
    for line in file.readlines():
        for word in line.split():
            try:
                pos = 

17864

In [22]:
from bisect import bisect_left 
import numpy as np
def binary_search(a, x): 
    i = bisect_left(a, x) 
    if i != len(a) and a[i] == x: 
        return i 
    else: 
        return -1

def texts_to_series(text_path = 'result.txt', dictionary_path = 'dictionary.txt', table_path = 'esp_sigma_v1024.npy'):
    with open(dictionary_path, 'r') as my_dict:
        dictionary = [word.strip() for word in my_dict.readlines()]
    table = np.load(table_path)
    with open(text_path, 'r') as texts:
        with open('vectorized_texts.txt', 'w') as file:
            for text in texts.readlines():
                for word in text.split():
                    word_index = binary_search(dictionary, word)
                    if word_index != -1:
                        word_vector = table[word_index]
                        for word in reversed(word_vector):
                            file.write(str(word))
                            file.write(' ')
                file.write('\n')       

In [11]:
create_table(esp_data_vectorized, 8, "esp_sigma_v")

[ 2.91070343  3.38073138  3.51769953  3.92386255  4.29448547  4.78065603
  7.01406035 28.35116013]


In [23]:
texts_to_series(table_path='esp_sigma_v8.npy')