In [1]:
from gensim.models import FastText, Word2Vec
import sqlite3
from nltk.tokenize import wordpunct_tokenize
from tqdm.auto import tqdm
from collections import Counter
import pickle
import pandas as pd
import os
from bs4 import BeautifulSoup

In [13]:
DB = './translate/translations.db'
db = sqlite3.connect(DB)
cur = db.cursor()

In [14]:
cur.execute('SELECT text FROM sentences')
texts = [wordpunct_tokenize(text[0].lower()) for text in cur.fetchall()]

In [15]:
ft = FastText(texts, workers=-1, size=300, min_count=10, iter=5)

In [16]:
ft.wv.most_similar('mann')

[('manns', 0.49959316849708557),
 ('manna', 0.48749732971191406),
 ('manni', 0.48149383068084717),
 ('mannaður', 0.4410877525806427),
 ('manninn', 0.4307960271835327),
 ('mannkaup', 0.4208546280860901),
 ('mannamunur', 0.4096454381942749),
 ('mannmargt', 0.39843112230300903),
 ('manndóm', 0.39427629113197327),
 ('austmann', 0.38897883892059326)]

In [17]:
del texts

In [18]:
pickle.dump(ft, open('ft.pkl', 'wb'))

MemoryError: 

In [None]:
freqdict = Counter()
for text in tqdm(texts):
    freqdict += Counter(text)

In [None]:
df = pd.DataFrame(freqdict.items()).sort_values(by=1, ascending=False)
df.to_csv('freq_forms.tsv', index=None, header=None, sep='\t')

In [None]:
! head freq_forms.tsv

In [2]:
def get_lemmas_from_xml(soup):

    data_lemmas = []

    for sent in soup.find_all("s"):
    
        s1, s2, s3 = [], [], []
    
        for wo in sent:
        
            if wo.name == 'w':
                s2.append(wo.attrs['lemma'])
       
            if wo.name == 'c':
                s2.append(wo.text)
        
        data_lemmas.append(s2)
    
    return data_lemmas

In [3]:
PATH = './translate/xml'
texts = []
for fname in tqdm(os.listdir(PATH)):
    text = BeautifulSoup(open(os.path.join(PATH, fname)).read(), 'lxml')
    texts.extend(get_lemmas_from_xml(text))

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [4]:
texts = [[word.lower() for word in sent] for sent in texts]
texts[:2]

[['ófeigur', 'heita', 'maður', '.'],
 ['hann', 'búa', 'norður', 'í', 'miðfjörður', 'á', 'reykir', '.']]

In [5]:
freqdict = Counter()
for text in tqdm(texts):
    freqdict += Counter(text)

df = pd.DataFrame(freqdict.items()).sort_values(by=1, ascending=False)
df.to_csv('freq_lemmas.tsv', index=None, header=None, sep='\t')

HBox(children=(IntProgress(value=0, max=101359), HTML(value='')))




In [6]:
! head freq_lemmas.tsv

.	97293
og	85828
hann	82793
vera	61356
að	50685
það	28374
í	26346
,	26134
til	24914
en	24049


In [8]:
w2v = Word2Vec(texts, workers=-1, size=300, iter=20, min_count=15)

In [9]:
w2v.wv.most_similar('stofa')

[('hamrammur', 0.24035866558551788),
 ('heimill', 0.20861855149269104),
 ('rauf', 0.20094311237335205),
 ('roskinn', 0.19731876254081726),
 ('heili', 0.1932411789894104),
 ('minnka', 0.18783366680145264),
 ('jaðar', 0.18083727359771729),
 ('dylja', 0.17953938245773315),
 ('rangur', 0.16946963965892792),
 ('stórættaður', 0.1622796356678009)]

In [10]:
pickle.dump(w2v, open('w2v.pkl', 'wb'))