In [4]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [5]:
import codecs
import nltk
import os
from gensim import corpora, models
import pyLDAvis.gensim
import numpy as np
from multiprocessing import Pool
import pickle
import re
import json
import pandas as pd

In [24]:
DATA_PATH = "/data/hackathon/ustawy_pdf/"

In [25]:
files = os.listdir(DATA_PATH)
files = [f for f in files if '.txt' in f]
len(files)

33484

In [35]:
with open("/data/hackathon/polish_stopwords", "r") as f:
    POLISH_STOPWORDS = set([x.strip() for x in f.readlines()] + ["art"])

def make_proper_word(w, stopwords=POLISH_STOPWORDS):
    w = w.strip(".,%-").lower()
    if len(w) <= 2:
        return None
    if w in stopwords:
        return None
    if re.findall("[\d.,%\"\']", w):
        return None
    w = re.sub(r'[^\x00-\x7F]+','_', w)
    return w

def parse_file(filename, dir = DATA_PATH):
    fpath = os.path.join(dir, filename)
    with codecs.open(fpath, encoding="UTF-8") as f:
        content = f.read()
    words = [make_proper_word(w) for w in nltk.word_tokenize(content)]
    return [w for w in words if w]

In [36]:
with Pool(10) as p:
    texts = p.map(parse_file, files)

In [None]:
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_above=0.3, no_below=20, keep_n=None)
corpus = [dictionary.doc2bow(t) for t in texts]

In [None]:
%%time 
lda_model = models.LdaMulticore(corpus, num_topics=30, id2word=dictionary, passes=100, workers=20)

In [6]:
lda_vis_data = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_vis_data)

In [None]:
with open('backup', 'wb') as f:
    pickle.dump({'texts': texts, 'dictionary': dictionary, 'corpus': corpus, 'model': lda_model, 'files': files}, f)