In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
import pandas as pd

In [None]:
import simplejson


def json_save(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        simplejson.dump(data, f, separators=(',', ':'), iterable_as_array=True)

In [None]:
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

In [None]:
# # dev
# df_train = df_train.sample(50, random_state=42)
# df_test = df_test.sample(50, random_state=42)

In [None]:
df_train.shape, df_test.shape

In [None]:
from tqdm.notebook import tqdm_notebook

In [None]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

def corpus2tokens(raw_corpus):
    return [
        [
            wd
            for wd in simple_preprocess(doc)
            if not wd in STOPWORDS
        ]
        for doc in tqdm_notebook(raw_corpus)
    ]

In [None]:
%%time
corpus_words_train = corpus2tokens(df_train.text.to_list())

In [None]:
json_save(corpus_words_train, './lda.corpus.train.json')

In [None]:
%%time
from gensim import corpora

dictionary = corpora.Dictionary(corpus_words_train)
dictionary.filter_extremes()
dictionary.save_as_text('./lda.dictionary.txt')

In [None]:
%%time
corpus_train = [dictionary.doc2bow(text) for text in corpus_words_train]
corpora.MmCorpus.serialize('./lda.corpus.txt', corpus_train)

In [None]:
from gensim import models

In [None]:
%%time
from gensim import models

lda = models.ldamulticore.LdaMulticore(
    corpus_train,
    id2word=dictionary, 
    num_topics=20,
    chunksize=4000,
    passes=20,
    workers=63
)
lda.save("./lda.model")

In [None]:
%%time
text_test = corpus2tokens(df_test.text.to_list())

In [None]:
json_save(text_test, './lda.corpus.test.json')