In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
import simplejson

def json_load(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        return simplejson.load(f)

def json_save(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        simplejson.dump(data, f, separators=(',', ':'), iterable_as_array=True)

In [None]:
corpus_token_objects = json_load('./doc2vec.corpus_token_objects.json')

In [None]:
from tqdm.notebook import tqdm_notebook

In [None]:
def corpus2tokens(corpus_token_objects):
    ent_labels_to_skip = [
        "DATE", # Absolute or relative dates or periods
        "CARDINAL", # Numerals that do not fall under another type
        "PERCENT", # Percentage, including "%"
        "TIME", # Times smaller than a day
        "MONEY", # Monetary values, including unit
        "ORDINAL", # "first", "second", etc.
        "QUANTITY", # Measurements, as of weight or distance
        "PERSON",
        "ORG",
        "GPE",
        "FAC",
    ]
    return_tokens = []
    for doc in tqdm_notebook(corpus_token_objects):
        doc_tokens = []
        tokens = doc['tokens']
        for token in tokens:
            if not (token['is_stop'] or token['is_punct'] or token['is_space'] or token['ent_type_'] in ent_labels_to_skip):
                doc_tokens.append(token['lower_'].replace(' ', '_'))
        return_tokens.append(doc_tokens)
    return return_tokens

In [None]:
# %%time
# corpus_words_train = corpus2tokens(df_train.text.to_list())

In [None]:
%%time
corpus_words_full = corpus2tokens(corpus_token_objects)

In [None]:
corpus_words_full[0]

In [None]:
# json_save(corpus_words_train, './lda.corpus.train.json')

In [None]:
json_save(corpus_words_full, './lda.corpus.full.json')

In [None]:
# %%time
# from gensim import corpora

# dictionary = corpora.Dictionary(corpus_words_train)
# dictionary.filter_extremes()
# dictionary.save_as_text('./lda.dictionary.txt')

In [None]:
%%time
from gensim import corpora

dictionary = corpora.Dictionary(corpus_words_full)
dictionary.filter_extremes()
dictionary.save_as_text('./lda.dictionary.txt')

In [None]:
# %%time
# corpus_train = [dictionary.doc2bow(text) for text in corpus_words_train]
# corpora.MmCorpus.serialize('./lda.corpus.txt', corpus_train)

In [None]:
%%time
corpus_full = [dictionary.doc2bow(text) for text in corpus_words_full]
corpora.MmCorpus.serialize('./lda.corpus.txt', corpus_full)

In [None]:
from gensim import models

In [None]:
# %%time
# from gensim import models

# lda = models.ldamulticore.LdaMulticore(
#     corpus_train,
#     id2word=dictionary, 
#     num_topics=20,
#     chunksize=4000,
#     passes=20,
#     workers=64
# )
# lda.save("./lda.model")

In [None]:
%%time
from gensim import models

lda = models.ldamulticore.LdaMulticore(
    corpus_full,
    id2word=dictionary, 
    num_topics=14,
    chunksize=4000,
    passes=100,
    workers=3  #64
)
lda.save("./lda.model")

In [None]:
# %%time
# text_test = corpus2tokens(df_test.text.to_list())

In [None]:
# json_save(text_test, './lda.corpus.test.json')

In [None]:
# df_production = pd.read_csv('../data/BBairline200722.csv')

In [None]:
# df_production.head()

In [None]:
# %%time
# text_production = corpus2tokens(df_production["5"].to_list())

In [None]:
# json_save(text_production, './lda.corpus.production.json')