In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
import pandas as pd

In [13]:
import simplejson


def json_save(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        simplejson.dump(data, f, separators=(',', ':'), iterable_as_array=True)

In [None]:
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

In [None]:
# # dev
# df_train = df_train.sample(50, random_state=42)
# df_test = df_test.sample(50, random_state=42)

In [None]:
df_train.shape, df_test.shape

In [10]:
from tqdm.notebook import tqdm_notebook

In [1]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

def corpus2tokens(raw_corpus):
    return [
        [
            wd
            for wd in simple_preprocess(doc)
            if not wd in STOPWORDS
        ]
        for doc in tqdm_notebook(raw_corpus)
    ]

In [None]:
%%time
corpus_words_train = corpus2tokens(df_train.text.to_list())

In [None]:
json_save(corpus_words_train, './lda.corpus.train.json')

In [None]:
%%time
from gensim import corpora

dictionary = corpora.Dictionary(corpus_words_train)
dictionary.filter_extremes()
dictionary.save_as_text('./lda.dictionary.txt')

In [None]:
%%time
corpus_train = [dictionary.doc2bow(text) for text in corpus_words_train]
corpora.MmCorpus.serialize('./lda.corpus.txt', corpus_train)

In [None]:
from gensim import models

In [None]:
%%time
from gensim import models

lda = models.ldamulticore.LdaMulticore(
    corpus_train,
    id2word=dictionary, 
    num_topics=20,
    chunksize=4000,
    passes=20,
    workers=64
)
lda.save("./lda.model")

In [None]:
%%time
text_test = corpus2tokens(df_test.text.to_list())

In [None]:
json_save(text_test, './lda.corpus.test.json')

In [4]:
df_production = pd.read_csv('../data/BBairline200722.csv')

In [6]:
df_production.head()

Unnamed: 0,0,1,2,3,4,5
0,https://www.bloomberg.com/news/articles/2021-0...,2021-07-19T09:08:37.527Z,Anger at Heathrow as Johnson’s French U-Turn A...,"['Laura Wright', 'Christopher Jasper']",Politics,London’s Heathrow airport was thronged with tr...
1,https://www.bloomberg.com/news/articles/2021-0...,2021-04-24T22:54:17.513Z,World Pledges Aid for India as Cases Surge: Vi...,[],prognosis,Healthcare workers administer doses of the Joh...
2,https://www.bloomberg.com/news/articles/2021-0...,2021-07-09T04:00:00.010Z,Want to End Flying Shame? Meet Sustainable Jet...,['Jack Wittels'],QuickTake,Workers fill an Airbus A350 passenger plane wi...
3,https://www.bloomberg.com/news/articles/2021-0...,2021-07-14T21:26:00.339Z,Missouri County Sounds Alarm; Tokyo Cases Surg...,[],prognosis,Health officials in southwestern Missouri aske...
4,https://www.bloomberg.com/news/articles/2021-0...,2021-06-02T10:22:31.347Z,Belarus Accused of Letting Illegal Migrants Cr...,['Milda Seputyte'],Politics,Alexander Lukashenko on May 28. Lithuania accu...


In [11]:
%%time
text_production = corpus2tokens(df_production["5"].to_list())

HBox(children=(FloatProgress(value=0.0, max=490.0), HTML(value='')))


CPU times: user 452 ms, sys: 35.4 ms, total: 487 ms
Wall time: 479 ms


In [14]:
json_save(text_production, './lda.corpus.production.json')