In [None]:
import sentencepiece as spm
import pandas as pd

from gensim.models import ldamodel
from gensim.corpora import Dictionary

In [None]:
input_file = 'issues.txt'
model_prefix = 'm1'
vocab_size = 1000
num_topics = 5

In [None]:
spm.SentencePieceTrainer.train(input=input_file, model_prefix=model_prefix, vocab_size=vocab_size, model_type='bpe')

s = spm.SentencePieceProcessor(model_file=f'{model_prefix}.model')

In [None]:
df = pd.read_json(input_file, lines=True)

In [None]:
df2 = df[df['tracker.name'].isin(['機能', 'バグ'])]

In [None]:
dt = [s.encode(r['subject'], out_type=str) for i, r in df2.iterrows()]

In [None]:
def convert(s):
    return s.replace('▁', '')

def check(s):
    return s != '' and len(s) > 1 and not str.isdigit(s)

dt2 = [list(filter(check, [convert(t) for t in ts])) for ts in dt]

In [None]:
dic = Dictionary(dt2)
corpus = [dic.doc2bow(t) for t in dt2]

lda = ldamodel.LdaModel(corpus=corpus, id2word=dic, num_topics=num_topics, random_state=1)

In [None]:
rs = [', '.join([dic[j] for (j, _) in lda.get_topic_terms(i)]) for i in range(lda.num_topics)]

for r in rs:
    print(r)

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensim_models

In [None]:
data = gensim_models.prepare(lda, corpus, dic, mds='mmds')
pyLDAvis.display(data)

In [None]:
pyLDAvis.save_html(data, 'result1.html')