In [2]:
import re
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

input_path = './data/en/en_output_categorized.txt'
output_path = './data/en/en_output-tmp.txt'
n_topics = 30  # set between 20-40

# Read file and extract words
lines = []
words = []
numbers = []
with open(input_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = re.split(r'\s+', line.strip())
        if len(parts) == 3:
            word, number, _ = parts
            lines.append(line.strip())
            words.append(word)
            numbers.append(number)

# Topic modeling on words
vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 4))
X = vectorizer.fit_transform(words)
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X)
topics = lda.transform(X)

# Assign topic or UNKNOWN
topic_labels = [f"topic_{i}" for i in range(n_topics)]
assigned_topics = []
for topic_dist in topics:
    if topic_dist.max() < 0.1:  # threshold for UNKNOWN
        assigned_topics.append("UNKNOWN")
    else:
        assigned_topics.append(topic_labels[topic_dist.argmax()])

# Write output
with open(output_path, 'w', encoding='utf-8') as f:
    for word, number, topic in zip(words, numbers, assigned_topics):
        f.write(f"{word} {number} {topic}\n")

In [3]:
n_top_words = 10
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    top_features = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print(f"Topic {topic_idx}: {' '.join(top_features)}")

Topic 0: er nc ce an ra nce anc se ar rc
Topic 1: in ti ng ing tin io ri ting pi rin
Topic 2: oo om oc co po lo ok ot op ock
Topic 3: ch he ro che ha es ow hes ches th
Topic 4: se os us es ss ses pr as use ou
Topic 5: ac ra ap es ck ack ss st as ess
Topic 6: sh el ho els ls en sho ow ab we
Topic 7: ar rd or rt art ear ard er ari ars
Topic 8: la li ol ll an el ar ir ba ha
Topic 9: ca um le ug an su mp ap ru ump
Topic 10: ta st re sp es ec sta tar ar res
Topic 11: ea ck ke nd an ds ic ker er and
Topic 12: ur to or tor ou rs our ct ec ors
Topic 13: sh he el she es ce be ic ush lt
Topic 14: in ng ing lin li ling di ki din kin
Topic 15: er te ter ers rs pe nt per un ters
Topic 16: am im ra on me mp er tra ti tr
Topic 17: ts et ke lo ets ak bl co ake le
Topic 18: ge er ne es st te ag age ner av
Topic 19: ff in fi fin uf uff of ass ui as
Topic 20: ie es ies tt er te tte ter si it
Topic 21: ll il le ns all ill ul lle ler it
Topic 22: de er ch der us st chi nd rs ers
Topic 23: re ee es tu ur re