Using gensim

In [None]:
import gensim
from gensim import corpora

# Sample documents
documents = [
    "artificial intelligence machine learning data",
    "deep learning neural network data"
]

# Preprocess documents
texts = [doc.split() for doc in documents]

# Create a dictionary and corpus
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# LDA model with 2 topics
lda = gensim.models.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=15)

# Print the topics
topics = lda.print_topics(num_words=4)
for topic in topics:
    print(topic)


(0, '0.166*"learning" + 0.166*"data" + 0.166*"intelligence" + 0.166*"artificial"')
(1, '0.167*"data" + 0.167*"learning" + 0.166*"neural" + 0.166*"network"')


Using Sklearn

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Sample documents
documents = [
    "artificial intelligence machine learning data",
    "deep learning neural network data",
    "neural networks are part of deep learning",
    "machine learning is a subset of artificial intelligence",
    "data science and artificial intelligence are related fields"
]

# Step 1: Vectorizing the text (Bag-of-Words)
vectorizer = CountVectorizer(stop_words='english')
doc_term_matrix = vectorizer.fit_transform(documents)

# Step 2: Fit LDA model
lda = LatentDirichletAllocation(n_components=2, random_state=0)  # 2 topics
lda.fit(doc_term_matrix)

# Step 3: Display the topics and the words associated with them
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 5
feature_names = vectorizer.get_feature_names_out()
display_topics(lda, feature_names, no_top_words)


Topic 0:
learning deep neural machine data
Topic 1:
intelligence artificial data science related
