In [None]:
# pip install gensim

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models import LdaModel
from gensim.parsing.preprocessing import remove_stopwords
import string

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

def preprocess_text(text):
    # Remove punctuation and convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = remove_stopwords(text)

    # Tokenize and lemmatize words
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

def topic_modeling(documents, num_topics=5, passes=20):
    # Preprocess the documents
    processed_docs = [preprocess_text(doc) for doc in documents]

    # Create a dictionary representation of the documents
    dictionary = corpora.Dictionary(processed_docs)

    # Convert the documents to a bag-of-words representation
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

    # Train the LDA model
    lda_model = LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=passes)

    return lda_model

# Example documents
documents = [
    "Natural language processing is a subfield of artificial intelligence that focuses on the interaction between computers and humans through natural language.",
    "Machine learning is a subset of artificial intelligence that provides systems the ability to learn and improve from experience without being explicitly programmed.",
    "Topic modeling is a useful technique to discover hidden themes or topics in a collection of text documents.",
    "Python is a popular programming language for data analysis and natural language processing tasks.",
    "The internet has revolutionized the way we access information and communicate with others."
]

# Perform topic modeling
num_topics = 3
passes = 50
lda_model = topic_modeling(documents, num_topics=num_topics, passes=passes)

# Display the topics and their top words
print(f"{num_topics} Topics:")
for topic_idx, topic in lda_model.print_topics(-1):
    print(f"Topic {topic_idx}: {topic}")

# Find the dominant topic for each document
print("\nDocument Topics:")
for idx, doc in enumerate(documents):
    bow = lda_model.id2word.doc2bow(preprocess_text(doc))
    doc_topics = lda_model.get_document_topics(bow)
    dominant_topic = max(doc_topics, key=lambda x: x[1])[0]
    print(f"Document {idx + 1} -> Dominant Topic: {dominant_topic}")
