<a href="https://colab.research.google.com/github/cur10usityDrives/Latent-Dirichlet-Allocation/blob/main/nlp_with_lda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Gensim if you haven't already
# !pip install gensim

import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import download
import string

# Sample documents
documents = [
    "Topic modeling is an unsupervised machine learning technique used to discover hidden topics in a collection of documents.",
    "Latent Dirichlet Allocation (LDA) is a popular topic modeling algorithm.",
    "Gensim is a Python library for topic modeling.",
    "Natural Language Processing (NLP) is a field of study focused on making sense of human language using computers.",
    "Topic modeling can be used for clustering similar documents or for text summarization."
]

# Preprocessing
download('stopwords')
download('punkt')
download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

processed_docs = [preprocess_text(doc) for doc in documents]

# Create dictionary and corpus
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Build LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=10)

# Print the topics
for topic_id, topic in lda_model.print_topics():
    print(f"Topic {topic_id}: {topic}")

# Compute coherence score
coherence_model = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Topic 0: 0.116*"topic" + 0.093*"modeling" + 0.046*"document" + 0.046*"used" + 0.032*"hidden" + 0.032*"machine" + 0.032*"learning" + 0.032*"collection" + 0.032*"discover" + 0.032*"technique"
Topic 1: 0.070*"language" + 0.042*"using" + 0.042*"focused" + 0.042*"nlp" + 0.042*"human" + 0.042*"computer" + 0.042*"making" + 0.042*"study" + 0.042*"sense" + 0.042*"natural"
Coherence Score: 0.26095153549246014


In [None]:
# Install Gensim if you haven't already
# !pip install gensim

import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import download
import string

# Sample documents
documents = [
    "Artificial intelligence and robotics are leading the next wave of digital transformation.",
    "Global markets are increasingly volatile, affecting international trade and investment strategies.",
    "Advancements in biotechnology are making personalized medicine more accessible than ever.",
    "Climate change is the defining issue of our time, affecting global weather patterns and ecosystems.",
    "Quantum computing could revolutionize data processing by significantly speeding up problem-solving capabilities."
]

# Preprocessing
download('stopwords')
download('punkt')
download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

processed_docs = [preprocess_text(doc) for doc in documents]

# Create dictionary and corpus
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Build LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=10)

# Print the topics
for topic_id, topic in lda_model.print_topics():
    print(f"Topic {topic_id}: {topic}")

# Compute coherence score
coherence_model = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")

Topic 0: 0.062*"affecting" + 0.062*"global" + 0.037*"defining" + 0.037*"issue" + 0.037*"change" + 0.037*"pattern" + 0.037*"investment" + 0.037*"time" + 0.037*"international" + 0.037*"trade"
Topic 1: 0.033*"computing" + 0.033*"data" + 0.033*"processing" + 0.033*"capability" + 0.033*"could" + 0.033*"revolutionize" + 0.033*"speeding" + 0.033*"quantum" + 0.033*"significantly" + 0.033*"problem-solving"
Coherence Score: 0.2983492322315788


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
