<a href="https://colab.research.google.com/github/geersenthil/Topic-Modeling-/blob/main/LSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Libraries

In [9]:
import pandas as pd
#Gensim
import gensim
import gensim.corpora as corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer

from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

#nltk
import nltk 
nltk.download("stopwords")
from nltk.corpus import stopwords


#Spacy
import spacy


#Dataset
from sklearn.datasets import fetch_20newsgroups


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Load Data into Dataframe

In [10]:
news_group = fetch_20newsgroups()

news_group_data = news_group.data
news_group_target_names = news_group.target_names
news_group_target = news_group.target

In [11]:
news_df = pd.DataFrame({'news': news_group_data})
news_data = news_df['news'].sample(500)

Pre-process the data
Lemmatization to get base words

In [13]:
def lemmatization(texts, allowed_postags=["NOUN","ADJ","VERB","ADV"]):
  nlp = spacy.load('en_core_web_sm', disable=["parser", "ner"])
  texts_out = []
  for text in texts:
    doc = nlp(text)
    new_text = []
    for token in doc:
        if token.pos_ in allowed_postags:
            new_text.append(token.lemma_)
    final = " ".join(new_text)
    texts_out.append(final)
  return (texts_out)


In [14]:
lemmatizated_text =  lemmatization(news_data)
print(lemmatizated_text)




In [15]:
stop_words = stopwords.words('english')
print(len(stop_words))
stop_words.extend(['from', 'subject', 're', 'edu', 'use','cdw','would','line','article'])
print(len(stop_words))

179
188


In [17]:
#tokenize
def gen_words(texts):
  final = []

  for text in texts:
    new = gensim.utils.simple_preprocess(text, deacc=True)
    for w in new:
      if w in stop_words:
        new.remove(w)
    final.append(new)
  return (final)

data_word = gen_words(lemmatizated_text)

In [18]:
print(data_word)



Prepare Corpus

In [19]:
def prepare_corpus(docs):
  #term dictionary
  dictionary = corpora.Dictionary(docs)
  # convert list into document matrix
  doc_term_matrix = [dictionary.doc2bow(i) for i in docs]
  return dictionary, doc_term_matrix
  

In [20]:
def create_gensim_lsa_model(doc,num_tops,words):
  dictionary, doc_term_matrix=prepare_corpus(doc)
  #create LSA model
  lsamodel =LsiModel(doc_term_matrix, num_topics= num_tops, id2word=dictionary)
  print(lsamodel.print_topics(num_topics=num_tops,num_words=words))
  return lsamodel

In [21]:
from gensim.models import coherencemodel
def compute_coherence_values(dictionary,doc_term_matrix,doc, stop, start= 2, step=3):
  coherence_values = []
  model_list = []
  for num_topics in range(start, stop, step):
    #generate LSA model
    model = LsiModel(doc_term_matrix, num_topics= num_topics, id2word=dictionary)
    model_list.append(model)
    coherencemodel= CoherenceModel(model=model, texts=doc,dictionary=dictionary,coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [22]:
def plot_graph(doc,start,stop,step):
  dictionary, doc_term_matrix=prepare_corpus(doc)
  model_list, coherence_values = compute_coherence_values(dictionary,doc_term_matrix,doc,stop, start,step)
  x=range(start,stop,step)
  plt.plot(x,coherence_values)
  plt.xlabel("Number of Topics")
  plt.ylabel("Coherence score")
  plt.legend(("coherence_values"),loc="best")
  plt.show()

In [24]:
# LSA Model
number_of_topics=7
words=10
clean_text=prepare_corpus(data_word)
model=create_gensim_lsa_model(clean_text,number_of_topics,words)
start,stop,step=2,12,1
plot_graph(clean_text,start,stop,step)

TypeError: ignored