<a href="https://colab.research.google.com/github/geersenthil/Topic-Modeling-/blob/main/LSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Libraries

In [6]:

import pandas as pd
#Gensim
import gensim
import gensim.corpora as corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt


#Dataset
from sklearn.datasets import fetch_20newsgroups


Pre-process the data

In [7]:
def preprocess_data(doc):
  #init regex token
  tokenizer =  RegexpTokenizer(r'\w+')
  # create stopword list
  stop = set(stopwords.words('english'))
  #create a port stemmer
  p_stemmer = PorterStemmer()
  #list for tokenized documents in loop
  texts = []
  for x in doc:
    value = x.lower()
    tokens = tokenizer.tokenize(value)
    stopped_tokens= [y for y in tokens if not y in stop]

    #tokens that aren't stop words
    stemmed_tokens = [p_stemmer.stem(x) for x in stopped_tokens]

    texts.append(stemmed_tokens)

  return texts


Load Data into Dataframe

In [8]:
news_group = fetch_20newsgroups()

news_group_data = news_group.data
news_group_target_names = news_group.target_names
news_group_target = news_group.target

In [9]:
news_df = pd.DataFrame({'news': news_group_data})
news_data = news_df['news'].sample(100)

Prepare Corpus

In [10]:
def prepare_corpus(docs):
  #term dictionary
  dictionary = corpora.Dictionary(docs)
  # convert list into document matrix
  doc_term_matrix = [dictionary.doc2bow(i) for i in docs]
  return dictionary, doc_term_matrix
  

In [12]:
def create_gensim_lsa_model(doc,num_tops,words):
  dictionary, doc_term_matrix=prepare_corpus(doc)
  #create LSA model
  lsamodel =LsiModel(doc_term_matrix, num_topics= num_tops, id2word=dictionary)
  print(lsamodel.print_topics(num_topics=num_tops,num_words=words))
  return lsamodel

In [14]:
from gensim.models import coherencemodel
def compute_coherence_values(dictionary,doc_term_matrix,doc, stop, start= 2, step=3):
  coherence_values = []
  model_list = []
  for num_topics in range(start, stop, step):
    #generate LSA model
    model = LsiModel(doc_term_matrix, num_topics= num_topics, id2word=dictionary)
    model_list.append(model)
    coherencemodel= CoherenceModel(model=model, texts=doc,dictionary=dictionary,coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values