<a href="https://colab.research.google.com/github/geersenthil/Topic-Modeling-/blob/main/LSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Libraries

In [1]:
import pandas as pd
import numpy as np

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.models import LsiModel
from gensim.parsing import strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
from gensim.parsing import preprocess_string
from nltk.tokenize import RegexpTokenizer

from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

#nltk
import nltk 
nltk.download("stopwords")
nltk.download('punkt')
from nltk.corpus import stopwords


#import nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet


# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

#Spacy
import spacy


#Dataset
from sklearn.datasets import fetch_20newsgroups


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Load Data into Dataframe

In [2]:
news_group = fetch_20newsgroups()

news_group_data = news_group.data
news_group_target_names = news_group.target_names
news_group_target = news_group.target

In [3]:
news_df = pd.DataFrame({'news': news_group_data})
news_data =  pd.DataFrame({'news':news_df['news'].sample(500)})
print(news_data.head())

                                                   news
4472  From: fang@en.ecn.purdue.edu (Wen-Cheng Fang)\...
9495  From: wrat@unisql.UUCP (wharfie)\nSubject: Re:...
3295  From: yamauchi@ces.cwru.edu (Brian Yamauchi)\n...
2488  From: bryan@philips.oz.au (Bryan Ryan)\nOrgani...
6209  From: lochem@fys.ruu.nl (Gert-Jan van Lochem)\...


:Pre-process the data
Lemmatization to get base words

In [None]:
def lemmatization(texts, allowed_postags=["NOUN","ADJ","VERB","ADV"]):
  nlp = spacy.load('en_core_web_sm', disable=["parser", "ner"])
  texts_out = []
  for text in texts:
    doc = nlp(text)
    new_text = []
    for token in doc:
        if token.pos_ in allowed_postags:
            new_text.append(token.lemma_)
    final = " ".join(new_text)
    texts_out.append(final)
  return (texts_out)


In [None]:
lemmatizated_text =  lemmatization(news_data['news'])
print(lemmatizated_text)




In [None]:
stop_words = stopwords.words('english')
print(len(stop_words))
stop_words.extend(['from', 'subject', 're', 'edu', 'use','cdw','would','line','article'])
print(len(stop_words))

179
188


In [None]:
def clean_text(headline):
  word_tokens=word_tokenize(headline)
  tokens=[w for w in word_tokens if w not in stop_words and len(w)>3]
  cleaned_text=" ".join(tokens)
  return cleaned_text

In [None]:
news_data['news_cleaned_text']=news_data['news'].apply(clean_text)

In [None]:
news_data.head()

Unnamed: 0,news,news_cleaned_text
9564,From: colling@ann-arbor.applicon.slb.com (Mich...,From colling ann-arbor.applicon.slb.com Michae...
6642,From: wtm@uhura.neoucom.edu (Bill Mayhew)\nSub...,From uhura.neoucom.edu Bill Mayhew Subject dis...
10340,From: lady@uhunix.uhcc.Hawaii.Edu (Lee Lady)\n...,From lady uhunix.uhcc.Hawaii.Edu Lady Subject ...
8461,From: lehors@koala.inria.fr (Arnaud Le_Hors)\n...,From lehors koala.inria.fr Arnaud Le_Hors Subj...
1895,From: isaaci@ccsg.tau.ac.il (barash isaac)\nSu...,From isaaci ccsg.tau.ac.il barash isaac Subjec...


In [None]:
vect =TfidfVectorizer(stop_words=stop_words,max_features=1000)

In [None]:
vect_text=vect.fit_transform(news_data['news_cleaned_text'])

In [None]:
print(vect_text)

  (0, 542)	0.128820375810534
  (0, 641)	0.1341849796070203
  (0, 493)	0.10796410902951245
  (0, 133)	0.23130964018946212
  (0, 410)	0.11565482009473106
  (0, 481)	0.1223733015001131
  (0, 916)	0.11420942110498822
  (0, 381)	0.10910187091628049
  (0, 426)	0.10112214763907643
  (0, 494)	0.08259198812678717
  (0, 391)	0.0923746440651971
  (0, 402)	0.26273667607824425
  (0, 71)	0.128820375810534
  (0, 678)	0.0923746440651971
  (0, 294)	0.1188035574765831
  (0, 735)	0.12435445728297499
  (0, 662)	0.11283817852683287
  (0, 573)	0.11718285707681626
  (0, 502)	0.09639219368880356
  (0, 352)	0.12435445728297499
  (0, 443)	0.10582429777068574
  (0, 564)	0.22567635705366573
  (0, 976)	0.14502442805409096
  (0, 528)	0.11718285707681626
  (0, 837)	0.5636138440496093
  :	:
  (499, 921)	0.05273797120142099
  (499, 199)	0.053717003942135445
  (499, 862)	0.07871835150637775
  (499, 171)	0.04977557865198913
  (499, 676)	0.04612259830646149
  (499, 545)	0.040470646729726556
  (499, 207)	0.046122598306461

In [None]:
idf=vect.idf_

In [None]:
dd=dict(zip(vect.get_feature_names(), idf))



In [None]:
from sklearn.decomposition import TruncatedSVD
lsa_model = TruncatedSVD(n_components=10, algorithm='randomized', n_iter=10, random_state=42)

lsa_top=lsa_model.fit_transform(vect_text)

In [None]:
l=lsa_top[0]
print("Document 0 :")
for i,topic in enumerate(l):
  print("Topic ",i," : ",topic*100)

Document 0 :
Topic  0  :  11.884855087564834
Topic  1  :  -0.5253615841945045
Topic  2  :  0.9751396924832546
Topic  3  :  -3.9240401342862294
Topic  4  :  -2.456850730774974
Topic  5  :  -2.2156586949451405
Topic  6  :  -9.226233183201517
Topic  7  :  -3.676754195348955
Topic  8  :  -0.6070734021671248
Topic  9  :  -6.778893720618219


In [None]:
vocab = vect.get_feature_names()

for i, comp in enumerate(lsa_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")
         

Topic 0: 
com writes lines organization like posting host people nntp university 

Topic 1: 
windows card host window video file nntp thanks problem posting 

Topic 2: 
com netcom inc sgi hp power att host nntp posting 

Topic 3: 
windows window problem people file ms screen running com data 

Topic 4: 
nasa gov uiuc larc cso information data clipper internet makes 

Topic 5: 
cs pitt windows computer pittsburgh science uiuc uucp com system 

Topic 6: 
israel jews israeli nasa state arab org ohio adam drive 

Topic 7: 
uk co window ac mit chip mil clipper encryption cs 

Topic 8: 
windows ca uiuc israel jews cso sgi cs writes utexas 

Topic 9: 
uiuc cso polygon problem apple illinois news video anyone monitor 



