<a href="https://colab.research.google.com/github/geersenthil/Topic-Modeling-/blob/main/NMF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.models import LsiModel
from gensim.parsing import strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
from gensim.parsing import preprocess_string
from nltk.tokenize import RegexpTokenizer

from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

#nltk
import nltk 
nltk.download("stopwords")
nltk.download('punkt')
from nltk.corpus import stopwords


#import nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet


# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

#Spacy
import spacy


#Dataset
from sklearn.datasets import fetch_20newsgroups


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
news_group = fetch_20newsgroups()

news_group_data = news_group.data
news_group_target_names = news_group.target_names
news_group_target = news_group.target

In [None]:
news_df = pd.DataFrame({'news': news_group_data})
news_data =  pd.DataFrame({'news':news_df['news'].sample(500)})
print(news_data.head())

Pre-process the data Lemmatization to get base words

In [4]:
def lemmatization(texts, allowed_postags=["NOUN","ADJ","VERB","ADV"]):
  nlp = spacy.load('en_core_web_sm', disable=["parser", "ner"])
  texts_out = []
  for text in texts:
    doc = nlp(text)
    new_text = []
    for token in doc:
        if token.pos_ in allowed_postags:
            new_text.append(token.lemma_)
    final = " ".join(new_text)
    texts_out.append(final)
  return (texts_out)

In [5]:
lemmatizated_text =  lemmatization(news_data['news'])
print(lemmatizated_text)



In [7]:
stop_words = stopwords.words('english')
print(len(stop_words))
stop_words.extend(['from', 'subject', 're', 'edu', 'use','cdw','would','line','article'])
print(len(stop_words))

179
188


In [8]:
def clean_text(headline):
  word_tokens=word_tokenize(headline)
  tokens=[w for w in word_tokens if w not in stop_words and len(w)>3]
  cleaned_text=" ".join(tokens)
  return cleaned_text

In [9]:
news_data['news_cleaned_text']=news_data['news'].apply(clean_text)

In [10]:
news_data.head()

Unnamed: 0,news,news_cleaned_text
9372,From: marka@hcx1.ssd.csd.harris.com (Mark Ashl...,From marka hcx1.ssd.csd.harris.com Mark Ashley...
2856,From: bgardner@bambam.es.com (Blaine Gardner)\...,From bgardner bambam.es.com Blaine Gardner Sub...
4003,From: will@rins.ryukoku.ac.jp (William Reiken)...,From rins.ryukoku.ac.jp William Reiken Subject...
2759,From: wb8foz@skybridge.SCL.CWRU.Edu (David Les...,From wb8foz skybridge.SCL.CWRU.Edu David Leshe...
5469,From: bill@west.msi.com (Bill Poitras)\nSubjec...,From bill west.msi.com Bill Poitras Subject Re...


In [14]:
vectorizer = TfidfVectorizer(stop_words=stop_words,max_features=1000)
X = vectorizer.fit_transform(news_data['news_cleaned_text'])
words = np.array(vectorizer.get_feature_names())

print(X)
print("X = ", words)

  (0, 535)	0.14580223321997265
  (0, 633)	0.12107199416917905
  (0, 281)	0.14580223321997265
  (0, 721)	0.12306100647390808
  (0, 630)	0.1353541946062304
  (0, 110)	0.10975824160706328
  (0, 654)	0.2572703307310058
  (0, 985)	0.13116015361644845
  (0, 428)	0.1241026385652065
  (0, 442)	0.10841892687038386
  (0, 178)	0.14580223321997265
  (0, 919)	0.1286351653655029
  (0, 837)	0.12012095207114727
  (0, 766)	0.16122242016427513
  (0, 162)	0.16854345996603726
  (0, 502)	0.49403747093523676
  (0, 829)	0.13848119341821052
  (0, 800)	0.11187566368452098
  (0, 864)	0.10189489763936202
  (0, 879)	0.14791965529743037
  (0, 808)	0.13249946835312787
  (0, 617)	0.1141392270060646
  (0, 724)	0.23314121980186345
  (0, 547)	0.15018321861897396
  (0, 652)	0.16450757676100877
  :	:
  (499, 962)	0.07141375356197183
  (499, 890)	0.09432144903531477
  (499, 485)	0.11648664789180624
  (499, 486)	0.1274040572445776
  (499, 400)	0.09708547829264216
  (499, 892)	0.11965803203648034
  (499, 893)	0.129045165035

