In [10]:
import pandas as pd, numpy as np
import re, nltk
from string import punctuation
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [3]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from pprint import pprint
np.random.seed(2019)

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Dhaval
[nltk_data]     Simaria\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
# Loading the dataset
data = pd.read_csv('D:\\APP\\NITW\\Natural Language Processing\\Assignment-14\\abcnews-date-text.csv', error_bad_lines=False);
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [10]:
#Analyse the data
#data.head(10)
#data.info()
#data.describe()
#print(len(documents))
print(documents['headline_text'])

0          aba decides against community broadcasting lic...
1             act fire witnesses must be aware of defamation
2             a g calls for infrastructure protection summit
3                   air nz staff in aust strike for pay rise
4              air nz strike to affect australian travellers
                                 ...                        
1103658    the ashes smiths warners near miss liven up bo...
1103659              timelapse: brisbanes new year fireworks
1103660             what 2017 meant to the kids of australia
1103661     what the papodopoulos meeting may mean for ausus
1103662    who is george papadopoulos the former trump ca...
Name: headline_text, Length: 1103663, dtype: object


In [11]:
# Define functions to sanitize the data
# 2. Check the hygiene of data and sanitize it.

#Write a function to perform lemmatize and stem preprocessing steps on 
#the data set.
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [12]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

0            [decid, communiti, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

In [13]:
dictionary = gensim.corpora.Dictionary(processed_docs)

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit


In [14]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [15]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(76, 1), (112, 1), (483, 1), (4014, 1)]

In [16]:
# Create TF-IDF model
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5892908867507543),
 (1, 0.38929654337861147),
 (2, 0.4964985175717023),
 (3, 0.5046520327464028)]


In [None]:
# Vectorize the data and create a Document Term Matrix (DTM)
# 3. Create DTM using following parameters:
#max_df=0.95 min_df=2,max_features=1000,ngram_range = (1,2),stop_words='english'
cvec = CountVectorizer(max_df=0.95, min_df=2,max_features=1000,ngram_range = (1,2),stop_words='english')
X = cvec.fit_transform(documents['headline_text'])
dtm = pd.DataFrame(X.toarray(), columns=cvec.get_feature_names())
print(dtm)

In [9]:
# Fit a LDA model on Document Term Matrix created above
#4. Fit a LDA model with 5 components
#n_components=5
lda = LatentDirichletAllocation(n_components=5)
lda.fit_transform(X)

In [None]:
# Vizualize the topics generated
#5. Create a report
import mglearn as mg  
sorting = np.argsort(lda.components_)[:,::-1] 
features = np.array(cvec.get_feature_names()) 
mg.tools.print_topics(topics=range(5), feature_names=features, sorting=sorting, topics_per_chunk=5, n_words=25)