In [80]:
!pip install nltk
!pip install numpy matplotlib
!pip install pandas
!pip install gensim
!pip install sklearn



# Overview
## 1. Topic Modeling with LDA

## 2. Word Embeddings

## 3. Sentence/Document Clustering Using Word Embeddings





In [25]:
# download relevant parts of NLTK
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Package cess_cat is already up-to-date!
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Package cess_esp is already up-to-date!
[nltk_data]    | Downloading packag

True

In [26]:
import pandas as pd

# Here we use 20-Newsgroups dataset (http://qwone.com/~jason/20Newsgroups/) for this example. 
# This version of the dataset contains about 11k newsgroups posts from 20 different topics. 
# This is available as https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json

raw_data = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(raw_data.target_names.unique())

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


In [27]:
raw_data

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space
...,...,...,...
11309,From: jim.zisfein@factory.com (Jim Zisfein) \n...,13,sci.med
11310,From: ebodin@pearl.tufts.edu\nSubject: Screen ...,4,comp.sys.mac.hardware
11311,From: westes@netcom.com (Will Estes)\nSubject:...,3,comp.sys.ibm.pc.hardware
11312,From: steve@hcrlgw (Steven Collins)\nSubject: ...,1,comp.graphics


In [28]:
text = []
for i in range(0, len(raw_data['content'])):
  text.append(raw_data['content'][i])

# Topic Modeling with LDA

In this section, we will go through how to use python pacakages (*gensim*) to perform the topic analysis.

Reference: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/


## Prepareing data for LDA Analysis

As the first step, we will follow pre-processing methods leanred previously to pre-process the data like tokenization, filtering out the stop words, lemmatizing, building the word dictionary and etc.

In [6]:
# Importing the needed packages
from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer


In [7]:
# tokenization
tokenized_text = []
for sentence in text:
  tokenized_text.append(word_tokenize(sentence))

In [8]:
# filtering stop words (numbers) and punctuations, and lemmatzing
stop_words = stopwords.words("english")
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'line', 'organization', 'university', 'wa', 'ha', "'s", "n't", "'d"])

punctuations = string.punctuation  + "*" + "/" + "\\" + "_" + "-"

lemmatizer = WordNetLemmatizer()

filtered_text = []

for sent in tokenized_text:
  filtered_list = []
  for word in sent:
    # filter out tokens that have punctuations and numbers
      # word.isalpha() returns true if a string only contains letters.
    # filter out stop words
    if word.isalpha() and lemmatizer.lemmatize(word.lower()) not in stop_words and len(word) >= 2:
      filtered_list.append(lemmatizer.lemmatize(word.lower()))
  filtered_text.append(filtered_list)

## Creating the Dictionary and Corpus needed for LDA Topic Modeling

In [9]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(filtered_text)



In [10]:
id2word[0] 

'addition'

In [11]:
# Create Corpus
texts = filtered_text

# Coverting Text to Bag of Words features
corpus = [id2word.doc2bow(text) for text in texts]

In [12]:
corpus[0]
# (word_id, word_count)

[(0, 1),
 (1, 2),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 5),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 2),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 1),
 (41, 1),
 (42, 1),
 (43, 1),
 (44, 1),
 (45, 1)]

In [13]:
print(id2word[0])
id2word[0] in filtered_text[0]

addition


True

In [14]:
print(id2word[60])
id2word[57] in filtered_text[0]

done


False

In [15]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('addition', 1),
  ('anyone', 2),
  ('body', 1),
  ('bricklin', 1),
  ('brought', 1),
  ('bumper', 1),
  ('called', 1),
  ('car', 5),
  ('college', 1),
  ('could', 1),
  ('day', 1),
  ('door', 1),
  ('early', 1),
  ('engine', 1),
  ('enlighten', 1),
  ('front', 1),
  ('funky', 1),
  ('history', 1),
  ('il', 1),
  ('info', 1),
  ('know', 1),
  ('late', 1),
  ('lerxst', 2),
  ('looked', 1),
  ('looking', 1),
  ('made', 1),
  ('maryland', 1),
  ('model', 1),
  ('name', 1),
  ('neighborhood', 1),
  ('park', 1),
  ('please', 1),
  ('production', 1),
  ('really', 1),
  ('rest', 1),
  ('saw', 1),
  ('separate', 1),
  ('small', 1),
  ('spec', 1),
  ('sport', 1),
  ('tellme', 1),
  ('thanks', 1),
  ('thing', 1),
  ('whatever', 1),
  ('wondering', 1),
  ('year', 1)]]

## Building the LDA Model

In this section we will learn the LDA model thourgh the LDA module in gensim. (https://radimrehurek.com/gensim/models/ldamodel.html)

In [16]:
import gensim
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state= 0,
                                           passes = 10,
                                           alpha='auto')

# for more details, please refers to: https://radimrehurek.com/gensim/models/ldamodel.html

## View the topics in LDA Model

In [17]:
lda_model.print_topics()

[(0,
  '0.033*"god" + 0.014*"christian" + 0.013*"jesus" + 0.009*"bible" + 0.008*"one" + 0.007*"church" + 0.006*"christ" + 0.006*"faith" + 0.005*"say" + 0.005*"religion"'),
 (1,
  '0.016*"space" + 0.005*"nasa" + 0.005*"would" + 0.004*"moon" + 0.004*"earth" + 0.004*"launch" + 0.004*"orbit" + 0.004*"year" + 0.004*"first" + 0.004*"henry"'),
 (2,
  '0.015*"detector" + 0.012*"radar" + 0.006*"value" + 0.006*"insurance" + 0.005*"brian" + 0.005*"health" + 0.005*"private" + 0.005*"widget" + 0.004*"set" + 0.003*"crohn"'),
 (3,
  '0.011*"gun" + 0.011*"law" + 0.010*"state" + 0.010*"right" + 0.008*"would" + 0.008*"people" + 0.007*"government" + 0.006*"article" + 0.006*"writes" + 0.005*"weapon"'),
 (4,
  '0.015*"writes" + 0.012*"article" + 0.010*"georgia" + 0.008*"helmet" + 0.008*"michael" + 0.004*"athens" + 0.004*"distribution" + 0.004*"like" + 0.004*"john" + 0.004*"jewish"'),
 (5,
  '0.009*"food" + 0.008*"israel" + 0.008*"disease" + 0.008*"msg" + 0.007*"medical" + 0.006*"article" + 0.006*"patient" 

### How to interpret the extracted topics?





For example, Topic 0 is represented as 

```
0.033*"god" + 0.014*"christian" + 0.013*"jesus" + 0.009*"bible" + 0.008*"one" + 0.007*"church" + 0.006*"christ" + 0.006*"faith" + 0.005*"say" + 0.005*"religion"
```

It means the top 10 keywords that contribute to this topic are: god, christian, jesus, bible and etc. And the weight of food on topic 0 is 0.033.

Looking at these keywords, can you guess what this topic could be? You may summarise it either are 'religion' or 'christian.




In [18]:
# Getting labels for all the news
doc_lda = lda_model[corpus]

In [19]:
print(text[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [20]:
doc_lda[0]

[(10, 0.6328957), (11, 0.3343146)]

In [21]:
lda_model.print_topics()[10]

(10,
 '0.016*"car" + 0.009*"would" + 0.008*"writes" + 0.008*"like" + 0.008*"article" + 0.007*"one" + 0.007*"get" + 0.006*"bike" + 0.005*"time" + 0.005*"good"')

In [22]:
print(text[1])

From: guykuo@carson.u.washington.edu (Guy Kuo)
Subject: SI Clock Poll - Final Call
Summary: Final call for SI clock reports
Keywords: SI,acceleration,clock,upgrade
Article-I.D.: shelley.1qvfo9INNc3s
Organization: University of Washington
Lines: 11
NNTP-Posting-Host: carson.u.washington.edu

A fair number of brave souls who upgraded their SI clock oscillator have
shared their experiences for this poll. Please send a brief message detailing
your experiences with the procedure. Top speed attained, CPU rated speed,
add on cards and adapters, heat sinks, hour of usage per day, floppy disk
functionality with 800 and 1.4 m floppies are especially requested.

I will be summarizing in the next two days, so please add to the network
knowledge base if you have done the clock upgrade and haven't answered this
poll. Thanks.

Guy Kuo <guykuo@u.washington.edu>



In [23]:
doc_lda[1]

[(1, 0.10218506),
 (6, 0.387976),
 (9, 0.05716874),
 (13, 0.16183329),
 (15, 0.047197845),
 (18, 0.23120497)]

In [24]:
lda_model.print_topics()[6]

(6,
 '0.027*"drive" + 0.015*"card" + 0.011*"disk" + 0.009*"system" + 0.008*"scsi" + 0.008*"sale" + 0.008*"hard" + 0.008*"driver" + 0.007*"controller" + 0.007*"mac"')

### Practice: 
What about the other topics? Can you try to summariza the topics based on these key words?

# Word Embeddings

In this section, we will learn how to use pre-trained embeddings from gensim to represent words with vectors.

### Load Pre-trained Word Embeddings from Gensim

In [44]:
import numpy as np
import gensim.downloader

In [30]:
# View all types of pre-trained embeddings 
list(gensim.downloader.info()['models'].keys())

['fasttext-wiki-news-subwords-300',
 'conceptnet-numberbatch-17-06-300',
 'word2vec-ruscorpora-300',
 'word2vec-google-news-300',
 'glove-wiki-gigaword-50',
 'glove-wiki-gigaword-100',
 'glove-wiki-gigaword-200',
 'glove-wiki-gigaword-300',
 'glove-twitter-25',
 'glove-twitter-50',
 'glove-twitter-100',
 'glove-twitter-200',
 '__testing_word2vec-matrix-synopsis']

In [31]:
# download and load the glove-wiki-gigaword-50
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-50')

### How to Use the Pre-trained Word Embeddings?

Retrieve the embedding for a specific word

In [67]:
glove_vectors['atlanta']

array([-1.0255  ,  1.14    ,  0.27088 ,  1.2964  , -0.22467 , -0.55808 ,
       -1.9727  , -0.52942 ,  0.55607 , -0.48596 , -0.7555  , -0.55799 ,
       -0.99334 ,  0.13091 ,  0.83527 , -0.058354, -0.79702 , -0.5973  ,
       -0.43055 ,  0.095148, -0.42831 ,  0.5277  , -0.41006 ,  0.64514 ,
       -0.59836 , -1.0417  , -0.060947, -0.45935 ,  0.79238 , -0.80473 ,
        1.781   ,  0.52496 ,  0.036867, -0.51445 , -0.19282 , -0.31396 ,
        0.094393,  0.17953 ,  0.91322 ,  0.48565 , -0.053946,  0.3348  ,
        0.24868 ,  0.71448 ,  0.040415,  1.4561  , -0.15356 , -0.15673 ,
       -0.55824 ,  1.0741  ], dtype=float32)

In [68]:
glove_vectors['atlanta'].shape

(50,)

Find the most similar words

In [58]:
glove_vectors.most_similar('atlanta')

[('denver', 0.8360953330993652),
 ('houston', 0.833003044128418),
 ('dallas', 0.8196703195571899),
 ('seattle', 0.8138191103935242),
 ('austin', 0.808619499206543),
 ('miami', 0.8046270608901978),
 ('tampa', 0.7782285213470459),
 ('angeles', 0.7770278453826904),
 ('cincinnati', 0.7677363753318787),
 ('phoenix', 0.7639263868331909)]

In [33]:
glove_vectors.most_similar('twitter')

[('facebook', 0.9333045482635498),
 ('myspace', 0.8801369667053223),
 ('youtube', 0.8430657982826233),
 ('blog', 0.8262056708335876),
 ('blogs', 0.8064823746681213),
 ('blogging', 0.7970671653747559),
 ('tumblr', 0.7901090383529663),
 ('email', 0.778261125087738),
 ('tweets', 0.7604536414146423),
 ('e-mail', 0.7538727521896362)]

### Embeddings Capture Relational Meaning

In [50]:
# calculate the cosine similarity between two vectors
def cosine_sim(a,b):
  return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [51]:
cosine_sim(glove_vectors['king'],glove_vectors['queen'])

0.7839044

In [52]:
a = glove_vectors['king'] - glove_vectors['man'] + glove_vectors['woman']

In [53]:
cosine_sim(a,glove_vectors['queen'])

0.86095816

Could you try another example shown in the lecture?

e.g., `vector(‘paris’) - vector(‘france’) + vector(‘italy’)` and `vector(‘rome’)`

# Sentence/Document Clustering with Word Embeddings

Now that we know how to use pre-trained word embeddings, let's utilize the embeddings to cluster documents in 20-newsgroups.

## Prepareing Data

Similarly, the first step is to pre-process the input data: tokenization, lammatization and etc.

In [69]:
# Importing the needed packages
from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer

# tokenization
tokenized_text = []
for sentence in text:
  tokenized_text.append(word_tokenize(sentence))

# filtering stop words (numbers) and punctuations, and lemmatzing
stop_words = stopwords.words("english")
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'line', 'organization', 'university', 'wa', 'ha', "'s", "n't", "'d"])

punctuations = string.punctuation  + "*" + "/" + "\\" + "_" + "-"

lemmatizer = WordNetLemmatizer()

filtered_text = []

for sent in tokenized_text:
  filtered_list = []
  for word in sent:
    # filter out tokens that have punctuations and numbers
      # word.isalpha() returns true if a string only contains letters.
    # filter out stop words
    if word.isalpha() and lemmatizer.lemmatize(word.lower()) not in stop_words and len(word) >= 2:
      filtered_list.append(lemmatizer.lemmatize(word.lower()))
  filtered_text.append(filtered_list)

In [70]:
filtered_text[0]

['lerxst',
 'thing',
 'car',
 'maryland',
 'college',
 'park',
 'wondering',
 'anyone',
 'could',
 'enlighten',
 'car',
 'saw',
 'day',
 'sport',
 'car',
 'looked',
 'late',
 'early',
 'called',
 'bricklin',
 'door',
 'really',
 'small',
 'addition',
 'front',
 'bumper',
 'separate',
 'rest',
 'body',
 'know',
 'anyone',
 'tellme',
 'model',
 'name',
 'engine',
 'spec',
 'year',
 'production',
 'car',
 'made',
 'history',
 'whatever',
 'info',
 'funky',
 'looking',
 'car',
 'please',
 'thanks',
 'il',
 'brought',
 'neighborhood',
 'lerxst']

## Generating Vector Representations of Sentences/Documents

A common approach to vectorize a sentence/document is to use the average of all the vectors of words in the sentence/document.

For words that are in the pre-trained word embedding models, we directly use the pre-trained embeddings. For words that are not in the pre-trained models, we call them **unknow words**. Usually, we will use a *zero vector* to represent them or just ignore them. If you want a more precise model that could cover those cases, you could train your own word2vec model over your own corpus. For more details, please refer to https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html and  https://towardsdatascience.com/a-beginners-guide-to-word-embedding-with-gensim-word2vec-model-5970fa56cc92.


In [87]:
def vectorize(doc, model):
  vectors = []
  
  # transform every token in the input doc to vectors
  for token in doc:
    #zero_vector = np.zeros(model.vector_size)
    if token in model:
      vectors.append(model[token])
    #else:
    #  vectors.append(zero_vector)

  # average word vectors in one document
  vectors = np.asarray(vectors)
  avg_vec = vectors.mean(axis=0)

  return avg_vec

In [88]:
vectorize(filtered_text[0], glove_vectors)

array([ 0.1752914 ,  0.13909456,  0.22094107, -0.12333324,  0.23647927,
       -0.04230335, -0.63693875, -0.14860386, -0.06012844, -0.03337453,
       -0.07830777, -0.05556682, -0.4236353 , -0.04059649,  0.39091134,
        0.1679264 , -0.11425725,  0.33666366, -0.1622146 , -0.52783036,
        0.05852604,  0.16622783, -0.16341592,  0.23142605,  0.22687714,
       -1.2628195 , -0.48308963,  0.18888098,  0.35635024, -0.24269347,
        2.4111907 ,  0.14718468, -0.18373367, -0.14261353,  0.22068126,
        0.06201767,  0.15856849,  0.07977802, -0.03621079, -0.17915004,
       -0.13333684, -0.00876729, -0.10287292, -0.00261767,  0.14338624,
        0.1158571 , -0.0657627 , -0.21478257,  0.1238562 ,  0.06324318],
      dtype=float32)

In [89]:
# vectorize all the news

vectorized_text = []
for doc in filtered_text:
  vectorized_text.append(vectorize(doc, glove_vectors))


## Clustering Documents with KMeans Cluster through Sklearn

In [112]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=20, random_state = 100).fit(vectorized_text)

In [135]:
# view the cluster assigned to a given document, e.g., the first document:
km.labels_[0]

1

In [133]:
# view the center of a given cluster. For example, the center of cluster 1:
km.cluster_centers_[1]

array([ 0.05587016,  0.0814233 ,  0.1851054 , -0.05953059,  0.11577202,
        0.08937234, -0.35560544, -0.13192832,  0.00598951, -0.02030215,
       -0.01738443,  0.0956945 , -0.27545068,  0.04817316,  0.21985977,
        0.15796056,  0.00575328,  0.12508632, -0.20474372, -0.49394582,
        0.01072044,  0.04175672,  0.01448295,  0.06068192,  0.1620056 ,
       -1.01463056, -0.25587663,  0.15912723,  0.32359527, -0.19843669,
        2.06408551,  0.08222181, -0.12588641, -0.0612789 ,  0.08557905,
        0.00979518,  0.11233084,  0.10292808,  0.08265812, -0.15084695,
        0.01966269,  0.04889586, -0.03501873,  0.12977324,  0.03491078,
        0.04244387,  0.06805984, -0.10252202,  0.02710781,  0.12603377])

## Interpreting Clusters

To interpret clusters, we usually find several most repreentative documents in one cluster and try to summarize the topics from them.

In [124]:
# For example, let's take a look at the most representative docs in cluster 1
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_text - km.cluster_centers_[1], axis=1)
)
for d in most_representative_docs[:2]:
    print(text[d])
    print("\n ================================= \n")
    

From: kking@cs.uah.edu (Ken King)
Subject: Re: The Kuebelwagen??!!          
Reply-To: kking@uahcs2.uah.edu (Ken King)
Organization: Computer Science Dept. - Univ. of Alabama in Huntsville
Lines: 36

In article <C5K5Co.F09@mentor.cc.purdue.edu> thwang@mentor.cc.purdue.edu (Tommy Hwang) writes:
>	Sorry for the mis-spelling, but I forgot how to spell it after 
>my series of exams and NO-on hand reference here.
>
>	Is it still possible to get those cute WWII VW Jeep-wanna-be's?
>A replica would be great I think.  

  greetings:
  you may be in luck.  i seem to recall seeing a blurb in one of
the kit car magazines about a company in norway who pulled a
mould (sp?) off a real kubel, and has adapted it to the beetle
floorpan.  as for the suspension, all i can remember about the
vw thing i used to own is that it had about 3" more suspension
travel than a stock beetle, but i'd heard that there were after-
market parts for off-road use that were as good or better.  note
that the major differenc

In [132]:
# For example, let's take a look at the most representative docs in cluster 2
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_text - km.cluster_centers_[2], axis=1)
)
for d in most_representative_docs[:2]:
    print(text[d])
    print("\n ================================= \n")

From: pat@rwing.UUCP (Pat Myrto)
Subject: Re: White House Public Encryption Management Fact Sheet
Article-I.D.: rwing.2087
Distribution: na
Organization: Totally Unorganized
Lines: 52

In article <19APR199313020883@charon.gsfc.nasa.gov> paul@charon.gsfc.nasa.gov (Paul Olson) writes:
>In article <1qnav4$r3l@transfer.stratus.com>, cme@ellisun.sw.stratus.com (Carl Ellison) writes...
>>In article <C5LGAz.250@dove.nist.gov> clipper@csrc.ncsl.nist.gov (Clipper Chip Announcement) writes:
>> 
>>>Further, the Attorney General
>>
> [ ... good post describing what is in store for us deleted ... ]
>
>It's also interesting to note that two months ago Rush Limbaugh said that
>Clinton would have the "plumbers" out in force shortly.  Clinton and his
>henchmen firmly believe in strong ubiquitous government control.  Anytime a
>leader believes in that, the leader will use every means possible to retain
>that control and take more.
>
>WE have to take OUR government back.  Otherwise we will end up living 

### Practice:

Can you use similar technique to interpret and summarize other clsuters? Are they consistent with the Topic Modeling from LDA? Which one do you think is better?