In [1]:
import pyLDAvis
import pyLDAvis.sklearn
import numpy as np
pyLDAvis.enable_notebook()

In [2]:
#import dataset
from sklearn.datasets import fetch_20newsgroups
#import functions to analyze the __ and the relative weighted term freq of words in a doc (TFIDF)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#import LDA stuff
from sklearn.decomposition import LatentDirichletAllocation

In [3]:
#There are only 18846 documents, with 11314 (60%) training and 7532 (40%) testing.
newsgroups1 = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
docs_raw_training = newsgroups1.data
newsgroups2 = fetch_20newsgroups(subset='test',remove=('headers', 'footers', 'quotes'))
docs_raw_testing = newsgroups2.data
print('the length of the training set is ',len(docs_raw_training))
print('the length of the testing set is ',len(docs_raw_testing))

the length of the training set is  11314
the length of the testing set is  7532


In [4]:
#get doc and category for a given document in the training set
docNum = 999
#print category number
category_for_given_doc = newsgroups1.target[docNum]
print(newsgroups1.target[docNum])
#print category name
print(newsgroups1.target_names[category_for_given_doc])

#print doc
print(docs_raw_training[docNum])



#print all labels
print("\n\n\nAll labels = " , newsgroups1.target_names)

15
soc.religion.christian
Sorry for posting this, but my e-mail keeps bouncing.  Maybe it will
help others here, anyway, and therefore I pray others will read this.  It is
actually a response from my Aunt, who has 5 kids, since I have none yet.

     I'm posting this for a good Christian relative who does not have e-mail
access.  Since this aunt and uncle have 5 kids I felt they would be more
relevant than I, who have none (yet).

     13-year-old (13YO) twins, 10YO boy, 6.5YO boy, 2YO girl

     I don't call it spanking, but they do, so yes, very rarely.

     I don't call it spanking because it's more of a reaction to something
very dangerous, such as trying to stick their finger in a fan or running
into the road.  Maybe 3-4 times for each except for the 2YO girl, who has
not been spanked yet.
     They call it that because it *does* hurt their feelings, and of course
I give all the hugs and stuff to ensure they know they're still loved.

     No, that would be too painful.  If it's 

In [5]:
#the term frequency vectorizer returns an M(document number in rows) BY N(count of every possible word in col) matrix of term frequency
#min and max df (doc freq) are set to select thresholds of which words to include in the tf matrix
# max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
# min_df = 0.01 means "ignore terms that appear in less than 1% of the documents".
# min_df = 5 means "ignore terms that appear in fewer than 10 documents".
# this gets rid of words like "the", "and" (thanks to max_df) 
# also removes words like "supercalifragalisticexpialidocious" which may only appear in one or two articles and thus is not important to consider

tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)

#run our pre programmed term freq vectorizer on the training set
dtm_tf = tf_vectorizer.fit_transform(docs_raw_training)
print(dtm_tf.shape)
#returns (11314, 9144) so 11314 rows (docs) and 9144 words (cols) fitting the max_df and min_df constraints we gave
# note that the term frequency matrix is extremely sparse, as only few docs have certain words
#print all words
#print(tf_vectorizer.get_feature_names())

(11314, 9144)


In [7]:
#next, convert tf matrix into a tf-idf matrix to scale things better
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw_training)
print(dtm_tfidf.shape)

(11314, 9144)


In [13]:
#import for debugging to obtain attributes of objects
from pprint import pprint

#set the number of distince topics we want our LDA classifier to extract
#the true number of topics given is 20. interesting as topics goes down the categories learned become broader
num_topics = 6

#now, use LDA to obtain a matrix with TxW (topics(20)) by (words(~9000)) with the estimated probabilities 
# normalized means column always sums to 1.0
lda = LatentDirichletAllocation(n_components=num_topics, random_state=0)

#fit the lda model to our data (in this case it is the testing data from 20 newsgroup)
lda.fit(dtm_tfidf)
#pprint(vars(lda))



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=6, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [14]:
#normalize cols to equal 1 
#lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]
#pprint([i[0] for i in lda.components_])

In [15]:
#Give it lda object, term freq matrix, and alphabetical list of words we consider
newsGroup_data = pyLDAvis.sklearn.prepare(lda, dtm_tfidf, tf_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [16]:
pyLDAvis.display(newsGroup_data)

In [12]:
print("\n\n\nAll labels = " , newsgroups1.target_names)





All labels =  ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
