In [1]:
import pickle
import gensim
from sklearn.feature_extraction.text import CountVectorizer

# Load the list of documents
with open('newsgroups', 'rb') as f:
    newsgroup_data = pickle.load(f)

# Use CountVectorizor to find three letter tokens, remove stop_words, 
# remove tokens that don't appear in at least 20 documents,
# remove tokens that appear in more than 20% of the documents
vect = CountVectorizer(min_df=20, max_df=0.2, stop_words='english', 
                       token_pattern='(?u)\\b\\w\\w\\w+\\b')
# Fit and transform
X = vect.fit_transform(newsgroup_data)

# Convert sparse matrix to gensim corpus.
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

# Mapping from word IDs to words (To be used in LdaModel's id2word parameter)
id_map = dict((v, k) for k, v in vect.vocabulary_.items())


In [2]:
# Use the gensim.models.ldamodel.LdaModel constructor to estimate 
# LDA model parameters on the corpus, and save to the variable `ldamodel`

# Your code here:
ldamodel = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                          id2word = id_map,
                                          passes=25,
                                          random_state=34)

In [16]:
len(ldamodel.print_topics())

20

In [22]:
def lda_topics():
    
    # Your Code Here
    
    return len(ldamodel.print_topics(num_topics = 10, num_words=10))
lda_topics()

10

In [8]:
new_doc = ["\n\nIt's my understanding that the freezing will start to occur because \
of the\ngrowing distance of Pluto and Charon from the Sun, due to it's\nelliptical orbit. \
It is not due to shadowing effects. \n\n\nPluto can shadow Charon, and vice-versa.\n\nGeorge \
Krumins\n-- "]

In [9]:
new_doc_matrix = vect.transform(new_doc)
corpus = gensim.matutils.Sparse2Corpus(new_doc_matrix, documents_columns=False)
doc_topics = ldamodel.get_document_topics(corpus)
doc_topics[0]

<gensim.interfaces.TransformedCorpus at 0x244387e40f0>

In [13]:
print(list(doc_topics)[0])

[(17, 0.20200191), (33, 0.20199962), (58, 0.40199006)]


In [12]:

def topic_distribution():
    
    new_doc_transformed = vect.transform(new_doc)
    corpus = gensim.matutils.Sparse2Corpus(new_doc_transformed, documents_columns=False)
    doc_topics = ldamodel.get_document_topics(corpus)
    topic_dist = []
    for val in list(doc_topics):
        for v in val:
            topic_dist.append(v)
    return topic_dist

topic_distribution()

[(17, 0.20200187), (33, 0.2019996), (58, 0.40199006)]