In [None]:
!pip install pyLDAvis
!pip install gensim

In [None]:
import re
import numpy as np
import pandas as  pd
from pprint import pprint# Gensim
import gensim
import gensim.corpora as 
from gensim.utils import 
from gensim.models import 
import spacy
import pyLDAvis
import pyLDAvis.gensim_models as 
import matplotlib.pyplot as plt

In [None]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [None]:
# LoadDataset
df=pd.read_json('newsgroups.json')
print(df.target_names.unique())
df.head()

In [None]:
df.shape

### Initial Data Cleaning

In [None]:
# Convert to list 
data = df.content.values.tolist()

# Remove Emails 
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]  

In [None]:
# Remove new line characters 
data = [re.sub('\s+', ' ', sent) for sent in data]  

In [None]:
# Remove distracting single quotes 
data = [re.sub("\'", "", sent) for sent in data]  
pprint(data[:1])

### Tokenization

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))            #deacc=True removes punctuations

data_words = list(sent_to_words(data))
print(data_words[:1])

### Build the bigram and trigram models

In [None]:
bigram = gensim.models.(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.(bigram[data_words], threshold=100)

In [None]:
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.(bigram)
trigram_mod = gensim.models.phrases.(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

### Define function for stopwords, bigrams, trigrams and lemmatization

In [None]:
def remove_stopwords(texts):
    return [[word for word in (str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [[doc] for doc in texts]

def make_trigrams(texts):
    return [[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token. for token in doc if token. in allowed_postags])
    return texts_out

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

In [None]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

In [None]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('', disable=['parser', 'ner'])

In [None]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])

In [None]:
# Create Dictionary 
id2word = corpora.Dictionary(data_lemmatized)  

In [None]:
# Create Corpus 
texts = data_lemmatized 

In [None]:
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]  

In [None]:
# View 
print(corpus[:1])

In [None]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the keyword of topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = (model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = .get_coherence()
print('\nCoherence Score: ', coherence_lda)

### Visualize the topics

In [None]:
vis = gensimvis.prepare(lda_model, corpus, id2word)

In [None]:
pyLDAvis.display(vis)

In [None]:
pyLDAvis.prepared_data_to_html(vis)

In [None]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_lemmatized)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)