In [85]:
#IMPORTING LIBRARIES 

import wikipedia
import nltk

nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

#DOWNLOADING ARTICLES FROM WIKIPEDIA BY SPECIFYING TOPIC TO PAGE OBJECT OF WIKI'S LIBRARY
Cricket_sports = wikipedia.page("Cricket")
artificial_intelligence = wikipedia.page("Artificial Intelligence")
mona_lisa = wikipedia.page("Mona Lisa")
eiffel_tower = wikipedia.page("Eiffel Tower")

#"CONTENT" USED TO RETRIEVE CONTENTS OF WEBPAGE,CONTENT OF FOUR ARTICLES STORED IN CORPUS
corpus = [Cricket_sports.content,artificial_intelligence.content, mona_lisa.content, eiffel_tower.content]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sonu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [86]:
corpus

['Cricket is a bat-and-ball game played between two teams of eleven players on a field at the centre of which is a 20-metre (22-yard) pitch with a wicket at each end, each comprising two bails balanced on three stumps. The batting side scores runs by striking the ball bowled at the wicket with the bat, while the bowling and fielding side tries to prevent this and dismiss each player (so they are "out"). Means of dismissal include being bowled, when the ball hits the stumps and dislodges the bails, and by the fielding side catching the ball after it is hit by the bat, but before it hits the ground. When ten players have been dismissed, the innings ends and the teams swap roles. The game is adjudicated by two umpires, aided by a third umpire and match referee in international matches. They communicate with two off-field scorers who record the match\'s statistical information.\nThere are various formats ranging from Twenty20, played over a few hours with each team batting for a single inn

In [87]:
#DATA PRE-PROCESSING

import re
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

def preprocess_text(document):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        tokens = [stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens if len(word)  > 5]

        return tokens

In [88]:
#CREATING A CORPUS OF ALL TOKENS/WORDS IN THE FOUR WIKI ARTICLES

processed_data = [];
for doc in corpus:
    tokens = preprocess_text(doc)
    processed_data.append(tokens)

In [89]:
#TO CREATE A DICTIONARY AND BAG-OF-WORDS CORPUS
from gensim import corpora

gensim_dictionary = corpora.Dictionary(processed_data)
gensim_corpus = [gensim_dictionary.doc2bow(token, allow_update=True) for token in processed_data]

In [90]:
#SAVING DICTIONARY AND CORPUS USING PICKLE
import pickle

pickle.dump(gensim_corpus, open('gensim_corpus_corpus.pkl', 'wb'))
gensim_dictionary.save('gensim_dictionary.gensim')

In [91]:
num_topics=4

In [92]:
#CREATING THE LDA MODEL
import gensim

lda_model = gensim.models.ldamodel.LdaModel(gensim_corpus, num_topics=4, id2word=gensim_dictionary, passes=20)
lda_model.save('gensim_model.gensim')

In [93]:
#TEN WORDS FOR EACH TOPIC
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.017*"intelligence" + 0.014*"machine" + 0.013*"artificial" + 0.011*"problem" + 0.010*"learning" + 0.009*"system" + 0.008*"network" + 0.007*"research" + 0.007*"knowledge" + 0.006*"computer"')
(1, '0.038*"cricket" + 0.018*"batsman" + 0.014*"wicket" + 0.011*"bowler" + 0.010*"inning" + 0.009*"played" + 0.009*"international" + 0.009*"player" + 0.007*"umpire" + 0.006*"century"')
(2, '0.036*"painting" + 0.017*"leonardo" + 0.009*"louvre" + 0.009*"portrait" + 0.007*"century" + 0.006*"french" + 0.006*"museum" + 0.005*"italian" + 0.005*"giocondo" + 0.004*"subject"')
(3, '0.026*"eiffel" + 0.008*"second" + 0.006*"french" + 0.006*"structure" + 0.006*"exposition" + 0.005*"tallest" + 0.005*"engineer" + 0.004*"design" + 0.004*"restaurant" + 0.004*"france"')


In [94]:
#ITERATING OVER NUMBER OF TOPICS,GETTING TOP WORDS FOR EACH CLUSTER & ADDING TO DATAFRAME
import pandas as pd;
import numpy as np;
def get_lda_topics(model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20);
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    return pd.DataFrame(word_dict);

In [95]:
get_lda_topics(lda_model, num_topics)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04
0,intelligence,cricket,painting,eiffel
1,machine,batsman,leonardo,second
2,artificial,wicket,louvre,french
3,problem,bowler,portrait,structure
4,learning,inning,century,exposition
5,system,played,french,tallest
6,network,international,museum,engineer
7,research,player,italian,design
8,knowledge,umpire,giocondo,restaurant
9,computer,century,subject,france


In [96]:
#EVALUATING THE LDA MODEL

test_doc = 'Great structures are build to remember an event happened in the history.'
test_doc = preprocess_text(test_doc)
bow_test_doc = gensim_dictionary.doc2bow(test_doc)

print(lda_model.get_document_topics(bow_test_doc))

[(0, 0.084221385), (1, 0.084749185), (2, 0.088212065), (3, 0.74281734)]


In [97]:
#EVALUATING USING PERPLEXITY AND COHERENCE SCORE
print('\nPerplexity:', lda_model.log_perplexity(gensim_corpus))

from gensim.models import CoherenceModel

coherence_score_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=gensim_dictionary, coherence='c_v')
coherence_score = coherence_score_lda.get_coherence()

print('\nCoherence Score:', coherence_score)


Perplexity: -7.5778464066970725

Coherence Score: 0.7320272106811812


In [98]:
#VISUALIZATION
#EACH CIRCLE CORRESPONDS TO ONE TOPIC,IF WE HOVER ON ANY WORD ON THE RIGHT,WE WILL SEE THE CIRCLE FOR TOPIC THAT CONTAINS THE WORD GETTING HIGHLIGHTED
#FOR EG. IF WE HOVER ON BATSMAN,CIRCLE 2 RELATED WHICH IS RELATED TO TOPIC 2,THAT IS, CRICKET GETS HIGHLIGHTED 

gensim_dictionary = gensim.corpora.Dictionary.load('gensim_dictionary.gensim')
gensim_corpus = pickle.load(open('gensim_corpus_corpus.pkl', 'rb'))
lda_model = gensim.models.ldamodel.LdaModel.load('gensim_model.gensim')

import pyLDAvis.gensim

lda_visualization = pyLDAvis.gensim.prepare(lda_model, gensim_corpus, gensim_dictionary, sort_topics=False)
pyLDAvis.display(lda_visualization)

In [99]:
#SAVING THE VISULATION AS HTML FILE
pyLDAvis.save_html(lda_visualization, 'wikipedia_pyLDAvis.html')

In [100]:
#TOPIC MODELLING USING LSI(LATENT SEMANTIC INDEXING)
from gensim.models import LsiModel

lsi_model = LsiModel(gensim_corpus, num_topics=4, id2word=gensim_dictionary)
topics = lsi_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.392*"intelligence" + 0.312*"machine" + 0.296*"artificial" + 0.242*"problem" + 0.225*"learning" + 0.213*"system" + 0.183*"network" + 0.155*"research" + 0.148*"knowledge" + 0.145*"computer"')
(1, '-0.657*"cricket" + -0.312*"batsman" + -0.240*"wicket" + -0.182*"bowler" + -0.168*"inning" + -0.159*"international" + -0.158*"played" + -0.147*"player" + -0.120*"umpire" + -0.116*"century"')
(2, '0.689*"painting" + 0.326*"leonardo" + 0.181*"eiffel" + 0.180*"louvre" + 0.177*"portrait" + 0.151*"french" + 0.125*"century" + 0.120*"museum" + 0.096*"original" + 0.092*"italian"')
(3, '-0.657*"eiffel" + 0.268*"painting" + -0.180*"second" + -0.145*"exposition" + -0.145*"structure" + 0.131*"leonardo" + -0.128*"tallest" + -0.116*"engineer" + -0.107*"design" + -0.102*"restaurant"')
