In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import nltk
from nltk.stem import WordNetLemmatizer 

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [47]:
df = pd.read_csv("blogtext.csv")
print(df.shape)
df.head()

(681284, 7)


Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [48]:
df.drop(df[df['topic']=='Student'].index,inplace=True)

In [49]:
df.drop(df[df['text'].map(len)<50].index,inplace=True)

In [51]:
df.drop(['gender','age','sign','date'],axis=1,inplace=True)

In [52]:
df.reset_index()


Unnamed: 0,index,id,topic,text
0,4,3581210,InvestmentBanking,Thanks to Yahoo!'s Toolbar I can ...
1,5,3581210,InvestmentBanking,I had an interesting conversation...
2,6,3581210,InvestmentBanking,Somehow Coca-Cola has a way of su...
3,7,3581210,InvestmentBanking,"If anything, Korea is a country o..."
4,8,3581210,InvestmentBanking,Take a read of this news article ...
5,9,3581210,InvestmentBanking,I surf the English news sites a l...
6,10,3581210,InvestmentBanking,"Ah, the Korean language...it look..."
7,11,3581210,InvestmentBanking,If you click on my profile you'll...
8,12,3581210,InvestmentBanking,Last night was pretty fun...mostl...
9,13,3581210,InvestmentBanking,There is so much that is differen...


In [62]:
d = dict()
for a in df['topic']:
    if a in d:
        d[a]+=1
    else:
        d[a]=1
df.drop(df[df['topic']=='indUnk'].index,inplace=True)
df.describe()

Unnamed: 0,id
count,267057.0
mean,2377485.0
std,1248855.0
min,7596.0
25%,1188176.0
50%,2621121.0
75%,3512293.0
max,4336871.0


In [64]:
df.reset_index()

Unnamed: 0,index,id,topic,text
0,4,3581210,InvestmentBanking,Thanks to Yahoo!'s Toolbar I can ...
1,5,3581210,InvestmentBanking,I had an interesting conversation...
2,6,3581210,InvestmentBanking,Somehow Coca-Cola has a way of su...
3,7,3581210,InvestmentBanking,"If anything, Korea is a country o..."
4,8,3581210,InvestmentBanking,Take a read of this news article ...
5,9,3581210,InvestmentBanking,I surf the English news sites a l...
6,10,3581210,InvestmentBanking,"Ah, the Korean language...it look..."
7,11,3581210,InvestmentBanking,If you click on my profile you'll...
8,12,3581210,InvestmentBanking,Last night was pretty fun...mostl...
9,13,3581210,InvestmentBanking,There is so much that is differen...


In [65]:
def preprocess(text):
    return [w for w in gensim.utils.simple_preprocess(text) if w not in gensim.parsing.preprocessing.STOPWORDS and len(w)>3]
def lemmatize(text):
    return [WordNetLemmatizer().lemmatize(w) for w in text]

In [66]:
preproc_doc = []
for s in df['text']:
    preproc_doc.append(lemmatize(preprocess(s)))
    

In [67]:
dwords = gensim.corpora.Dictionary(preproc_doc)

In [68]:
c = 0
for k,v in dwords.iteritems():
    c+=1
    print(k,v)
    if c==10:
        break

0 audio
1 button
2 capture
3 choose
4 click
5 cool
6 enjoy
7 hour
8 instruction
9 korean


In [69]:
dwords.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)


In [70]:
bow = [dwords.doc2bow(s) for s in preproc_doc]

In [73]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow)
corpus_tfidf = tfidf[bow]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5736997760936114),
 (1, 0.15724659945370864),
 (2, 0.1866402276540447),
 (3, 0.13914152151545042),
 (4, 0.13832689677211982),
 (5, 0.09042247351906292),
 (6, 0.11323731293037777),
 (7, 0.0769655743088099),
 (8, 0.18183009714202764),
 (9, 0.1850448032663647),
 (10, 0.033268858607848346),
 (11, 0.2175777013157439),
 (12, 0.07382144018686897),
 (13, 0.061726923248351155),
 (14, 0.28977120425655534),
 (15, 0.18312180741814743),
 (16, 0.10016692517772093),
 (17, 0.22647528497158337),
 (18, 0.10640736864076424),
 (19, 0.24500636690853603),
 (20, 0.26865962225161516),
 (21, 0.07644083792853062),
 (22, 0.2583159530088372),
 (23, 0.16110664271570171)]


In [74]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=30, id2word=dwords, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.004*"went" + 0.003*"today" + 0.003*"urllink" + 0.003*"haha" + 0.003*"like" + 0.003*"night" + 0.003*"gonna" + 0.003*"time" + 0.002*"perth" + 0.002*"going"
Topic: 1 Word: 0.013*"urllink" + 0.011*"test" + 0.009*"christmas" + 0.007*"level" + 0.006*"belle" + 0.005*"moderate" + 0.005*"merry" + 0.005*"personality" + 0.005*"violent" + 0.005*"blogsite"
Topic: 2 Word: 0.003*"like" + 0.003*"time" + 0.002*"night" + 0.002*"little" + 0.002*"went" + 0.002*"thing" + 0.002*"good" + 0.002*"know" + 0.002*"think" + 0.002*"going"
Topic: 3 Word: 0.017*"urllink" + 0.017*"quiz" + 0.014*"harry" + 0.012*"ross" + 0.012*"potter" + 0.006*"pirate" + 0.006*"poll" + 0.005*"portal" + 0.005*"olivia" + 0.005*"rated"
Topic: 4 Word: 0.007*"diet" + 0.006*"weight" + 0.006*"urllink" + 0.006*"pound" + 0.005*"tucker" + 0.005*"sidebar" + 0.004*"mar" + 0.003*"monty" + 0.003*"maureen" + 0.003*"python"
Topic: 5 Word: 0.008*"cheese" + 0.008*"food" + 0.007*"chicken" + 0.007*"chocolate" + 0.006*"cream" + 0.006*"salad

In [75]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model_tfidf, corpus=bow, texts=preproc_doc)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,23.0,0.4821,"blog, post, urllink, link, blogger, comment, s...","[thanks, yahoo, toolbar, capture, url, popups,..."
1,1,15.0,0.2839,"urllink, bush, iraq, people, american, preside...","[interesting, conversation, morning, talking, ..."
2,2,19.0,0.6232,"life, know, love, want, thing, like, feel, tim...","[coca, cola, summing, thing, early, flagship, ..."
3,3,15.0,0.4744,"urllink, bush, iraq, people, american, preside...","[korea, country, extreme, based, think, come, ..."
4,4,15.0,0.7757,"urllink, bush, iraq, people, american, preside...","[read, news, article, urllink, joongang, ilbo,..."
5,5,15.0,0.5434,"urllink, bush, iraq, people, american, preside...","[surf, english, news, site, looking, tidbit, k..."
6,6,2.0,0.2729,"like, time, night, little, went, thing, good, ...","[korean, language, look, difficult, figure, re..."
7,7,2.0,0.3697,"like, time, night, little, went, thing, good, ...","[click, profile, startling, discovery, born, y..."
8,8,9.0,0.5489,"went, work, going, today, time, like, night, w...","[night, pretty, company, kept, recently, coupl..."
9,9,7.0,0.6604,"today, like, went, good, going, night, urllink...","[different, seen, haven, travelled, canada, ph..."


In [76]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_tfidf, bow, dictionary=lda_model_tfidf.id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
