In [1]:
import pandas as pd
import numpy as np
import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from pprint import pprint
# gensim
import gensim
from gensim.corpora import Dictionary
from gensim import models
from gensim.parsing.preprocessing import STOPWORDS

# plotting tools
import pyLDAvis
import pyLDAvis.gensim 
import matplotlib.pyplot as plt
%matplotlib inline

# nltk
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *


In [2]:
# topic modelling
news = pd.read_csv("Cleaned Data/headlines.csv", encoding = "ISO-8859-1")
news.head()

Unnamed: 0,date,source,headline
0,2008-10-14,business_daily,Trade financing drought hits Kenyan companies
1,2008-10-14,business_daily,House asks Treasury to table fresh budget pla
2,2008-10-14,business_daily,Kenya gets harsh human rights verdic
3,2008-10-14,business_daily,Ripple effects of global crisis set to hit Eas...
4,2008-10-14,business_daily,Manufacturers cut product sizes to beat recessio


In [3]:
news["headline1"] = news["headline"].str.lower()
news["headline1"] = news["headline1"].str.replace("[^a-z\\s]", " ")
news["headline1"] = news["headline1"].str.replace("\\s+", " ")
news["headline1"].head()

0        trade financing drought hits kenyan companies
1        house asks treasury to table fresh budget pla
2                 kenya gets harsh human rights verdic
3    ripple effects of global crisis set to hit eas...
4     manufacturers cut product sizes to beat recessio
Name: headline1, dtype: object

In [4]:
# tokenization
news["headline1"] = news["headline1"].apply(word_tokenize)
news["headline1"].head()

0    [trade, financing, drought, hits, kenyan, comp...
1    [house, asks, treasury, to, table, fresh, budg...
2          [kenya, gets, harsh, human, rights, verdic]
3    [ripple, effects, of, global, crisis, set, to,...
4    [manufacturers, cut, product, sizes, to, beat,...
Name: headline1, dtype: object

In [5]:
lemmatizer = WordNetLemmatizer()
news["headline1"] = news["headline1"].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])
news["headline1"].head()

0    [trade, financing, drought, hit, kenyan, company]
1    [house, asks, treasury, to, table, fresh, budg...
2            [kenya, get, harsh, human, right, verdic]
3    [ripple, effect, of, global, crisis, set, to, ...
4    [manufacturer, cut, product, size, to, beat, r...
Name: headline1, dtype: object

In [6]:
english_stopwords = stopwords.words('english')
english_stopwords.extend(["news", "live blog", "live",  "in pictures", "download", "blog",
   "lifestyle", "editorial", "editorials", "pictures", "opinion", "read",
  "newsplex", "politics", "new", "news live", "photos",  "video", 
  "frontrow", "fact check", "financial time", "watch live", "columnists", "interview", "habari", "kenya", "kenyan", "star"])
news["headline1"] = news["headline1"].apply(lambda x: [y for y in x if y not in english_stopwords and  len(y) > 2])
news["headline1"].head()                        

0            [trade, financing, drought, hit, company]
1    [house, asks, treasury, table, fresh, budget, ...
2                   [get, harsh, human, right, verdic]
3    [ripple, effect, global, crisis, set, hit, eas...
4    [manufacturer, cut, product, size, beat, reces...
Name: headline1, dtype: object

In [7]:
# create bag of words
dictionary = Dictionary(news["headline1"]) # dictionary of 51127 words
dictionary.filter_extremes(no_below = 5, no_above = 0.5) # dictionary of 18063 words


In [8]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count == 10:
        break


0 company
1 drought
2 financing
3 hit
4 trade
5 asks
6 budget
7 fresh
8 house
9 pla


In [9]:
bow = [dictionary.doc2bow(headline) for headline in news["headline1"]]
dictionary[bow[0][0][0]]

'company'

In [10]:
lda_model = gensim.models.LdaMulticore(bow, num_topics=3, id2word=dictionary, passes=2, workers=2, per_word_topics = True, random_state = 0)

In [11]:
pprint(lda_model.print_topics())

[(0,
  '0.008*"market" + 0.008*"firm" + 0.007*"bank" + 0.007*"company" + '
  '0.006*"africa" + 0.006*"deal" + 0.006*"county" + 0.006*"state" + '
  '0.006*"plan" + 0.005*"farmer"'),
 (1,
  '0.010*"man" + 0.010*"woman" + 0.009*"police" + 0.007*"year" + 0.006*"two" + '
  '0.005*"death" + 0.005*"life" + 0.004*"killed" + 0.004*"attack" + '
  '0.004*"nairobi"'),
 (2,
  '0.012*"uhuru" + 0.009*"get" + 0.008*"ruto" + 0.008*"raila" + 0.008*"court" '
  '+ 0.007*"say" + 0.006*"governor" + 0.006*"leader" + 0.005*"president" + '
  '0.005*"world"')]


In [12]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow, dictionary,  sort_topics = False)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))
