In [1]:
import pandas as pd
import numpy as np
import time
import nltk
import re
import pickle

from gensim import corpora, models, matutils, similarities
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import defaultdict
from textacy.preprocess import preprocess_text
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
txt = pd.read_pickle('combined_blogs.pickle')
print len(txt)
txt.head()

2573


Unnamed: 0,Title,Details,Date,Processed Summary,Processed Detail,Word_count,Summary
0,Great Deals That Reward You When Dining With P...,"Singapore might be known for many things, but ...","Oct 31st, 2017",great deal reward dining paradise group,known thing stand colourful scene flavour borr...,504,Others like their Crispy BBQ Honey Pork Bun ($...
1,10 Exciting Highlights at AccorHotels Food & W...,"\n AccorHotels, the largest and most prestigio...","Oct 31st, 2017",exciting highlight accorhotels wine festival,accorhotels largest prestigious hotel operator...,920,"During the festival from 4 to 12 November, mor..."
2,Man Man Unagi – Delicious Unagi that Received ...,Since the release of the Singapore Michelin Gu...,"Oct 30th, 2017",man man unagi delicious unagi received bib gou...,release michelin guide bib gourmand awardees q...,366,Each portion features a bed of rice topped wit...
3,Kopi More – Brewing Traditional Nanyang Kopi W...,"Since I first opened my eyes, and heart, to th...","Oct 29th, 2017",kopi brewing traditional nanyang kopi espresso...,opened eye heart world coffee going looking go...,147,"Well, he makes traditional Nanyang coffee usin..."
4,11 Finger-Licking-Good Biryanis That You Will ...,"In Indian culture, eating with your hands is a...","Oct 28th, 2017",fingerlickinggood biryani love,indian culture eating hand mark respect host s...,697,"I liked the generous serving of rice, and robu..."


In [3]:
txt.drop_duplicates(inplace=True)
len(txt)

2573

In [4]:
document = txt['Processed Detail']
doc_summary = txt['Summary']

In [5]:
# convert each document into list of tokens 
# also need to convert each token to unicode rather than having entire list as 1 unicode, else Dictionary cannot process
texts = [[token for token in text.split()] for text in document]

In [6]:
print texts[0][:10]

[u'known', u'thing', u'stand', u'colourful', u'scene', u'flavour', u'borrowed', u'global', u'neighbour', u'variety']


In [7]:
# convert texts to dictionary where every token in texts is indexed
dictionary = corpora.Dictionary(texts)

# token to id shows the id(index) for each token
d = dictionary.token2id
for key, value in d.items()[:3]:
    print key, value

fawn 31504
nodaiwa 18387
gai 5681


In [8]:
# doc to bag of words (bow) converts the words to its dictionary indices(ids) and their respective counts in each text doc
corpus = [dictionary.doc2bow(text) for text in texts]
print corpus[0][:10]

[(0, 2), (1, 2), (2, 1), (3, 1), (4, 2), (5, 2), (6, 1), (7, 2), (8, 2), (9, 1)]


In [9]:
# Set up and run LDA model
number_of_topics = 10
start = time.time()

lda = models.LdaModel(corpus, num_topics=number_of_topics, iterations=1000, passes=5, id2word=dictionary)

end = time.time()
exe_time = (end - start)/60
print 'Time taken :',(exe_time),' minutes'

Time taken : 2.55165193478  minutes


In [10]:
# lda.save('foodblog.lda')
models.LdaModel.load('foodblog.lda')

In [11]:
# Objective 1 : Identify the topics (trends)
topics = lda.print_topics(num_topics=number_of_topics, num_words=10)

In [13]:
topics[5:]

[(5,
  u'0.023*"coffee" + 0.013*"new" + 0.012*"cafe" + 0.010*"restaurant" + 0.008*"road" + 0.007*"good" + 0.006*"orchard" + 0.005*"menu" + 0.005*"place" + 0.005*"park"'),
 (6,
  u'0.020*"crab" + 0.015*"egg" + 0.008*"chilli" + 0.007*"hong" + 0.007*"salted" + 0.007*"kong" + 0.006*"mooncakes" + 0.006*"curry" + 0.006*"flavour" + 0.005*"rice"'),
 (7,
  u'0.007*"chicken" + 0.007*"sauce" + 0.007*"pizza" + 0.005*"restaurant" + 0.005*"usdnumber++" + 0.005*"durian" + 0.005*"menu" + 0.004*"seafood" + 0.004*"pasta" + 0.004*"meat"'),
 (8,
  u'0.010*"japanese" + 0.010*"ramen" + 0.010*"rice" + 0.009*"restaurant" + 0.009*"korean" + 0.008*"bowl" + 0.008*"sauce" + 0.007*"beef" + 0.005*"like" + 0.005*"soup"'),
 (9,
  u'0.012*"noodle" + 0.011*"rice" + 0.011*"chicken" + 0.010*"stall" + 0.009*"soup" + 0.008*"fish" + 0.008*"fried" + 0.008*"pork" + 0.007*"sauce" + 0.006*"mee"')]

In [36]:
# Extract topic words without the probability
topics_str = []
for i in topics:
    t = re.findall('[a-z]+', str(i))
    T = ' '.join(t[1:])
    topics_str.append(T)
    
topics_str[5:]

['coffee new cafe restaurant road good orchard menu place park',
 'crab egg chilli hong salted kong mooncakes curry flavour rice',
 'chicken sauce pizza restaurant usdnumber durian menu seafood pasta meat',
 'japanese ramen rice restaurant korean bowl sauce beef like soup',
 'noodle rice chicken stall soup fish fried pork sauce mee']

In [40]:
# Pickle topics_str for web application. Need to set protocol to -1 else flask load pickle won't work
with open('topics_str.pkl', 'wb') as f:
    pickle.dump(topics_str, f, -1)
# with open('topics_str.pkl', 'rb') as f:
#    topics_str = pickle.load(f)

In [16]:
# Pick the documents that best matches the selected topics accordingly to their corresponding probabiities

# convert to lda transformed corpus. for each doc in lda_corpus, it provides the probability of that doc to the topic,
# in descending order of probability - (topic, probability)
lda_corpus = lda[corpus]

print type(lda[corpus])
print len(lda_corpus)
print lda_corpus[0]

<class 'gensim.interfaces.TransformedCorpus'>
2573
[(1, 0.22005904816650237), (7, 0.13157006706045862), (8, 0.14840992020279783), (9, 0.49877253613182454)]


In [17]:
# Determine the threshold - use the average of topic probability for all the docs
# Use the threshold to select desired documents

scores = []
for doc in lda_corpus:
    for topic in doc:
        # topic here is a tuple containing the topic and associated probability. topic[1] is the probability
        scores.append(topic[1])
            
threshold = sum(scores)/len(scores)

print threshold

0.264668534506


In [18]:
# Following select the documents with sorted probabilities (descending) that are associated with a particular topic
start = time.time()

Topics = []
# loop through the number of topics stated for the lda model
for topic in range(number_of_topics):
    cluster = []
    # link the lda transformed corpus (refer above) to the summary document
    for i,j in zip(lda_corpus, doc_summary):
        # check if first doc in corpus belongs to the topic specified and that its probability is above threshold
        if ((i[0][0] == topic) and (i[0][1] > threshold)):
            cluster.append((i[0][1], j))

    cluster_sorted = sorted(cluster, key=lambda item: -item[0])

    Topics.append(cluster_sorted)
    
end = time.time()
exe_time = (end-start)/60
print 'Time taken : ', exe_time, ' minutes'

Time taken :  1.53991643588  minutes


In [19]:
# Pickle Topics for web application
with open ('Topics.pkl', 'wb') as f:
    pickle.dump(Topics, f)

In [20]:
# Check how how many documents cross the threshold for each topic
for i in range(10):
    print len(Topics[i])

589
84
91
109
4
21
12
36
12
32


In [21]:
# print first 5 summaries associated with Topic 9
print 'Topic 10' # Topics[9]
print
print 'Keywords : ', T
print
try:
    for topic in Topics[9][:5]:
        # topic is a tuple containing the probability and the summary
        print topic[1]
        print
except:
    print 'No snippets found !'

Topic 10

Keywords :  noodle rice chicken stall soup fish fried pork sauce mee

With prices beginning from as low as $2.50, the ingredients that come with the Koka instant noodle soup are generous.A bowl of Lor Mee ($2.50$3.00) consists of a generous amount of ingredients such as fish cake, egg, braised pork belly, ngo hiang and flaky fish meat.Address: #03-146, Taman Jurong Food Centre, 3 Yung Seng Road, Singapore 618499 Opening Hours: Tue-Sat 5am to 3pm.Address: #01-28, Yuhua Village Food Centre, Blk 254 Jurong East Street 24, Singapore 600254 Opening Hours: 9am to 1pm.Address: #01-48, Yuhua Village Market & Food Centre, Blk 254 Jurong East Street 24, Singapore 600254 Opening Hours: 6am to 10am, no fixed rest days.

Website Source One of my all-time favourite hawker dishes is fried carrot cake, and those living near Bukit Gombak MRT Station would have come across Qiu Ji at Meng Soon Huat Food Centre.Opening hours: Daily 9am to 4pm Source If there's a place on this list that's definit

In [22]:
# Objective 2 : Search for food categories topics

In [23]:
query = 'korean food'

nl = []
for item in txt['Summary']:  
        if query.lower() in item.lower():
            nl.append(item)

In [24]:
len(nl)

9

In [25]:
# print out search results
try:
    for i in range(5):
        print nl[i]
        print
except:
    print 'No snippets found ! explore another category'

Baro Baro's rendition of a Tuna Kimbab won't blow your mind to the ends of the globe, but good enough to keep your Korean food cravings on hold.For the noodles and soup, I'm sure that we all know how it tastes like.After gobbling down like ten chewy rice cakes, it might be a tad spicy for some.There is nothing amazing about this Korean eatery, but I wouldn't mind popping by to enjoy some Kimbab.Baro Baro is having a promotion now - order any set meal and you'll receive a light bulb cooler on the house.

While K-Tower's rendition is very likeable, it doesn't scratch the itch of eating traditional Korean pancake.Besides the seafood pancake, the other item with the word "signature" in its name is the signature pork rib stew with instant noodles ($16).To quote Mr Fitness, "Just because they are in the majority doesn't mean they are right." I like it because underneath the sweetness, there is a potent taste of soy bean, which is the foundation of many Korean dishes.-Wang Dae Bak Korean Rest

In [26]:
# Objective 3 : Sentiment Analysis on eateries

In [27]:
query = "bak kut teh"

nl = []
for item in txt['Summary']:  
        if query.lower() in item.lower():
            nl.append(item)

In [28]:
len(nl)

21

In [29]:
# Load Vader sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

In [30]:
# Use the compound scores with specified threshold to segregate into positive, negative and neutral groups

sent_threshold = 0.7
try:
    pos = []
    neu = []
    neg = []
    for i in range(len(nl)):
        vs = analyzer.polarity_scores(nl[i])
        
        if vs['compound'] > sent_threshold:
            pos.append((vs['compound'], nl[i]))
        elif vs['compound'] < -sent_threshold:
            neg.append((vs['compound'], nl[i]))
        else:
            neu.append((vs['compound'], nl[i]))
        
except:
    print 'No snippets found ! explore another category'      

In [31]:
# print the percentage of each sentiment group
pos_percent = round(float(len(pos)) / float(len(nl)) * 100, 2)
neg_percent = round(float(len(neg)) / float(len(nl)) * 100, 2)
neu_percent = round(float(len(neu)) / float(len(nl)) * 100, 2)

print pos_percent, neg_percent, neu_percent

76.19 0.0 23.81


In [32]:
# Positive sentiment group
try:
    for i,j in pos[:5]:
        print 'Positive Sentiment score : ',i
        print j
        print
except:
    pass

Positive Sentiment score :  0.7463
Expanding from their Balestier and Hotel Boss outlets, Founder Bak Kut Teh has set up a new branch in Bugis (conveniently located just across the street from Bugis Junction).Look forward to mouthwatering new dishes, which are exclusive to the Bugis outlet, to complete your BKT experience!The Steamed Cod Fish ($18.80) is the most eye-catching dish on the menu which comprises around 3 dozen options.The Steamed Minced Meat ($6.80) is another classic Chinese dish that is easy to prepare, but not easy to perfect.Founder BKT's new outlet is situated in such a convenient location, so there's nothing separating you and this quintessential Chinese dish.

Positive Sentiment score :  0.9184
However, if we compare across eateries that serve Klang style bak kut teh in Singapore, Leong Kee is the undisputed winner!The Bak Kut Teh ($19.80 for 3 pax) comes with pork ribs, liver, stomach, intestine, beancurd skin, and vegetables, soaked in a broth that has a strong an

In [33]:
# Negative sentiment group
try:
    for i,j in neg[:5]:
        print 'Negative Sentiment score : ',i
        print j
        print
except:
    pass

In [34]:
# Neutral sentiment group
try:
    for i,j in neu[:5]:
        print 'Neutral Sentiment score : ',i
        print j
        print
except:
    pass

Neutral Sentiment score :  0.0
Daily: 11am - 10pm BBQ EXPRESS 53 Ang Mo Kio Avenue 3,Sat & Sun: 7am - 11pm BURGER KING 53 Ang Mo Kio Avenue 3,Mon to Fri: 9am - 10.30pm CHOCOEXPRESS 53 Ang Mo Kio Avenue 3,Daily: 10.30am - 10pm COFFEE KAKI 53 Ang Mo Kio Avenue 3,Daily: 8am - 11pm MR BEAN 53 Ang Mo Kio Avenue 3,Daily: 8.30am - 10pm NEW MANLEE BAK KUT TEH 53 Ang Mo Kio Avenue 3,Sat & Sun: 8am - 10.30pm THE COFFEE BEAN & TEA LEAF 53 Ang Mo Kio Avenue 3,

Neutral Sentiment score :  0.6369
Daily: 11am - 10pm 85 REDHILL TEOCHEW FISHBALL NOODLE #B2-39 Plaza SingapuraDaily: 10am - 10pm DAEBAK KOREAN FOOD EXPRESS #B2-44 Plaza SingapuraDaily: 10am - 10pm MUCHOS MEXICAN BAR & RESTAURANT #01-34 Plaza SingapuraDaily: 11am - 10pm OLD CHANG KEE #B2-02 Plaza SingapuraDaily: 11am - 10pm OLD STREET BAK KUT TEH #B2-07 Plaza SingapuraDaily: 10.30am - 10pm Photo Credit: sapporolionsg RIVE GAUCHE PATISSERIE #B2-57 Plaza SingapuraDaily: 11.30am - 10pm SAKAE TEPPANYAKI #B2-5254 Plaza SingapuraDaily: 11am - 10pm