# Generating topics for venue tips in London
* prepare tips data
* preprocessing tips
* generating topics
* evaluation

### 1.prepare tips data
We collect tips from Foursquare. 

There are around 11000 places and about 1700 places have more 200 tips.
All of the tips(English tips) were collected previously. 

In [None]:
# import useful libs
import graphlab as gl
import pandas as pd
import json
import re
import pyLDAvis 
import pyLDAvis.graphlab

In [65]:
# load the pre-processed data
sf = gl.SFrame.read_json('london_venue.json')
#sf.head()

### 2. preprocessing
* put all tips for one venue together, store them in a new column 'all_tips'

In [69]:
# put all tips together
def get_feature(row):
    feature=''
    if row['tips']!=None:
        for t in row['tips']:
            feature += t
    
    tags = 5*row['tags']
    for tag in tags:
        feature += tag+' '
    if row['description']!=None:
        feature += row['description']
        feature += ' '
        feature += row['description']
    return feature
sf['feature'] = sf.apply(lambda x: get_feature(x))

   * Text cleaning
   * Lemmatization

In [70]:
# get words, dropping punctuations etc.
sf['feature'] = sf['feature'].apply(lambda x: re.sub("[^a-zA-Z]", " ", x))
docs = gl.text_analytics.tokenize(sf['feature'])

In [71]:
# lemmatization
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
docs_lemmatized = docs.apply(lambda x: [wordnet_lemmatizer.lemmatize(w) for w in x])
#docs_lemmatized[0]

* Tokenization
* Bag-of-words representation
* Stop words and less frequent words removal

In [57]:
# Bag-of-words
docs_cleaned = gl.text_analytics.count_words(docs_lemmatized)
# Remove stop words
docs_cleaned = docs_cleaned.dict_trim_by_keys(gl.text_analytics.stopwords(), exclude=True)
# Remove less freq words
docs_cleaned = docs_cleaned.dict_trim_by_values(2)

###  3. Gnerate topics
* remove docs which has less than 3 keywords
* create a model
* check and evaluate

In [104]:
# remove docs which has less than 3 keywords
ix = docs_cleaned.apply(lambda x:len(x.keys())>2)
docs_new = docs_cleaned[ix]

In [105]:
# Show how many docs have been removed
print 1 - 1.0*len(docs_new)/len(docs_cleaned)

0.559368737475


In [106]:
# create a topic model 
#associations = gl.SFrame({'word':['local','authentic', 'london', 'good', 'lunch','delicious','amazing','lover','walk'],
#                                'topic': [0, 0, 0, 1, 1,2,2,3,3]})

topic_model = gl.topic_model.create(docs_new,num_topics=20, num_iterations=200)

In [109]:
for i in range(20):
    print 'topic ',i,topic_model.get_topics(num_words=5,output_type='topic_words')[i]['words']

topic  0 ['coffee', 'staff', 'place', 'nice', 'friendly']
topic  1 ['bar', 'cocktail', 'great', 'music', 'place']
topic  2 ['art', 'free', 'museum', 'place', 'exhibition']
topic  3 ['food', 'great', 'wine', 'restaurant', 'menu']
topic  4 ['station', 'train', 'time', 'free', 'don']
topic  5 ['food', 'good', 'sushi', 'great', 'chicken']
topic  6 ['food', 'good', 'service', 'wa', 'staff']
topic  7 ['burger', 'great', 'good', 'chicken', 'wa']
topic  8 ['park', 'place', 'great', 'london', 'garden']
topic  9 ['london', 'don', 'worth', 'good', 'time']
topic  10 ['market', 'store', 'shop', 'street', 'gym']
topic  11 ['food', 'pizza', 'restaurant', 'italian', 'good']
topic  12 ['palace', 'place', 'queen', 'wa', 'buckingham']
topic  13 ['london', 'office', 'uk', 'event', 'offer']
topic  14 ['great', 'good', 'breakfast', 'food', 'place']
topic  15 ['design', 'book', 'medium', 'student', 'digital']
topic  16 ['great', 'theatre', 'wa', 'cinema', 'seat']
topic  17 ['view', 'london', 'bridge', 'tower

In [None]:
# These are the 5th topic words
print topic_model.get_topics(num_words=10,output_type='topic_words')[11]

In [None]:
docs_in_topic_5 = docs_new[topic_model.predict(docs_new)==11]

In [None]:
# select venues which are predicted to be topic 5
sf_new = sf[ix]
venue_in_topic5 = sf_new[topic_model.predict(docs_new)==11]

In [None]:
venue_in_topic5['category','name','tips'][21]

In [110]:
#Automatic Rendering of Visualisation
pyLDAvis.enable_notebook()
pyLDAvis.graphlab.prepare(topic_model, docs_new)