In [36]:
from pymongo import MongoClient
import requests
import json
import numpy as np
import pandas as pd
import time
from pprint import pprint
import gensim
import pyLDAvis
from pyLDAvis import gensim as gensimvis
import spacy
import pickle
import logging
from tqdm import tqdm
from collections import defaultdict

In [26]:

# set up config to get data from mongoDB on AWS

config = {
    'host': 'xx.xx.xx.xx:27017',
    'username': 'grant',
    'password': 'xxx',
    'authSource': 'menu'
}

client = MongoClient(**config)

db = client.menu

In [27]:
collection = db.restaurants

In [39]:
cursor = collection.find({'menus' : {'$exists': True}})

In [None]:
menu_types = defaultdict(int)

In [40]:
# Get restaurant menus, exclude Desserts and Beverages

restaurants = []

for record in cursor:
    if len(record['menus']) > 10:
        menu_items = []
        rest_dict = json.loads(record['menus'])
        for menu in rest_dict:
            if menu['name'] != 'Beverages' and menu['name'] != 'Beverages ' and menu['name'] != 'Soft Drinks' \
                        and menu['name'] != 'Desserts' and menu['name'] != 'Desserts ':  
                for item in (menu['items']):
                    menu_items.append(item['name'])
        restaurants.append([record['apiKey'], ', '.join(menu_items)])

In [41]:
# pickle menus

with open('data/menus.pickle', 'wb') as to_write:
    pickle.dump(restaurants, to_write)

In [42]:
len(restaurants)

9676

In [43]:
# Do LDA analysis on menus
ng_train = pd.DataFrame(restaurants, columns=['apiKey','menu'])


In [44]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        

clean_sents = list(sent_to_words(ng_train.menu))

In [45]:
bigram = gensim.models.Phrases(clean_sents, min_count=20) # higher threshold fewer phrases.

trigram = gensim.models.Phrases(bigram[clean_sents], min_count=10)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_model = gensim.models.phrases.Phraser(bigram)
trigram_model = gensim.models.phrases.Phraser(trigram)

In [46]:
for s in clean_sents[0:5]:
    print(f'{" ".join(trigram_model[bigram_model[s]]) } \n')

imperial vegetarian rolls chicken satay fresh spring_rolls imperial shrimp rolls summer duck rolls golden tofu fried wonton fish cake kimchi soup chicken kimchi soup tofu kimchi soup shrimp kimchi soup seafood hot sour soup chicken hot sour soup tofu hot sour soup shrimp hot sour soup seafood coconut soup chicken coconut soup tofu coconut soup shrimp coconut soup seafood honey pork siagon noodle bowl five_spice chicken siagon noodle bowl bbq short_rib siagon noodle bowl vegan delight siagon noodle bowl honey pork garlic noodles five_spice chicken garlic noodles bbq short_rib garlic noodles garlic fish with garlic noodles garlic prawn with garlic noodles soft_shell_crab with garlic noodles banana fritters with ice_cream mango with sweet sticky_rice honey pork curry five_spice chicken curry bbq short_rib curry chuchi salmon curry the green sea_bass curry chicken salad papaya salad tofu salad steak salad salmon salad crispy fish basil frog string_bean chicken string_bean tofu spicy eggpla

In [47]:
clean_sents = [trigram_model[bigram_model[t]] for t in clean_sents]

In [48]:
for t in clean_sents[:5]:
    print(f'{" ".join(t)} \n')

imperial vegetarian rolls chicken satay fresh spring_rolls imperial shrimp rolls summer duck rolls golden tofu fried wonton fish cake kimchi soup chicken kimchi soup tofu kimchi soup shrimp kimchi soup seafood hot sour soup chicken hot sour soup tofu hot sour soup shrimp hot sour soup seafood coconut soup chicken coconut soup tofu coconut soup shrimp coconut soup seafood honey pork siagon noodle bowl five_spice chicken siagon noodle bowl bbq short_rib siagon noodle bowl vegan delight siagon noodle bowl honey pork garlic noodles five_spice chicken garlic noodles bbq short_rib garlic noodles garlic fish with garlic noodles garlic prawn with garlic noodles soft_shell_crab with garlic noodles banana fritters with ice_cream mango with sweet sticky_rice honey pork curry five_spice chicken curry bbq short_rib curry chuchi salmon curry the green sea_bass curry chicken salad papaya salad tofu salad steak salad salmon salad crispy fish basil frog string_bean chicken string_bean tofu spicy eggpla

In [49]:
# get stop words, including my own stop words for things like tray, etc.
nlp = spacy.load('en', disable=['parser', 'ner'])
with open('data/menu_stop_words.txt') as f:
    menu_stop_words = f.readlines()
menu_stop_words = {s.strip() for s in menu_stop_words}

In [50]:
nlp.Defaults.stop_words |= menu_stop_words

In [51]:


# NOUN, ADJ, VERB, ADV
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        
        output_text = [token.lemma_ for token in doc if token.pos_ in allowed_postags and
                       str(token) not in menu_stop_words and token.is_stop == False]
        
        if len(output_text) > 0:
            texts_out.append(output_text)
    return texts_out

In [52]:
clean_sents = lemmatization(clean_sents)

In [53]:
id2word = gensim.corpora.Dictionary(clean_sents)

corpus = [id2word.doc2bow(t) for t in clean_sents]

In [54]:
test = lemmatization([['large', 'green', 'medium', 'apple', 'pear', 'laugh', 'entree', 'cater']])

In [55]:
test

[['green', 'apple', 'pear', 'laugh']]

In [56]:
[(id2word[id], freq) for id, freq in corpus[0]]

[('banana', 1),
 ('basil', 1),
 ('bbq', 4),
 ('beef', 3),
 ('brown', 1),
 ('buddy', 1),
 ('cake', 1),
 ('chicken', 11),
 ('chuchi', 1),
 ('coconut', 4),
 ('crispy', 1),
 ('curry', 5),
 ('delight', 3),
 ('duck', 1),
 ('eggplant', 1),
 ('fish', 4),
 ('five_spice', 4),
 ('fresh', 1),
 ('fritter', 1),
 ('frog', 1),
 ('fry', 1),
 ('garlic', 9),
 ('golden', 1),
 ('green', 1),
 ('honey', 4),
 ('ice_cream', 1),
 ('imperial', 2),
 ('kimchi', 4),
 ('lava', 1),
 ('mango', 1),
 ('meggie', 1),
 ('noodle', 15),
 ('papaya', 1),
 ('pork', 4),
 ('prawn', 2),
 ('rice', 6),
 ('roll', 3),
 ('salad', 5),
 ('salmon', 3),
 ('satay', 1),
 ('sea_bass', 2),
 ('seafood', 4),
 ('shake', 1),
 ('short_rib', 4),
 ('shrimp', 4),
 ('siagon', 8),
 ('soft_shell_crab', 1),
 ('sole', 1),
 ('soup', 16),
 ('sour', 4),
 ('spicy', 3),
 ('spring_roll', 1),
 ('steak', 1),
 ('steam', 2),
 ('steamed', 1),
 ('stew', 1),
 ('sticky_rice', 1),
 ('string_bean', 2),
 ('summer', 1),
 ('sweet', 1),
 ('tenderloin', 1),
 ('tofu', 6),
 ('ve

In [57]:
# Run analysis for multiple topics

for i in range(8,16):
    filen = 'lda_model' + str(i) + '.log'
    logging.basicConfig(filename=filen, format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=i, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=128,
                                           per_word_topics=True)
    pyLDAvis.enable_notebook()
    vis = gensimvis.prepare(lda_model, corpus, id2word)
    html = 'lda_new_stop_' + str(i) + '.html'
    pyLDAvis.save_html(vis, html)


In [29]:
top_topics = lda_model.get_document_topics(corpus[0])
top_topics.sort(key=lambda x: x[1], reverse=True)

print(top_topics)

[(9, 0.91819555), (6, 0.07777809)]


In [30]:

pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, 'lda-12.html')