In [255]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans 
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.decomposition import NMF
from collections import defaultdict
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
#from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words

In [218]:
data = pd.read_csv('data/data_not_starbucks.csv').drop('Unnamed: 0', axis=1)

In [170]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 474 entries, 0 to 473
Data columns (total 6 columns):
location.lat        474 non-null float64
location.lng        474 non-null float64
name                474 non-null object
combined_reviews    474 non-null object
num_review_words    474 non-null int64
final_address       474 non-null object
dtypes: float64(2), int64(1), object(3)
memory usage: 22.3+ KB


### Filter out shops with less than 20 words in their review

In [330]:
over20_data = data[data['num_review_words'] >= 20]

### Adding custom stopwords to sklearns defaults

In [333]:
stopwords = list(stop_words.ENGLISH_STOP_WORDS)

In [334]:
stopwords += ['coffee', 'shop', 'coffeeshop', 'starbucks', 'wa', 'seattle']

### Added LemmaTokenizer to lemmatize words 

In [335]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [336]:
tf = TfidfVectorizer(strip_accents='unicode',
                     tokenizer=LemmaTokenizer(),
                     stop_words=stopwords,
                     max_features=500)

In [337]:
tfidf = tf.fit_transform(over20_data['combined_reviews'])

In [338]:
words = tf.get_feature_names()

In [349]:
nmf = NMF(n_components=20)
nmf.fit(tfidf)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=20, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [350]:
H = nmf.components_
W = nmf.transform(tfidf)

In [351]:
H.shape

(20, 500)

In [352]:
W.shape

(435, 20)

In [353]:
top_words_index = np.argsort(-H)[:,0:5]
most_common_words_per_topic = np.array(words)[top_words_index]
for i, items in enumerate(most_common_words_per_topic):
    print(i, items)

0 ['good' 'friendly' 'staff' 'little' 'cute']
1 ['sandwich' 'breakfast' 'delicious' 'soup' 'egg']
2 ['brew' 'bean' 'cold' 'pour' 'espresso']
3 ['bagel' 'cheese' 'good' 'cream' 'sandwich']
4 ['crepe' 'like' 'wanted' 'stop' 'neighborhood']
5 ['cupcake' 'ice' 'cake' 'cream' 'pizza']
6 ['stand' 'drink' 'time' 'barista' 'girl']
7 ['lot' 'work' 'beer' 'place' 'wifi']
8 ['donut' 'doughnut' 'fashioned' 'pot' 'old']
9 ['croissant' 'pastry' 'good' 'bakery' 'baked']
10 ['chocolate' 'hot' 'mocha' 'dark' 'order']
11 ['great' 'service' 'love' 'place' 'people']
12 ['owner' 'neighborhood' 'local' 'business' 'new']
13 ['latte' 'best' 'try' 'tea' 'mocha']
14 ['market' 'pike' 'view' 'place' 'day']
15 ['waffle' 'beer' 'delicious' 'cafe' 'mean']
16 ['building' 'located' 'floor' 'store' 'center']
17 ['biscuit' 'gravy' 'breakfast' 'worth' 'delicious']
18 ['food' 'breakfast' 'egg' 'burrito' 'brunch']
19 ['espresso' 'white' 'best' 'shot' 'nico']


In [356]:
#def top_shops(W):
shop_dict = defaultdict(list)
for index, item in enumerate(W):
    key = np.argmax(item)
    value = item[key]
    name = over20_data['name'][index]
    shop_dict[key].append([value, name])
top_shop_names = defaultdict(list)
for feature in shop_dict:
    top_shop_names[feature] = list(np.sort(np.array(shop_dict[feature]).T)[1,-5:-1])
top_shop_names

KeyError: 11

> [0;32m/Users/ReddingSkinnyRobot/Galvanize/capstone_project/pandas/_libs/hashtable_class_helper.pxi[0m(817)[0;36mpandas._libs.hashtable.Int64HashTable.get_item[0;34m()[0m

ipdb> up
> [0;32m/Users/ReddingSkinnyRobot/anaconda2/envs/py36/lib/python3.6/site-packages/pandas/core/indexes/base.py[0m(2576)[0;36mget_value[0;34m()[0m
[0;32m   2574 [0;31m                    [0;32mraise[0m [0me1[0m[0;34m[0m[0m
[0m[0;32m   2575 [0;31m            [0;32mexcept[0m [0mException[0m[0;34m:[0m  [0;31m# pragma: no cover[0m[0;34m[0m[0m
[0m[0;32m-> 2576 [0;31m                [0;32mraise[0m [0me1[0m[0;34m[0m[0m
[0m[0;32m   2577 [0;31m        [0;32mexcept[0m [0mTypeError[0m[0;34m:[0m[0;34m[0m[0m
[0m[0;32m   2578 [0;31m            [0;31m# python 3[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> up
> [0;32m/Users/ReddingSkinnyRobot/anaconda2/envs/py36/lib/python3.6/site-packages/pandas/core/series.py[0m(653)[0;36m__getitem__[0;34m()[0m
[0;32m    651 [0;

In [355]:
%pdb

Automatic pdb calling has been turned ON
