In [217]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans 
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.decomposition import NMF
from collections import defaultdict
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

In [218]:
data = pd.read_csv('data/data_not_starbucks.csv').drop('Unnamed: 0', axis=1)

In [170]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 474 entries, 0 to 473
Data columns (total 6 columns):
location.lat        474 non-null float64
location.lng        474 non-null float64
name                474 non-null object
combined_reviews    474 non-null object
num_review_words    474 non-null int64
final_address       474 non-null object
dtypes: float64(2), int64(1), object(3)
memory usage: 22.3+ KB


### Added LemmaTokenizer to lemmatize words 

In [226]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [228]:
tf = TfidfVectorizer(strip_accents='unicode',
                     tokenizer=LemmaTokenizer(),
                     stop_words='english',
                     max_features=500)

In [229]:
tfidf = tf.fit_transform(data['combined_reviews'])

In [230]:
words = tf.get_feature_names()

In [231]:
nmf = NMF(n_components=20)
nmf.fit(tfidf)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=20, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [232]:
H = nmf.components_
W = nmf.transform(tfidf)

In [233]:
H.shape

(20, 500)

In [234]:
W.shape

(474, 20)

In [241]:
top_words_index = np.argsort(-H)[:,0:5]
most_common_words_per_topic = np.array(words)[top_words_index]
for i, items in enumerate(most_common_words_per_topic):
    print(i, items)

0 ['coffee' 'shop' 'good' 'spot' 'bean']
1 ['food' 'breakfast' 'egg' 'biscuit' 'burrito']
2 ['bagel' 'cheese' 'good' 'sandwich' 'cream']
3 ['best' 'espresso' 'latte' 'seattle' 'try']
4 ['wa' 'location' 'nice' 'seattle' 'barista']
5 ['wifi' 'lot' 'good' 'work' 'beer']
6 ['crepe' 'like' 'wanted' 'great' 'stop']
7 ['cupcake' 'cake' 'ice' 'cream' 'velvet']
8 ['donut' 'doughnut' 'fashioned' 'pot' 'wonderful']
9 ['starbucks' 'store' 'located' 'make' 'dont']
10 ['great' 'baristas' 'service' 'staff' 'super']
11 ['chocolate' 'hot' 'mocha' 'dark' 'shot']
12 ['croissant' 'pastry' 'good' 'amazing' 'baked']
13 ['neighborhood' 'friendly' 'owner' 'shop' 'little']
14 ['sandwich' 'breakfast' 'salad' 'delicious' 'soup']
15 ['waffle' 'beer' 'delicious' 'coffee' 'cafe']
16 ['place' 'market' 'customer' 'pike' 'day']
17 ['lunch' 'dog' 'special' 'chip' 'food']
18 ['line' 'stand' 'service' 'order' 'fast']
19 ['pizza' 'salad' 'food' 'building' 'card']


In [237]:
#def top_shops(W):
shop_dict = defaultdict(list)
for index, item in enumerate(W):
    key = np.argmax(item)
    value = item[key]
    name = data['name'][index]
    shop_dict[key].append([value, name])
top_shop_names = defaultdict(list)
for feature in shop_dict:
    top_shop_names[feature] = list(np.sort(np.array(shop_dict[feature]).T)[1,-5:-1])
top_shop_names

defaultdict(list,
            {0: ['Victrola',
              'Victrola Cafe and Roastery',
              'Voxx Coffee',
              'Zoka Coffee'],
             1: ['Seattle Central Grind',
              'Solsticio',
              'Stone Way Café',
              "Terry's 14 Carrot Cafe"],
             2: ['Fat Ducks Deli & Bakery',
              'Grateful Bread Baking Company & Cafe',
              "Lama G's",
              'Meadow Brew'],
             3: ['Realfine Coffee',
              'UW: Reboot Café',
              'Uptown Espresso',
              'Vero Cafe'],
             4: ['The Living Room - GC UD',
              'The Seattle Grind',
              'Urban Coffee House',
              'Visions Espresso Service'],
             5: ['Voxx Coffee',
              'Wayward Coffeehouse',
              'Woodland Coffee',
              'Zoka Coffee Roaster & Tea Company'],
             6: ['Joe Bar',
              'Le Petite Cafe',
              'Pearls Tea & Coffee',
              "