In [454]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans 
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.decomposition import NMF
from collections import defaultdict
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
#from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words
from geopy.distance import vincenty
from geopy.distance import great_circle

In [359]:
data = pd.read_csv('data/data_not_starbucks.csv').drop('Unnamed: 0', axis=1)

In [360]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 474 entries, 0 to 473
Data columns (total 6 columns):
location.lat        474 non-null float64
location.lng        474 non-null float64
name                474 non-null object
combined_reviews    474 non-null object
num_review_words    474 non-null int64
final_address       474 non-null object
dtypes: float64(2), int64(1), object(3)
memory usage: 22.3+ KB


### Filter out shops with less than 20 words in their review

In [361]:
over20_data = data[data['num_review_words'] >= 20].reset_index().drop('index', axis=1)

### Adding custom stopwords to sklearns defaults

In [520]:
stopwords = list(stop_words.ENGLISH_STOP_WORDS)

In [521]:
stopwords += ['coffee', 'shop', 'coffeeshop', 'starbucks', 'wa', 'seattle', 'cafe', 'caffee']

### Added LemmaTokenizer to lemmatize words 

In [522]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [523]:
tf = TfidfVectorizer(strip_accents='unicode',
                     tokenizer=LemmaTokenizer(),
                     stop_words=stopwords,
                     max_features=500)

In [524]:
tfidf = tf.fit_transform(over20_data['combined_reviews'])

In [525]:
words = tf.get_feature_names()

In [526]:
nmf = NMF(n_components=40)
nmf.fit(tfidf)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=40, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [527]:
H = nmf.components_
W = nmf.transform(tfidf)

In [528]:
H.shape

(40, 500)

In [529]:
W.shape

(435, 40)

In [533]:
top_words_index = np.argsort(-H)[:,0:16]
most_common_words_per_topic = np.array(words)[top_words_index]
for i, items in enumerate(most_common_words_per_topic):
    print(i, items)

0 ['wifi' 'lot' 'place' 'work' 'outlet' 'space' 'table' 'music' 'seating'
 'great' 'atmosphere' 'plenty' 'nice' 'study' 'chair' 'big']
1 ['sandwich' 'breakfast' 'delicious' 'egg' 'turkey' 'yum' 'panini' 'snack'
 'try' 'sweet' 'nice' 'muffin' 'chai' 'coconut' 'wifi' 'bomb']
2 ['latte' 'best' 'try' 'art' 'iced' 'amazing' 'soy' 'just' 'ive' 'bean'
 'yum' 'make' 'chai' 'town' 'raspberry' 'come']
3 ['bagel' 'cheese' 'cream' 'breakfast' 'sandwich' 'good' 'egg' 'street'
 'really' 'try' 'tomato' 'love' 'best' 'lady' 'chai' 'bread']
4 ['crepe' 'like' 'wanted' 'stop' 'neighborhood' 'instead' 'strawberry'
 'great' 'regular' 'daily' 'quite' 'door' 'maple' 'usually' 'dont'
 'ordered']
5 ['cupcake' 'cake' 'velvet' 'red' 'free' 'salted' 'delicious' 'order'
 'bakery' 'hour' 'today' '10' 'ice' 'caramel' 'lavender' 'stumptown']
6 ['customer' 'service' 'place' 'right' 'rude' 'location' 'asked'
 'convenient' '3' 'say' 'time' '’' 'dont' 'think' 'star' 'making']
7 ['beer' 'wine' 'good' 'selection' 'night' '

In [531]:
#def top_shops(W):
shop_dict = defaultdict(list)
for index, item in enumerate(W):
    key = np.argmax(item)
    value = item[key]
    name = over20_data['name'][index]
    shop_dict[key].append([value, name])
top_shop_names = defaultdict(list)
for feature in shop_dict:
    top_shop_names[feature] = list(np.sort(np.array(shop_dict[feature]).T)[1,-10:-1])
top_shop_names

defaultdict(list,
            {0: ['Down Pour Coffee Bar',
              'Drip City Coffee',
              'Porchlight Coffee & Records',
              'Roy Street Coffee & Tea',
              'Seattle Sunshine Coffee',
              'Stage Door Cafe',
              'The Highlands',
              'The Maple Leaf Living Room',
              'Uptown Espresso'],
             1: ['Cafe Kopi',
              'Celesto Espresso',
              'Discovery Espresso',
              'Grand Central Bakery',
              'Midtown Espresso Cafe',
              'Monkey Grind Espresso Bar',
              'Treehouse Coffee',
              'Trinity Market',
              'World Class Coffee'],
             2: ['Bustle Caffe',
              'Fremont Coffee Company',
              'Makeda & Mingus Café',
              'Monorail Espresso',
              'Moore Coffee',
              "Natalie's Organic Coffee",
              'Rococo Coffee Roasting'],
             3: ['Cherry Street Coffee House',
         

In [544]:
a = [len(shop_dict[feature]) for feature in shop_dict]

In [550]:
np.argsort(a)

array([27, 39, 31, 34, 25,  6, 18, 17, 11, 16,  4, 38, 26, 22, 15, 32, 36,
       30, 37, 23, 21, 13, 12,  5,  2, 24,  9, 14, 35, 33, 20, 29,  1, 10,
       19,  8,  0, 28,  7,  3])

In [549]:
np.argsort?