In [119]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans 
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.decomposition import NMF
from collections import defaultdict

In [4]:
data = pd.read_csv('data/seattle_only_less_columns.csv').drop('Unnamed: 0', axis=1)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599 entries, 0 to 598
Data columns (total 6 columns):
location.lat        599 non-null float64
location.lng        599 non-null float64
name                599 non-null object
combined_reviews    599 non-null object
num_review_words    599 non-null int64
final_address       599 non-null object
dtypes: float64(2), int64(1), object(3)
memory usage: 28.2+ KB


In [96]:
tf = TfidfVectorizer(strip_accents='unicode', 
                     stop_words='english',
                     max_features=500)

In [99]:
tfidf = tf.fit_transform(data['combined_reviews'])

In [100]:
words = tf.get_feature_names()

In [111]:
nmf = NMF(n_components=20)
nmf.fit(tfidf)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=20, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [112]:
H = nmf.components_
W = nmf.transform(tfidf)

In [113]:
H.shape

(20, 500)

In [121]:
W.shape

(599, 20)

In [None]:
def headlines(W):
    art_dict = defaultdict(list)
    for index, item in enumerate(W):
        key = np.argmax(item)
        value = item[key]
        name = data.headline[index]
        art_dict[key].append([value, name])
    top_article_names = defaultdict(list)
    for topic in art_dict:
        top_article_names[topic] = list(np.sort(np.array(art_dict[topic]).T)[1,-10:-1])
    return top_article_names

In [141]:
#def top_shops(W):
shop_dict = defaultdict(list)
for index, item in enumerate(W):
    key = np.argmax(item)
    value = item[key]
    name = data['name'][index]
    shop_dict[key].append([value, name])
top_shop_names = defaultdict(list)
for feature in shop_dict:
    top_shop_names[feature] = list(np.sort(np.array(shop_dict[feature]).T)[1,-10:-1])
top_shop_names

defaultdict(list,
            {0: ["Tully's Coffee",
              'Uptown Espresso',
              'Uptown Espresso - California Ave',
              'Uptown Espresso Magnolia',
              'Victrola',
              'Voxx Coffee',
              'Wayward Coffeehouse',
              'Woodland Coffee',
              'Zoka Coffee Roaster & Tea Company'],
             1: ['Starbucks',
              'Starbucks',
              'Starbucks',
              'Starbucks',
              'Starbucks',
              'Starbucks',
              'Starbucks',
              'Starbucks Reserve Bar',
              'Starbucks Reserve Roastery & Tasting Room'],
             2: ["Natalie's Organic Coffee",
              'Royal Drummer',
              'Seattle Aquarium Cafe',
              'Seattle Grind',
              'Solsticio',
              'The Station',
              'Third Ave Cafe',
              'Treehouse Coffee',
              'Trinity Market'],
             3: ['Monorail Espresso',
              '

In [168]:
list(np.sort(np.array(shop_dict[1]).T)[1,-10:-1])

['Starbucks',
 'Starbucks',
 'Starbucks',
 'Starbucks',
 'Starbucks',
 'Starbucks',
 'Starbucks',
 'Starbucks Reserve Bar',
 'Starbucks Reserve Roastery & Tasting Room']

In [138]:
shop_dict

defaultdict(list,
            {0: [[0.11067636006636829, 'Uptown Espresso Magnolia'],
              [0.13736629632996109, 'Toast'],
              [0.16464847246367659, 'Uptown Espresso - California Ave'],
              [0.17088924078412648, 'Ballard Coffee Works'],
              [0.15906708316472731, 'Bauhaus Books & Coffee'],
              [0.21816810742290382, 'The Dane'],
              [0.17693596448121973, 'Woodland Coffee'],
              [0.15036861985868064, 'The Highlands'],
              [0.14511642941639022, 'Stage Door Cafe'],
              [0.10139884193704202, "Tully's Coffee"],
              [0.14470067947368953, 'Fremont Coffee Company'],
              [0.17970805850185129, 'La Marzocco Cafe'],
              [0.1654874341048973, 'Drip City Coffee'],
              [0.15866769566128666, 'Dubsea Coffee'],
              [0.18995386576938197, 'Caffe Appassionato'],
              [0.32411650584750329, 'MiiR Flagship'],
              [0.090629631241330944, 'Portside Coffee Comp

In [132]:
np.argmax(W[598])

11

In [118]:
top_words_index = np.argsort(-H)[:,0:6]
most_common_words_per_feature = np.array(words)[top_words_index]
most_common_words_per_feature

array([['place', 'beer', 'wifi', 'work', 'space', 'lots'],
       ['starbucks', 'location', 'clover', 'nice', 'just', 'code'],
       ['breakfast', 'sandwich', 'sandwiches', 'delicious', 'egg',
        'biscuits'],
       ['best', 'espresso', 'latte', 'seattle', 'try', 'town'],
       ['drive', 'service', 'parking', 'window', 'lot', 'order'],
       ['coffee', 'shop', 'cup', 'beans', 'brew', 'pour'],
       ['bagel', 'bagels', 'cheese', 'cream', 'sandwiches', 'egg'],
       ['great', 'service', 'baristas', 'love', 'super', 'awesome'],
       ['cafe', 'blend', 'love', 'try', 'parking', 'closed'],
       ['donuts', 'doughnuts', 'donut', 'doughnut', 'old', 'pot'],
       ['good', 'coffee', 'free', 'way', 'location', 'tasty'],
       ['friendly', 'staff', 'baristas', 'drink', 'super', 'morning'],
       ['crepe', 'crepes', 'like', 'wanted', 'lunch', 'great'],
       ['cupcake', 'cupcakes', 'ice', 'cream', 'cake', 'free'],
       ['pastries', 'croissant', 'baked', 'croissants', 'amazing', '

In [87]:
feature_top_words = []
for feature in range(10):
    top_words = []
    top_10_indices = np.argsort(-km.cluster_centers_[feature][0:10])
    for i in top_10_indices:
        top_words.append(tfidf.get_feature_names()[i])
    feature_top_words.append(top_words)

In [89]:
top_words

['amazing',
 'almond',
 'add',
 '10',
 'actually',
 'ambiance',
 'absolutely',
 'ambience',
 'afternoon',
 '12']