In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans 
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.decomposition import NMF
from collections import defaultdict
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
#from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words
from geopy.distance import vincenty
from geopy.distance import great_circle

In [2]:
data = pd.read_csv('data/data_not_starbucks.csv').drop('Unnamed: 0', axis=1)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 474 entries, 0 to 473
Data columns (total 6 columns):
location.lat        474 non-null float64
location.lng        474 non-null float64
name                474 non-null object
combined_reviews    474 non-null object
num_review_words    474 non-null int64
final_address       474 non-null object
dtypes: float64(2), int64(1), object(3)
memory usage: 22.3+ KB


### Filter out shops with less than 20 words in their review

In [566]:
over20_data = data[data['num_review_words'] >= 20].reset_index().drop('index', axis=1)

### Adding custom stopwords to sklearns defaults

In [567]:
stopwords = list(stop_words.ENGLISH_STOP_WORDS)

In [568]:
stopwords += ['coffee', 'shop', 'coffeeshop', 'starbucks', 'wa', 'seattle', 'cafe', 'caffee']

### Added LemmaTokenizer to lemmatize words 

In [569]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [629]:
tf = TfidfVectorizer(strip_accents='unicode',
                     tokenizer=LemmaTokenizer(),
                     stop_words=stopwords,
                     max_features=500)

In [630]:
tfidf = tf.fit_transform(over20_data['combined_reviews'])

In [631]:
words = tf.get_feature_names()

In [647]:
nmf = NMF(n_components=40)
nmf.fit(tfidf)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=40, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [660]:
H = nmf.components_
W = nmf.transform(tfidf)

In [661]:
H.shape

(40, 500)

In [662]:
W.shape

(435, 40)

In [654]:
top_words_index = np.argsort(-H)[:,0:10]
most_common_words_per_topic = np.array(words)[top_words_index]
for i, items in enumerate(most_common_words_per_topic):
    print(i, items)

0 ['wifi' 'lot' 'place' 'outlet' 'good' 'space' 'table' 'music' 'seating'
 'atmosphere']
1 ['sandwich' 'breakfast' 'delicious' 'egg' 'turkey' 'yum' 'try' 'panini'
 'snack' 'good']
2 ['espresso' 'white' 'nico' 'best' 'shot' 'velvet' 'place' 'milk' 'good'
 'cappuccino']
3 ['bagel' 'cheese' 'cream' 'breakfast' 'sandwich' 'good' 'egg' 'street'
 'really' 'love']
4 ['crepe' 'like' 'wanted' 'stop' 'neighborhood' 'instead' 'strawberry'
 'great' 'regular' 'quite']
5 ['cupcake' 'cake' 'velvet' 'free' 'red' 'salted' 'delicious' 'today' 'hour'
 'order']
6 ['customer' 'place' 'service' 'rude' 'asked' 'right' 'people' 'make' '’'
 'making']
7 ['work' 'best' 'place' 'great' 'nice' 'town' 'time' 'better' 'come' 'super']
8 ['donut' 'doughnut' 'fashioned' 'pot' 'old' 'maple' 'pumpkin' 'cake'
 'lemon' 'apple']
9 ['croissant' 'pastry' 'good' 'bakery' 'baked' 'macaroon' 'bread' 'almond'
 'quiche' 'ham']
10 ['chocolate' 'hot' 'dark' 'favorite' 'rich' 'cooky' 'far' 'chip'
 'delicious' 'cookie']
11 ['great' 'l

In [656]:
#def top_shops(W):
shop_dict = defaultdict(list)
for index, item in enumerate(W):
    key = np.argmax(item)
    value = item[key]
    name = over20_data['name'][index]
    shop_dict[key].append([value, name])
top_shop_names = defaultdict(list)
for feature in shop_dict:
    top_shop_names[feature] = list(np.sort(np.array(shop_dict[feature]).T)[1,-10:-1])
top_shop_names

defaultdict(list,
            {0: ['Down Pour Coffee Bar',
              'Drip City Coffee',
              'Porchlight Coffee & Records',
              'Roy Street Coffee & Tea',
              'The Maple Leaf Living Room',
              'Uptown Espresso',
              'Voxx Coffee',
              'Wayward Coffeehouse',
              'Woodland Coffee'],
             1: ['Cherry Street Coffee House',
              'Discovery Espresso',
              'Grand Central Bakery',
              'Konvene Coffee',
              'Midtown Espresso Cafe',
              'Monkey Grind Espresso Bar',
              'Treehouse Coffee',
              'Trinity Market',
              'World Class Coffee'],
             2: ['Caffe Delia',
              'Espresso Vivace',
              'Espresso Vivace',
              'Espresso Vivace Sidewalk Bar',
              'Gourmet Latte',
              'Moonshot Coffee',
              'Sureshot'],
             3: ['Cherry Street Coffee House',
              'Cherry St

### Add W matrix to main data matrix

In [676]:
columns = ['feature{}'.format(n) for n in range(0,40)]

W_df = pd.DataFrame(W, columns=columns)

In [677]:
df_with_features = pd.concat([over20_data, W_df], axis=1)

In [681]:
df_with_features.shape

(435, 46)

In [684]:
df_with_features = df_with_features.drop(['combined_reviews', 'num_review_words'], axis=1)

In [689]:
output_columns = ['lat', 'lng', 'name', 'address', 'feature0',
       'feature1', 'feature2', 'feature3', 'feature4', 'feature5', 'feature6',
       'feature7', 'feature8', 'feature9', 'feature10', 'feature11',
       'feature12', 'feature13', 'feature14', 'feature15', 'feature16',
       'feature17', 'feature18', 'feature19', 'feature20', 'feature21',
       'feature22', 'feature23', 'feature24', 'feature25', 'feature26',
       'feature27', 'feature28', 'feature29', 'feature30', 'feature31',
       'feature32', 'feature33', 'feature34', 'feature35', 'feature36',
       'feature37', 'feature38', 'feature39']

In [693]:
df_with_features.columns = output_columns

In [724]:
df_with_features.to_csv('data/df_with_features.csv')

In [725]:
df_with_features

Unnamed: 0,lat,lng,name,address,feature0,feature1,feature2,feature3,feature4,feature5,...,feature30,feature31,feature32,feature33,feature34,feature35,feature36,feature37,feature38,feature39
0,47.579130,-122.410511,Alki Cafe,2726 Alki Ave SW,0.000000,0.019318,0.000000,0.001701,0.006822,0.003499,...,0.000000,0.000000,0.008312,0.000967,0.026012,0.000000,0.087595,0.000000,0.365975,0.000000
1,47.579352,-122.409126,Tully's Coffee,2676 Alki Ave SW,0.058326,0.000000,0.010739,0.000000,0.002277,0.000000,...,0.000000,0.000000,0.000000,0.030510,0.007146,0.008403,0.048431,0.000000,0.104535,0.000000
2,47.580447,-122.406728,Ampersand Cafe,2536 Alki Ave SW,0.000000,0.206793,0.000000,0.000000,0.000000,0.000000,...,0.006540,0.000000,0.157125,0.000000,0.000000,0.042855,0.051240,0.000606,0.053707,0.000000
3,47.680561,-122.404709,Jibe Espresso Bar,7001 Seaview Ave NW #170,0.000000,0.078934,0.032120,0.000000,0.000000,0.000000,...,0.018206,0.000000,0.000799,0.049792,0.017265,0.102750,0.105620,0.032958,0.000000,0.098956
4,47.675598,-122.398264,The Scoop at Walter's,6408 32nd Ave NW,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.081314,0.605016,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,47.675674,-122.398387,Pico Café,6415 32nd Ave NW,0.000000,0.051616,0.000000,0.000000,0.009567,0.000000,...,0.000000,0.000000,0.000000,0.020297,0.000000,0.042304,0.000000,0.125295,0.000000,0.000000
6,47.675592,-122.398240,Walter's,6408 32nd Ave NW,0.005489,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.030476,0.613082,0.000000,0.000000,0.014427,0.000000,0.006457,0.000000
7,47.659875,-122.398059,Discovery Espresso,3103 W Jameson St,0.045896,0.154715,0.033706,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.011493,0.008990,0.000000,0.135539,0.000000,0.045460,0.016415,0.000000
8,47.639363,-122.399467,Uptown Espresso Magnolia,3223 W McGraw St,0.079794,0.000000,0.011205,0.000000,0.000000,0.007741,...,0.012501,0.000000,0.000000,0.041876,0.000000,0.089588,0.017854,0.000000,0.000000,0.067171
9,47.668781,-122.391557,Firehouse Coffee,2622 NW Market St,0.040098,0.056687,0.011331,0.006736,0.000000,0.000000,...,0.028326,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
