In [4]:
import pandas as pd
pd.options.display.max_rows = 1000

import pickle

import numpy as np

import re
import string

import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style('white')

In [7]:
with open('../data/interim/04_neighborhoods_lemmas.pkl', 'rb') as picklefile:
    df_neighborhoods = pickle.load(picklefile)

In [5]:
touristy = ['tourist', 'attraction', 'fisherman', 'whart', 'paint', 'lady', 'seaworld', 'world', 'universal', \
           'hollywoord', 'theater', 'disney', 'rise', 'bowl', 'empire', 'broadway', 'bourbon', 'mardi', 'gras', \
           'prudential', 'newbury', 'copley', 'opry', 'smithsonian', 'mall', 'space', 'needle', 'starbucks', \
           'pike']
hipster = ['hip', 'hipster', 'coffee', 'bike']
sportsfans = ['petco', 'dodgers', 'staple', 'rise', 'bowl', 'superdome', 'cub', 'wrigley', 'sox', 'bronco', '']
outdoorsy = ['runyon', 'trail', 'kayak', 'water', 'hike', '']
nightlife = ['restaurant', 'bar', 'nightlift', 'club']
campus = ['ucsf', 'sdsu', 'ucla', 'barclays', 'tulane', 'loyola', 'university', 'ut']
artsy = ['art', 'music', 'artist', 'musician']
transportation = ['bart', 'muni', 'cta', 'metra', 'divvy', 'metro', 'walkability', 'walkable']

In [88]:
descriptions = df_neighborhoods['lemmas'].tolist()

In [89]:
len(descriptions)

895

In [90]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering

from scipy.cluster.hierarchy import linkage, ward, dendrogram

In [91]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2),
                                   #max_df=0.5, 
                                   min_df=20,
                                   max_features=1000)

tfidf = tfidf_vectorizer.fit_transform(descriptions).toarray()
tfidf_features = tfidf_vectorizer.get_feature_names()

In [92]:
#10 words with highest tfidf
top = tfidf_vectorizer.idf_.argsort()[:10].tolist()
[(tfidf_features[i], tfidf_vectorizer.idf_[i]) for i in top]

[(u'shop', 1.0813456394539525),
 (u'store', 1.1221171913400827),
 (u'home', 1.1399293671041821),
 (u'grocery', 1.1619938572882842),
 (u'easy', 1.1619938572882842),
 (u'safe', 1.1725480449669743),
 (u'bar', 1.1765347612107957),
 (u'food', 1.1858993781418385),
 (u'access', 1.1926424920267287),
 (u'well', 1.2062666809662723)]

In [93]:
#10 words with lowest tfidf
bottom = tfidf_vectorizer.idf_.argsort()[::-1].tolist()[:10]
[(tfidf_features[i], tfidf_vectorizer.idf_[i]) for i in bottom]

[(u'frenchman', 4.7534179752515069),
 (u'museum botanical', 4.6624461970457807),
 (u'tokyo', 4.3967430313127753),
 (u'dodger', 4.3967430313127753),
 (u'km', 4.2715798883587688),
 (u'rockefeller', 4.2144214745188204),
 (u'botanic', 4.2144214745188204),
 (u'natural history', 4.1870225003307056),
 (u'arboretum', 4.1870225003307056),
 (u'esplanade', 4.134378766845284)]

In [94]:
tfidf.shape

(895, 1000)

In [95]:
#cities = df_cities_grouped['city'].tolist()
neighborhoods = df_neighborhoods['neighborhood'].tolist()
cities = df_neighborhoods['city'].tolist()
states = df_neighborhoods['state'].tolist()

### PCA

In [96]:
from sklearn.decomposition import PCA

In [97]:
pca = PCA(n_components=10, random_state=16)

In [108]:
pca.fit_transform(tfidf)

array([[ 0.28677069,  0.07824793, -0.03499322, ..., -0.08307502,
        -0.12625819, -0.09051519],
       [ 0.15088896, -0.15224201,  0.09720958, ..., -0.12142037,
        -0.00597597, -0.06882234],
       [ 0.18864988,  0.08508183, -0.01415991, ...,  0.00050788,
         0.00056209,  0.14980713],
       ..., 
       [ 0.12143806,  0.11429275,  0.16148524, ...,  0.08007493,
        -0.20605281, -0.02155959],
       [ 0.11607367,  0.04981465,  0.13954229, ..., -0.02044357,
         0.03145779,  0.02195166],
       [ 0.23436029, -0.18537359,  0.08339857, ...,  0.23429249,
        -0.07463868,  0.08113307]])

In [109]:
pca.explained_variance_ratio_

array([ 0.041677  ,  0.02417318,  0.0199639 ,  0.01636349,  0.0136757 ,
        0.0124156 ,  0.01137654,  0.01097487,  0.01061261,  0.01016486])

In [110]:
sum(pca.explained_variance_ratio_)

0.17139774339738689

In [111]:
zipped = zip(neighborhoods, cities, states)

In [112]:
terms = tfidf_vectorizer.get_feature_names()

In [113]:
dist = 1 - cosine_similarity(tfidf)
linkage_matrix = ward(dist)

In [114]:
linkage_matrix = linkage(dist, method='complete', metric='cosine')

In [None]:
fig, ax = plt.subplots(figsize=(20, 100)) # set size
dend = dendrogram(linkage_matrix,
                p=12,
                #truncate_mode='lastp', 
                labels=zipped,
                orientation='left');

ax = dend

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout() #show plot with tight layout

#uncomment below to save figure
plt.savefig('ward_clusters.png', dpi=300) #save figure as ward_clusters

In [16]:
from sklearn.cluster import DBSCAN
from sklearn import metrics

In [17]:
db = DBSCAN(metric='cosine', algorithm='brute').fit(tfidf)

In [18]:
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

In [19]:
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

In [20]:
print('Estimated number of clusters: %d' % n_clusters_)

Estimated number of clusters: 1
