## Take the reviews data and cluster it by neighborhood. 
Then, run TF-IDF on it to see what terms are most common in each neighborhood

In [110]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
from sklearn.externals import joblib
import seaborn
import nltk
import joblib
from collections import defaultdict
import itertools

%matplotlib inline
pd.set_option('display.max_columns', 500)

In [111]:
df = pd.read_csv('with_clusters.csv')

In [112]:
df.review_date = df.review_date.apply(lambda x: pd.to_datetime(x))
df.first_review = df.first_review.apply(lambda x: pd.to_datetime(x))
df.last_review = df.last_review.apply(lambda x: pd.to_datetime(x))

In [113]:
n = df.groupby(['neighbourhood_cleansed'])['comments'].apply(lambda x: ' ;; '.join(x)).reset_index()

In [114]:
n['chars'] = n.comments.apply(lambda x: len(x))
n['reviews'] = n.comments.apply(lambda x: x.count(';;') + 1)
n['words'] = n.comments.apply(lambda x: x.count(' '))
n['words_per_review'] = n.words / n.reviews

# Text Blob Sentiment Analysis

In [115]:
from textblob import TextBlob

polarity_list = []
subjectivity_list = []
for comment in df.comments:
    comment = comment.decode('utf-8')
    blob = TextBlob(comment)
    pol = (100*blob.sentiment.polarity + 100) / 2
    subj = (100*blob.sentiment.subjectivity + 100) / 2
    
    polarity_list.append(pol)
    subjectivity_list.append(subj)
    
df['positivity'] = polarity_list
df['subjectivity'] = subjectivity_list

grouped = df.groupby('neighbourhood_cleansed', as_index=False)

In [116]:
n['positivity'] = grouped['positivity'].mean()['positivity']
n['subjectivity']= grouped['subjectivity'].mean()['subjectivity']
n['stars'] = grouped['review_scores_rating'].mean()['review_scores_rating']/20

Write out that csv of neighborhoods for use later

In [117]:
subset = n[['neighbourhood_cleansed', 'chars', 'reviews', 'words_per_review', 'positivity', 'subjectivity', 'stars']]
subset.to_csv('neighborhood_info.csv', index = False)

# Filter Out Stop Words

Create lists of stopwords to exclude from our analysis:

In [8]:
from nltk.corpus import stopwords

In [9]:
stop = stopwords.words('english')
stop += ['.', ',', '(', ')', "'", '"']
stop.remove('not')

Create a list of common names to add to our stop words

In [10]:
names = []
with open('/Users/dyawitz/nltk_data/corpora/names/female.txt') as fp:
    for line in fp:
        names.append(line.lower().replace('\n', ''))
with open('/Users/dyawitz/nltk_data/corpora/names/male.txt') as fp:
    for line in fp:
        names.append(line.lower().replace('\n', ''))
names.remove('park')

In [11]:
stop_names = stop + names

Add a list of common nyc words I'm not interested in:

In [15]:
foreign = ['est', 'et', 'es', 'de', 'que', 'la', 'le', 'much', 'bien', 'pour','un','en',\
      'appartement', 're', 'un', 'und', 'nous', 'tres', 'el', 'pas', 'muy', 'vie', 'las', 'los', 'die',
        'ich', 'ein', 'die', 'nos', 'ich', 'su', 'ist', 'wir', 'para', 'il', 'chi'\
      ]

In [16]:
nyc = ['nyc', 'brooklyn', 'thanks','great','located','train','subway','new york', \
       'ny', 'york', 'nights', 'mrs', 'ms',
      ]

In [18]:
full_stop = stop + names + nyc + foreign

# LDA 

In [None]:
# from sklearn.decomposition import LatentDirichletAllocation
# from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# vectorizer = CountVectorizer(ngram_range=(1,2), max_df=0.9, min_df=0, stop_words=stop)

In [None]:
# %time vectorized_vocab = vectorizer.fit_transform(list(n.comments))

In [None]:
# vectorized_vocab.shape

In [None]:
# %time vectorizer.fit(list(n.comments))
# all_features_names = vectorizer.get_feature_names()

In [None]:
# len(all_features_names)

In [None]:
# lda_model = LatentDirichletAllocation(n_topics = 20)
# lda_model.fit_transform(vectorized_vocab)

In [None]:
# def print_top_words(model, feature_names, n_top_words):
#     for topic_idx, topic in enumerate(model.components_):
#         print("\nTopic #%d:" % topic_idx)
#         print(", ".join([feature_names[i]
#                         for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [None]:
# print_top_words(lda_model, all_features_names, 40)

# TF-IDF

In [20]:
# from spacy.en import English
# nlp = English()
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.75, min_df=0.1, stop_words=full_stop, strip_accents = 'unicode', \
                                   norm = 'l2', use_idf=True, ngram_range=(1,3))

In [22]:
%time tfidf_matrix = tfidf_vectorizer.fit_transform(list(n.comments))

print(tfidf_matrix.shape)

CPU times: user 4min 50s, sys: 30.1 s, total: 5min 20s
Wall time: 5min 53s
(204, 75964)


In [24]:
nabes = list(n.neighbourhood_cleansed)

In [34]:
terms = tfidf_vectorizer.get_feature_names()

nabe_dict = dict(zip(nabes, tfidf_matrix))

terms_dict = {}

for nabe, tfs in zip(nabes, tfidf_matrix):
    words = [terms[ind] for ind in tfs.indices]
    freqs = [freq for freq in tfs.data]
    
    foo = zip(words, freqs)
    foo.sort(key = lambda t: t[1], reverse=True)
    
    terms_dict[nabe] = foo

In [41]:
joblib.dump(terms_dict1, 'terms_dictionary.pkl')

['terms_dictionary.pkl']

In [38]:
terms_dict["Bayswater"]

[(u'astoria', 0.35518923128993368),
 (u'mrs', 0.16438837917172017),
 (u'jfk', 0.14793891427467892),
 (u'beach', 0.14183313933433728),
 (u'airport', 0.10998978210350553),
 (u'near jfk', 0.10148263751140962),
 (u'hospital', 0.095752764799175216),
 (u'fridge', 0.087103762466864215),
 (u'rockaway', 0.077060880264631509),
 (u'ms', 0.072030204881516874),
 (u'close jfk', 0.068189919884524455),
 (u'leave', 0.066330230679730956),
 (u'rotation', 0.065608894473311433),
 (u'st', 0.06365303438761942),
 (u'informed', 0.059308234158491714),
 (u'city new', 0.058684232484912946),
 (u'booking', 0.058186423010112572),
 (u'jfk airport', 0.057925609493701485),
 (u'touch', 0.056415531180017339),
 (u'hair', 0.055008649115024892),
 (u'informed would', 0.054174072081364537),
 (u'basement', 0.051198345909603329),
 (u'near airport', 0.050131726189867734),
 (u'home people', 0.048975271255459848),
 (u'locating', 0.048425649945536851),
 (u'dogs', 0.046692967634941039),
 (u'flight', 0.046549138408090052),
 (u'defian

In [37]:
terms_dict1['Bayswater']

[(u'astoria', 0.010176440592496838),
 (u'mrs', 0.0047098516153275781),
 (u'jfk', 0.0042385619827698291),
 (u'beach', 0.004063626904569715),
 (u'airport', 0.0031512905931664653),
 (u'near jfk', 0.0029075544549990967),
 (u'hospital', 0.0027433892604439014),
 (u'fridge', 0.0024955887905380903),
 (u'rockaway', 0.0022078526062587855),
 (u'ms', 0.0020637199449433487),
 (u'close jfk', 0.0019536928701127782),
 (u'leave', 0.0019004113653656915),
 (u'rotation', 0.0018797445365173436),
 (u'st', 0.0018237076631667732),
 (u'informed', 0.001699225844679733),
 (u'city new', 0.0016813477239446778),
 (u'booking', 0.0016670851053166009),
 (u'jfk airport', 0.0016596126004609821),
 (u'touch', 0.0016163477126336801),
 (u'hair', 0.0015760394755200437),
 (u'informed would', 0.0015521282111721272),
 (u'basement', 0.001466871401734267),
 (u'near airport', 0.0014363119386186336),
 (u'home people', 0.0014031786285372387),
 (u'locating', 0.0013874315615766525),
 (u'dogs', 0.0013377889005775768),
 (u'flight', 0.00

In [108]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.75, min_df=0.1, stop_words=full_stop, \
                                   strip_accents = 'unicode', norm = 'l1', use_idf=True, \ 
                                   ngram_range=(1,3))

SyntaxError: unexpected character after line continuation character (<ipython-input-108-ca61fbfa3f45>, line 1)

In [33]:
%time tfidf_matrix1 = tfidf_vectorizer_1.fit_transform(list(n.comments))

print(tfidf_matrix1.shape)

CPU times: user 4min 48s, sys: 36.9 s, total: 5min 25s
Wall time: 5min 54s
(204, 75964)


In [40]:
terms1 = tfidf_vectorizer_1.get_feature_names()

nabe_dict1 = dict(zip(nabes, tfidf_matrix1))

terms_dict1 = {}

for nabe, tfs in zip(nabes, tfidf_matrix1):
    words = [terms1[ind] for ind in tfs.indices]
    freqs = [freq for freq in tfs.data]
    
    foo = zip(words, freqs)
    foo.sort(key = lambda t: t[1], reverse=True)
    
    terms_dict1[nabe] = foo

In [39]:
nabes

['Allerton',
 'Arden Heights',
 'Arrochar',
 'Arverne',
 'Astoria',
 'Bath Beach',
 'Battery Park City',
 'Bay Ridge',
 'Bay Terrace',
 'Bay Terrace, Staten Island',
 'Baychester',
 'Bayside',
 'Bayswater',
 'Bedford-Stuyvesant',
 'Belle Harbor',
 'Bellerose',
 'Belmont',
 'Bensonhurst',
 'Bergen Beach',
 'Boerum Hill',
 'Borough Park',
 'Briarwood',
 'Brighton Beach',
 'Bronxdale',
 'Brooklyn Heights',
 'Brownsville',
 'Bushwick',
 'Cambria Heights',
 'Canarsie',
 'Carroll Gardens',
 'Castleton Corners',
 'Chelsea',
 'Chinatown',
 'City Island',
 'Civic Center',
 'Claremont Village',
 'Clason Point',
 'Clifton',
 'Clinton Hill',
 'Co-op City',
 'Cobble Hill',
 'College Point',
 'Columbia St',
 'Concord',
 'Concourse',
 'Concourse Village',
 'Coney Island',
 'Corona',
 'Country Club',
 'Crown Heights',
 'Cypress Hills',
 'DUMBO',
 'Ditmars Steinway',
 'Dongan Hills',
 'Downtown Brooklyn',
 'Dyker Heights',
 'East Elmhurst',
 'East Flatbush',
 'East Harlem',
 'East Morrisania',
 'East N

# Clustering

In [None]:
# from sklearn.cluster import KMeans

# num_clusters = 15

# km = KMeans(n_clusters=num_clusters)

# %time km.fit(tfidf_matrix)

In [None]:
# inertia_list = []

# for i in range(3,30):
#     km = KMeans(n_clusters=i)
#     %time km.fit(tfidf_matrix)
#     inertia_list.append(km.inertia_)


In [None]:
# plt.plot(range(3,30), inertia_list)

Import the vocab list (stemmed and not) so that I can reference it later on

In [None]:
from __future__ import print_function

def get_top_terms(model=km, terms=terms, num_clusters=num_clusters):
    print("Top terms per cluster:")
    print()

    #sort cluster centers by proximity to centroid
    order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

    for i in range(num_clusters):
        print("Cluster %d words:" % i, end='')

        for ind in order_centroids[i, :20]: #replace 11 with n words per cluster
            print(' %s' % terms[ind], end=',')
        print() #add whitespace
        print() #add whitespace

In [None]:
# get_top_terms(model=km, terms=terms, num_clusters=num_clusters)