## Motivation for this Notebook

If I was a business owner, I would want to know how my customers are generally feeling. After reading a couple of reviews, you can start to pick up on some trends but who has the time to go through all of the comments to get a full picture of what people are saying about the company? Well luckily we have the power of NLP and Machine Learning algorithms that can do this compiling and grouping for us. Here I try to get a better look into 'average' reviews for a particular business and what's being said in them by implementing kMeans clustering.

In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import cProfile, pstats, sys
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
import textblob as tb

In [10]:
review_df = None
profiler = cProfile.Profile()
snowball = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')

def profile():
    profiler.disable()
    ps = pstats.Stats(profiler, stream=sys.stdout)
    ps.print_stats()
    
def readDataset():
    #return pd.read_json('yelp_academic_dataset_review.json', lines = True)
    return pd.read_json('filtered.json')

def tokenize(text):
    return [snowball.stem(word) for word in tokenizer.tokenize(text.lower())]

def vectorize(review_df):
    vectorizer = TfidfVectorizer(stop_words = 'english', tokenizer = tokenize, \
                    min_df = 0.0025, max_df = 0.05, max_features = 1000, ngram_range = (1, 3))
    return vectorizer.fit_transform(review_df['text'])


In [13]:
profiler.enable()

review_df = readDataset()
#review_df = review_df[review_df['business_id'] == 'HQl28KMwrEKHqhFrrDqVNQ']

X = vectorize(review_df)

profile()

  'stop_words.' % sorted(inconsistent))


         4297478 function calls (4294039 primitive calls) in 1.450 seconds

   Random listing order was used

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       16    0.000    0.000    0.000    0.000 {method 'read' of '_io.StringIO' objects}
        2    0.004    0.002    0.004    0.002 {method 'read' of '_io.TextIOWrapper' objects}
        2    0.001    0.000    0.001    0.000 {method 'close' of '_io.TextIOWrapper' objects}
        1    0.000    0.000    0.000    0.000 {method 'readline' of '_io.BufferedReader' objects}
        1    0.000    0.000    0.000    0.000 {method 'seek' of '_io.BufferedReader' objects}
        2    0.001    0.000    0.001    0.000 {method 'read' of '_io.FileIO' objects}
        1    0.001    0.001    0.001    0.001 {method 'readlines' of '_io._IOBase' objects}
        3    0.001    0.000    0.001    0.000 {built-in method io.open}
       12    0.000    0.000    0.000    0.000 {built-in method _locale.setlocale}
        2    0.000 

        1    0.000    0.000    0.010    0.010 /opt/conda/lib/python3.6/site-packages/sklearn/feature_extraction/text.py:330(build_analyzer)
        1    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/sklearn/feature_extraction/text.py:358(_validate_vocabulary)
        1    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/sklearn/feature_extraction/text.py:394(_validate_params)
        2    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/sklearn/feature_extraction/text.py:680(_document_frequency)
        1    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/sklearn/feature_extraction/text.py:864(__init__)
        1    0.018    0.018    0.062    0.062 /opt/conda/lib/python3.6/site-packages/sklearn/feature_extraction/text.py:897(_sort_features)
        1    0.028    0.028    0.037    0.037 /opt/conda/lib/python3.6/site-packages/sklearn/feature_extraction/text.py:911(_limit_features)
        1   

        1    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/pandas/core/frame.py:2869(__getitem__)
       41    0.000    0.000    0.001    0.000 /opt/conda/lib/python3.6/site-packages/pandas/core/frame.py:3184(_box_col_values)
        6    0.000    0.000    0.006    0.001 /opt/conda/lib/python3.6/site-packages/pandas/core/internals/construction.py:60(arrays_to_mgr)
        4    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/pandas/core/internals/construction.py:274(<listcomp>)
        4    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/pandas/core/internals/construction.py:278(<listcomp>)
        4    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/pandas/core/internals/construction.py:281(<listcomp>)
        6    0.000    0.000    0.016    0.003 /opt/conda/lib/python3.6/site-packages/pandas/core/frame.py:441(__init__)
       16    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6

        2    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py:590(dtype)
        8    0.000    0.000    0.001    0.000 /opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py:625(astype)
       14    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py:1175(name)
       10    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py:1182(name)
        1    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py:1378(nlevels)
        2    0.001    0.000    0.001    0.001 /opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py:1646(is_unique)
       41    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py:1755(is_floating)
        2    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py:200

        4    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/pandas/core/dtypes/common.py:150(ensure_python_int)
      382    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/pandas/core/dtypes/common.py:180(<lambda>)
      382    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/pandas/core/dtypes/common.py:178(classes)
      152    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/pandas/core/dtypes/common.py:188(<lambda>)
      152    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/pandas/core/dtypes/common.py:183(classes_and_not_datetimelike)
      198    0.000    0.000    0.001    0.000 /opt/conda/lib/python3.6/site-packages/pandas/core/dtypes/common.py:194(is_object_dtype)
       96    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/pandas/core/dtypes/common.py:224(is_sparse)
       66    0.000    0.000    0.000    0.000 /opt/conda/lib/python3

       78    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/numpy/core/_dtype.py:321(_name_get)
       20    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/numpy/core/numeric.py:268(full)
        2    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(bincount)
       22    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(copyto)
      152    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/numpy/core/numerictypes.py:286(issubclass_)
       76    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/numpy/core/numerictypes.py:360(issubdtype)
        6    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/numpy/core/numeric.py:1816(isscalar)
        4    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/numpy/core/numeric.py:2313(_array_equal_dispatcher)
        4    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6

       19    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/sre_compile.py:250(_optimize_charset)
        5    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/contextlib.py:79(__enter__)
        5    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/contextlib.py:85(__exit__)
        5    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/contextlib.py:157(helper)
        5    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/contextlib.py:59(__init__)
        8    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/posixpath.py:232(expanduser)
        2    0.000    0.000    0.001    0.000 /opt/conda/lib/python3.6/genericpath.py:16(exists)
       25    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/_collections_abc.py:72(_check_methods)
        9    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/_collections_abc.py:252(__subclasshook__)
       14    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/_collect

In [None]:
def vectorize_reviews(reviews):
    vectorizer = TfidfVectorizer(stop_words = 'english', tokenizer = tokenize, \
                        min_df = 0.0025, max_df = 0.05, max_features = 1000, ngram_range = (1, 3))
    X = vectorizer.fit_transform(reviews)
    words = vectorizer.get_feature_names()
    return X, words

def print_clusters():
    num_words = 20
    X, words = vectorize_reviews(review_df['text'])
    
    kmeans = KMeans(n_clusters = 3)
    kmeans.fit(X)
    
    common_words = kmeans.cluster_centers_.argsort()[:,-1:-num_words-1:-1]
    for num, centroid in enumerate(common_words):
        print(str(num) + ' : ' + ', '.join(words[word] for word in centroid))

def calc_polarity(text):
    blob = tb.TextBlob(text)
    return blob.sentiment.polarity

def calc_subjectivity(text):
    blob = tb.TextBlob(text)
    return blob.sentiment.subjectivity

def get_pol_sub():
    review_df['polarity'] = review_df['text'].apply(calc_polarity)
    review_df['subjectivity'] = review_df['text'].apply(calc_subjectivity)
    
    print('\nMean Polarity: ' + str(review_df['polarity'].mean())\
          + '\nMean Subjectivity: ' + str(review_df['subjectivity'].mean()))

In [None]:
#print_clusters()
cProfile.run('print_clusters()')
#cProfile.run('get_pol_sub()')
get_pol_sub()

In [20]:
X

<453x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 9247 stored elements in Compressed Sparse Row format>