<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Acknowledgements" data-toc-modified-id="Acknowledgements-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Acknowledgements</a></span></li><li><span><a href="#Prepare-data-and-model" data-toc-modified-id="Prepare-data-and-model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Prepare data and model</a></span></li><li><span><a href="#Make-feature-matrix-(word2vec,-votes,-stars)" data-toc-modified-id="Make-feature-matrix-(word2vec,-votes,-stars)-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Make feature matrix (word2vec, votes, stars)</a></span></li><li><span><a href="#Create-Label-y-(Business-categories)" data-toc-modified-id="Create-Label-y-(Business-categories)-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Create Label y (Business categories)</a></span></li><li><span><a href="#Join-x,y-(feature-matrix,-category)-using-business_id" data-toc-modified-id="Join-x,y-(feature-matrix,-category)-using-business_id-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Join x,y (feature matrix, category) using business_id</a></span></li><li><span><a href="#Category-Prediction" data-toc-modified-id="Category-Prediction-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Category Prediction</a></span></li><li><span><a href="#Cluster-with-metadata-(useful,-cool,-funny,-stars)" data-toc-modified-id="Cluster-with-metadata-(useful,-cool,-funny,-stars)-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Cluster with metadata (useful, cool, funny, stars)</a></span></li></ul></div>

# Acknowledgements
Thanks to the tutorial: https://www.kaggle.com/c/word2vec-nlp-tutorial/overview/part-3-more-fun-with-word-vectors

# Prepare data and model

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re

import nltk
import nltk.data
nltk.download('stopwords')
from nltk.corpus import stopwords # Import the stop word list



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daviderickson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def load_reviews(size='small'): 
    if size == 'small':
        filename = r'../../data/small-review.json'
    elif size == 'intermediate':
        filename = r'../../data/intermediate-review.json'
    elif size == 'full':
        filename = r'../../data/review.json'
    new_list = []
    for line in open(filename):
       new_list.append(json.loads(line))
    return pd.DataFrame.from_records(new_list)

dfreviews = load_reviews(size='intermediate')

In [3]:
dfreviews.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,ujmEBvifdJM6h6RLv4wQIg,0,2013-05-07 04:34:36,1,Q1sbwvVQXV2734tPgoKj4Q,1.0,Total bill for this horrible service? Over $8G...,6,hG7b0MtEbXx5QzbzE6C_VA
1,NZnhc2sEQy3RmzKTZnqtwQ,0,2017-01-14 21:30:33,0,GJXCdrto3ASJOqKeVWPi6Q,5.0,I *adore* Travis at the Hard Rock's new Kelly ...,0,yXQM5uF2jS6es16SJzNHfg
2,WTqjgwHlXbSFevF32_DJVw,0,2016-11-09 20:09:03,0,2TzJjDVDEuAW6MR5Vuc1ug,5.0,I have to say that this office really has it t...,3,n6-Gk65cPZL6Uz8qRm3NYw
3,ikCg8xy5JIg_NGPx-MSIDA,0,2018-01-09 20:56:38,0,yi0R0Ugj_xUx_Nek0-_Qig,5.0,Went in for a lunch. Steak sandwich was delici...,0,dacAIZ6fTM6mqwW5uxkskg
4,b1b1eb3uo-w561D0ZfCEiQ,0,2018-01-30 23:07:38,0,11a8sVPMUFtaC7_ABRkmtw,1.0,Today was my second out of three sessions I ha...,7,ssoyf2_x0EQMed6fgHeMyQ


In [4]:
dfreviews.columns

Index(['business_id', 'cool', 'date', 'funny', 'review_id', 'stars', 'text',
       'useful', 'user_id'],
      dtype='object')

In [5]:
dfreviews['text'][0]

'Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.'

In [6]:
# For simplicity, drop anything that isn't a letter
# Numbers and symbols may have interesting meaning and could be explore later

def lettersOnly(string):
    return re.sub("[^a-zA-Z]", " ", string) 

dfreviews['text'] = dfreviews['text'].apply(lettersOnly)


In [7]:
dfreviews['text'][0]

'Total bill for this horrible service  Over   Gs  These crooks actually had the nerve to charge us     for   pills  I checked online the pills can be had for    cents EACH  Avoid Hospital ERs at all costs '

In [8]:
def review_to_wordlist(string, remove_stopwords=False):
    string = re.sub("[^a-zA-Z]", " ", string) # keep only letters. more complex model possible later
    words =  string.lower().split() # make everything lowercase. split into words
    if remove_stopwords:
        stops = set(stopwords.words('english')) # create a fast lookup for stopwords
        words = [w for w in words if not w in stops] # remove stopwords
    return( words) # return a list of words
    
# dfreviews['text'] = dfreviews['text'].apply(review_to_words) # apply to reviews in dataframe


In [9]:
# Word2Vec expects single sentences, each one as a list of words

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [10]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences")
for review in dfreviews["text"]:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences


In [11]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2020-01-17 16:15:09,870 : INFO : 'pattern' package not found; tag filters are not available for English
2020-01-17 16:15:09,880 : INFO : collecting all words and their counts
2020-01-17 16:15:09,881 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training model...


2020-01-17 16:15:10,135 : INFO : PROGRESS: at sentence #10000, processed 1088334 words, keeping 25539 word types
2020-01-17 16:15:10,330 : INFO : PROGRESS: at sentence #20000, processed 2172597 words, keeping 35463 word types
2020-01-17 16:15:10,504 : INFO : PROGRESS: at sentence #30000, processed 3251616 words, keeping 42649 word types
2020-01-17 16:15:10,724 : INFO : PROGRESS: at sentence #40000, processed 4373996 words, keeping 48893 word types
2020-01-17 16:15:10,931 : INFO : PROGRESS: at sentence #50000, processed 5471587 words, keeping 53964 word types
2020-01-17 16:15:11,146 : INFO : PROGRESS: at sentence #60000, processed 6570064 words, keeping 58362 word types
2020-01-17 16:15:11,371 : INFO : PROGRESS: at sentence #70000, processed 7667364 words, keeping 62704 word types
2020-01-17 16:15:11,560 : INFO : PROGRESS: at sentence #80000, processed 8768955 words, keeping 66443 word types
2020-01-17 16:15:11,742 : INFO : PROGRESS: at sentence #90000, processed 9872097 words, keeping 

2020-01-17 16:15:58,024 : INFO : EPOCH 4 - PROGRESS: at 94.18% examples, 608041 words/s, in_qsize 7, out_qsize 0
2020-01-17 16:15:58,591 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-01-17 16:15:58,606 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-01-17 16:15:58,626 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-01-17 16:15:58,627 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-17 16:15:58,627 : INFO : EPOCH - 4 : training on 10978770 raw words (7805011 effective words) took 12.7s, 615534 effective words/s
2020-01-17 16:15:59,669 : INFO : EPOCH 5 - PROGRESS: at 9.28% examples, 706344 words/s, in_qsize 7, out_qsize 0
2020-01-17 16:16:00,672 : INFO : EPOCH 5 - PROGRESS: at 17.98% examples, 686679 words/s, in_qsize 7, out_qsize 0
2020-01-17 16:16:01,675 : INFO : EPOCH 5 - PROGRESS: at 26.31% examples, 671041 words/s, in_qsize 7, out_qsize 0
2020-01-17 16:16:02,695 : INFO : EPOCH 5 - PRO

In [12]:
model.most_similar('pizza')

  """Entry point for launching an IPython kernel.


[('crust', 0.7152116894721985),
 ('pepperoni', 0.6815378665924072),
 ('pizzas', 0.6730896234512329),
 ('margherita', 0.6281450390815735),
 ('calzone', 0.6170759797096252),
 ('mozzarella', 0.5520058870315552),
 ('slice', 0.5371947288513184),
 ('dough', 0.5306512713432312),
 ('meatball', 0.5304919481277466),
 ('lasagna', 0.5257267951965332)]

In [13]:
model.most_similar('service')

  """Entry point for launching an IPython kernel.


[('waitstaff', 0.5360804200172424),
 ('staff', 0.47411614656448364),
 ('communication', 0.4119015634059906),
 ('servers', 0.4111485779285431),
 ('bartenders', 0.4075550138950348),
 ('vibes', 0.39751946926116943),
 ('execution', 0.3916454017162323),
 ('value', 0.38852739334106445),
 ('environment', 0.3866685628890991),
 ('baristas', 0.38619837164878845)]

In [14]:
model.most_similar('bad')

  """Entry point for launching an IPython kernel.


[('terrible', 0.6086654663085938),
 ('horrible', 0.5806093215942383),
 ('good', 0.5596156120300293),
 ('awful', 0.5373344421386719),
 ('poor', 0.522222101688385),
 ('disappointing', 0.49767547845840454),
 ('alright', 0.4742405414581299),
 ('ok', 0.4618035554885864),
 ('greatest', 0.45794445276260376),
 ('okay', 0.4555124044418335)]

In [15]:
import numpy as np  # Make sure that numpy is imported

def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # WV.Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = int(0.)
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
       #
       # Print a status message every 1000th review
       if counter%1000. == 0.:
           print ("Review %d of %d" % (counter, len(reviews)))
       # 
       # Call the function (defined above) that makes average feature vectors
       reviewFeatureVecs[counter] = makeFeatureVec(review, model, \
           num_features)
       #
       # Increment the counter
       counter = counter + 1
    return reviewFeatureVecs

In [16]:
# ****************************************************************
# Calculate average feature vectors
# using the functions we defined above. Notice that we now use stop word
# removal.

clean_reviews = []
for review in dfreviews["text"]:
    clean_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

reviewDataVecs = getAvgFeatureVecs( clean_reviews, model, num_features )

Review 0 of 100000




Review 1000 of 100000
Review 2000 of 100000
Review 3000 of 100000
Review 4000 of 100000
Review 5000 of 100000
Review 6000 of 100000
Review 7000 of 100000
Review 8000 of 100000




Review 9000 of 100000
Review 10000 of 100000
Review 11000 of 100000
Review 12000 of 100000
Review 13000 of 100000
Review 14000 of 100000
Review 15000 of 100000
Review 16000 of 100000
Review 17000 of 100000
Review 18000 of 100000
Review 19000 of 100000
Review 20000 of 100000
Review 21000 of 100000
Review 22000 of 100000
Review 23000 of 100000
Review 24000 of 100000
Review 25000 of 100000
Review 26000 of 100000
Review 27000 of 100000
Review 28000 of 100000
Review 29000 of 100000
Review 30000 of 100000
Review 31000 of 100000
Review 32000 of 100000
Review 33000 of 100000
Review 34000 of 100000
Review 35000 of 100000
Review 36000 of 100000
Review 37000 of 100000
Review 38000 of 100000
Review 39000 of 100000
Review 40000 of 100000
Review 41000 of 100000
Review 42000 of 100000
Review 43000 of 100000
Review 44000 of 100000
Review 45000 of 100000
Review 46000 of 100000
Review 47000 of 100000
Review 48000 of 100000
Review 49000 of 100000
Review 50000 of 100000
Review 51000 of 100000
Review 52000

# Make feature matrix (word2vec, votes, stars)

In [17]:
reviewDataVecs.shape[1]

300

In [18]:
# Add non-text data back to feature matrix
review_features = ['cool', 'funny', 'useful', 'stars' , 'business_id']
all_features_labels = ['w2v{}'.format(idx) for idx in range(reviewDataVecs.shape[1])] + review_features
all_features = np.append(reviewDataVecs, dfreviews[review_features].to_numpy(), 1)


In [19]:
# Create df 
all_features_df = pd.DataFrame(data=all_features, columns=all_features_labels)

# Convert all but business_id to numerical
business_ids = all_features_df['business_id']
all_features_df = all_features_df.iloc[:,:-1].astype('float64')
all_features_df['business_id'] = business_ids
del business_ids

# Group by business_id
all_features_business = all_features_df.groupby(by='business_id').mean()

In [20]:
all_features_business.head()

Unnamed: 0_level_0,w2v0,w2v1,w2v2,w2v3,w2v4,w2v5,w2v6,w2v7,w2v8,w2v9,...,w2v294,w2v295,w2v296,w2v297,w2v298,w2v299,cool,funny,useful,stars
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--I7YYLada0tSLkORTHb5Q,-0.010113,-0.005191,-0.001253,0.006725,0.00777,0.011561,-0.003707,0.011003,0.006204,-0.009752,...,0.022868,0.014176,-0.003008,-0.017205,0.010185,-0.012379,0.352941,0.352941,0.823529,3.647059
--U98MNlDym2cLn36BBPgQ,-0.001456,-0.007032,-0.008054,0.001862,0.002557,0.00522,-0.010104,-0.006789,-0.000528,-0.015444,...,0.009817,0.021803,-0.004733,-0.010605,0.008614,-0.014502,0.0,0.0,2.0,3.0
--j-kaNMCo1-DYzddCsA5Q,-0.021356,-0.027066,-0.010531,-0.00617,0.007331,-0.021164,0.012739,0.019847,-0.022999,-0.004694,...,-0.002237,-0.013663,-0.014634,0.002697,0.016767,-0.00421,0.0,0.0,0.0,5.0
--wIGbLEhlpl_UeAIyDmZQ,0.010092,-0.001865,0.007459,-0.008281,0.003777,-0.016988,-0.018937,-0.01491,-0.012829,-0.024872,...,-0.000234,0.007773,0.011672,0.019025,-0.007226,-0.003846,0.666667,0.166667,3.0,3.833333
-000aQFeK6tqVLndf7xORg,-0.003168,0.002194,-0.016079,-0.011542,-0.00464,-0.028274,-0.013379,-0.01007,-0.022016,-0.04127,...,0.008823,0.011324,-0.013361,0.022844,-0.001799,0.006115,0.666667,0.0,0.0,5.0


In [21]:
all_features_business.describe()

Unnamed: 0,w2v0,w2v1,w2v2,w2v3,w2v4,w2v5,w2v6,w2v7,w2v8,w2v9,...,w2v294,w2v295,w2v296,w2v297,w2v298,w2v299,cool,funny,useful,stars
count,13942.0,13942.0,13942.0,13942.0,13942.0,13942.0,13942.0,13942.0,13942.0,13942.0,...,13942.0,13942.0,13942.0,13942.0,13942.0,13942.0,13943.0,13943.0,13943.0,13943.0
mean,-0.009644,-0.007059,-0.004529,1.4e-05,0.007524,-0.002177,-0.006919,0.005827,-0.006069,-0.012597,...,0.007251,0.008102,0.001327,0.005025,0.004145,-0.003701,0.486991,0.423987,1.434996,3.615964
std,0.016698,0.01229,0.01447,0.01264,0.009614,0.019429,0.015465,0.019105,0.014467,0.014996,...,0.012946,0.013961,0.012986,0.020609,0.014878,0.012345,1.299472,1.070148,2.371442,1.277067
min,-0.087107,-0.071613,-0.087,-0.078661,-0.05182,-0.083455,-0.091149,-0.091423,-0.077116,-0.084884,...,-0.07756,-0.065941,-0.072834,-0.085656,-0.085916,-0.085738,0.0,0.0,0.0,1.0
25%,-0.020139,-0.014487,-0.013132,-0.008281,0.002227,-0.015653,-0.015611,-0.006464,-0.015406,-0.023257,...,-0.001222,0.000183,-0.006621,-0.011344,-0.0046,-0.010337,0.0,0.0,0.15251,3.0
50%,-0.010494,-0.006518,-0.003756,0.000771,0.007629,-0.00222,-0.007273,0.005804,-0.006396,-0.011889,...,0.008154,0.009284,0.001527,0.006263,0.003867,-0.004363,0.076923,0.0,1.0,4.0
75%,0.000484,0.00064,0.004853,0.008433,0.012787,0.011312,0.001634,0.017845,0.003437,-0.002331,...,0.016365,0.016875,0.009624,0.021208,0.011911,0.003083,0.555556,0.5,1.833333,5.0
max,0.075227,0.056094,0.062206,0.05771,0.069714,0.096254,0.097832,0.109534,0.067639,0.060146,...,0.065252,0.089294,0.06296,0.090201,0.12949,0.103323,56.0,28.0,75.0,5.0


# Create Label y (Business categories)

In [22]:
def load_business_df(): 
    filename = r'../../data/business.json'
    new_list = []
    for line in open(filename):
       new_list.append(json.loads(line))
    return pd.DataFrame.from_records(new_list)

dfbusiness = load_business_df()

In [23]:
dfbusiness.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
0,2818 E Camino Acequia Drive,{'GoodForKids': 'False'},1SWheh84yJXfytovILXOAQ,"Golf, Active Life",Phoenix,,0,33.522143,-112.018481,Arizona Biltmore Golf Club,85016,5,3.0,AZ
1,30 Eglinton Avenue W,"{'RestaurantsReservations': 'True', 'GoodForMe...",QXAEGFB4oINsVuTFxEYKFQ,"Specialty Food, Restaurants, Dim Sum, Imported...",Mississauga,"{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W...",1,43.605499,-79.652289,Emerald Chinese Restaurant,L5R 3E7,128,2.5,ON
2,"10110 Johnston Rd, Ste 15","{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...",gnKjwL_1w79qoiV3IC_xQQ,"Sushi Bars, Restaurants, Japanese",Charlotte,"{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",1,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170,4.0,NC
3,"15655 W Roosevelt St, Ste 237",,xvX2CttrVhyG2z1dFg_0xw,"Insurance, Financial Services",Goodyear,"{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",1,33.455613,-112.395596,Farmers Insurance - Paul Lorenz,85338,3,5.0,AZ
4,"4209 Stuart Andrew Blvd, Ste F","{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...",HhyxOkGAM07SRYtlQ4wMFQ,"Plumbing, Shopping, Local Services, Home Servi...",Charlotte,"{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",1,35.190012,-80.887223,Queen City Plumbing,28217,4,4.0,NC


# Join x,y (feature matrix, category) using business_id

In [24]:
dfbusiness.columns

Index(['address', 'attributes', 'business_id', 'categories', 'city', 'hours',
       'is_open', 'latitude', 'longitude', 'name', 'postal_code',
       'review_count', 'stars', 'state'],
      dtype='object')

In [25]:
len(dfbusiness['stars'].unique())

9

In [26]:
# Add business details to features df
keep_cols = ['business_id', 'categories', 'review_count']
all_features_business = all_features_business.merge(dfbusiness[keep_cols], how='left', on='business_id') 

In [27]:
all_features_business.head()

Unnamed: 0,business_id,w2v0,w2v1,w2v2,w2v3,w2v4,w2v5,w2v6,w2v7,w2v8,...,w2v296,w2v297,w2v298,w2v299,cool,funny,useful,stars,categories,review_count
0,--I7YYLada0tSLkORTHb5Q,-0.010113,-0.005191,-0.001253,0.006725,0.00777,0.011561,-0.003707,0.011003,0.006204,...,-0.003008,-0.017205,0.010185,-0.012379,0.352941,0.352941,0.823529,3.647059,"Nightlife, Sports Bars, Restaurants, Bars, Ame...",96
1,--U98MNlDym2cLn36BBPgQ,-0.001456,-0.007032,-0.008054,0.001862,0.002557,0.00522,-0.010104,-0.006789,-0.000528,...,-0.004733,-0.010605,0.008614,-0.014502,0.0,0.0,2.0,3.0,"Pizza, Restaurants",4
2,--j-kaNMCo1-DYzddCsA5Q,-0.021356,-0.027066,-0.010531,-0.00617,0.007331,-0.021164,0.012739,0.019847,-0.022999,...,-0.014634,0.002697,0.016767,-0.00421,0.0,0.0,0.0,5.0,"Hair Removal, Nail Technicians, Beauty & Spas,...",4
3,--wIGbLEhlpl_UeAIyDmZQ,0.010092,-0.001865,0.007459,-0.008281,0.003777,-0.016988,-0.018937,-0.01491,-0.012829,...,0.011672,0.019025,-0.007226,-0.003846,0.666667,0.166667,3.0,3.833333,"Electronics, Professional Services, Local Serv...",14
4,-000aQFeK6tqVLndf7xORg,-0.003168,0.002194,-0.016079,-0.011542,-0.00464,-0.028274,-0.013379,-0.01007,-0.022016,...,-0.013361,0.022844,-0.001799,0.006115,0.666667,0.0,0.0,5.0,"Automotive, Auto Repair",7


In [28]:
all_features_business['categories'][0]

'Nightlife, Sports Bars, Restaurants, Bars, American (Traditional)'

In [29]:
all_features_business['categories'][0].split(',')

['Nightlife',
 ' Sports Bars',
 ' Restaurants',
 ' Bars',
 ' American (Traditional)']

In [30]:
# # Create list of all categories
# all_cats = []
# for string in all_features_business['categories']:
# #     print(string)
#     string = str(string)
#     cats = string.split(',')
#     for cat in cats:
#         if cat not in all_cats:
#             all_cats.append(cat)
# print(all_cats)

In [31]:
# len(all_cats)

In [32]:
# Make binary for each cat for each row
# for cat in all_cats:
#     all_features_business[cat] = all_features_business['categories'].str.contains(cat)

In [33]:
all_features_business.head()

Unnamed: 0,business_id,w2v0,w2v1,w2v2,w2v3,w2v4,w2v5,w2v6,w2v7,w2v8,...,w2v296,w2v297,w2v298,w2v299,cool,funny,useful,stars,categories,review_count
0,--I7YYLada0tSLkORTHb5Q,-0.010113,-0.005191,-0.001253,0.006725,0.00777,0.011561,-0.003707,0.011003,0.006204,...,-0.003008,-0.017205,0.010185,-0.012379,0.352941,0.352941,0.823529,3.647059,"Nightlife, Sports Bars, Restaurants, Bars, Ame...",96
1,--U98MNlDym2cLn36BBPgQ,-0.001456,-0.007032,-0.008054,0.001862,0.002557,0.00522,-0.010104,-0.006789,-0.000528,...,-0.004733,-0.010605,0.008614,-0.014502,0.0,0.0,2.0,3.0,"Pizza, Restaurants",4
2,--j-kaNMCo1-DYzddCsA5Q,-0.021356,-0.027066,-0.010531,-0.00617,0.007331,-0.021164,0.012739,0.019847,-0.022999,...,-0.014634,0.002697,0.016767,-0.00421,0.0,0.0,0.0,5.0,"Hair Removal, Nail Technicians, Beauty & Spas,...",4
3,--wIGbLEhlpl_UeAIyDmZQ,0.010092,-0.001865,0.007459,-0.008281,0.003777,-0.016988,-0.018937,-0.01491,-0.012829,...,0.011672,0.019025,-0.007226,-0.003846,0.666667,0.166667,3.0,3.833333,"Electronics, Professional Services, Local Serv...",14
4,-000aQFeK6tqVLndf7xORg,-0.003168,0.002194,-0.016079,-0.011542,-0.00464,-0.028274,-0.013379,-0.01007,-0.022016,...,-0.013361,0.022844,-0.001799,0.006115,0.666667,0.0,0.0,5.0,"Automotive, Auto Repair",7


In [34]:
The below fcn is supposed to remove all spaces, but it is not doing it. idk why. 

SyntaxError: invalid syntax (<ipython-input-34-f8816c10a295>, line 1)

In [43]:
def stringDFColToBinaryCols(df, series_name):
    # Create list of all categories
    all_cats = []
    for string in df[series_name]:
        string = str(string)
        cats = string.strip().replace(' ', '').split(',')
        for cat in cats:
            if cat not in all_cats:
                all_cats.append(cat)
    # Make binary for each cat for each row
    for cat in all_cats:
        df[cat] = df[series_name].str.strip().replace(' ', '').str.contains(cat)
    
    return df, all_cats
        
all_features_business, all_cats = stringDFColToBinaryCols(all_features_business, 'categories')

  if sys.path[0] == '':


In [55]:
all_features_business[all_features_business['categories'].str.strip().replace(' ', '').str.contains('Golf')==True]['Golf']

37       True
54       True
267      True
525      True
610      True
822      True
1007     True
1161     True
1598     True
1646     True
1877     True
2171     True
2261     True
3065     True
4327     True
4390     True
4842     True
4991     True
5212     True
5904     True
5958     True
6190     True
6415     True
6660     True
6850     True
7016     True
7079     True
7146     True
7196     True
7675     True
         ... 
8382     True
8529     True
8547     True
8719     True
8824     True
9098     True
9739     True
10166    True
10241    True
10289    True
10515    True
10592    True
10636    True
10686    True
10944    True
11226    True
11362    True
11407    True
11501    True
11790    True
12294    True
12310    True
12494    True
12668    True
12812    True
13035    True
13289    True
13359    True
13446    True
13647    True
Name: Golf, Length: 61, dtype: object

In [44]:
print(all_cats)

['Nightlife', 'SportsBars', 'Restaurants', 'Bars', 'American(Traditional)', 'Pizza', 'HairRemoval', 'NailTechnicians', 'Beauty&Spas', 'NailSalons', 'Waxing', 'DaySpas', 'Electronics', 'ProfessionalServices', 'LocalServices', 'ElectronicsRepair', 'Computers', 'Shopping', 'Automotive', 'AutoRepair', 'Chinese', 'EyelashService', 'TobaccoShops', 'VapeShops', 'CarDealers', 'UsedCarDealers', 'Dentists', 'GeneralDentistry', 'CosmeticDentists', 'PediatricDentists', 'Health&Medical', 'Tex-Mex', 'Mexican', 'Arts&Entertainment', 'Festivals', 'Food', 'FoodTrucks', 'FarmersMarket', 'Portuguese', 'Bakeries', 'ChickenShop', 'Barbeque', 'EventPlanning&Services', 'EventPhotography', 'Photographers', 'SessionPhotography', 'SkinCare', 'Antiques', 'IceCream&FrozenYogurt', 'Donuts', 'SpecialtyFood', 'WebDesign', 'GraphicDesign', 'Marketing', 'RecyclingCenter', 'Caterers', 'Southern', 'ComfortFood', 'Breakfast&Brunch', 'French', 'American(New)', 'Burgers', 'Sandwiches', 'Coffee&Tea', 'Brasseries', 'Gyms', '

In [45]:
all_features_business.columns

Index(['business_id', 'w2v0', 'w2v1', 'w2v2', 'w2v3', 'w2v4', 'w2v5', 'w2v6',
       'w2v7', 'w2v8',
       ...
       'RockClimbing', 'BalloonServices', 'ATVRentals/Tours', 'MassageSchools',
       'Pool&Billiards', 'PettingZoos', 'Toxicologists', 'WaterParks',
       'AirportLounges', 'Australian'],
      dtype='object', length=1397)

In [58]:
all_features_business[all_features_business['Pool&Billiards']==True]

Unnamed: 0,business_id,w2v0,w2v1,w2v2,w2v3,w2v4,w2v5,w2v6,w2v7,w2v8,...,RockClimbing,BalloonServices,ATVRentals/Tours,MassageSchools,Pool&Billiards,PettingZoos,Toxicologists,WaterParks,AirportLounges,Australian


In [57]:
all_features_business[all_features_business['Golf']==True]

Unnamed: 0,business_id,w2v0,w2v1,w2v2,w2v3,w2v4,w2v5,w2v6,w2v7,w2v8,...,RockClimbing,BalloonServices,ATVRentals/Tours,MassageSchools,Pool&Billiards,PettingZoos,Toxicologists,WaterParks,AirportLounges,Australian
37,-AnLtSHWJjYXFu-UvpPMHA,-0.014482,-0.003012,-0.020955,0.006187,0.008828,0.013181,-0.001462,0.010961,0.001663,...,False,False,False,False,False,False,False,False,False,False
54,-Ek4ibmfqFFQhcfe_HBzlA,-0.016133,-0.005260,-0.021706,-0.009130,-0.003861,-0.014865,-0.011565,0.008397,-0.018524,...,False,False,False,False,False,False,False,False,False,False
267,0IDJEwsntLMgAKBDHRZjGA,0.012588,-0.013008,0.009313,-0.015913,0.001041,-0.015291,-0.021732,-0.018582,-0.026432,...,False,False,False,False,False,False,False,False,False,False
525,1SWheh84yJXfytovILXOAQ,-0.031541,-0.011232,-0.043121,-0.014846,-0.008221,-0.005549,0.002744,0.007301,-0.010419,...,False,False,False,False,False,False,False,False,False,False
610,1qwkbN3wu3AGDxMTXVz2gA,-0.004550,-0.011639,-0.002545,0.003994,0.010406,0.011735,-0.004884,0.003749,-0.009131,...,False,False,False,False,False,False,False,False,False,False
822,2r2-SUffzmo22lvKSiIKvg,-0.031969,-0.008633,-0.004458,0.009101,0.016207,0.012636,0.012241,0.037767,-0.014736,...,False,False,False,False,False,False,False,False,False,False
1007,3fdtp-bzoE4ZgTakkcEBzQ,-0.016544,-0.001947,0.005464,0.010125,0.013476,0.004892,-0.018859,-0.001401,-0.007172,...,False,False,False,False,False,False,False,False,False,False
1161,4PNCZxJ7fwsf9TH30kwGPQ,0.024196,-0.001454,0.009570,0.001588,0.002894,0.009317,-0.008664,0.007551,-0.006987,...,False,False,False,False,False,False,False,False,False,False
1598,6F5jq6nJmZZjmwxOIhSrAQ,-0.007809,-0.005780,-0.005660,0.002666,0.004219,0.022762,0.005441,0.012965,-0.015723,...,False,False,False,False,False,False,False,False,False,False
1646,6Tv6vdqL5Aj28bTCd_YdHg,0.005856,-0.008655,0.002249,0.008781,0.000095,0.005929,-0.022200,-0.019227,-0.020648,...,False,False,False,False,False,False,False,False,False,False


In [47]:
all_features_business.head()

Unnamed: 0,business_id,w2v0,w2v1,w2v2,w2v3,w2v4,w2v5,w2v6,w2v7,w2v8,...,RockClimbing,BalloonServices,ATVRentals/Tours,MassageSchools,Pool&Billiards,PettingZoos,Toxicologists,WaterParks,AirportLounges,Australian
0,--I7YYLada0tSLkORTHb5Q,-0.010113,-0.005191,-0.001253,0.006725,0.00777,0.011561,-0.003707,0.011003,0.006204,...,False,False,False,False,False,False,False,False,False,False
1,--U98MNlDym2cLn36BBPgQ,-0.001456,-0.007032,-0.008054,0.001862,0.002557,0.00522,-0.010104,-0.006789,-0.000528,...,False,False,False,False,False,False,False,False,False,False
2,--j-kaNMCo1-DYzddCsA5Q,-0.021356,-0.027066,-0.010531,-0.00617,0.007331,-0.021164,0.012739,0.019847,-0.022999,...,False,False,False,False,False,False,False,False,False,False
3,--wIGbLEhlpl_UeAIyDmZQ,0.010092,-0.001865,0.007459,-0.008281,0.003777,-0.016988,-0.018937,-0.01491,-0.012829,...,False,False,False,False,False,False,False,False,False,False
4,-000aQFeK6tqVLndf7xORg,-0.003168,0.002194,-0.016079,-0.011542,-0.00464,-0.028274,-0.013379,-0.01007,-0.022016,...,False,False,False,False,False,False,False,False,False,False


In [None]:
[ele for ele in all_features_business.columns if ele not in all_cats]

In [None]:
# Clean

# Remove rows with NaNs
print('Before: ', len(all_features_business), '. ')
all_features_business = all_features_business.dropna(axis=0)
print('After: ', len(all_features_business))

In [None]:
# Create final y and x 

y_df = all_features_business[all_cats]
x_cols = [ele for ele in all_features_business.columns if ele not in all_cats+['categories', 'business_id']]
# May also want to remove from x_cols: 'cool', 'funny', 'useful', 'stars', 'categories', 'review_count' 

x_df = all_features_business[x_cols]

x = x_df.values
y = y_df.values

# Classifier wants 1/0, not T/F
y = y.astype(int)

In [None]:
y

# Category Prediction

In [None]:
# Multilabel Classification
# RandomForestClassifier supports multilabel classification

# Most other classifiers will require use of 
    # sklearn.multioutput.MultiOutputClassifier to run a separate classifier model for each targe
    
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=10, n_jobs=-1)

In [None]:
rfc.fit(x,y)

In [None]:
x[0]

In [None]:
predict0 = rfc.predict(x[0].reshape(1,-1))
predict0

In [None]:
y[0]==1

In [None]:
working on figuring out how to produce the list of items the model predicted to be 1. ie do boolean indexing

In [None]:
all_cats_ser = pd.Series(data=all_cats)
y0cats = all_cats_ser.loc[y[0]==1]
list(y0cats)

In [None]:
all_features_business.iloc[0]

In [None]:
plt.figure(figsize=(15,5))
plt.bar(range(predict0.shape[1]), predict0[0])

In [None]:
plt.figure(figsize=(15,5))
plt.bar(range(predict0.shape[1]), y[0])

In [None]:
break

In [None]:
# Cluster users using K-means
from sklearn.cluster import MiniBatchKMeans

max_clusters = 200 # 10
kmeans_cost = []
for num_clusters in range(1,max_clusters):
    k_means_clutering = MiniBatchKMeans(n_clusters=num_clusters)
    k_means_clutering.fit(reviewDataVecs[~np.isnan(reviewDataVecs).any(axis=1)]) # Drop rows that have NAN
    kmeans_cost.append(k_means_clutering.inertia_)
    

In [None]:
# Determine the best value of K to use (the number of clusters)
# plot the cost against K values 
plt.plot(range(1, max_clusters), kmeans_cost, color ='g', linewidth ='3') 
plt.title('20 Clusters of Reviews:', fontsize=20)
plt.xlabel("Value of K") 
plt.ylabel("Sqaured Error (Cost)") 
plt.show() # clear the plot 

In [None]:
# Cluster users using K-means
# Interpret user clusters

from sklearn.cluster import KMeans

num_clusters = 20
k_means_clutering = KMeans(n_clusters=num_clusters)
k_means_clutering.fit(reviewDataVecs[~np.isnan(reviewDataVecs).any(axis=1)])

In [None]:
cluster_centers_df = pd.DataFrame(data=k_means_clutering.cluster_centers_, columns=all_features_df.iloc[:,:-4].columns)
cluster_centers_df.head()

In [None]:
cluster_centers_df = cluster_centers_df.merge(right=all_features_df.dropna()[['cool', 'funny', 'useful', 'stars']].groupby(by=k_means_clutering.labels_).mean(), right_index=True, left_index=True)


In [None]:
cluster_centers_df.head()

In [None]:
cluster_centers_df.iloc[0]['stars']

In [None]:
from wordcloud import WordCloud

for cluster in range(len(cluster_centers_df)):
#     series = user_cluster_centers_df.iloc[cluster,:-4] #Use relevant row, drop non-word cols
#     wc_dict = series.to_dict()
    series = model.wv.similar_by_vector(cluster_centers_df.iloc[cluster,:-4].values, topn=100)
    wc_dict = {}
    for key, val in series:
        wc_dict[key] = val
    wordcloud = WordCloud().generate_from_frequencies(wc_dict)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title('Cluster{0:3d}: {1:.2f}stars, {2:.2f}useful'.format(cluster, cluster_centers_df.iloc[cluster]['stars'], 
                                                    cluster_centers_df.iloc[cluster]['useful']), fontsize=20)
    plt.axis('off')
    plt.show()
    

In [None]:
print('The most similar words to the avg vector describing each user cluster:\n')
for cluster in range(len(cluster_centers_df)):
    print('Cluster {}'.format(cluster))
    display(
        model.wv.similar_by_vector(cluster_centers_df.iloc[cluster,:].values, topn=10)
    )

# Cluster with metadata (useful, cool, funny, stars)

In [None]:
# Cluster users using K-means
from sklearn.cluster import MiniBatchKMeans

max_clusters = 200 # 10
kmeans_cost = []
for num_clusters in range(1,max_clusters):
    k_means_clutering = MiniBatchKMeans(n_clusters=num_clusters)
    k_means_clutering.fit(all_features_df.dropna()) # Drop rows that have NAN
    kmeans_cost.append(k_means_clutering.inertia_)
    

In [None]:
# Determine the best value of K to use (the number of clusters)
# plot the cost against K values 
plt.plot(range(1, max_clusters), kmeans_cost, color ='g', linewidth ='3') 
plt.title('20 Clusters of Reviews:', fontsize=20)
plt.xlabel("Value of K") 
plt.ylabel("Sqaured Error (Cost)") 
plt.show() # clear the plot 

In [None]:
# Cluster users using K-means
# Interpret user clusters

from sklearn.cluster import KMeans

num_clusters = 20
k_means_clutering = KMeans(n_clusters=num_clusters)
k_means_clutering.fit(all_features_df.dropna())

In [None]:
cluster_centers_df = pd.DataFrame(data=k_means_clutering.cluster_centers_, columns=all_features_df.iloc[:,:].columns)
# cluster_centers_df = cluster_centers_df.merge(right=all_features_df.dropna()[['cool', 'funny', 'useful', 'stars']].groupby(by=k_means_clutering.labels_).mean(), right_index=True, left_index=True)
cluster_centers_df.head()


In [None]:
from wordcloud import WordCloud

for cluster in range(len(cluster_centers_df)):
#     series = user_cluster_centers_df.iloc[cluster,:-4] #Use relevant row, drop non-word cols
#     wc_dict = series.to_dict()
    series = model.wv.similar_by_vector(cluster_centers_df.iloc[cluster,:-4].values, topn=100)
    wc_dict = {}
    for key, val in series:
        wc_dict[key] = val
    wordcloud = WordCloud().generate_from_frequencies(wc_dict)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title('Cluster{0:3d}: {1:.2f}stars, {2:.2f}useful'.format(cluster, cluster_centers_df.iloc[cluster]['stars'], 
                                                    cluster_centers_df.iloc[cluster]['useful']), fontsize=20)
    plt.axis('off')
    plt.show()
    

In [None]:
for col in ['cool', 'funny', 'useful', 'stars']:
    plt.bar(x=range(len(cluster_centers_df)), height=cluster_centers_df[col])
    plt.title('{}'.format(col))
    plt.show()
# display(cluster_centers_df[['cool', 'funny', 'useful', 'stars']])