In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from yelp_functions import get_processed_inputs

# read file with reviews data for restaurants
reviews = pd.read_csv('processed_data/reviews_restaurants.csv')

# create a BoW vectorizer using a random sample of 10% of the reviews
vectorizer_inputs = reviews.sample(frac=0.1, random_state=0)[['text']]
vectorizer_inputs = get_processed_inputs(vectorizer_inputs)
vectorizer = CountVectorizer(max_features=4096, ngram_range=(1,2))
vectorizer_inputs = vectorizer.fit_transform(vectorizer_inputs)

In [3]:
# zip the id's and prices together
restaurants_zip = list(zip(list(reviews.groupby('business_id').mean().index),
                           list(reviews.groupby('business_id').mean()['price_range'])))

# check results of sampled zip
restaurants_zip

[('--0iUa4sNDFiZFrAdIWhZQ', 1),
 ('--7PUidqRWpRSpXebiyxTg', 2),
 ('--8IbOsAAxjKRoYsBFL-PA', 2),
 ('--ZVrH2X2QXBFdCilbirsw', 1),
 ('--epgcb7xHGuJ-4PUeSLAw', 1),
 ('--lqIzK-ZVTtgwiQM63XgQ', 1),
 ('--onnLZrsCazmcy2P_7fcw', 2),
 ('--qLiYw2ErSmvVwumb2kdw', 3),
 ('-09Oc2D14vRnmirPh0vlXw', 2),
 ('-0G_6-KFGpCpxTUlVXCMYQ', 2),
 ('-0M0b-XhtFagyLmsBtOe8w', 2),
 ('-0PN_KFPtbnLQZEeb23XiA', 2),
 ('-0TffRSXXIlBYVbb5AwfTg', 2),
 ('-0__F9fnKt8uioCKztF5Ww', 2),
 ('-0dKgi_Hpcis921nOpM85Q', 2),
 ('-0eUa8TsXFFy0FCxHYmrjg', 1),
 ('-0epFLgYq2C1Jo_W4FOBKw', 1),
 ('-0gWtMKg8_iV6vC5wRFDiA', 1),
 ('-0i2KNr7WrCsDF5m0IViJg', 2),
 ('-0iIxySkp97WNlwK66OGWg', 1),
 ('-0jK77zdE3-plqXuwXtilQ', 2),
 ('-0jzoPt3UeXn6FUXVQvyPg', 2),
 ('-0m4IwD1FIOqkA8dh4mVfQ', 1),
 ('-1B9pP_CrRBJYPICE5WbRA', 2),
 ('-1MhPXk1FglglUAmuPLIGg', 1),
 ('-1PG6k_iezwJmRZLB7f6og', 2),
 ('-1PvWminK3Er7fqpTjk19A', 2),
 ('-1WM2044r3jVZC6oQ2QeVA', 1),
 ('-1XSzguS6XLN-V6MVZMg2A', 3),
 ('-1b2kNOowsPrPpBOK4lNkQ', 2),
 ('-1iLbEf1NwY-OJp5Hg-3Sg', 1),
 ('-1owB

In [5]:
from yelp_functions import sum_to_one
from tqdm.notebook import tqdm

restaurants_zip = restaurants_zip[:200]

# create empty list for restaurant vectors and price ratings
restaurants_vectors = []
restaurants_price = []

# create vectors for restaurants
for restaurant_id, price_range in tqdm(restaurants_zip):

    # set up the initial conditions
    r_vector = np.zeros(4096)
    temp_df = reviews[reviews['business_id'] == restaurant_id]
    bow_inputs = get_processed_inputs(temp_df[['text']])

    # get individual BoW representations
    bow_inputs = vectorizer.transform(bow_inputs)
    bow_inputs = bow_inputs.toarray()
    bow_inputs = sum_to_one(bow_inputs)

    # add up the individual BoW representations
    for vec in bow_inputs:
        r_vector += vec

    restaurants_vectors.append(r_vector)
    restaurants_price.append(price_range)

# check results for restaurants vectors
len(restaurants_vectors), restaurants_vectors[0].shape[0], restaurants_vectors[0].sum()

  0%|          | 0/200 [00:00<?, ?it/s]

(200, 4096, 14.0)

In [19]:
import pickle

ids = list(reviews.groupby('business_id').mean().index)

normalized_restaurants = sum_to_one(restaurants_vectors)
data_to_save = [ids, normalized_restaurants, restaurants_price]

with open("processed_data/restaurant_vectors", "wb") as pickle_file:
    pickle.dump(data_to_save, pickle_file)