In [6]:
import pandas as pd
import numpy as np
import requests
import yaml
import time
from random import shuffle
import json
from pymongo import MongoClient
from collections import Counter
from geopy.geocoders import Nominatim
geolocator = Nominatim()
pd.set_option('display.max_columns', 200)
from sklearn.feature_extraction.text import TfidfVectorizer

In [108]:
def create_pandas_df_from_json(path):
    '''
    INPUT: filepath string
    OUTPUT: pandas database
    '''
    return pd.read_json(file_path, lines=True)

def is_food(item):
    '''
    INPUT: cell from pandas dataframe
    OUTPUT: boolean
    '''
    restaurants_and_related_categories = ['Restaurants', 'Italian','Food', 'Bars','Fast Food', 'Coffee & Tea', 'Sandwiches']
    if len(set(restaurants_and_related_categories) & set(item)) >= 1:
        return True
    else:
        return False
    
def flatten_dict(row):
    out = {}
    for key, value in row.items():
        if type(value) != dict:
            out[key] = value
        else:
            sub_key = key
            for k, v in value.items():
                out[sub_key + "|" + k] = v
    return out

def make_exists_function(key):
    def get_key_if_exists(row):
        if key in row:
            return row[key]
        else:
            return "N/A"
    return get_key_if_exists

def add_restaurant_count_column(dataframe):
    restaurant_frequency = dataframe.groupby(['name']).count().sort_values('address', ascending=False)

    restaurant_frequency = pd.DataFrame(restaurant_frequency['address'])

    restaurant_frequency.columns = ['restaurant_count']

    restaurant_frequency['name'] = restaurant_frequency.index

    restaurant_frequency = restaurant_frequency[['name', 'restaurant_count']]

    return previously_open_US_restaurants.merge(restaurant_frequency, how='left', left_on='name', right_on='name')

def closed_on_google(row):
    try:
        return row[0]['permanently_closed']
    except:
        return False
    
def fix_percent(row):
    row = str(row).strip('%')
    row = float(row)
    return row/100

def summaries_from_google(dataframe, key, default_val=0):
    summaries = []
    key_errors = 0
    for i in range(len(dataframe)):
        total = 0
        count = 0
        for j in range(len(dataframe['results'][i])):
            try:
                total += dataframe['results'][i][j][key]
                count += 1
            except KeyError:
                key_errors += 1
        try:
            summaries.append({'business_id': nearby_df['yelp_business_id'][i], 'avg_'+key: (total / count)})
        except ZeroDivisionError:
            summaries.append({'business_id': nearby_df['yelp_business_id'][i], 'avg_'+key: default_val})
    return pd.DataFrame(summaries)

def get_price(row):
    try:
        return row['RestaurantsPriceRange2']
    except KeyError:
        return 1.5

In [3]:
file_path = 'https://s3-us-west-2.amazonaws.com/businesspredictiondata/business.json'
yelp_business_data = create_pandas_df_from_json(file_path)

In [None]:
#filters businesses that were open when this dataset was published Jan. 2018
open_businesses = yelp_business_data[yelp_business_data['is_open'] == 1]

#creates column that says if business is restaurant and creates df of just open restaurants
open_businesses['is_food'] = open_businesses['categories'].apply(is_food)
open_restaurants = open_businesses[open_businesses['is_food'] == True]

#creates column that says if business is in USA and creates df of just
#restaurants open in the US as of January 2018
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA",
      "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
      "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
      "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
      "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
open_restaurants['in_US'] = open_restaurants['state'].isin(states)
previously_open_US_restaurants = open_restaurants[open_restaurants['in_US'] == True]

#creates dummy columns for 
previously_open_US_restaurants['flat_attributes'] = previously_open_US_restaurants['attributes'].apply(flatten_dict)
all_attributes = []
for row in previously_open_US_restaurants['flat_attributes']:
    all_attributes.extend(row.keys())
unique_attributes = list(dict(Counter(all_attributes).most_common(50)).keys())

for key in unique_attributes:
    previously_open_US_restaurants['Attribute|has_'+key] = previously_open_US_restaurants['flat_attributes'].apply(lambda x: key in x)
    
    f = make_exists_function(key)
    previously_open_US_restaurants['Attribute|' +key + ' value:'] = previously_open_US_restaurants['flat_attributes'].apply(f)
    
all_categories = []
[all_categories.extend(item) for item in list(previously_open_US_restaurants['categories'])]

most_common_categories = list(dict(Counter(all_categories).most_common(50)).keys())

for key in most_common_categories:
    previously_open_US_restaurants[f"Category|{key}_true"] = previously_open_US_restaurants['categories'].apply(lambda x: key in x)

previously_open_US_restaurants = add_restaurant_count_column(previously_open_US_restaurants)
    
client = MongoClient('mongodb://localhost:27017/')
restaurants = client['restaurants']
google_places = restaurants['google_places']
start_time = time.time()

google_df = pd.DataFrame(list(google_places.find()))

google_df = google_df[['queried_name', 'yelp_business_id', 'results']]

google_df['closed_on_google'] = google_df['results'].apply(closed_on_google)

restaurants_with_google_data = previously_open_US_restaurants.merge(google_df, how='inner', left_on='business_id', right_on='yelp_business_id')

#removes rows without any matching data from Google
restaurants_with_google_data = restaurants_with_google_data[restaurants_with_google_data['results'].map(len) > 0]

zip_code_df = pd.read_csv('/Users/ElliottC/g/projects/yelp/predicting_restaurant_closure/data/zip_code_data.csv')

zip_code_df['Zip Code'] = zip_code_df['Zip Code'].apply(str)

restaurants_with_economic_data = restaurants_with_google_data.merge(zip_code_df, how='left', left_on='postal_code', right_on='Zip Code')

restaurants_with_economic_data.iloc[:,-19:] = restaurants_with_economic_data.iloc[:,-19:].fillna(0).copy()

percent_columns = ['Educational Attainment: Percent high school graduate or higher', 'Individuals below poverty level']
for col in percent_columns:
    restaurants_with_economic_data[col] = restaurants_with_economic_data[col].apply(fix_percent)

num_columns = ['2016 ACS 5-Year Population Estimate',
 'American Indian and Alaska Native alone',
 'Asian alone',
 'Black or African American alone',
 'Census 2010 Total Population',
 'Foreign Born Population',
 'Hispanic or Latino (of any race)',
 'Median Age',
 'Median Household Income',
 'Native Hawaiian and Other Pacific Islander alone',
 'Some Other Race alone',
 'Total housing units',
 'Two or More Races',
 'Veterans',
 'White alone',
 'White alone, Not Hispanic or Latino']
    
for col in num_columns:
    restaurants_with_economic_data[col] = restaurants_with_economic_data[col].apply(int)

#adds nearby data using google maps api data: among nearby restaurants: 1) count 2) avg_rating 3) avg_price
maps_nearby = restaurants['maps_nearby']
nearby_df = pd.DataFrame(list(maps_nearby.find()))
nearby_df['num_nearby_restaurants'] = nearby_df['results'].apply(lambda x: len(x))

In [114]:
nearby_prices = summaries_from_google(nearby_df, 'price_level', 1.5)
nearby_ratings = summaries_from_google(nearby_df, 'rating', 3.5)
nearby_prices_and_rating = nearby_prices.merge(nearby_ratings, how='outer', on='business_id')
nearby_prices_rating_num = nearby_prices_and_rating.merge(nearby_df, how='outer', left_on='business_id', right_on='yelp_business_id')
trimmed_nearby_data = nearby_prices_rating_num[['business_id','avg_price_level','avg_rating','num_nearby_restaurants']]

restaurants_with_nearby_data = restaurants_with_economic_data.merge(trimmed_nearby_data, how='left', on='business_id')

restaurants_with_nearby_data['relative rating'] = restaurants_with_nearby_data['stars'] - restaurants_with_nearby_data['avg_rating']

restaurants_with_nearby_data.to_csv('../data/featurized_dataframe.csv')

In [115]:
restaurants_with_nearby_data['price_level'] = restaurants_with_nearby_data['attributes'].apply(get_price)

In [116]:
restaurants_with_nearby_data['relative_price'] = restaurants_with_nearby_data['price_level'] - restaurants_with_nearby_data['avg_price_level']

In [117]:
restaurants_with_nearby_data.head()

Unnamed: 0.1,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state,is_food,in_US,flat_attributes,Attribute|has_BusinessAcceptsCreditCards,Attribute|BusinessAcceptsCreditCards value:,Attribute|has_RestaurantsPriceRange2,Attribute|RestaurantsPriceRange2 value:,Attribute|has_RestaurantsTakeOut,Attribute|RestaurantsTakeOut value:,Attribute|has_BusinessParking|garage,Attribute|BusinessParking|garage value:,Attribute|has_BusinessParking|street,Attribute|BusinessParking|street value:,Attribute|has_BusinessParking|lot,Attribute|BusinessParking|lot value:,Attribute|has_BusinessParking|valet,Attribute|BusinessParking|valet value:,Attribute|has_BusinessParking|validated,Attribute|BusinessParking|validated value:,Attribute|has_BikeParking,Attribute|BikeParking value:,Attribute|has_OutdoorSeating,Attribute|OutdoorSeating value:,Attribute|has_RestaurantsGoodForGroups,Attribute|RestaurantsGoodForGroups value:,Attribute|has_RestaurantsDelivery,Attribute|RestaurantsDelivery value:,Attribute|has_RestaurantsReservations,Attribute|RestaurantsReservations value:,Attribute|has_GoodForKids,Attribute|GoodForKids value:,Attribute|has_HasTV,Attribute|HasTV value:,Attribute|has_WiFi,Attribute|WiFi value:,Attribute|has_Ambience|romantic,Attribute|Ambience|romantic value:,Attribute|has_Ambience|intimate,Attribute|Ambience|intimate value:,Attribute|has_Ambience|classy,Attribute|Ambience|classy value:,Attribute|has_Ambience|hipster,Attribute|Ambience|hipster value:,Attribute|has_Ambience|touristy,Attribute|Ambience|touristy value:,Attribute|has_Ambience|trendy,Attribute|Ambience|trendy value:,Attribute|has_Ambience|upscale,Attribute|Ambience|upscale value:,Attribute|has_Ambience|casual,Attribute|Ambience|casual value:,Attribute|has_Ambience|divey,Attribute|Ambience|divey value:,Attribute|has_Alcohol,Attribute|Alcohol value:,Attribute|has_RestaurantsAttire,Attribute|RestaurantsAttire value:,Attribute|has_GoodForMeal|dessert,Attribute|GoodForMeal|dessert value:,Attribute|has_GoodForMeal|latenight,Attribute|GoodForMeal|latenight value:,Attribute|has_GoodForMeal|lunch,Attribute|GoodForMeal|lunch value:,Attribute|has_GoodForMeal|dinner,Attribute|GoodForMeal|dinner value:,Attribute|has_GoodForMeal|breakfast,Attribute|GoodForMeal|breakfast value:,Attribute|has_GoodForMeal|brunch,Attribute|GoodForMeal|brunch value:,Attribute|has_Caters,Attribute|Caters value:,Attribute|has_NoiseLevel,Attribute|NoiseLevel value:,Attribute|has_RestaurantsTableService,Attribute|RestaurantsTableService value:,Attribute|has_WheelchairAccessible,Attribute|WheelchairAccessible value:,Attribute|has_HappyHour,Attribute|HappyHour value:,Attribute|has_GoodForDancing,Attribute|GoodForDancing value:,Attribute|has_DriveThru,Attribute|DriveThru value:,Attribute|has_Music|dj,Attribute|Music|dj value:,Attribute|has_Music|background_music,Attribute|Music|background_music value:,Attribute|has_Music|no_music,Attribute|Music|no_music value:,Attribute|has_Music|karaoke,Attribute|Music|karaoke value:,Attribute|has_Music|live,Attribute|Music|live value:,Attribute|has_Music|video,Attribute|Music|video value:,Attribute|has_Music|jukebox,Attribute|Music|jukebox value:,Attribute|has_CoatCheck,Attribute|CoatCheck value:,Attribute|has_DogsAllowed,Attribute|DogsAllowed value:,Attribute|has_Smoking,Attribute|Smoking value:,Category|Restaurants_true,Category|Food_true,Category|Nightlife_true,Category|Bars_true,Category|Fast Food_true,Category|American (Traditional)_true,Category|Sandwiches_true,Category|Pizza_true,Category|Mexican_true,Category|Burgers_true,Category|American (New)_true,Category|Breakfast & Brunch_true,Category|Coffee & Tea_true,Category|Grocery_true,Category|Italian_true,Category|Specialty Food_true,Category|Shopping_true,Category|Chinese_true,Category|Event Planning & Services_true,Category|Chicken Wings_true,Category|Salad_true,Category|Bakeries_true,Category|Desserts_true,Category|Convenience Stores_true,Category|Ice Cream & Frozen Yogurt_true,Category|Sports Bars_true,Category|Seafood_true,Category|Beer_true,Category|Wine & Spirits_true,Category|Caterers_true,Category|Delis_true,Category|Cafes_true,Category|Drugstores_true,Category|Japanese_true,Category|Arts & Entertainment_true,Category|Juice Bars & Smoothies_true,Category|Pubs_true,Category|Steakhouses_true,Category|Sushi Bars_true,Category|Asian Fusion_true,Category|Barbeque_true,Category|Diners_true,Category|Lounges_true,Category|Automotive_true,Category|Gas Stations_true,Category|Cocktail Bars_true,Category|Mediterranean_true,Category|Wine Bars_true,Category|Food Trucks_true,Category|Tex-Mex_true,restaurant_count,queried_name,yelp_business_id,results,closed_on_google,Unnamed: 0,2016 ACS 5-Year Population Estimate,American Indian and Alaska Native alone,Asian alone,Black or African American alone,Census 2010 Total Population,Educational Attainment: Percent high school graduate or higher,Foreign Born Population,Hispanic or Latino (of any race),Individuals below poverty level,Median Age,Median Household Income,Native Hawaiian and Other Pacific Islander alone,Some Other Race alone,Total housing units,Two or More Races,Veterans,White alone,"White alone, Not Hispanic or Latino",Zip Code,avg_price_level,avg_rating,num_nearby_restaurants,relative rating,price_level,relative_price
0,581 Howe Ave,"{'Alcohol': 'full_bar', 'HasTV': True, 'NoiseL...",PfOCPjBrlQAnz__NXj9h_w,"[American (New), Nightlife, Bars, Sandwiches, ...",Cuyahoga Falls,"{'Monday': '11:00-1:00', 'Tuesday': '11:00-1:0...",1,41.119535,-81.47569,Brick House Tavern + Tap,,44221,116,3.5,OH,True,True,"{'Alcohol': 'full_bar', 'HasTV': True, 'NoiseL...",True,True,True,2.0,True,True,True,False,True,False,True,True,True,False,True,False,True,True,True,True,True,True,True,False,True,False,True,True,True,True,True,free,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,True,True,False,True,full_bar,True,casual,True,False,True,True,True,False,True,True,True,False,True,False,True,False,True,average,True,True,False,,True,True,True,False,True,False,True,False,True,True,True,False,True,False,True,False,True,False,True,False,True,False,False,,True,outdoor,True,False,True,True,False,True,True,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1,Brick House Tavern + Tap,PfOCPjBrlQAnz__NXj9h_w,"[{'geometry': {'location': {'lat': 41.119429, ...",False,0.0,29544,65,721,887,29587,0.928,1317,899,0.108,37,49664,9,150,14319,365,1933,27347,26642,44221,1.25,4.077778,18,-0.577778,2.0,0.75
1,2612 Brandt School Rd,"{'BusinessParking': {'garage': False, 'street'...",EsMcGiZaQuG1OOvL9iUFug,"[Coffee & Tea, Ice Cream & Frozen Yogurt, Food]",Wexford,{},1,40.615102,-80.091349,Any Given Sundae,,15090,15,5.0,PA,True,True,"{'BusinessParking|garage': False, 'BusinessPar...",True,True,True,1.0,True,True,True,False,True,False,True,True,True,False,True,False,True,False,True,True,False,,False,,False,,False,,False,,True,free,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,True,False,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1,Any Given Sundae,EsMcGiZaQuG1OOvL9iUFug,"[{'geometry': {'location': {'lat': 40.6152352,...",False,1.0,23811,64,2385,487,21202,0.988,2448,259,0.031,42,111636,0,34,9268,310,1235,20531,20355,15090,1.666667,4.216667,6,0.783333,1.0,-0.666667
2,600 E 4th St,"{'GoodForMeal': {'dessert': False, 'latenight'...",fNMVV_ZX7CJSDWQGdOM8Nw,"[Restaurants, American (Traditional)]",Charlotte,"{'Friday': '7:00-15:00', 'Tuesday': '7:00-15:0...",1,35.221647,-80.839345,Showmars Government Center,Uptown,28202,7,3.5,NC,True,True,"{'GoodForMeal|dessert': False, 'GoodForMeal|la...",True,True,True,1.0,True,True,False,,False,,False,,False,,False,,True,True,True,True,True,True,True,False,True,False,True,True,True,False,True,free,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,False,,True,casual,True,False,True,False,True,False,True,False,True,False,True,False,False,,False,,True,False,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1,Showmars Government Center,fNMVV_ZX7CJSDWQGdOM8Nw,"[{'geometry': {'location': {'lat': 35.221647, ...",False,2.0,11993,58,427,2591,11195,0.945,973,845,0.128,30,84688,0,97,7576,588,597,8232,7646,28202,1.4,3.511111,11,-0.011111,1.0,-0.4
3,38295 Chestnut Ridge Rd,"{'GoodForMeal': {'dessert': False, 'latenight'...",Dj0S-Oe4ytRJzMGUPgYUkw,"[Soup, Salad, Sandwiches, Restaurants]",Elyria,"{'Monday': '6:30-21:00', 'Tuesday': '6:30-21:0...",1,41.343078,-82.06714,Panera Bread,,44035,4,2.0,OH,True,True,"{'GoodForMeal|dessert': False, 'GoodForMeal|la...",True,True,False,,True,True,False,,False,,False,,False,,False,,False,,False,,False,,True,False,True,False,True,True,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,True,False,True,False,True,False,True,False,True,False,True,False,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,False,,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,134,Panera Bread,Dj0S-Oe4ytRJzMGUPgYUkw,"[{'geometry': {'location': {'lat': 41.3431885,...",False,3.0,63663,124,559,8418,64263,0.872,1201,3200,0.202,39,43133,65,450,29564,3007,4980,51040,49005,44035,1.5,3.933333,6,-1.933333,1.5,0.0
4,96 S Main St,"{'BusinessParking': {'garage': False, 'street'...",8y56fOiKhtCnqaiYB2S2Qg,"[Nightlife, Pubs, Bars]",Munroe Falls,{},1,41.136622,-81.439259,Brewster's Pub,,44308,4,4.0,OH,True,True,"{'BusinessParking|garage': False, 'BusinessPar...",True,True,True,1.0,False,,True,False,True,False,True,False,True,False,True,False,True,True,True,True,True,True,False,,True,False,False,,True,True,False,,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,full_bar,False,,False,,False,,False,,False,,False,,False,,False,,True,average,False,,False,,True,True,True,False,False,,True,False,True,False,True,False,True,False,True,False,True,False,True,True,True,False,False,,True,outdoor,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,1,Brewster's Pub,8y56fOiKhtCnqaiYB2S2Qg,"[{'geometry': {'location': {'lat': 41.136622, ...",False,4.0,963,0,7,204,1392,0.797,52,60,0.758,38,9775,0,0,448,86,88,666,633,44308,1.4,4.1,7,-0.1,1.0,-0.4


In [60]:
maps_nearby = restaurants['maps_nearby']
nearby_df = pd.DataFrame(list(maps_nearby.find()))
nearby_df['num_nearby_restaurants'] = nearby_df['results'].apply(lambda x: len(x))

nearby_prices = summaries_from_google(nearby_df, 'price_level')
nearby_ratings = summaries_from_google(nearby_df, 'rating')
nearby_prices_and_rating = nearby_prices.merge(nearby_ratings, how='outer', on='business_id')
nearby_prices_rating_num = nearby_prices_and_rating.merge(nearby_df, how='outer', left_on='business_id', right_on='yelp_business_id')
trimmed_nearby_data = nearby_prices_rating_num[['business_id','avg_price_level','avg_rating','num_nearby_restaurants']]

restaurants_with_nearby_data = restaurants_with_economic_data.merge(trimmed_nearby_data, how='left', on='business_id')

restaurants_with_nearby_data['relative rating'] = restaurants_with_nearby_data['stars'] - restaurants_with_nearby_data['avg_rating']

In [5]:
reviews = []
with open('../data/review.json') as f:
    for line in f:
        reviews.append(json.loads(line))

reviews_df = pd.DataFrame(reviews)

five_star_reviews = reviews_df[reviews_df['stars'] == 5]
two_to_four_star_reviews = reviews_df[reviews_df['stars'].isin([2,3,4])]
one_star_reviews = reviews_df[reviews_df['stars'] == 1]

review_series = [five_star_reviews, two_to_four_star_reviews, one_star_reviews]

for i in range(len(review_series)):
    review_series[i] = review_series[i].groupby('business_id')['text'].apply(lambda x: "{%s}" % ':::'.join(x))
    review_series[i] = pd.DataFrame(review_series[i])
    review_series[i]['business_id'] = review_series[i].index

restaurants_with_stars = restaurants_with_economic_data.merge(review_series[0], how='left', on='business_id')
restaurants_with_stars = restaurants_with_stars.rename({'text': 'five_star_review_text'}, axis='columns')
restaurants_with_stars = restaurants_with_stars.merge(review_series[1], how='left', on='business_id')
restaurants_with_stars = restaurants_with_stars.rename({'text': 'two_to_four_star_review_text'}, axis='columns')
restaurants_with_stars = restaurants_with_stars.merge(review_series[2], how='left', on='business_id')
restaurants_with_stars = restaurants_with_stars.rename({'text': 'one_star_review_text'}, axis='columns')

restaurants_with_stars[['five_star_review_text', 'two_to_four_star_review_text', 'one_star_review_text']] = restaurants_with_stars[['five_star_review_text', 'two_to_four_star_review_text', 'one_star_review_text']].fillna("Empty")

In [8]:
tfidf_five_star = TfidfVectorizer(stop_words='english', max_features=100)
feature_matrix = tfidf_five_star.fit_transform(restaurants_with_stars['five_star_review_text'])
five_star_tfidf_df = pd.DataFrame(feature_matrix.toarray(), columns=tfidf_five_star.get_feature_names())

tfidf_two_to_four_star = TfidfVectorizer(stop_words='english', max_features=100)
feature_matrix = tfidf_two_to_four_star.fit_transform(restaurants_with_stars['two_to_four_star_review_text'])
two_to_four_star_tfidf_df = pd.DataFrame(feature_matrix.toarray(), columns=tfidf_two_to_four_star.get_feature_names())

tfidf_one_star = TfidfVectorizer(stop_words='english', max_features=100)
feature_matrix = tfidf_one_star.fit_transform(restaurants_with_stars['one_star_review_text'])
one_star_tfidf_df = pd.DataFrame(feature_matrix.toarray(), columns=tfidf_one_star.get_feature_names())

In [39]:
closed_restaurants = restaurants_with_stars[restaurants_with_stars['closed_on_google'] == True]
open_restaurants = restaurants_with_stars[restaurants_with_stars['closed_on_google'] == False]

In [34]:
tfidf_five_star = TfidfVectorizer(stop_words='english', max_features=50)
tfidf_five_star.fit(closed_restaurants['five_star_review_text'])
feature_matrix = tfidf_five_star.transform(restaurants_with_stars['five_star_review_text'])
five_star_tfidf_df = pd.DataFrame(feature_matrix.toarray(), columns=tfidf_five_star.get_feature_names())

tfidf_two_to_four_star = TfidfVectorizer(stop_words='english', max_features=50)
tfidf_two_to_four_star.fit(closed_restaurants['two_to_four_star_review_text'])
feature_matrix = tfidf_two_to_four_star.transform(restaurants_with_stars['two_to_four_star_review_text'])
two_to_four_star_tfidf_df = pd.DataFrame(feature_matrix.toarray(), columns=tfidf_two_to_four_star.get_feature_names())

tfidf_one_star = TfidfVectorizer(stop_words='english', max_features=50)
tfidf_one_star.fit(closed_restaurants['one_star_review_text'])
feature_matrix = tfidf_one_star.transform(restaurants_with_stars['one_star_review_text'])
one_star_tfidf_df = pd.DataFrame(feature_matrix.toarray(), columns=tfidf_one_star.get_feature_names())

In [53]:
tfidf_five_star_closed = TfidfVectorizer(stop_words='english', max_features=50)
tfidf_five_star_closed.fit(closed_restaurants['five_star_review_text'])
r_closed = tfidf_five_star_closed.transform(restaurants_with_stars['five_star_review_text'])
r_closed_df = pd.DataFrame(r_closed.toarray(), columns=tfidf_five_star_closed.get_feature_names())

In [54]:
tfidf_five_star_open = TfidfVectorizer(stop_words='english', max_features=50)
tfidf_five_star_open.fit(open_restaurants['five_star_review_text'])
r_open = tfidf_five_star_open.transform(restaurants_with_stars['five_star_review_text'])
r_open_df = pd.DataFrame(r_open.toarray(), columns=tfidf_five_star_open.get_feature_names())

In [55]:
r_closed_df.head()

Unnamed: 0,amazing,awesome,best,came,cheese,chicken,chocolate,come,definitely,delicious,dinner,don,eat,excellent,experience,favorite,food,fresh,friendly,good,got,great,just,like,little,love,lunch,make,menu,new,nice,order,ordered,perfect,place,really,recommend,restaurant,right,salad,sauce,service,staff,super,sweet,time,try,ve,vegas,went
0,0.199602,0.174033,0.100815,0.14014,0.036385,0.254869,0.045659,0.103533,0.068707,0.092219,0.091163,0.029109,0.096313,0.045432,0.048815,0.10562,0.416197,0.0,0.127257,0.290894,0.199842,0.444953,0.062299,0.126019,0.02875,0.037744,0.033324,0.074086,0.126948,0.074219,0.052609,0.029795,0.062284,0.063962,0.257407,0.13022,0.056799,0.115025,0.030455,0.094945,0.03493,0.182737,0.067435,0.106202,0.068567,0.115663,0.109931,0.09346,0.0,0.030852
1,0.04197,0.045742,0.158987,0.0,0.0,0.0,0.792052,0.0,0.130022,0.041552,0.0,0.137714,0.0,0.095528,0.0,0.04759,0.0,0.089573,0.240824,0.183498,0.045022,0.136917,0.039298,0.238481,0.090679,0.079364,0.0,0.093467,0.0,0.093636,0.041482,0.0,0.0,0.050435,0.176493,0.0,0.0,0.0,0.048028,0.0,0.055086,0.115271,0.042538,0.095704,0.0,0.040534,0.086681,0.084222,0.0,0.048655
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.334568,0.0,0.298005,0.0,0.0,0.0,0.0,0.0,0.0,0.553799,0.0,0.287862,0.0,0.0,0.245489,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.506319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.305082,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.307118,0.0,0.0,0.261911,0.0,0.0,0.346924,0.607272,0.0,0.0,0.382965,0.358237,0.0,0.0,0.0,0.0,0.270094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
r_open_df.head()

Unnamed: 0,amazing,awesome,bar,best,came,cheese,chicken,come,definitely,delicious,dinner,don,eat,excellent,experience,favorite,food,fresh,friendly,good,got,great,just,like,little,love,lunch,make,menu,nice,order,ordered,people,perfect,pizza,place,really,recommend,restaurant,right,salad,sauce,service,staff,time,try,ve,vegas,wait,went
0,0.206302,0.16917,0.091026,0.099179,0.14049,0.035033,0.260038,0.099978,0.069407,0.095189,0.088822,0.028182,0.093361,0.045376,0.046893,0.100853,0.409024,0.0,0.119683,0.287873,0.200807,0.449178,0.062766,0.124221,0.029195,0.037294,0.032937,0.071969,0.12766,0.051751,0.028422,0.06178,0.087049,0.063721,0.095057,0.265197,0.130921,0.05843,0.112362,0.029666,0.095446,0.035561,0.177916,0.063749,0.111383,0.11472,0.0916,0.0,0.048047,0.03054
1,0.073488,0.075327,0.097276,0.264969,0.0,0.0,0.0,0.0,0.222517,0.07266,0.0,0.225873,0.0,0.161637,0.0,0.076983,0.0,0.151643,0.383698,0.307636,0.07664,0.234154,0.067075,0.398248,0.155996,0.132848,0.0,0.153819,0.0,0.06913,0.0,0.0,0.0,0.085119,0.0,0.308046,0.0,0.0,0.0,0.079257,0.0,0.095005,0.19013,0.068126,0.066127,0.153244,0.13984,0.0,0.0,0.081591
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.325494,0.0,0.309903,0.0,0.0,0.0,0.0,0.0,0.0,0.548324,0.0,0.272754,0.0,0.0,0.249674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.525543,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.290565,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.310882,0.0,0.0,0.284575,0.0,0.0,0.379175,0.645818,0.0,0.0,0.414502,0.0,0.0,0.0,0.0,0.0,0.0,0.299504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
