In [6]:
import pandas as pd
import numpy as np
import requests
import yaml
import time
from random import shuffle
import json
from pymongo import MongoClient
from collections import Counter
from geopy.geocoders import Nominatim
geolocator = Nominatim()
pd.set_option('display.max_columns', 200)
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def create_pandas_df_from_json(path):
    '''
    INPUT: filepath string
    OUTPUT: pandas database
    '''
    return pd.read_json(file_path, lines=True)

def is_food(item):
    '''
    INPUT: cell from pandas dataframe
    OUTPUT: boolean
    '''
    restaurants_and_related_categories = ['Restaurants', 'Italian','Food', 'Bars','Fast Food', 'Coffee & Tea', 'Sandwiches']
    if len(set(restaurants_and_related_categories) & set(item)) >= 1:
        return True
    else:
        return False
    
def flatten_dict(row):
    out = {}
    for key, value in row.items():
        if type(value) != dict:
            out[key] = value
        else:
            sub_key = key
            for k, v in value.items():
                out[sub_key + "|" + k] = v
    return out

def make_exists_function(key):
    def get_key_if_exists(row):
        if key in row:
            return row[key]
        else:
            return "N/A"
    return get_key_if_exists

def add_restaurant_count_column(dataframe):
    restaurant_frequency = dataframe.groupby(['name']).count().sort_values('address', ascending=False)

    restaurant_frequency = pd.DataFrame(restaurant_frequency['address'])

    restaurant_frequency.columns = ['restaurant_count']

    restaurant_frequency['name'] = restaurant_frequency.index

    restaurant_frequency = restaurant_frequency[['name', 'restaurant_count']]

    return previously_open_US_restaurants.merge(restaurant_frequency, how='left', left_on='name', right_on='name')

def closed_on_google(row):
    try:
        return row[0]['permanently_closed']
    except:
        return False
    
def fix_percent(row):
    row = str(row).strip('%')
    row = float(row)
    return row/100

In [3]:
file_path = 'https://s3-us-west-2.amazonaws.com/businesspredictiondata/business.json'
yelp_business_data = create_pandas_df_from_json(file_path)

In [4]:
#filters businesses that were open when this dataset was published Jan. 2018
open_businesses = yelp_business_data[yelp_business_data['is_open'] == 1]

#creates column that says if business is restaurant and creates df of just open restaurants
open_businesses['is_food'] = open_businesses['categories'].apply(is_food)
open_restaurants = open_businesses[open_businesses['is_food'] == True]

#creates column that says if business is in USA and creates df of just
#restaurants open in the US as of January 2018
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA",
      "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
      "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
      "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
      "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
open_restaurants['in_US'] = open_restaurants['state'].isin(states)
previously_open_US_restaurants = open_restaurants[open_restaurants['in_US'] == True]

#creates dummy columns for 
previously_open_US_restaurants['flat_attributes'] = previously_open_US_restaurants['attributes'].apply(flatten_dict)
all_attributes = []
for row in previously_open_US_restaurants['flat_attributes']:
    all_attributes.extend(row.keys())
unique_attributes = list(dict(Counter(all_attributes).most_common(50)).keys())

for key in unique_attributes:
    previously_open_US_restaurants['Attribute|has_'+key] = previously_open_US_restaurants['flat_attributes'].apply(lambda x: key in x)
    
    f = make_exists_function(key)
    previously_open_US_restaurants['Attribute|' +key + ' value:'] = previously_open_US_restaurants['flat_attributes'].apply(f)
    
all_categories = []
[all_categories.extend(item) for item in list(previously_open_US_restaurants['categories'])]

most_common_categories = list(dict(Counter(all_categories).most_common(50)).keys())

for key in most_common_categories:
    previously_open_US_restaurants[f"Category|{key}_true"] = previously_open_US_restaurants['categories'].apply(lambda x: key in x)

previously_open_US_restaurants = add_restaurant_count_column(previously_open_US_restaurants)
    
client = MongoClient('mongodb://localhost:27017/')
restaurants = client['restaurants']
google_places = restaurants['google_places']
start_time = time.time()

google_df = pd.DataFrame(list(google_places.find()))

google_df = google_df[['queried_name', 'yelp_business_id', 'results']]

google_df['closed_on_google'] = google_df['results'].apply(closed_on_google)

restaurants_with_google_data = previously_open_US_restaurants.merge(google_df, how='inner', left_on='business_id', right_on='yelp_business_id')

#removes rows without any matching data from Google
restaurants_with_google_data = restaurants_with_google_data[restaurants_with_google_data['results'].map(len) > 0]

zip_code_df = pd.read_csv('/Users/ElliottC/g/projects/yelp/predicting_restaurant_closure/data/zip_code_data.csv')

zip_code_df['Zip Code'] = zip_code_df['Zip Code'].apply(str)

restaurants_with_economic_data = restaurants_with_google_data.merge(zip_code_df, how='left', left_on='postal_code', right_on='Zip Code')

restaurants_with_economic_data.iloc[:,-19:] = restaurants_with_economic_data.iloc[:,-19:].fillna(0).copy()

percent_columns = ['Educational Attainment: Percent high school graduate or higher', 'Individuals below poverty level']
for col in percent_columns:
    restaurants_with_economic_data[col] = restaurants_with_economic_data[col].apply(fix_percent)

num_columns = ['2016 ACS 5-Year Population Estimate',
 'American Indian and Alaska Native alone',
 'Asian alone',
 'Black or African American alone',
 'Census 2010 Total Population',
 'Foreign Born Population',
 'Hispanic or Latino (of any race)',
 'Median Age',
 'Median Household Income',
 'Native Hawaiian and Other Pacific Islander alone',
 'Some Other Race alone',
 'Total housing units',
 'Two or More Races',
 'Veterans',
 'White alone',
 'White alone, Not Hispanic or Latino']
    
for col in num_columns:
    restaurants_with_economic_data[col] = restaurants_with_economic_data[col].apply(int)

restaurants_with_economic_data.to_csv('../data/featurized_dataframe.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.

In [5]:
reviews = []
with open('../data/review.json') as f:
    for line in f:
        reviews.append(json.loads(line))

reviews_df = pd.DataFrame(reviews)

five_star_reviews = reviews_df[reviews_df['stars'] == 5]
two_to_four_star_reviews = reviews_df[reviews_df['stars'].isin([2,3,4])]
one_star_reviews = reviews_df[reviews_df['stars'] == 1]

review_series = [five_star_reviews, two_to_four_star_reviews, one_star_reviews]

for i in range(len(review_series)):
    review_series[i] = review_series[i].groupby('business_id')['text'].apply(lambda x: "{%s}" % ':::'.join(x))
    review_series[i] = pd.DataFrame(review_series[i])
    review_series[i]['business_id'] = review_series[i].index

restaurants_with_stars = restaurants_with_economic_data.merge(review_series[0], how='left', on='business_id')
restaurants_with_stars = restaurants_with_stars.rename({'text': 'five_star_review_text'}, axis='columns')
restaurants_with_stars = restaurants_with_stars.merge(review_series[1], how='left', on='business_id')
restaurants_with_stars = restaurants_with_stars.rename({'text': 'two_to_four_star_review_text'}, axis='columns')
restaurants_with_stars = restaurants_with_stars.merge(review_series[2], how='left', on='business_id')
restaurants_with_stars = restaurants_with_stars.rename({'text': 'one_star_review_text'}, axis='columns')

restaurants_with_stars[['five_star_review_text', 'two_to_four_star_review_text', 'one_star_review_text']] = restaurants_with_stars[['five_star_review_text', 'two_to_four_star_review_text', 'one_star_review_text']].fillna("Empty")

In [8]:
tfidf_five_star = TfidfVectorizer(stop_words='english', max_features=100)
feature_matrix = tfidf_five_star.fit_transform(restaurants_with_stars['five_star_review_text'])
five_star_tfidf_df = pd.DataFrame(feature_matrix.toarray(), columns=tfidf_five_star.get_feature_names())

tfidf_two_to_four_star = TfidfVectorizer(stop_words='english', max_features=100)
feature_matrix = tfidf_two_to_four_star.fit_transform(restaurants_with_stars['two_to_four_star_review_text'])
two_to_four_star_tfidf_df = pd.DataFrame(feature_matrix.toarray(), columns=tfidf_two_to_four_star.get_feature_names())

tfidf_one_star = TfidfVectorizer(stop_words='english', max_features=100)
feature_matrix = tfidf_one_star.fit_transform(restaurants_with_stars['one_star_review_text'])
one_star_tfidf_df = pd.DataFrame(feature_matrix.toarray(), columns=tfidf_one_star.get_feature_names())

In [39]:
closed_restaurants = restaurants_with_stars[restaurants_with_stars['closed_on_google'] == True]
open_restaurants = restaurants_with_stars[restaurants_with_stars['closed_on_google'] == False]

In [34]:
tfidf_five_star = TfidfVectorizer(stop_words='english', max_features=50)
tfidf_five_star.fit(closed_restaurants['five_star_review_text'])
feature_matrix = tfidf_five_star.transform(restaurants_with_stars['five_star_review_text'])
five_star_tfidf_df = pd.DataFrame(feature_matrix.toarray(), columns=tfidf_five_star.get_feature_names())

tfidf_two_to_four_star = TfidfVectorizer(stop_words='english', max_features=50)
tfidf_two_to_four_star.fit(closed_restaurants['two_to_four_star_review_text'])
feature_matrix = tfidf_two_to_four_star.transform(restaurants_with_stars['two_to_four_star_review_text'])
two_to_four_star_tfidf_df = pd.DataFrame(feature_matrix.toarray(), columns=tfidf_two_to_four_star.get_feature_names())

tfidf_one_star = TfidfVectorizer(stop_words='english', max_features=50)
tfidf_one_star.fit(closed_restaurants['one_star_review_text'])
feature_matrix = tfidf_one_star.transform(restaurants_with_stars['one_star_review_text'])
one_star_tfidf_df = pd.DataFrame(feature_matrix.toarray(), columns=tfidf_one_star.get_feature_names())

In [53]:
tfidf_five_star_closed = TfidfVectorizer(stop_words='english', max_features=50)
tfidf_five_star_closed.fit(closed_restaurants['five_star_review_text'])
r_closed = tfidf_five_star_closed.transform(restaurants_with_stars['five_star_review_text'])
r_closed_df = pd.DataFrame(r_closed.toarray(), columns=tfidf_five_star_closed.get_feature_names())

In [54]:
tfidf_five_star_open = TfidfVectorizer(stop_words='english', max_features=50)
tfidf_five_star_open.fit(open_restaurants['five_star_review_text'])
r_open = tfidf_five_star_open.transform(restaurants_with_stars['five_star_review_text'])
r_open_df = pd.DataFrame(r_open.toarray(), columns=tfidf_five_star_open.get_feature_names())

In [55]:
r_closed_df.head()

Unnamed: 0,amazing,awesome,best,came,cheese,chicken,chocolate,come,definitely,delicious,dinner,don,eat,excellent,experience,favorite,food,fresh,friendly,good,got,great,just,like,little,love,lunch,make,menu,new,nice,order,ordered,perfect,place,really,recommend,restaurant,right,salad,sauce,service,staff,super,sweet,time,try,ve,vegas,went
0,0.199602,0.174033,0.100815,0.14014,0.036385,0.254869,0.045659,0.103533,0.068707,0.092219,0.091163,0.029109,0.096313,0.045432,0.048815,0.10562,0.416197,0.0,0.127257,0.290894,0.199842,0.444953,0.062299,0.126019,0.02875,0.037744,0.033324,0.074086,0.126948,0.074219,0.052609,0.029795,0.062284,0.063962,0.257407,0.13022,0.056799,0.115025,0.030455,0.094945,0.03493,0.182737,0.067435,0.106202,0.068567,0.115663,0.109931,0.09346,0.0,0.030852
1,0.04197,0.045742,0.158987,0.0,0.0,0.0,0.792052,0.0,0.130022,0.041552,0.0,0.137714,0.0,0.095528,0.0,0.04759,0.0,0.089573,0.240824,0.183498,0.045022,0.136917,0.039298,0.238481,0.090679,0.079364,0.0,0.093467,0.0,0.093636,0.041482,0.0,0.0,0.050435,0.176493,0.0,0.0,0.0,0.048028,0.0,0.055086,0.115271,0.042538,0.095704,0.0,0.040534,0.086681,0.084222,0.0,0.048655
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.334568,0.0,0.298005,0.0,0.0,0.0,0.0,0.0,0.0,0.553799,0.0,0.287862,0.0,0.0,0.245489,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.506319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.305082,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.307118,0.0,0.0,0.261911,0.0,0.0,0.346924,0.607272,0.0,0.0,0.382965,0.358237,0.0,0.0,0.0,0.0,0.270094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
r_open_df.head()

Unnamed: 0,amazing,awesome,bar,best,came,cheese,chicken,come,definitely,delicious,dinner,don,eat,excellent,experience,favorite,food,fresh,friendly,good,got,great,just,like,little,love,lunch,make,menu,nice,order,ordered,people,perfect,pizza,place,really,recommend,restaurant,right,salad,sauce,service,staff,time,try,ve,vegas,wait,went
0,0.206302,0.16917,0.091026,0.099179,0.14049,0.035033,0.260038,0.099978,0.069407,0.095189,0.088822,0.028182,0.093361,0.045376,0.046893,0.100853,0.409024,0.0,0.119683,0.287873,0.200807,0.449178,0.062766,0.124221,0.029195,0.037294,0.032937,0.071969,0.12766,0.051751,0.028422,0.06178,0.087049,0.063721,0.095057,0.265197,0.130921,0.05843,0.112362,0.029666,0.095446,0.035561,0.177916,0.063749,0.111383,0.11472,0.0916,0.0,0.048047,0.03054
1,0.073488,0.075327,0.097276,0.264969,0.0,0.0,0.0,0.0,0.222517,0.07266,0.0,0.225873,0.0,0.161637,0.0,0.076983,0.0,0.151643,0.383698,0.307636,0.07664,0.234154,0.067075,0.398248,0.155996,0.132848,0.0,0.153819,0.0,0.06913,0.0,0.0,0.0,0.085119,0.0,0.308046,0.0,0.0,0.0,0.079257,0.0,0.095005,0.19013,0.068126,0.066127,0.153244,0.13984,0.0,0.0,0.081591
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.325494,0.0,0.309903,0.0,0.0,0.0,0.0,0.0,0.0,0.548324,0.0,0.272754,0.0,0.0,0.249674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.525543,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.290565,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.310882,0.0,0.0,0.284575,0.0,0.0,0.379175,0.645818,0.0,0.0,0.414502,0.0,0.0,0.0,0.0,0.0,0.0,0.299504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
