In [1]:
import pandas as pd
import numpy as np
import requests
import yaml
import time
from random import shuffle
import json
from pymongo import MongoClient
from collections import Counter
from geopy.geocoders import Nominatim
geolocator = Nominatim()
pd.set_option('display.max_columns', 200)
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def create_pandas_df_from_json(path):
    '''
    INPUT: filepath string
    OUTPUT: pandas database
    '''
    return pd.read_json(file_path, lines=True)

def is_food(item):
    '''
    INPUT: cell from pandas dataframe
    OUTPUT: boolean
    '''
    restaurants_and_related_categories = ['Restaurants', 'Italian','Food', 'Bars','Fast Food', 'Coffee & Tea', 'Sandwiches']
    if len(set(restaurants_and_related_categories) & set(item)) >= 1:
        return True
    else:
        return False
    
def flatten_dict(row):
    out = {}
    for key, value in row.items():
        if type(value) != dict:
            out[key] = value
        else:
            sub_key = key
            for k, v in value.items():
                out[sub_key + "|" + k] = v
    return out

def make_exists_function(key):
    def get_key_if_exists(row):
        if key in row:
            return row[key]
        else:
            return "N/A"
    return get_key_if_exists

def add_restaurant_count_column(dataframe):
    restaurant_frequency = dataframe.groupby(['name']).count().sort_values('address', ascending=False)

    restaurant_frequency = pd.DataFrame(restaurant_frequency['address'])

    restaurant_frequency.columns = ['restaurant_count']

    restaurant_frequency['name'] = restaurant_frequency.index

    restaurant_frequency = restaurant_frequency[['name', 'restaurant_count']]

    return previously_open_US_restaurants.merge(restaurant_frequency, how='left', left_on='name', right_on='name')

def closed_on_google(row):
    try:
        return row[0]['permanently_closed']
    except:
        return False
    
def fix_percent(row):
    row = str(row).strip('%')
    row = float(row)
    return row/100

def summaries_from_google(dataframe, key, default_val=0):
    summaries = []
    key_errors = 0
    for i in range(len(dataframe)):
        total = 0
        count = 0
        for j in range(len(dataframe['results'][i])):
            try:
                total += dataframe['results'][i][j][key]
                count += 1
            except KeyError:
                key_errors += 1
        try:
            summaries.append({'business_id': nearby_df['yelp_business_id'][i], 'avg_'+key: (total / count)})
        except ZeroDivisionError:
            summaries.append({'business_id': nearby_df['yelp_business_id'][i], 'avg_'+key: default_val})
    return pd.DataFrame(summaries)

def get_price(row):
    try:
        return row['RestaurantsPriceRange2']
    except KeyError:
        return 1.5
    
def concat_unique_columns(df1, df2, suffix):
    cols = list(set(list(df1.columns) + list(df2.columns)))
    df_dict = {'df1':[], 'df2':[]}
    for col in cols:
        if col in df1.columns:
            df_dict['df1'].append(col)
        else:
            df_dict['df2'].append(col)
    combined_df = pd.concat([df1[df_dict['df1']],df2[df_dict['df2']]],axis=1)
    combined_df.columns = [suffix + str(col) for col in combined_df.columns]
    return combined_df

In [3]:
file_path = 'https://s3-us-west-2.amazonaws.com/businesspredictiondata/business.json'
yelp_business_data = create_pandas_df_from_json(file_path)

In [6]:
previously_open_US_restaurants

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state,is_food,in_US,flat_attributes,Attribute|BusinessAcceptsCreditCards value:,Attribute|RestaurantsPriceRange2 value:,Attribute|RestaurantsTakeOut value:,Attribute|BusinessParking|garage value:,Attribute|BusinessParking|street value:,Attribute|BusinessParking|validated value:,Attribute|BusinessParking|lot value:,Attribute|BusinessParking|valet value:,Attribute|BikeParking value:,Attribute|OutdoorSeating value:,Attribute|RestaurantsGoodForGroups value:,Attribute|RestaurantsDelivery value:,Attribute|RestaurantsReservations value:,Attribute|GoodForKids value:,Attribute|HasTV value:,Attribute|WiFi value:,Attribute|Ambience|romantic value:,Attribute|Ambience|intimate value:,Attribute|Ambience|classy value:,Attribute|Ambience|hipster value:,Attribute|Ambience|divey value:,Attribute|Ambience|touristy value:,Attribute|Ambience|trendy value:,Attribute|Ambience|upscale value:,Attribute|Ambience|casual value:,Attribute|Alcohol value:,Attribute|RestaurantsAttire value:,Attribute|GoodForMeal|dessert value:,Attribute|GoodForMeal|latenight value:,Attribute|GoodForMeal|lunch value:,Attribute|GoodForMeal|dinner value:,Attribute|GoodForMeal|breakfast value:,Attribute|GoodForMeal|brunch value:,Attribute|Caters value:,Attribute|NoiseLevel value:,Attribute|RestaurantsTableService value:,Attribute|WheelchairAccessible value:,Attribute|HappyHour value:,Attribute|GoodForDancing value:,Attribute|DriveThru value:,Attribute|Music|dj value:,Attribute|Music|background_music value:,Attribute|Music|no_music value:,Attribute|Music|karaoke value:,Attribute|Music|live value:,Attribute|Music|video value:,Attribute|Music|jukebox value:,Attribute|CoatCheck value:,Attribute|DogsAllowed value:,Attribute|Smoking value:,Category|Restaurants_true,Category|Food_true,Category|Nightlife_true,Category|Bars_true,Category|Fast Food_true,Category|American (Traditional)_true,Category|Sandwiches_true,Category|Pizza_true,Category|Mexican_true,Category|Burgers_true,Category|American (New)_true,Category|Breakfast & Brunch_true,Category|Coffee & Tea_true,Category|Grocery_true,Category|Italian_true,Category|Specialty Food_true,Category|Shopping_true,Category|Chinese_true,Category|Event Planning & Services_true,Category|Salad_true,Category|Chicken Wings_true,Category|Bakeries_true,Category|Desserts_true,Category|Convenience Stores_true,Category|Ice Cream & Frozen Yogurt_true,Category|Sports Bars_true,Category|Seafood_true,Category|Beer_true,Category|Wine & Spirits_true,Category|Caterers_true,Category|Delis_true,Category|Cafes_true,Category|Drugstores_true,Category|Japanese_true,Category|Juice Bars & Smoothies_true,Category|Arts & Entertainment_true,Category|Pubs_true,Category|Steakhouses_true,Category|Sushi Bars_true,Category|Asian Fusion_true,Category|Barbeque_true,Category|Diners_true,Category|Lounges_true,Category|Automotive_true,Category|Gas Stations_true,Category|Cocktail Bars_true,Category|Mediterranean_true,Category|Wine Bars_true,Category|Food Trucks_true,Category|Tex-Mex_true,restaurant_count,restaurant_count > 1,restaurant_count > 5,restaurant_count > 25
0,581 Howe Ave,"{'Alcohol': 'full_bar', 'HasTV': True, 'NoiseL...",PfOCPjBrlQAnz__NXj9h_w,"[American (New), Nightlife, Bars, Sandwiches, ...",Cuyahoga Falls,"{'Monday': '11:00-1:00', 'Tuesday': '11:00-1:0...",1,41.119535,-81.475690,Brick House Tavern + Tap,,44221,116,3.5,OH,True,True,"{'Alcohol': 'full_bar', 'HasTV': True, 'NoiseL...",True,2,True,False,False,False,True,False,True,True,True,False,False,True,True,free,False,False,False,False,False,False,False,False,True,full_bar,casual,False,True,False,True,False,False,False,average,True,,True,False,False,False,True,False,False,False,False,False,False,,outdoor,True,False,True,True,False,True,True,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1,False,False,False
1,2612 Brandt School Rd,"{'BusinessParking': {'garage': False, 'street'...",EsMcGiZaQuG1OOvL9iUFug,"[Coffee & Tea, Ice Cream & Frozen Yogurt, Food]",Wexford,{},1,40.615102,-80.091349,Any Given Sundae,,15090,15,5.0,PA,True,True,"{'BusinessParking|garage': False, 'BusinessPar...",True,1,True,False,False,False,True,False,False,True,,,,,,free,,,,,,,,,,,,,,,,,,False,,,,,,,,,,,,,,,,,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1,False,False,False
2,600 E 4th St,"{'GoodForMeal': {'dessert': False, 'latenight'...",fNMVV_ZX7CJSDWQGdOM8Nw,"[Restaurants, American (Traditional)]",Charlotte,"{'Friday': '7:00-15:00', 'Tuesday': '7:00-15:0...",1,35.221647,-80.839345,Showmars Government Center,Uptown,28202,7,3.5,NC,True,True,"{'GoodForMeal|dessert': False, 'GoodForMeal|la...",True,1,True,,,,,,True,True,True,False,False,True,False,free,False,False,False,False,False,False,False,False,False,,casual,False,False,False,False,False,False,,,False,,,,,,,,,,,,,,,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1,False,False,False
3,38295 Chestnut Ridge Rd,"{'GoodForMeal': {'dessert': False, 'latenight'...",Dj0S-Oe4ytRJzMGUPgYUkw,"[Soup, Salad, Sandwiches, Restaurants]",Elyria,"{'Monday': '6:30-21:00', 'Tuesday': '6:30-21:0...",1,41.343078,-82.067140,Panera Bread,,44035,4,2.0,OH,True,True,"{'GoodForMeal|dessert': False, 'GoodForMeal|la...",True,,True,,,,,,,,,False,False,True,,,,,,,,,,,,,,False,False,False,False,False,False,,,,,,,,,,,,,,,,,,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,134,True,True,True
4,96 S Main St,"{'BusinessParking': {'garage': False, 'street'...",8y56fOiKhtCnqaiYB2S2Qg,"[Nightlife, Pubs, Bars]",Munroe Falls,{},1,41.136622,-81.439259,Brewster's Pub,,44308,4,4.0,OH,True,True,"{'BusinessParking|garage': False, 'BusinessPar...",True,1,,False,False,False,False,False,True,True,True,,False,,True,,False,False,False,False,False,False,False,False,False,full_bar,,,,,,,,,average,,,True,False,,False,False,False,False,False,False,True,False,,outdoor,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,1,False,False,False
5,13603 Madison Ave,"{'Alcohol': 'full_bar', 'HasTV': True, 'NoiseL...",tRVx2c89coruPRwYhGTcTw,"[Nightlife, Izakaya, Comfort Food, Cocktail Ba...",Lakewood,"{'Monday': '12:00-2:00', 'Tuesday': '12:00-2:0...",1,41.476846,-81.786971,Yuzu,,44107,78,3.5,OH,True,True,"{'Alcohol': 'full_bar', 'HasTV': True, 'NoiseL...",True,2,True,False,True,False,True,False,True,True,True,True,False,False,True,free,False,False,False,True,False,False,False,False,True,full_bar,casual,False,True,False,True,False,False,True,average,True,True,True,False,,False,False,False,False,False,False,True,False,True,no,True,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,1,False,False,False
6,"610 Jetton St, Ste 130","{'BusinessParking': {'garage': False, 'street'...",zjySmTfL9WiMDVgp8-Jp3w,"[Desserts, Ice Cream & Frozen Yogurt, Food]",Davidson,{},1,35.501227,-80.860949,TCBY,,28036,3,3.5,NC,True,True,"{'BusinessParking|garage': False, 'BusinessPar...",True,2,True,False,False,False,False,False,True,,,,,,,free,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,13,True,True,False
7,719 E Thunderbird Rd,"{'RestaurantsTableService': False, 'GoodForMea...",rDMptJYWtnMhpQu_rRXHng,"[Fast Food, Burgers, Restaurants]",Phoenix,{},1,33.607070,-112.064382,McDonald's,,85022,10,1.0,AZ,True,True,"{'RestaurantsTableService': False, 'GoodForMea...",True,1,True,False,False,False,True,False,True,False,True,False,False,True,True,free,False,False,False,False,False,False,False,False,False,none,casual,False,False,True,False,True,False,False,loud,False,,,,True,,,,,,,,,,,True,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,473,True,True,True
8,"777 E Thunderbird Rd, Ste 107","{'RestaurantsTableService': True, 'GoodForMeal...",1WBkAuQg81kokZIPMpn9Zg,"[Burgers, Restaurants]",Phoenix,"{'Monday': '11:00-22:00', 'Tuesday': '11:00-22...",1,33.607310,-112.063404,Charr An American Burger Bar,,85022,232,3.0,AZ,True,True,"{'RestaurantsTableService': True, 'GoodForMeal...",True,2,True,False,False,False,True,False,True,True,True,True,False,True,True,free,False,False,False,False,False,False,False,False,True,full_bar,casual,False,False,False,True,False,False,True,average,True,,,,False,,,,,,,,,,,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1,False,False,False
9,6730 S Las Vegas Blvd,"{'Alcohol': 'full_bar', 'HasTV': True, 'NoiseL...",Pd52CjgyEU3Rb8co6QfTPw,"[Nightlife, Bars, Barbeque, Sports Bars, Ameri...",Las Vegas,"{'Monday': '8:30-22:30', 'Tuesday': '8:30-22:3...",1,36.066914,-115.170848,Flight Deck Bar & Grill,Southeast,89119,13,4.0,NV,True,True,"{'Alcohol': 'full_bar', 'HasTV': True, 'NoiseL...",True,2,True,False,False,False,True,False,True,False,True,False,False,True,True,free,False,False,False,False,False,False,False,False,True,full_bar,casual,False,False,True,False,False,False,True,average,True,True,True,False,,False,True,False,False,False,False,False,False,,no,True,False,True,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,1,False,False,False


In [4]:
#filters businesses that were open when this dataset was published Jan. 2018
open_businesses = yelp_business_data[yelp_business_data['is_open'] == 1].drop_duplicates(['name','address'])

#creates column that says if business is restaurant and creates df of just open restaurants
open_businesses['is_food'] = open_businesses['categories'].apply(is_food)
open_restaurants = open_businesses[open_businesses['is_food'] == True]

#creates column that says if business is in USA and creates df of just
#restaurants open in the US as of January 2018
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA",
      "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
      "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
      "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
      "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
open_restaurants['in_US'] = open_restaurants['state'].isin(states)
previously_open_US_restaurants = open_restaurants[(open_restaurants['in_US'] == True) & (open_restaurants['longitude'] < -20)]

#creates dummy columns for 0
previously_open_US_restaurants['flat_attributes'] = previously_open_US_restaurants['attributes'].apply(flatten_dict)
all_attributes = []
for row in previously_open_US_restaurants['flat_attributes']:
    all_attributes.extend(row.keys())
unique_attributes = list(dict(Counter(all_attributes).most_common(50)).keys())

for key in unique_attributes:
    f = make_exists_function(key)
    previously_open_US_restaurants['Attribute|' +key + ' value:'] = previously_open_US_restaurants['flat_attributes'].apply(f)
    
all_categories = []
[all_categories.extend(item) for item in list(previously_open_US_restaurants['categories'])]

most_common_categories = list(dict(Counter(all_categories).most_common(50)).keys())

for key in most_common_categories:
    previously_open_US_restaurants[f"Category|{key}_true"] = previously_open_US_restaurants['categories'].apply(lambda x: key in x)

previously_open_US_restaurants = add_restaurant_count_column(previously_open_US_restaurants)

previously_open_US_restaurants['restaurant_count > 1'] = previously_open_US_restaurants['restaurant_count'] > 1
previously_open_US_restaurants['restaurant_count > 5'] = previously_open_US_restaurants['restaurant_count'] > 5
previously_open_US_restaurants['restaurant_count > 25'] = previously_open_US_restaurants['restaurant_count'] > 25

client = MongoClient('mongodb://localhost:27017/')
restaurants = client['restaurants']
google_places = restaurants['google_places']
start_time = time.time()

google_df = pd.DataFrame(list(google_places.find()))

google_df = google_df[['queried_name', 'yelp_business_id', 'results']]

google_df['closed_on_google'] = google_df['results'].apply(closed_on_google)

restaurants_with_google_data = previously_open_US_restaurants.merge(google_df, how='inner', left_on='business_id', right_on='yelp_business_id')

#removes rows without any matching data from Google
restaurants_with_google_data = restaurants_with_google_data[restaurants_with_google_data['results'].map(len) > 0]

zip_code_df = pd.read_csv('../data/zip_code_data.csv')

zip_code_df['Zip Code'] = zip_code_df['Zip Code'].apply(str)

restaurants_with_economic_data = restaurants_with_google_data.merge(zip_code_df, how='left', left_on='postal_code', right_on='Zip Code')

restaurants_with_economic_data.iloc[:,-19:] = restaurants_with_economic_data.iloc[:,-19:].fillna(0).copy()

percent_columns = ['Educational Attainment: Percent high school graduate or higher', 'Individuals below poverty level']
for col in percent_columns:
    restaurants_with_economic_data[col] = restaurants_with_economic_data[col].apply(fix_percent)

num_columns = ['2016 ACS 5-Year Population Estimate',
 'American Indian and Alaska Native alone',
 'Asian alone',
 'Black or African American alone',
 'Census 2010 Total Population',
 'Foreign Born Population',
 'Hispanic or Latino (of any race)',
 'Median Age',
 'Median Household Income',
 'Native Hawaiian and Other Pacific Islander alone',
 'Some Other Race alone',
 'Total housing units',
 'Two or More Races',
 'Veterans',
 'White alone']
    
for col in num_columns:
    restaurants_with_economic_data[col] = restaurants_with_economic_data[col].apply(int)

#adds nearby data using google maps api data: among nearby restaurants: 1) count 2) avg_rating 3) avg_price
maps_nearby = restaurants['maps_nearby']
nearby_df = pd.DataFrame(list(maps_nearby.find()))
nearby_df['num_nearby_restaurants'] = nearby_df['results'].apply(lambda x: len(x))

nearby_prices = summaries_from_google(nearby_df, 'price_level', 1.5)
nearby_ratings = summaries_from_google(nearby_df, 'rating', 3.5)
nearby_prices_and_rating = nearby_prices.merge(nearby_ratings, how='outer', on='business_id')
nearby_prices_rating_num = nearby_prices_and_rating.merge(nearby_df, how='outer', left_on='business_id', right_on='yelp_business_id')
trimmed_nearby_data = nearby_prices_rating_num[['business_id','avg_price_level','avg_rating','num_nearby_restaurants']]

restaurants_with_nearby_data = restaurants_with_economic_data.merge(trimmed_nearby_data, how='left', on='business_id')

restaurants_with_nearby_data['relative rating'] = restaurants_with_nearby_data['stars'] - restaurants_with_nearby_data['avg_rating']

restaurants_with_nearby_data['price_level'] = restaurants_with_nearby_data['attributes'].apply(get_price)

restaurants_with_nearby_data['relative_price'] = restaurants_with_nearby_data['price_level'] - restaurants_with_nearby_data['avg_price_level']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#i

In [30]:
reviews = []
with open('../data/review.json') as f:
    for line in f:
        reviews.append(json.loads(line))

reviews_df = pd.DataFrame(reviews)

five_star_reviews = reviews_df[reviews_df['stars'] == 5]
two_to_four_star_reviews = reviews_df[reviews_df['stars'].isin([2,3,4])]
one_star_reviews = reviews_df[reviews_df['stars'] == 1]

review_series = [five_star_reviews, two_to_four_star_reviews, one_star_reviews]

for i in range(len(review_series)):
    review_series[i] = review_series[i].groupby('business_id')['text'].apply(lambda x: "{%s}" % ':::'.join(x))
    review_series[i] = pd.DataFrame(review_series[i])
    review_series[i]['business_id'] = review_series[i].index

restaurants_with_stars = restaurants_with_nearby_data.merge(review_series[0], how='left', on='business_id')
restaurants_with_stars = restaurants_with_stars.rename({'text': 'five_star_review_text'}, axis='columns')
restaurants_with_stars = restaurants_with_stars.merge(review_series[1], how='left', on='business_id')
restaurants_with_stars = restaurants_with_stars.rename({'text': 'two_to_four_star_review_text'}, axis='columns')
restaurants_with_stars = restaurants_with_stars.merge(review_series[2], how='left', on='business_id')
restaurants_with_stars = restaurants_with_stars.rename({'text': 'one_star_review_text'}, axis='columns')

restaurants_with_stars[['five_star_review_text', 'two_to_four_star_review_text', 'one_star_review_text']] = restaurants_with_stars[['five_star_review_text', 'two_to_four_star_review_text', 'one_star_review_text']].fillna("Empty")

closed_restaurants = restaurants_with_stars[restaurants_with_stars['closed_on_google'] == True]
open_restaurants = restaurants_with_stars[restaurants_with_stars['closed_on_google'] == False]

tfidf_five_closed = TfidfVectorizer(stop_words='english', max_features=100)
tfidf_five_closed.fit(closed_restaurants['five_star_review_text'])
feature_matrix = tfidf_five_closed.transform(restaurants_with_stars['one_star_review_text'])
tfidf_five_closed_df = pd.DataFrame(feature_matrix.toarray(), columns=tfidf_five_closed.get_feature_names())

tfidf_five_open = TfidfVectorizer(stop_words='english', max_features=100)
tfidf_five_open.fit(open_restaurants['five_star_review_text'])
feature_matrix = tfidf_five_open.transform(restaurants_with_stars['five_star_review_text'])
tfidf_five_open_df = pd.DataFrame(feature_matrix.toarray(), columns=tfidf_five_open.get_feature_names())

tfidf_two_to_four_closed = TfidfVectorizer(stop_words='english', max_features=100)
tfidf_two_to_four_closed.fit(closed_restaurants['two_to_four_star_review_text'])
feature_matrix = tfidf_two_to_four_closed.transform(restaurants_with_stars['one_star_review_text'])
tfidf_two_to_four_closed_df = pd.DataFrame(feature_matrix.toarray(), columns=tfidf_two_to_four_closed.get_feature_names())

tfidf_two_to_four_open = TfidfVectorizer(stop_words='english', max_features=100)
tfidf_two_to_four_open.fit(open_restaurants['two_to_four_star_review_text'])
feature_matrix = tfidf_two_to_four_open.transform(restaurants_with_stars['two_to_four_star_review_text'])
tfidf_two_to_four_open_df = pd.DataFrame(feature_matrix.toarray(), columns=tfidf_two_to_four_open.get_feature_names())

tfidf_one_closed = TfidfVectorizer(stop_words='english', max_features=100)
tfidf_one_closed.fit(closed_restaurants['one_star_review_text'])
feature_matrix = tfidf_one_closed.transform(restaurants_with_stars['one_star_review_text'])
tfidf_one_closed_df = pd.DataFrame(feature_matrix.toarray(), columns=tfidf_one_closed.get_feature_names())

tfidf_one_open = TfidfVectorizer(stop_words='english', max_features=100)
tfidf_one_open.fit(open_restaurants['one_star_review_text'])
feature_matrix = tfidf_one_open.transform(restaurants_with_stars['one_star_review_text'])
tfidf_one_open_df = pd.DataFrame(feature_matrix.toarray(), columns=tfidf_one_open.get_feature_names())

unique_one_star_df = concat_unique_columns(tfidf_one_closed_df, tfidf_one_open_df, 'one_star: ')
unique_two_to_four_star_df = concat_unique_columns(tfidf_two_to_four_closed_df, tfidf_two_to_four_open_df, '2-4_star: ')
unique_five_star_df = concat_unique_columns(tfidf_five_closed_df, tfidf_five_open_df, 'five-star: ')

all_tfidf_reviews_df = pd.concat([unique_one_star_df, unique_two_to_four_star_df, unique_five_star_df], axis=1)

restaurants_with_reviews = pd.concat([restaurants_with_stars,all_tfidf_reviews_df],axis=1)

In [31]:
restaurants_with_reviews = restaurants_with_reviews.drop_duplicates(['name','address'])

In [32]:
restaurants_with_reviews.to_csv('../data/featurized_dataframe.csv')