# Recommendation Engine for Restaurants based on Yelp Reviews
## Author: Gary Corcoran
## Date: Dec. 9th, 2017
# Reference
https://nick-morgan.github.io/Python-Recommendation-Engine-Yelp/

In [2]:
import pandas as pd
import numpy as np
import time
import json

def json_to_csv(directory, filenames, create_sample=False):
    """
    Converts specified json file to csv.
    
    Loops through specified JSON files and converts them to csv files.
    Option to also create a sample csv, which uses np.random.seed(9001)
    to create a sample dataset with 10% of the observations.
    
    @param  directory:     directory of json files
    @param  file_names:    list of json filenames
    @param  create_sample: create random sample dataset flag
    """
    start = time.time()
    json_data = []
    
    for filename in filenames:
        with open(directory + filename, encoding='utf8') as file:
            print('{} opened'.format(filename))
            for line in file:
                # use rstrip because some of the files having trailing
                # blank spaces
                json_data.append(json.loads(line.rstrip()))
        
        df = pd.DataFrame.from_dict(json_data)
        csv_filename = filename[:len(filename)-5] + '.csv'
        df.to_csv(directory + csv_filename)
        print('{} created'.format(csv_filename))
        
        if create_sample:
            np.random.seed(9001)
            msk = np.random.rand(len(df)) <= 0.1
            sample = df[msk]
            
            csv_sample_filename = filename[:len(filename)-5] + '_sample.csv'
            sample.to_csv(directory + csv_sample_filename)
            print('{} created'.format(csv_sample_filename))
        
    print('This function took {} minutes to run.'.format(
        (time.time()-start)/60)
         )
    

In [3]:
file_list = ['business.json', 'review.json', 'user.json']
json_to_csv('datasets/yelp/', file_list, create_sample=True)

user.json opened
user.csv created
user_sample.csv created
This function took 1.905851928393046 minutes to run.


In [4]:
# create a mask for restaurants
mask_restaurants = business['categories'].str.contains('Restaurants')

# create a mask for food
mask_food = business['categories'].str.contains('Food')

# apply both masks
restaurants_and_food = business[mask_restaurants & mask_food]

# number of businessses that have food and restaurant in their category
restaurants_and_food['categories'].count()

NameError: name 'business' is not defined