# Setup

In [1]:
import pandas as pd
import numpy as np
import os

# To serialise models
from sklearn.externals import joblib

# To plot pretty figures
%matplotlib inline
import matplotlib.pyplot as plt

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

import seaborn as sns
sns.set()

# to make this notebook's output stable across runs
np.random.seed(42)

# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    if not os.path.isdir(IMAGES_PATH):
        os.makedirs(IMAGES_PATH)
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)


PROCESSED_PATH = os.path.join(PROJECT_ROOT_DIR, "processed")
def save_processed(df, filename, extension="csv"):
    if not os.path.isdir(PROCESSED_PATH):
        os.makedirs(PROCESSED_PATH)
    path = os.path.join(PROCESSED_PATH, filename + "." + extension)
    print("Saving processed dataset", filename)
    df.to_csv(path, index=False)
    
SUBMISSIONS_PATH = os.path.join(PROJECT_ROOT_DIR, "submissions")
def save_submission(df, filename, extension="csv"):
    if not os.path.isdir(SUBMISSIONS_PATH):
        os.makedirs(SUBMISSIONS_PATH)
    path = os.path.join(SUBMISSIONS_PATH, filename + "." + extension)
    print("Saving submission", filename)
    df.to_csv(path, index=False)

MODELS_PATH = os.path.join(PROJECT_ROOT_DIR, "models")
def save_model(model, filename):
    if not os.path.isdir(MODELS_PATH):
        os.makedirs(MODELS_PATH)
    path = os.path.join(MODELS_PATH, filename + "." + "pkl")
    print("Saving model", filename)
    joblib.dump(model, path)

# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore")

# Download Zomato Data

In [2]:
def load_data(base_path, file, sub_dir=None, ext="csv", encoding=None):
    filename = file + "." + ext
    if sub_dir is not None:
        csv_path = os.path.join(base_path, sub_dir, filename)
    else:
        csv_path = os.path.join(base_path, filename)
    return pd.read_csv(csv_path)

In [3]:
city_ids = load_data("Raw/", "zomato_city_ids")
city_ids.head()

Unnamed: 0,Country,City,Zomato ID
0,USA,New York City,280
1,USA,San Francisco,306
2,USA,Washington DC,283
3,USA,Chicago,292
4,USA,Los Angeles,281


In [4]:
city_ids.shape

(29, 3)

In [5]:
API_KEY = "0ce0b2e48571f88facd08f8efd3569aa"
ENTITY_TYPE = "city"
SORT_BY = "rating"
SORT_ORDER = "desc"
RAW_FOLDER = "Raw/"
PROCESSED_FOLDER = "Processed/"

In [6]:
import requests
import json

def get_categories():
    # Get all zomato categories.
    # Categories are global and not unique to a specific country.
    headers = {"user-key": API_KEY}
    response=requests.get("https://developers.zomato.com/api/v2.1/categories", headers=headers)
    json_data = response.json()
    
    if not os.path.isdir(RAW_FOLDER):
         os.makedirs(RAW_FOLDER)
    file_name = "categories.json"
    file_path = os.path.join(RAW_FOLDER, file_name)
    
    with open(file_path, 'w') as outfile:
        json.dump(json_data, outfile)
        
    # return all the category id's for restaurant search function.  
    cat_ids = [category["categories"]["id"] for category in json_data["categories"]]
    return cat_ids
        
def get_establishment_type_for_city(city_id):
    # Finding all establishment types in city_id and writing to file
    headers = {"user-key": API_KEY}
    params = {"city_id": city_id}
    response=requests.get("https://developers.zomato.com/api/v2.1/establishments", 
                          headers=headers, params=params)
    json_data = response.json()
       
    if not os.path.isdir(RAW_FOLDER):
         os.makedirs(RAW_FOLDER)    
    file_name = "establishment_types_" + str(city_id) + "." + "json"
    file_path = os.path.join(RAW_FOLDER, file_name)
    
    with open(file_path, 'w') as outfile:
        json.dump(json_data, outfile)
        
    # return all the establishment type id's for restaurant search function.  
    establishment_ids = [establishment["establishment"]["id"] for establishment in json_data["establishments"]]
    return establishment_ids
        
def get_cuisine_type_for_city(city_id):
    # Finding all cuisine types in Cape Town and writing to file
    headers = {"user-key": API_KEY}
    params = {"city_id": city_id}
    response=requests.get("https://developers.zomato.com/api/v2.1/cuisines", 
                          headers=headers, params=params)
    json_data = response.json()
    
    if not os.path.isdir(RAW_FOLDER):
         os.makedirs(RAW_FOLDER)    
    file_name = "cuisine_types_" + str(city_id) + "." + "json"
    file_path = os.path.join(RAW_FOLDER, file_name)
    
    with open(file_path, 'w') as outfile:
        json.dump(json_data, outfile)
        
    # return all the cuisine type id's for restaurant search function.
    cuisine_ids = [cuisine["cuisine"]["cuisine_id"] for cuisine in json_data["cuisines"]]
    return cuisine_ids

In [7]:
def download_with_criteria(headers, city_id, file_path, iterable_list, iterable_name, is_item_list=False):
    json_dump = []
    
    # if API is not expecting a list for iterable_name, then run through each value as as search criteria
    if not is_item_list:        
        for item in iterable_list:
            start = 0
            results_shown = 20
            while results_shown != 0:                
                params = {"entity_id": city_id, "entity_type": ENTITY_TYPE, "start": start, 
                          "count": 20, iterable_name: item,"sort": SORT_BY, 
                          "order": SORT_ORDER}
                response=requests.get("https://developers.zomato.com/api/v2.1/search", 
                                  headers=headers, params=params)

                json_data = response.json()
                results_shown = int(json_data.get("results_shown", 0))

                if results_shown == 0:
                    break

                # appending each dictionary to a list so that json.load() can process 
                # multiple dictionaries
                json_dump.append(json.dumps(json_data))     

                start += 20
    
    # otherwise pass the full list to iterable_name, such as cuisine's. 
    # note: initially it was attempted to run through the above loop for every cuisine type, but allowed API calls 
    # where exceeded for a single city download attempt. 
    else:
        start = 0
        results_shown = 20
        while results_shown != 0:            
            params = {"entity_id": city_id, "entity_type": ENTITY_TYPE, "start": start, 
                      "count": 20, iterable_name: iterable_list,"sort": SORT_BY, 
                      "order": SORT_ORDER}
            response=requests.get("https://developers.zomato.com/api/v2.1/search", 
                              headers=headers, params=params)

            json_data = response.json()
            results_shown = int(json_data.get("results_shown", 0))

            if results_shown == 0:
                break

            # appending each dictionary to a list so that json.load() can process 
            # multiple dictionaries
            json_dump.append(json.dumps(json_data))     

            start += 20
        
            
    
    # each dictionary must exist as a list object for json.load to read it correctly    
    with open(file_path, 'w') as outfile:
        outfile.write("[")
        count = 0
        for item in json_dump:
            outfile.write(item)
            if count < len(json_dump)-1:
                outfile.write(",")
            count += 1
        outfile.write("]")

In [8]:
def get_restaurants(city_id, categories, establishment_types, cuisine_types):
     # Find 100 top rated restaurants of each establishment type, category and cuisine type per city and write to file
    headers = {"user-key": API_KEY}
    
    if not os.path.isdir(RAW_FOLDER):
         os.makedirs(RAW_FOLDER)
    
    search_criteria = ["Category", "Establishment Type", "Cuisine Type"]
    city_name = city_ids[city_ids["Zomato ID"] == city_id]["City"].values[0]
    
    # Download restaurant for city by category
#     print("Downloading restaurants for", city_name, "by", search_criteria[0])
#     file_name = "restaurants_by_category_" + str(city_id) + "." + "json"
#     file_path = os.path.join(RAW_FOLDER, file_name)
#     download_with_criteria(headers, city_id, file_path, categories, "category", is_item_list=False)
#     print("Done")

    # Download restaurant for city by establishment type
    print("Downloading restaurants for", city_name, "by", search_criteria[1])
    file_name = "restaurants_by_establishment_type_" + str(city_id) + "." + "json"
    file_path = os.path.join(RAW_FOLDER, file_name)
    download_with_criteria(headers, city_id, file_path, establishment_types, "establishment_type", is_item_list=False)
    print("Done")
    
    # Download restaurant for city by cuisine type
#     print("Downloading restaurants for", city_name, "by", search_criteria[2])
#     file_name = "restaurants_by_cuisine_type_" + str(city_id) + "." + "json"
#     file_path = os.path.join(RAW_FOLDER, file_name)
#     download_with_criteria(headers, city_id, file_path, cuisine_types, "cuisines", is_item_list=True)
#     print("Done")
    

In [9]:
categories = get_categories()

In [11]:
for city_id in cities:
    establishments = get_establishment_type_for_city(city_id)
    #cuisines = get_cuisine_type_for_city(city_id)
    get_restaurants(city_id, None, establishments, None)

Downloading restaurants for Cape Town by Establishment Type
Done
Downloading restaurants for Rio De Janeiro by Establishment Type
Done
Downloading restaurants for Sao Paulo by Establishment Type
Done
Downloading restaurants for Santiago by Establishment Type
Done
Downloading restaurants for Dubai by Establishment Type
Done
Downloading restaurants for Doha by Establishment Type
Done


In [12]:
# For each city, we need to combine the restaurant data from the categories-based, establishment type-based and cuisine-based json files. 

def build_csv(city_id, source_type):
    file_name = "restaurants_by_" + source_type + "_" + str(city_id) + ".json"
    source_path = os.path.join(RAW_FOLDER, file_name)
    
    db = json.load(open(source_path))
    
    restaurant_cols = ["name", 
                       "cuisines",
                       "aggregate_rating", 
                       "rating_text", 
                       "votes",
                       "currency", 
                       "average_cost_for_two", 
                       "price_range",                    
                       "locality",
                       "locality_verbose",
                       "city", 
                       "zipcode",
                       "country_id",
                       "latitude", 
                       "longitude",                                       
                       "has_online_delivery", 
                       "has_table_booking", 
                       ]
    
    restaurants = [restaurant_group["restaurant"] for count_group in db 
                       for restaurant_group in count_group["restaurants"]] 
    
    for restaurant in restaurants:
        restaurant["locality"] = restaurant["location"]["locality"]
        restaurant["locality_verbose"] = restaurant["location"]["locality_verbose"]
        restaurant["city"] = restaurant["location"]["city"]
        restaurant["latitude"] = restaurant["location"]["latitude"]
        restaurant["longitude"] = restaurant["location"]["longitude"]
        restaurant["zipcode"] = restaurant["location"]["zipcode"]
        restaurant["country_id"] = restaurant["location"]["country_id"]
        restaurant["aggregate_rating"] = restaurant["user_rating"]["aggregate_rating"]
        restaurant["votes"] = restaurant["user_rating"]["votes"]
        restaurant["rating_text"] = restaurant["user_rating"]["rating_text"]
        
    
    df = pd.DataFrame(restaurants, columns=restaurant_cols)
    return df
    

def process_restaurants_json(city_id):
    
    #source_types = ["category", "cuisine_type", "establishment_type"]
    source_types = ["establishment_type"]
    dfs = []
    for source in source_types:
        dfs.append(build_csv(city_id, source))
    
    if not os.path.isdir(PROCESSED_FOLDER):
         os.makedirs(PROCESSED_FOLDER) 
    
    file_name = "restaurants_" + str(city_id) + ".csv"
    output_path = os.path.join(PROCESSED_FOLDER, file_name)

    data = pd.concat(dfs)
    data.to_csv(output_path, encoding='utf-8-sig', index=False)
    

In [13]:
city_ids = load_data("Raw/", "zomato_city_ids")

In [None]:
for city in city_ids["Zomato ID"]:
    process_restaurants_json(city)