In [1]:
import pymongo
from pymongo import MongoClient
import pandas as pd
import json
import datetime
from datetime import datetime
from pandas import read_excel

In [2]:
########## Saving json file from link ##########

from urllib.request import urlopen

url = "https://raw.githubusercontent.com/Papagoat/brain-assessment/main/restaurant_data.json"
with urlopen(url) as source:
    out = source.read()

data = json.loads(out)

with open("restaurant_data.json", "w") as file:
    merged = []
    for i in range(len(data)):
        # combine all results shown
        merged.extend(data[i]["restaurants"])
    json.dump(merged, file)

In [3]:
mongodb = MongoClient('localhost', 27017)
mongodb.drop_database('Restaurants')

In [4]:
########## Load data files & save as them collections of a database in mongodb ##########

db = mongodb.Restaurants

restaurant_data = json.load(open("restaurant_data.json"))
# create and populate collection
res = db.restaurants
res.insert_many(restaurant_data)

country_code = read_excel("Country-Code.xlsx")
country_code = country_code.to_dict(orient='records')
# create and populate collection
ccode = db.countrycode
ccode.insert_many(country_code)

<pymongo.results.InsertManyResult at 0x7fe883196c80>

## 1. Generating restaurants.csv
Extract fields:
* Restaurant Id
* Restaurant Name
* Country
* City
* User Rating Votes
* User Aggregate Rating (float)
* Cuisines

In [5]:
column_names = ["Restaurant Id", "Restaurant Name", "Country", "City", 
                "User Rating Votes", "User Aggregate Rating", "Cuisines"]

restaurants_dict = dict(zip(column_names, [[] for i in range(len(column_names))]))

# retrieve restaurant id
restaurants_dict["Restaurant Id"] = res.distinct("restaurant.R.res_id")
# map restaurant id to other fields
for id in restaurants_dict["Restaurant Id"]:
    out = res.find_one({"restaurant.R.res_id":id}, {"_id":0,"restaurant.name":1, 
                        "restaurant.location.country_id":1, "restaurant.location.city":1, 
                        "restaurant.user_rating.votes":1, "restaurant.user_rating.aggregate_rating":1, 
                        "restaurant.cuisines":1, })['restaurant']
    restaurants_dict["Restaurant Name"].append(out["name"])
    country_id = out["location"]["country_id"]
    try:
        country = ccode.find_one({"Country Code":country_id}, {"_id":0, "Country":1})["Country"]
    except:
        country = "NA"
    restaurants_dict["Country"].append(country)
    restaurants_dict["City"].append(out["location"]["city"])
    restaurants_dict["User Rating Votes"].append(int(out["user_rating"]["votes"]))
    restaurants_dict["User Aggregate Rating"].append(float(out["user_rating"]["aggregate_rating"]))
    restaurants_dict["Cuisines"].append(out["cuisines"])

In [6]:
# convert to dataframe
df1 = pd.DataFrame(restaurants_dict)

In [7]:
# save as csv locally
df1.to_csv("restaurants.csv")

## 2. Generating restaurant_events.csv
Extract fields (start <= April 2019 <= end):
* Event Id
* Restaurant Id
* Restaurant Name
* Photo URL
* Event Title
* Event Start Date
* Event End Date

In [8]:
# filter events that happened in April 2019
out = res.find({
    "$and":[
        {"restaurant.zomato_events.event.start_date":{"$lte":'2019-04'}}, 
        {"restaurant.zomato_events.event.end_date":{"$gte":'2019-04'}}
    ]
}, {"_id":0, "restaurant.R.res_id":1, "restaurant.name":1, "restaurant.zomato_events.event.event_id":1, 
"restaurant.zomato_events.event.photos":1, "restaurant.zomato_events.event.title":1, 
"restaurant.zomato_events.event.start_date":1, "restaurant.zomato_events.event.end_date":1})

## 3. Determine threshold for different rating text based on aggregate rating.
Return aggregates for the following ratings only:
* Excellent
* Very Good
* Good
* Average
* Poor