In [31]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from datetime import datetime
from pprint import pprint
import json
import pytz

# read in URI string from file to avoid showing password in plain text
with open("mongo-config.txt", 'r') as config:
        URI = config.read()

# Create a new client and connect to the server
client = MongoClient(URI, server_api=ServerApi('1'))

# Send a ping to test connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)
    
RESTAURANT_PATH = "restaurants.json"
HISTORICAL_PATH = "historical-events.json"

Pinged your deployment. You successfully connected to MongoDB!


In [33]:
# convenience function to print out num_to_print sample records and number of docs returned by cursor
def print_sample_and_total(cursor, num_to_print=5):
    num_docs = 0
    docs_to_print = []
    for doc in cursor:
        if(num_docs < num_to_print):
            docs_to_print.append(doc)
        num_docs = num_docs + 1

    # print number of docs returned by find() query
    print("\nTotal number of documents returned: " + str(num_docs) )
    print("\nSample docs:\n")

    for doc in docs_to_print:
        # easier to read JSON formatting
        pprint(doc)

## 1. Load the restaurants data into a collection

restaurants.json file is loaded into an array of JSON records and uploaded to remote MongoDB collection dh3382.rest_data. insert_many statement is commented out to avoid mistakenly uploading duplicate records. Note that for larger datasets, the dataset can be chunked and inserted in chunks to avoid loading the full set in memory

In [4]:
# read in restaurant data file, convert to list of JSON records to upload to DB
rest_data = []
with open(RESTAURANT_PATH, 'r') as file:
    for line in file:
        rest_data.append(json.loads(line) )
        
# find or create database 
db = client["dh3382"]

# find or create collection
rest_collection = db["rest_data"]

# commented out to avoid reinserting data
# rest_collection.insert_many(rest_data)

## 2. Count the number of documents in the collection

I first print out the length of the array of JSON records to ensure it matches up with the records uploaded to dh3382.rest_data. I then call count_documents on the rest_data collection object with an empty query to return a full count

In [34]:
# first check number of records in JSON array 
print("Number of records in rest_data array: " + str(len(rest_data) ) )

# count number of records uploaded to dh3382.rest_data and make sure it matches
print("Number of records in MongoDB dh3382.rest_data collection: " + str(rest_collection.count_documents({}) ) )

Number of records in rest_data array: 3772
Number of records in MongoDB dh3382.rest_data collection: 3772


## 3. Find all the documents in the collection

I use find() with an empty query to return a cursor pointing to the entire dh3382.rest_data collection. I then print out 5 documents to check that full documents were indeed returned, and count the number of documents returned to ensure that it matches up with the total count calculated in the previous cell

In [6]:
# returns a cursor to all documents in dh3382.rest_data collection
all_docs_cursor = rest_collection.find({})

# convenience function defined above, prints 5 documents by default and total number returned by query
print_sample_and_total(all_docs_cursor)


Total number of documents returned: 3772

Sample docs:

{'_id': ObjectId('6566166d88b337b1d7dbd2ed'),
 'address': {'building': '21',
             'coord': [-73.9774394, 40.7604522],
             'street': 'West   52 Street',
             'zipcode': '10019'},
 'borough': 'Manhattan',
 'cuisine': 'American ',
 'grades': [{'date': {'$date': 1400025600000}, 'grade': 'A', 'score': 12},
            {'date': {'$date': 1376352000000}, 'grade': 'A', 'score': 12},
            {'date': {'$date': 1333497600000}, 'grade': 'A', 'score': 12}],
 'name': '21 Club',
 'restaurant_id': '40364362'}
{'_id': ObjectId('6566166d88b337b1d7dbd2b9'),
 'address': {'building': '6409',
             'coord': [-74.00528899999999, 40.628886],
             'street': '11 Avenue',
             'zipcode': '11219'},
 'borough': 'Brooklyn',
 'cuisine': 'American ',
 'grades': [{'date': {'$date': 1405641600000}, 'grade': 'A', 'score': 12},
            {'date': {'$date': 1375142400000}, 'grade': 'A', 'score': 12},
           

## 4. Display: restaurant_id, name, borough and cuisine, but exclude field _id, for all the documents in the collection

I query with a dict defined as key: field, val: boolean as the second parameter in order to exclude the \_id field and include only the restaurant_id, name, borough, and cuisine fields. I again print out 5 sample documents as well as the total number of documents returned to confirm the correct formatting and size of the query return val

In [7]:
# define fields that will be returned by query
fields = {"_id": 0, "restaurant_id": 1, "name": 1, "borough": 1, "cuisine": 1}

# query dh3382.rest_data
all_docs_4_fields = rest_collection.find({}, fields)

# convenience function defined above, prints 5 documents by default and total number returned by query
print_sample_and_total(all_docs_4_fields)


Total number of documents returned: 3772

Sample docs:

{'borough': 'Manhattan',
 'cuisine': 'American ',
 'name': '21 Club',
 'restaurant_id': '40364362'}
{'borough': 'Brooklyn',
 'cuisine': 'American ',
 'name': 'Regina Caterers',
 'restaurant_id': '40356649'}
{'borough': 'Manhattan',
 'cuisine': 'American ',
 'name': 'Cafe Metro',
 'restaurant_id': '40363298'}
{'borough': 'Bronx',
 'cuisine': 'Bakery',
 'name': 'Morris Park Bake Shop',
 'restaurant_id': '30075445'}
{'borough': 'Manhattan',
 'cuisine': 'American ',
 'name': 'P & S Deli Grocery',
 'restaurant_id': '40362264'}


## 5. Display all the restaurants in the Bronx

I define a dict using the mongoDB $eq operator, then execute the query to return all documents where the borough field is equal to "Bronx" and print out a sample as well as the total number of documents returned

In [37]:
# define filter with equals function
query_bronx = {"borough": {"$eq": "Bronx"} }

# execute query with filter
all_docs_bronx = rest_collection.find(query_bronx)

# convenience function defined above, prints 5 documents by default and total number returned by query
print_sample_and_total(all_docs_bronx)


Total number of documents returned: 309

Sample docs:

{'_id': ObjectId('6566166d88b337b1d7dbd2b1'),
 'address': {'building': '1007',
             'coord': [-73.856077, 40.848447],
             'street': 'Morris Park Ave',
             'zipcode': '10462'},
 'borough': 'Bronx',
 'cuisine': 'Bakery',
 'grades': [{'date': {'$date': 1393804800000}, 'grade': 'A', 'score': 2},
            {'date': {'$date': 1378857600000}, 'grade': 'A', 'score': 6},
            {'date': {'$date': 1358985600000}, 'grade': 'A', 'score': 10},
            {'date': {'$date': 1322006400000}, 'grade': 'A', 'score': 9},
            {'date': {'$date': 1299715200000}, 'grade': 'B', 'score': 14}],
 'name': 'Morris Park Bake Shop',
 'restaurant_id': '30075445'}
{'_id': ObjectId('6566166d88b337b1d7dbd38a'),
 'address': {'building': '2300',
             'coord': [-73.8786113, 40.8502883],
             'street': 'Southern Boulevard',
             'zipcode': '10460'},
 'borough': 'Bronx',
 'cuisine': 'American ',
 'grades'

## 7. Display the first 5 restaurants in the Bronx
I query the first 10 restaurants in the Bronx using the previous query paired with the limit function. I save the result of the query to a list in memory and print out the first 5 results, displaying only the name of the restaurants for readability

In [38]:
# rerun query with limit function added
ten_docs_bronx = rest_collection.find(query_bronx).limit(10)

# append each document from query in order to a list
ten_docs_bronx_list = []
for doc in ten_docs_bronx:
    ten_docs_bronx_list.append(doc)

# print out the restaurant name from the first 5 entries in the list
for doc in ten_docs_bronx_list[:5]:
    print(doc["name"])

Morris Park Bake Shop
Cool Zone
Bronx Grill
Marina Delray
The New Starling Athletic Club Of The Bronx


## 8. Display the second 5 restaurants (skipping the first 5) in the Bronx
I simply print out the remaining documents in the list from the previous question

In [39]:
# print out the last 5 items in the list from previous question
for doc in ten_docs_bronx_list[5:]:
    print(doc["name"])

Terrace Cafe
Beaver Pond
Mcdonald'S
Ihop
Tony'S Pier Restaurant


## 9. Find the restaurants with a score more than 85
Interpreted as find the restaurants with at least one score strictly greater than 85, use dot notation to evaluate all scores of each restaurant and the $gt operator to return restaurants with at least one score greater than 85. Use my convenience function to print out the results, in this case all of them because there were only four restaurants in total

In [11]:
# query all restaurants with at least one score greater than 85
rest_over_85 = rest_collection.find({"grades.score": {"$gt": 85}})

# print results
print_sample_and_total(rest_over_85)


Total number of documents returned: 4

Sample docs:

{'_id': ObjectId('6566166d88b337b1d7dbd40f'),
 'address': {'building': '65',
             'coord': [-73.9782725, 40.7624022],
             'street': 'West   54 Street',
             'zipcode': '10019'},
 'borough': 'Manhattan',
 'cuisine': 'American ',
 'grades': [{'date': {'$date': 1408665600000}, 'grade': 'A', 'score': 11},
            {'date': {'$date': 1395964800000}, 'grade': 'C', 'score': 131},
            {'date': {'$date': 1380067200000}, 'grade': 'A', 'score': 11},
            {'date': {'$date': 1365379200000}, 'grade': 'B', 'score': 25},
            {'date': {'$date': 1350259200000}, 'grade': 'A', 'score': 11},
            {'date': {'$date': 1318982400000}, 'grade': 'A', 'score': 13}],
 'name': "Murals On 54/Randolphs'S",
 'restaurant_id': '40372466'}
{'_id': ObjectId('6566166d88b337b1d7dbd613'),
 'address': {'building': '130',
             'coord': [-73.984758, 40.7457939],
             'street': 'Madison Avenue',
       

## 10. Find the restaurants that achieved a score, more than 80 but less than 100

As per clarification on Slack, interpreted as documents with at least one score greater than 80 and at least one score less than 100. Similar to query from Q9, just added the less than condition

In [12]:
# define query
query_80_100 = { "grades.score": { "$gt": 80, "$lt": 100 } }

# execute query
rest_80_to_100 = rest_collection.find(query_80_100)

# print results
print_sample_and_total(rest_80_to_100)


Total number of documents returned: 4

Sample docs:

{'_id': ObjectId('6566166d88b337b1d7dbd40f'),
 'address': {'building': '65',
             'coord': [-73.9782725, 40.7624022],
             'street': 'West   54 Street',
             'zipcode': '10019'},
 'borough': 'Manhattan',
 'cuisine': 'American ',
 'grades': [{'date': {'$date': 1408665600000}, 'grade': 'A', 'score': 11},
            {'date': {'$date': 1395964800000}, 'grade': 'C', 'score': 131},
            {'date': {'$date': 1380067200000}, 'grade': 'A', 'score': 11},
            {'date': {'$date': 1365379200000}, 'grade': 'B', 'score': 25},
            {'date': {'$date': 1350259200000}, 'grade': 'A', 'score': 11},
            {'date': {'$date': 1318982400000}, 'grade': 'A', 'score': 13}],
 'name': "Murals On 54/Randolphs'S",
 'restaurant_id': '40372466'}
{'_id': ObjectId('6566166d88b337b1d7dbd613'),
 'address': {'building': '130',
             'coord': [-73.984758, 40.7457939],
             'street': 'Madison Avenue',
       

## 11. Find the restaurants which locate in latitude (edit: longitude) value less than -95.754168

In class, professor clarified this question should read 'longitude', and that coordinates are recorded as [\<longitude\>, \<latitude\>], I define a query to find all longitude values less than -95.754168. I then execute the query and print the results with my convenience function

In [42]:
# define query
query_lat = {"address.coord.0" : {"$lt": -95.754168} }

# execute query
rest_lat_95 = rest_collection.find(query_lat)

# print results
print_sample_and_total(rest_lat_95)


Total number of documents returned: 3

Sample docs:

{'_id': ObjectId('6566166d88b337b1d7dbdc64'),
 'address': {'building': '15259',
             'coord': [-119.6368672, 36.2504996],
             'street': '10 Avenue',
             'zipcode': '11357'},
 'borough': 'Queens',
 'cuisine': 'Italian',
 'grades': [{'date': {'$date': 1409788800000}, 'grade': 'A', 'score': 11},
            {'date': {'$date': 1395792000000}, 'grade': 'A', 'score': 8},
            {'date': {'$date': 1362355200000}, 'grade': 'A', 'score': 10},
            {'date': {'$date': 1348704000000}, 'grade': 'A', 'score': 10},
            {'date': {'$date': 1334880000000}, 'grade': 'A', 'score': 7},
            {'date': {'$date': 1322006400000}, 'grade': 'C', 'score': 34}],
 'name': "Cascarino'S",
 'restaurant_id': '40668681'}
{'_id': ObjectId('6566166d88b337b1d7dbd8f9'),
 'address': {'building': '3707',
             'coord': [-101.8945214, 33.5197474],
             'street': '82 Street',
             'zipcode': '11372'},

## 12. Find the restaurants that do not prepare any cuisine of 'American' and their grade score more than 70 and latitude (edit: longitude) less than -65.754168.

Chained queries to meet all criteria. Noticed some restaurants had "American " cuisine with an extra whitespace at end, so manually removed to avoid mistakenly filtering out cuisine such as 'South & Central American'. Then executed query and printed results. Note that again, professor clarified this question should read longitude, not latitude

In [43]:
# define query
query_cuisine_score_lat = {
    "$and": [
        {"cuisine": {"$ne": "American", "$ne": "American "}},
        {"grades.score": {"$gt": 70}},
        {"address.coord.0": {"$lt": 60}}
    ]
}

# execute query
rest_cuisine_score_lat = rest_collection.find(query_cuisine_score_lat)

# print results
print_sample_and_total(rest_cuisine_score_lat)


Total number of documents returned: 5

Sample docs:

{'_id': ObjectId('6566166d88b337b1d7dbd613'),
 'address': {'building': '130',
             'coord': [-73.984758, 40.7457939],
             'street': 'Madison Avenue',
             'zipcode': '10016'},
 'borough': 'Manhattan',
 'cuisine': 'Pizza/Italian',
 'grades': [{'date': {'$date': 1419379200000}, 'grade': 'Z', 'score': 31},
            {'date': {'$date': 1402963200000}, 'grade': 'C', 'score': 98},
            {'date': {'$date': 1386806400000}, 'grade': 'C', 'score': 32},
            {'date': {'$date': 1369180800000}, 'grade': 'B', 'score': 21},
            {'date': {'$date': 1335916800000}, 'grade': 'A', 'score': 11}],
 'name': 'Bella Napoli',
 'restaurant_id': '40393488'}
{'_id': ObjectId('6566166d88b337b1d7dbd4b0'),
 'address': {'building': '345',
             'coord': [-73.9864626, 40.7266739],
             'street': 'East 6 Street',
             'zipcode': '10003'},
 'borough': 'Manhattan',
 'cuisine': 'Indian',
 'grades': [

## 13. Find the restaurants which do not prepare any cuisine of 'American' and achieved a score more than 70 and located in the longitude less than -65.754168. (without using $and operator)

Largely the same as 12, just changed the syntax to remove the $and operator and extra brackets. Professor clarified that 'longitude' in this question is correct

In [15]:
# define query
query_cuisine_score_lat = {
        "cuisine": {"$ne": "American", "$ne": "American "},
        "grades.score": {"$gt": 70},
        "address.coord.0": {"$lt": 60}
}

# execute query
rest_cuisine_score_lat = rest_collection.find(query_cuisine_score_lat)

# print results
print_sample_and_total(rest_cuisine_score_lat)


Total number of documents returned: 5

Sample docs:

{'_id': ObjectId('6566166d88b337b1d7dbd613'),
 'address': {'building': '130',
             'coord': [-73.984758, 40.7457939],
             'street': 'Madison Avenue',
             'zipcode': '10016'},
 'borough': 'Manhattan',
 'cuisine': 'Pizza/Italian',
 'grades': [{'date': {'$date': 1419379200000}, 'grade': 'Z', 'score': 31},
            {'date': {'$date': 1402963200000}, 'grade': 'C', 'score': 98},
            {'date': {'$date': 1386806400000}, 'grade': 'C', 'score': 32},
            {'date': {'$date': 1369180800000}, 'grade': 'B', 'score': 21},
            {'date': {'$date': 1335916800000}, 'grade': 'A', 'score': 11}],
 'name': 'Bella Napoli',
 'restaurant_id': '40393488'}
{'_id': ObjectId('6566166d88b337b1d7dbd4b0'),
 'address': {'building': '345',
             'coord': [-73.9864626, 40.7266739],
             'street': 'East 6 Street',
             'zipcode': '10003'},
 'borough': 'Manhattan',
 'cuisine': 'Indian',
 'grades': [

## 14. Find the restaurants which do not prepare any cuisine of 'American ' and achieved a grade point 'A' and not in the borough of Brooklyn, sorted by cuisine in descending order.

Query filters out any documents with "American" or "American " cuisine or in Brooklyn. Also filters out any documents if they have not achieved at least one rating of A. I then execute the query and sort on "cuisine", passing -1 as the second parameter to specify descending order.

In [16]:
# define query
query_not_american_brooklyn = {
        "cuisine": {"$ne": "American", "$ne": "American "},
        "grades.grade": {"$eq": 'A'},
        "borough": {"$ne": "Brooklyn"}
}

# execute query. Sorted by cuisine, -1 for descending order
rest_not_american_brooklyn = rest_collection.find(query_not_american_brooklyn)\
                                            .sort([("cuisine", -1)])

# print results
print_sample_and_total(rest_not_american_brooklyn)


Total number of documents returned: 2017

Sample docs:

{'_id': ObjectId('6566166d88b337b1d7dbd9bd'),
 'address': {'building': '89',
             'coord': [-73.9995899, 40.7168015],
             'street': 'Baxter Street',
             'zipcode': '10013'},
 'borough': 'Manhattan',
 'cuisine': 'Vietnamese/Cambodian/Malaysia',
 'grades': [{'date': {'$date': 1408579200000}, 'grade': 'A', 'score': 13},
            {'date': {'$date': 1377907200000}, 'grade': 'A', 'score': 13},
            {'date': {'$date': 1365638400000}, 'grade': 'C', 'score': 3},
            {'date': {'$date': 1350432000000}, 'grade': 'A', 'score': 4},
            {'date': {'$date': 1337040000000}, 'grade': 'A', 'score': 10}],
 'name': 'Thai Son',
 'restaurant_id': '40559606'}
{'_id': ObjectId('6566166d88b337b1d7dbda76'),
 'address': {'building': '8278',
             'coord': [-73.88143509999999, 40.7412552],
             'street': 'Broadway',
             'zipcode': '11373'},
 'borough': 'Queens',
 'cuisine': 'Vietnames

## 15. Find the restaurant Id, name, borough and cuisine for those restaurants which contain 'Wil' as first three letters for its name.

I define a new fields argument equal to Q4 because I am paranoid about the way Jupyter handles variables. I also define a query using Mongo's regex operator, then execute the query and print the results

In [17]:
# define query to find name's starting with "Wil"
query_wil = {"name": {"$regex": "^Wil"}}

# define fields parameter as per criteria
fields_wil = {"_id": 0, "restaurant_id": 1, "name": 1, "borough": 1, "cuisine": 1}

# execute query
rest_wil = rest_collection.find(query_wil, fields_wil)

# print results
print_sample_and_total(rest_wil)


Total number of documents returned: 3

Sample docs:

{'borough': 'Brooklyn',
 'cuisine': 'Delicatessen',
 'name': "Wilken'S Fine Food",
 'restaurant_id': '40356483'}
{'borough': 'Bronx',
 'cuisine': 'American ',
 'name': 'Wild Asia',
 'restaurant_id': '40357217'}
{'borough': 'Bronx',
 'cuisine': 'Pizza',
 'name': 'Wilbel Pizza',
 'restaurant_id': '40871979'}


## 16. Find the restaurant Id, name, borough and cuisine for those restaurants which contain 'ces' as last three letters for its name.

Similar to Q15, but I change the regex to search for names ending in "ces". I then add the optional num_to_print argument in my convenience function in order to print all 6 documents returned by the query

In [44]:
# define query to find name's ending with "ces"
query_ces = {"name": {"$regex": "ces$"}}

# define fields parameter as per criteria
fields_ces = {"_id": 0, "restaurant_id": 1, "name": 1, "borough": 1, "cuisine": 1}

# execute query
rest_ces = rest_collection.find(query_ces, fields_ces)

# print results, specify 6 as num_to_print to show all 6 documents
print_sample_and_total(rest_ces, 6)


Total number of documents returned: 6

Sample docs:

{'borough': 'Manhattan',
 'cuisine': 'American ',
 'name': 'Good Shepherd Services',
 'restaurant_id': '40403989'}
{'borough': 'Manhattan',
 'cuisine': 'American ',
 'name': 'Pieces',
 'restaurant_id': '40399910'}
{'borough': 'Queens',
 'cuisine': 'American ',
 'name': 'S.M.R Restaurant Services',
 'restaurant_id': '40403857'}
{'borough': 'Manhattan',
 'cuisine': 'American ',
 'name': 'Re: Sources',
 'restaurant_id': '40876068'}
{'borough': 'Queens',
 'cuisine': 'Ice Cream, Gelato, Yogurt, Ices',
 'name': "The Ice Box-Ralph'S Famous Italian Ices",
 'restaurant_id': '40690899'}
{'borough': 'Brooklyn',
 'cuisine': 'Jewish/Kosher',
 'name': 'Alices',
 'restaurant_id': '40782042'}


## 17. Find the restaurant Id, name, borough and cuisine for those restaurants which contain 'Reg' as three letters somewhere in its name.

Similar to Q15 and Q16, just changed the regex statement as per criteria and added 7 to convenience function to print all results. 

In [19]:
# define query to find name's starting with "Wil"
query_reg = {"name": {"$regex": "Reg"}}

# define fields parameter as per criteria
fields_reg = {"_id": 0, "restaurant_id": 1, "name": 1, "borough": 1, "cuisine": 1}

# execute query
rest_reg = rest_collection.find(query_reg, fields_reg)

# print results, specify 7 as num_to_print to show all 7 documents
print_sample_and_total(rest_reg, 7)


Total number of documents returned: 7

Sample docs:

{'borough': 'Brooklyn',
 'cuisine': 'American ',
 'name': 'Regina Caterers',
 'restaurant_id': '40356649'}
{'borough': 'Manhattan',
 'cuisine': 'Café/Coffee/Tea',
 'name': 'Caffe Reggio',
 'restaurant_id': '40369418'}
{'borough': 'Manhattan',
 'cuisine': 'American ',
 'name': 'Regency Hotel',
 'restaurant_id': '40382679'}
{'borough': 'Manhattan',
 'cuisine': 'American ',
 'name': 'Regency Whist Club',
 'restaurant_id': '40402377'}
{'borough': 'Queens',
 'cuisine': 'American ',
 'name': 'Rego Park Cafe',
 'restaurant_id': '40523342'}
{'borough': 'Queens',
 'cuisine': 'Pizza',
 'name': 'Regina Pizza',
 'restaurant_id': '40801325'}
{'borough': 'Manhattan',
 'cuisine': 'American ',
 'name': 'Regal Entertainment Group',
 'restaurant_id': '40891782'}


## 18. Find the restaurants which belong to the borough Bronx and prepared either American or Chinese dish

I use the in operator to find all restaurants which serve American or Chinese cuisine, accounting for the "American " restaurants, and filter all restaurants not in the Bronx. I then execute the query and print out 5 sample documents as well as the total number of restaurants returned by the query using my convenience function

In [20]:
# define query using in operator
query_bronx_us_chinese = {
    "borough": {"$eq": "Bronx"},
    "cuisine": {"$in": ["American", "American ", "Chinese"]}
}

# execute query
rest_bronx_us_chinese = rest_collection.find(query_bronx_us_chinese)

# print results
print_sample_and_total(rest_bronx_us_chinese)


Total number of documents returned: 91

Sample docs:

{'_id': ObjectId('6566166d88b337b1d7dbd38a'),
 'address': {'building': '2300',
             'coord': [-73.8786113, 40.8502883],
             'street': 'Southern Boulevard',
             'zipcode': '10460'},
 'borough': 'Bronx',
 'cuisine': 'American ',
 'grades': [{'date': {'$date': 1400630400000}, 'grade': 'A', 'score': 2},
            {'date': {'$date': 1369180800000}, 'grade': 'A', 'score': 11},
            {'date': {'$date': 1337212800000}, 'grade': 'A', 'score': 12},
            {'date': {'$date': 1307491200000}, 'grade': 'A', 'score': 5}],
 'name': 'Cool Zone',
 'restaurant_id': '40368022'}
{'_id': ObjectId('6566166d88b337b1d7dbd3be'),
 'address': {'building': '2375',
             'coord': [-73.85534559999999, 40.8426433],
             'street': 'East Tremont Avenue',
             'zipcode': '10462'},
 'borough': 'Bronx',
 'cuisine': 'American ',
 'grades': [{'date': {'$date': 1413244800000}, 'grade': 'A', 'score': 13},
     

## 19. Find the restaurant Id, name, borough and cuisine for those restaurants which belong to the boroughs of Staten Island or Queens or Bronx or Brooklyn.

I use the in operator to construct a query which will find all restaurants in any of the four specified boroughs, then add the fields argument, execute and print. For this dataset, we could return the same set of documents by just excluding the borough of Manhattan instead of including these four boroughs, but the query as is would apply to datasets which include restaurants outside of NYC as well

In [21]:
query_boroughs = {"borough": {"$in": ["Staten Island", "Queens", "Bronx", "Brooklyn"]}}

# define fields parameter as per criteria
fields_boroughs = {"_id": 0, "restaurant_id": 1, "name": 1, "borough": 1, "cuisine": 1}

# execute query
rest_boroughs = rest_collection.find(query_boroughs, fields_boroughs)

# print results
print_sample_and_total(rest_boroughs)


Total number of documents returned: 1889

Sample docs:

{'borough': 'Brooklyn',
 'cuisine': 'American ',
 'name': 'Regina Caterers',
 'restaurant_id': '40356649'}
{'borough': 'Bronx',
 'cuisine': 'Bakery',
 'name': 'Morris Park Bake Shop',
 'restaurant_id': '30075445'}
{'borough': 'Brooklyn',
 'cuisine': 'Polish',
 'name': 'Polish National Home',
 'restaurant_id': '40364404'}
{'borough': 'Queens',
 'cuisine': 'Italian',
 'name': 'Parkside Restaurant',
 'restaurant_id': '40365841'}
{'borough': 'Bronx',
 'cuisine': 'American ',
 'name': 'Cool Zone',
 'restaurant_id': '40368022'}


## 20. Find the restaurant Id, name, borough and cuisine for those restaurants which are not belonging to the borough Staten Island or Queens or Bronx or Brooklyn

Only difference from Q19 is the use of the nin operator instead of the in operator. As a sanity check, the total number of documents returned by the two queries is 1883 + 1889 = 3772, which is the total number of documents in the DB, as expected

In [22]:
query_not_boroughs = {"borough": {"$nin": ["Staten Island", "Queens", "Bronx", "Brooklyn"]}}

# execute query
rest_not_boroughs = rest_collection.find(query_not_boroughs, fields_boroughs)

# print results
print_sample_and_total(rest_not_boroughs)


Total number of documents returned: 1883

Sample docs:

{'borough': 'Manhattan',
 'cuisine': 'American ',
 'name': '21 Club',
 'restaurant_id': '40364362'}
{'borough': 'Manhattan',
 'cuisine': 'American ',
 'name': 'Cafe Metro',
 'restaurant_id': '40363298'}
{'borough': 'Manhattan',
 'cuisine': 'American ',
 'name': 'P & S Deli Grocery',
 'restaurant_id': '40362264'}
{'borough': 'Manhattan',
 'cuisine': 'American ',
 'name': '1 East 66Th Street Kitchen',
 'restaurant_id': '40359480'}
{'borough': 'Manhattan',
 'cuisine': 'Irish',
 'name': 'Dj Reynolds Pub And Restaurant',
 'restaurant_id': '30191841'}


## 21. Find the restaurant Id, name, borough and cuisine for those restaurants which achieved a score below 10.

I define a query which will return all restaurants which received at least one score less than 10. I then define the fields to be displayed, execute the query and print my results

In [23]:
# define query for any score below 10
query_below_10 = {"grades.score": {"$lt": 10}}

# define fields parameter
fields_below_10 = {"_id": 0, "restaurant_id": 1, "name": 1, "borough": 1, "cuisine": 1}

# execute query
rest_below_10 = rest_collection.find(query_below_10, fields_below_10)

# print results
print_sample_and_total(rest_below_10)


Total number of documents returned: 3245

Sample docs:

{'borough': 'Brooklyn',
 'cuisine': 'American ',
 'name': 'Regina Caterers',
 'restaurant_id': '40356649'}
{'borough': 'Manhattan',
 'cuisine': 'American ',
 'name': 'Cafe Metro',
 'restaurant_id': '40363298'}
{'borough': 'Bronx',
 'cuisine': 'Bakery',
 'name': 'Morris Park Bake Shop',
 'restaurant_id': '30075445'}
{'borough': 'Manhattan',
 'cuisine': 'American ',
 'name': 'P & S Deli Grocery',
 'restaurant_id': '40362264'}
{'borough': 'Manhattan',
 'cuisine': 'American ',
 'name': '1 East 66Th Street Kitchen',
 'restaurant_id': '40359480'}


## 22. Find the restaurant Id, name, borough and cuisine for those restaurants which prepared dish except 'American' and 'Chinese' or restaurant's name begins with letter 'Wil'.

Query simply combines elements from previous questions, again taking care to avoid the "American " restaurants. I use the $or operator to return all restaurants that do not prepare American or Chinese cuisines and all restaurants whose names begin with 'Wil'

In [54]:
# define query, restaurants whose name begins with "Wil" and are not American or Chinese
query_wil_not_us_chinese = {
    "$or": [
        {"cuisine": {"$nin": ["American", "American ", "Chinese"] } },
        {"name": {"$regex": "^Wil"} }
    ]
}

# define fields parameter
fields_wil_not_us_chinese = {"_id": 0, "restaurant_id": 1, "name": 1, "borough": 1, "cuisine": 1}

# execute query
rest_wil_not_us_chinese = rest_collection.find(query_wil_not_us_chinese, fields_wil_not_us_chinese)

# print results
print_sample_and_total(rest_wil_not_us_chinese)


Total number of documents returned: 2403

Sample docs:

{'borough': 'Bronx',
 'cuisine': 'Bakery',
 'name': 'Morris Park Bake Shop',
 'restaurant_id': '30075445'}
{'borough': 'Manhattan',
 'cuisine': 'Irish',
 'name': 'Dj Reynolds Pub And Restaurant',
 'restaurant_id': '30191841'}
{'borough': 'Brooklyn',
 'cuisine': 'Polish',
 'name': 'Polish National Home',
 'restaurant_id': '40364404'}
{'borough': 'Manhattan',
 'cuisine': 'Italian',
 'name': "Arturo'S",
 'restaurant_id': '40365387'}
{'borough': 'Queens',
 'cuisine': 'Italian',
 'name': 'Parkside Restaurant',
 'restaurant_id': '40365841'}


## 23. Find the restaurant Id, name, and grades for those restaurants which achieved a grade of "A" and scored 11 on an ISODate "2014-08-11T00:00:00Z" among many of survey dates.

In [63]:
# Convert ISODate from UTC timezone to unix timestamp
datetime = datetime.strptime("2014-08-11T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ")

# Set the UTC timezone
utc_timezone = pytz.timezone('UTC')
utc_datetime = utc_timezone.localize(datetime)

# Convert to Unix timestamp and multiply by 1000 to convert to milliseconds
unix_timestamp = int(utc_datetime.timestamp() ) * 1000

query_isodate_11 = {
    "grades": {
        "$elemMatch": {
            "date.$date": unix_timestamp,
            "grade": 'A',
            "score": 11
        }
    }
}

# define fields parameter
fields_isodate = {"_id": 0, "restaurant_id": 1, "name": 1, "grades": 1}

# execute query
rest_isodate_11 = rest_collection.find(query_isodate_11, fields_isodate)

# print results
print_sample_and_total(rest_isodate_11)


Total number of documents returned: 1

Sample docs:

{'grades': [{'date': {'$date': 1407715200000}, 'grade': 'A', 'score': 11},
            {'date': {'$date': 1386633600000}, 'grade': 'A', 'score': 9},
            {'date': {'$date': 1370822400000}, 'grade': 'A', 'score': 12},
            {'date': {'$date': 1339113600000}, 'grade': 'A', 'score': 13},
            {'date': {'$date': 1327449600000}, 'grade': 'A', 'score': 8},
            {'date': {'$date': 1315872000000}, 'grade': 'A', 'score': 12}],
 'name': 'Don Filippo Restaurant',
 'restaurant_id': '40372417'}


## 24. Find the restaurant Id, name and grades for those restaurants where the 2nd element of grades array contains a grade of "A" and score 9 on an ISODate "2014-08-11T00:00:00Z".

In [67]:
# manually select 2nd element of grades array, query on same date as Q23 and specified criteria
query_isodate_9 = {
    "grades.1.date.$date": unix_timestamp,
    "grades.1.grade": 'A',
    "grades.1.score": 9
    }

# execute query
rest_isodate_9 = rest_collection.find(query_isodate_9, fields_isodate)

# print results
print_sample_and_total(rest_isodate_9)


Total number of documents returned: 1

Sample docs:

{'grades': [{'date': {'$date': 1421020800000}, 'grade': 'A', 'score': 10},
            {'date': {'$date': 1407715200000}, 'grade': 'A', 'score': 9},
            {'date': {'$date': 1389657600000}, 'grade': 'A', 'score': 13},
            {'date': {'$date': 1360195200000}, 'grade': 'A', 'score': 10},
            {'date': {'$date': 1335744000000}, 'grade': 'A', 'score': 11}],
 'name': 'Club Macanudo (Cigar Bar)',
 'restaurant_id': '40526406'}


## 25. Find the restaurant Id, name, address and geographical location for those restaurants where 2nd element of coordinates array contains a value which is more than 42 and up to 52.

Assuming 52 is inclusive, I find the second element of the coordinates array using dot notation, specify the fields to be displayed, then execute the query and print the results. I am interpreting "address and geographical location" as street address and coordinates, which are nested under "address"

In [68]:
query_geo = {
    "address.coord.1": {'$gt': 42, '$lte': 52},
}

# define fields parameter
fields_geo = {"_id": 0, "restaurant_id": 1, "name": 1, "address": 1}

# execute query
rest_geo = rest_collection.find(query_geo, fields_geo)

# print results, specify 7 as num_to_print to print all results
print_sample_and_total(rest_geo, 7)


Total number of documents returned: 7

Sample docs:

{'address': {'building': '1',
             'coord': [-0.7119979, 51.6514664],
             'street': 'Pennplaza E, Penn Sta',
             'zipcode': '10001'},
 'name': 'T.G.I. Fridays',
 'restaurant_id': '40388936'}
{'address': {'building': '47',
             'coord': [-78.877224, 42.89546199999999],
             'street': 'Broadway @ Trinity Pl',
             'zipcode': '10006'},
 'name': "T.G.I. Friday'S",
 'restaurant_id': '40387990'}
{'address': {'building': '3000',
             'coord': [-87.86567699999999, 42.61150920000001],
             'street': '47 Avenue',
             'zipcode': '11101'},
 'name': "Di Luvio'S Deli",
 'restaurant_id': '40402284'}
{'address': {'building': '21972199',
             'coord': [-78.589606, 42.8912372],
             'street': 'Broadway',
             'zipcode': '10024'},
 'name': 'La Caridad 78',
 'restaurant_id': '40568285'}
{'address': {'building': '0',
             'coord': [-88.0778799, 42.

## EC 1. (historical-events.json): Count the number of events per year
The trick to this question is that the raw data includes 2 separate date formats: YYY and YYY(Y)/MM/DD. Given we only actually need the year, I solve this problem by extracting only the year from each entry and uploading it as an integer. This is more efficient than converting everything to a datetime object, which would require further conversions when uploading and querying the database, and allows for a simpler query as well. In this first cell, I modify the JSON records accordingly before uploading to the database, and then print out the number of records in the JSON array and count the number of documents in the database to ensure I have uploaded the full historical events file. Note that, as with Q1, the dataset can be chunked and inserted in chunks to accomodate datasets that do not fit in memory. I also again commented out the insert_many function in order to avoid mistakenly reinserting the data

In [69]:
# read in restaurant data file, convert to list of JSON records to upload to DB
historical_events = []
i = 0
with open(HISTORICAL_PATH, 'r') as file:
    for line in file:
        line_json = json.loads(line)
        # extract only year from dates formatted YYY(Y)/MM/DD
        if('/' in line_json["date"]):
            line_json["date"] = int(line_json["date"].split('/')[0])
            historical_events.append(line_json)

# find or create collection
historical_collection = db["historical_events"]

# upload to db, commented out to avoid reinserting data
# historical_collection.insert_many(historical_events)

# ensure correct number of events uploaded to DB
print("Events in historical_events JSON array: " + str(len(historical_events) ) )
print("Documents uploaded to dh3382.historical_events: " + str(historical_collection.count_documents({}) ) )

Events in historical_events JSON array: 30781
Documents uploaded to dh3382.historical_events: 30781


### EC 1 continued

In this cell, I define an aggregate query to group by date (in this case, an integer representing only the year), and count each entry by date. I use the $project feature to create an alias, "Year", for the group's \_id field for readability

In [71]:
# define agg query that counts events by year, renames _id field 'Year'
agg_query_events_per_year = [
    {
        "$group": {
            "_id": "$date",
            "count": {"$sum": 1}
        }
    },
    {
        "$project": {
            "Year":"$_id",
            "count": 1,
            "_id": 0
        }
    }
]
    
# execute aggregate query
result = historical_collection.aggregate(agg_query_events_per_year)

# print 20 sample results and total number of documents returned
# Note that the total number can be verified by counting distinct date entries, which also comes to 1321
print_sample_and_total(result, 20)


Total number of documents returned: 1321

Sample docs:

{'Year': 1888, 'count': 67}
{'Year': 1946, 'count': 154}
{'Year': 864, 'count': 6}
{'Year': 1957, 'count': 150}
{'Year': 161, 'count': 5}
{'Year': 1504, 'count': 22}
{'Year': 1514, 'count': 11}
{'Year': 872, 'count': 1}
{'Year': 1361, 'count': 6}
{'Year': 1590, 'count': 15}
{'Year': -241, 'count': 6}
{'Year': 706, 'count': 3}
{'Year': 1967, 'count': 283}
{'Year': 380, 'count': 15}
{'Year': 1595, 'count': 10}
{'Year': 1258, 'count': 9}
{'Year': 1485, 'count': 8}
{'Year': 1743, 'count': 9}
{'Year': 1777, 'count': 24}
{'Year': 976, 'count': 6}
