<a href="https://colab.research.google.com/github/datapreparation-javeriana/etl-tutorial/blob/master/etl-tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MongoDB tutorial

Data Management course  
Univerisdad Javeriana  
February, 2023

Dataset: AirBnB reviews

In [None]:
pip install pymongo[srv]

In [None]:
from datetime import datetime

import pymongo

In [None]:
DB_NAME = "sample_airbnb"
COLLECTION = "listingsAndReviews"

In [None]:
client = pymongo.MongoClient("mongodb+srv://<username>:<password>@cluster0.<cluster_id>.mongodb.net/?retryWrites=true&w=majority", server_api = pymongo.server_api.ServerApi('1'))

In [None]:
db = client[DB_NAME]

In [None]:
col = db[COLLECTION]

### Basic queries

In [None]:
# Getting one element
# MongoDB equivalent: findOne({})
# SQL equivalent: SELECT * FROM airbnb LIMIT 1
col.find_one({})

In [None]:
# Getting distict values for a key
# SQL equivalent: SELECT DISTINCT(property_type) FROM airbnb
col.distinct("property_type")

In [None]:
# Counting all documents
# MongoDB equivalent: count({})
# SQL equivalent: SELECT COUNT(*) FROM airbnb
col.count_documents({})

In [None]:
# Specifying equality condition
# SQL equivalent: SELECT * FROM airbnb WHERE _id = "10030955"
col.find({"_id": "10030955"})[0]

In [None]:
# Specifying condition with query operators, https://docs.mongodb.com/manual/reference/operator/query/#std-label-query-selectors 
# SQL equivalent: SELECT * FROM airbnb WHERE bedrooms IN (1, 2)
col.find({"bedrooms": {"$in": [1, 2]}})[0]

In [None]:
# Specifying AND conditions
# SQL equivalent: SELECT * FROM airbnb WHERE bedrooms > 8 AND bathrooms >= 5
col.find({"bedrooms": {"$gt": 8}, "bathrooms":  {"$gte": 5}})[0]

In [None]:
# Specifying OR conditions
# SQL equivalent: SELECT * FROM airbnb WHERE price < 50 OR minimum_nights <= 3
col.find({"$or": [{"price": {"$lt": 50}}, {"minimum_nights": {"$lte": 3}}]})[0]

In [None]:
# Specifying AND and OR conditions, regex: https://docs.mongodb.com/manual/reference/operator/query/regex/#mongodb-query-op.-regex
# SQL equivalent: SELECT * FROM airbnb WHERE property_type = "House" AND ( bed_type = "Real Bed" OR summary LIKE "%beach%")
col.find({
    "property_type": "House",
    "$or": [{"bed_type": "Real Bed"}, {"summary": {"$regex": "^beach$"}}]
})[0]

### Queries on embedded/nested documents

In [None]:
# Specifying equality on sub-collection
col.find({"address.country": "Spain"})[0]

### Queries on arrays

In [None]:
# Specifying equality on array with exactly these two elements in the specified order
col.find({"amenities": ["TV", "Wifi"]})[0]

In [None]:
# Specifying documents with at least these two elements without regard to order
col.find({"amenities": {"$all": ["TV", "Wifi"]}})[0]

In [None]:
# Specifying documents with at least this element
col.find({"amenities": "Pool"})[0]

More about queries on arrays: https://docs.mongodb.com/manual/tutorial/query-arrays/

### Queries on arrays of embedded documents

In [None]:
col.find({"reviews.date": {"$gte": datetime(2019, 3, 1)}})[0]

In [None]:
col.find({"reviews.0.date": {"$lte": datetime(2010, 1, 1)}})[0]

More about queries on arrays of embedded documents: https://docs.mongodb.com/manual/tutorial/query-array-of-documents/

### Project fields

In [None]:
# SQL equivalent: SELECT _id, name, property_type FROM airbnb LIMIT 5
for x in col.find({}, {"name": 1, "property_type": 1}).limit(5):
    print(x)

In [None]:
# SQL equivalent: SELECT name, property_type FROM airbnb LIMIT 5
for x in col.find({}, {"_id": 0, "name": 1, "property_type": 1}).limit(5):
    print(x)

In [None]:
# SQL equivalent: SELECT * EXCEPT(summary, description, host, images) FROM airbnb
col.find({}, {"summary": 0, "description": 0, "host": 0, "images": 0})[0]

In [None]:
for x in col.find({}, {"_id": 0, "name": 1, "property_type": 1, "host.host_name": 1}).limit(5):
    print(x)

In [None]:
for x in col.find(
    {"property_type": "House"},
    {"_id": 0, "name": 1, "property_type": 1, "reviews": {"$slice": -1}
}).limit(3):
    print(x)

More about project fields: https://docs.mongodb.com/manual/tutorial/project-fields-from-query-results/

### Query for null or missing fields

In [None]:
# It will raise an error because there is no documents without the name attribute
col.find({"name": {"$exists": False}}, {"_id": 0, "name": 1})[0]

In [None]:
for x in col.find({"reviews": { "$exists": True, "$type": "array", "$size": 1}}, {"_id": 0, "name": 1, "reviews": 1}).limit(3):
    print(x)