## Functions

In [1]:
#!pip3 install rich

import pprint

def print_(cursor, mode="pprint"):
    """
    Print collection.
    """
    if isinstance(cursor, pymongo.cursor.Cursor):
        obj = list(cursor)
    else:
        obj = cursor
    if mode in ("json", "json_color"):
        import json

        parsed = json.loads(json.dumps(obj))
        if mode == "json":
            print(json.dumps(parsed, indent=2))
        else:
            import rich

            rich.print_json(json.dumps(parsed, indent=2))
    elif mode == "pprint":
        pprint.pprint(obj)
    else:
        raise ValueError(f"Invalid mode='{mode}'")

# Connect to DB

In [44]:
import pymongo
from bson.objectid import ObjectId

# Connect to MongoDB instance.
client = pymongo.MongoClient("localhost", 27017)

# Create a db.
db = client["book"]
print("db=", db)
print("type(db)=", type(db))

db= Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'book')
type(db)= <class 'pymongo.database.Database'>


In [54]:
# Show all the collections.
print("collections=", db.list_collection_names())

# Clean all collections.
for db_name in db.list_collection_names():
    print("Dropping %s" % db_name)
    db[db_name].drop()

print("collections=", db.list_collection_names())

collections= ['towns']
Dropping towns
collections= []


# `Towns` collection

## Insert

In [55]:
dict_ = {
    "name": "New York",
    "population": 22200000,
    "lastCensus": "2022-11-01",
    "famousFor": ["the MOMA", "food", "Derek Jeter"],
    "mayor": {"name": "Bill de Blasio", "party": "D"},
}
print_(dict_)

# Inserting an object in a DB creates a DB.
val = db.towns.insert_one(dict_)
print("val=", val)
print("obj_id=", val.inserted_id)

{'famousFor': ['the MOMA', 'food', 'Derek Jeter'],
 'lastCensus': '2022-11-01',
 'mayor': {'name': 'Bill de Blasio', 'party': 'D'},
 'name': 'New York',
 'population': 22200000}
val= <pymongo.results.InsertOneResult object at 0x7f22246dd820>
obj_id= 63fb625c96c3a56b758fd461


In [48]:
# Show all the collections.
db.list_collection_names()

['towns']

In [56]:
# _id is like the primary key.
for obj in db.towns.find():
    print_(obj, mode="pprint")

{'_id': ObjectId('63fb625c96c3a56b758fd461'),
 'famousFor': ['the MOMA', 'food', 'Derek Jeter'],
 'lastCensus': '2022-11-01',
 'mayor': {'name': 'Bill de Blasio', 'party': 'D'},
 'name': 'New York',
 'population': 22200000}


In [57]:
# Insert more data in the collection.
def insert_city(name, population, lastCensus, famousFor, mayor):
    db.towns.insert_one(
        {
            "name": name,
            "population": population,
            "lastCensus": lastCensus,
            "famousFor": famousFor,
            "mayor": mayor,
        }
    )


insert_city(
    "Punxsutawney",
    6200,
    "2016-01-31",
    ["Punxsutawney Phil"],
    {"name": "Richard Alexander"},
)

insert_city(
    "Portland",
    582000,
    "2016-09-20",
    ["beer", "food", "Portlandia"],
    {"name": "Ted Wheeler", "party": "D"},
)

# Note that `mayor` field doesn't have a strict schema.

In [58]:
# Print all the documents in db["towns"].
for obj in db.towns.find():
    print_(obj, mode="pprint")

{'_id': ObjectId('63fb625c96c3a56b758fd461'),
 'famousFor': ['the MOMA', 'food', 'Derek Jeter'],
 'lastCensus': '2022-11-01',
 'mayor': {'name': 'Bill de Blasio', 'party': 'D'},
 'name': 'New York',
 'population': 22200000}
{'_id': ObjectId('63fb626596c3a56b758fd462'),
 'famousFor': ['Punxsutawney Phil'],
 'lastCensus': '2016-01-31',
 'mayor': {'name': 'Richard Alexander'},
 'name': 'Punxsutawney',
 'population': 6200}
{'_id': ObjectId('63fb626596c3a56b758fd463'),
 'famousFor': ['beer', 'food', 'Portlandia'],
 'lastCensus': '2016-09-20',
 'mayor': {'name': 'Ted Wheeler', 'party': 'D'},
 'name': 'Portland',
 'population': 582000}


## Query

In [59]:
# Find by ObjectId.
# db.towns.find_one({"_id": ObjectId("6368352a657571ee34691dd9")})
db.towns.find_one({"_id": val.inserted_id})

{'_id': ObjectId('63fb625c96c3a56b758fd461'),
 'name': 'New York',
 'population': 22200000,
 'lastCensus': '2022-11-01',
 'famousFor': ['the MOMA', 'food', 'Derek Jeter'],
 'mayor': {'name': 'Bill de Blasio', 'party': 'D'}}

In [10]:
# Retrieve only the field `name`.
object_id = ObjectId(str(val.inserted_id))
db.towns.find_one({"_id": object_id}, {"name": 1})

{'_id': ObjectId('63fb619796c3a56b758fd458'), 'name': 'New York'}

In [11]:
# Retrieve all fields excluding `name`.
db.towns.find_one({"_id": ObjectId(str(val.inserted_id))}, {"name": 0})

{'_id': ObjectId('63fb619796c3a56b758fd458'),
 'population': 22200000,
 'lastCensus': '2022-11-01',
 'famousFor': ['the MOMA', 'food', 'Derek Jeter'],
 'mayor': {'name': 'Bill de Blasio', 'party': 'D'}}

In [12]:
# Find all towns with name starting with P.
# This is going to do a table scan.
print_(db.towns.find({"name": {"$regex": r"^P"}}))

[{'_id': ObjectId('63fb619796c3a56b758fd459'),
  'famousFor': ['Punxsutawney Phil'],
  'lastCensus': '2016-01-31',
  'mayor': {'name': 'Richard Alexander'},
  'name': 'Punxsutawney',
  'population': 6200},
 {'_id': ObjectId('63fb619796c3a56b758fd45a'),
  'famousFor': ['beer', 'food', 'Portlandia'],
  'lastCensus': '2016-09-20',
  'mayor': {'name': 'Ted Wheeler', 'party': 'D'},
  'name': 'Portland',
  'population': 582000}]


In [13]:
# Find all towns with name starting with P, but print only name.
print_(db.towns.find({"name": {"$regex": r"^P"}}, {"_id": 0, "name": 1}))

[{'name': 'Punxsutawney'}, {'name': 'Portland'}]


In [14]:
# Find all towns with name that begins with P and have population less than 100,000.
print_(db.towns.find({"name": {"$regex": r"^P"}, "population": {"$lt": 100000}}))

[{'_id': ObjectId('63fb619796c3a56b758fd459'),
  'famousFor': ['Punxsutawney Phil'],
  'lastCensus': '2016-01-31',
  'mayor': {'name': 'Richard Alexander'},
  'name': 'Punxsutawney',
  'population': 6200}]


In [15]:
# Projection.
print_(db.towns.find({"famousFor": "food"}, {"_id": 0, "name": 1, "famousFor": 1}))

[{'famousFor': ['the MOMA', 'food', 'Derek Jeter'], 'name': 'New York'},
 {'famousFor': ['beer', 'food', 'Portlandia'], 'name': 'Portland'}]


In [16]:
# Query for matching values.
print_(
    db.towns.find(
        {"famousFor": {"$all": ["food", "beer"]}}, {"_id": 0, "name": 1, "famousFor": 1}
    )
)

[{'famousFor': ['beer', 'food', 'Portlandia'], 'name': 'Portland'}]


In [17]:
# Query for lack of matching values.
print_(
    db.towns.find(
        {"famousFor": {"$nin": ["food", "beer"]}}, {"_id": 0, "name": 1, "famousFor": 1}
    )
)

[{'famousFor': ['Punxsutawney Phil'], 'name': 'Punxsutawney'}]


In [18]:
# Find results with nested search criteria, e.g., mayor.party = "D".
print_(db.towns.find({"mayor.party": "D"}))

[{'_id': ObjectId('63fb619796c3a56b758fd458'),
  'famousFor': ['the MOMA', 'food', 'Derek Jeter'],
  'lastCensus': '2022-11-01',
  'mayor': {'name': 'Bill de Blasio', 'party': 'D'},
  'name': 'New York',
  'population': 22200000},
 {'_id': ObjectId('63fb619796c3a56b758fd45a'),
  'famousFor': ['beer', 'food', 'Portlandia'],
  'lastCensus': '2016-09-20',
  'mayor': {'name': 'Ted Wheeler', 'party': 'D'},
  'name': 'Portland',
  'population': 582000}]


## Updating

In [19]:
print_(db.towns.find())

[{'_id': ObjectId('63fb619796c3a56b758fd458'),
  'famousFor': ['the MOMA', 'food', 'Derek Jeter'],
  'lastCensus': '2022-11-01',
  'mayor': {'name': 'Bill de Blasio', 'party': 'D'},
  'name': 'New York',
  'population': 22200000},
 {'_id': ObjectId('63fb619796c3a56b758fd459'),
  'famousFor': ['Punxsutawney Phil'],
  'lastCensus': '2016-01-31',
  'mayor': {'name': 'Richard Alexander'},
  'name': 'Punxsutawney',
  'population': 6200},
 {'_id': ObjectId('63fb619796c3a56b758fd45a'),
  'famousFor': ['beer', 'food', 'Portlandia'],
  'lastCensus': '2016-09-20',
  'mayor': {'name': 'Ted Wheeler', 'party': 'D'},
  'name': 'Portland',
  'population': 582000}]


In [20]:
object_id_for_Portland = str(db.towns.find_one({"name": "Portland"})["_id"])
print("object_id_for_Portland=", object_id_for_Portland)

object_id_for_Portland= 63fb619796c3a56b758fd45a


In [21]:
# There are multiple cities called Portland in US (e.g., in Oregon and in Maine).
db.towns.update_one(
    {"_id": ObjectId(object_id_for_Portland)}, {"$set": {"state": "OR"}}
)

print_(db.towns.find({"_id": ObjectId(object_id_for_Portland)}))

# Note that we need to specify $set.
# Mongo thinks in terms of documents and not attributes. So if you
# specify:
# db.towns.update_one({"_id": ObjectId("63696c28657571ee34691de3")},
#                     {"state": "OR"})
# the entire document will be replaced with the document `{"state": "OR"}`

[{'_id': ObjectId('63fb619796c3a56b758fd45a'),
  'famousFor': ['beer', 'food', 'Portlandia'],
  'lastCensus': '2016-09-20',
  'mayor': {'name': 'Ted Wheeler', 'party': 'D'},
  'name': 'Portland',
  'population': 582000,
  'state': 'OR'}]


In [22]:
# Increment the population.
db.towns.update_one(
    {"_id": ObjectId(object_id_for_Portland)}, {"$inc": {"population": 1000}}
)
print_(db.towns.find({"_id": ObjectId(object_id_for_Portland)}))

[{'_id': ObjectId('63fb619796c3a56b758fd45a'),
  'famousFor': ['beer', 'food', 'Portlandia'],
  'lastCensus': '2016-09-20',
  'mayor': {'name': 'Ted Wheeler', 'party': 'D'},
  'name': 'Portland',
  'population': 583000,
  'state': 'OR'}]


# `countries` collection

## Insert

In [23]:
db.countries.drop()

# Note:
# 1) we define the _id directly
# 2) the schema is not strict
db.countries.insert_one(
    {
        "_id": "us",
        "name": "United States",
        "exports": {"foods": [{"name": "bacon", "tasty": True}, {"name": "burgers"}]},
    }
)

db.countries.insert_one(
    {
        "_id": "ca",
        "name": "Canada",
        "exports": {
            "foods": [
                {"name": "bacon", "tasty": False},
                {"name": "syrup", "tasty": True},
            ]
        },
    }
)

db.countries.insert_one(
    {
        "_id": "mx",
        "name": "Mexico",
        "exports": {"foods": [{"name": "salsa", "tasty": True, "condiment": True}]},
    }
)

assert db.countries.count_documents({}) == 3

In [24]:
for obj in db["countries"].find():
    print_(obj)

{'_id': 'us',
 'exports': {'foods': [{'name': 'bacon', 'tasty': True}, {'name': 'burgers'}]},
 'name': 'United States'}
{'_id': 'ca',
 'exports': {'foods': [{'name': 'bacon', 'tasty': False},
                       {'name': 'syrup', 'tasty': True}]},
 'name': 'Canada'}
{'_id': 'mx',
 'exports': {'foods': [{'condiment': True, 'name': 'salsa', 'tasty': True}]},
 'name': 'Mexico'}


## Query

In [25]:
# Find the country that exports tasty bacon.

# This doesn't return what we want, since we want the AND of the condition and not OR.
print_(
    db.countries.find(
        {
            "exports.foods.name": "bacon",
            "exports.foods.tasty": True,
        },
        {"_id": 0, "name": 1},
    )
)

[{'name': 'United States'}, {'name': 'Canada'}]


In [26]:
# Using $elemMatch.
print_(
    db.countries.find(
        {
            "exports.foods": {
                "$elemMatch": {
                    "name": "bacon",
                    "tasty": True,
                }
            }
        },
        {"_id": 0, "name": 1},
    )
)

[{'name': 'United States'}]


In [27]:
# This performs an AND.
print_(db.countries.find({"_id": "mx", "name": "United States"}))

# This performs an OR.
print_(
    db.countries.find({"$or": [{"_id": "mx"}, {"name": "United States"}]}, {"_id": 1})
)

[]
[{'_id': 'us'}, {'_id': 'mx'}]


## References

In [28]:
object_id_for_Pun = ObjectId(str(db.towns.find_one({"name": "Punxsutawney"})["_id"]))
print("object_id_for_Pun=", object_id_for_Pun)

object_id_for_Pun= 63fb619796c3a56b758fd459


In [29]:
# Mongo is not built to perform joins.
# It is useful to have documents reference each other.
db.towns.update_one(
    {"_id": object_id_for_Pun},
    {"$set": {"country": {"$ref": "countries", "$id": "us"}}},
)

print_(db.towns.find_one({"_id": object_id_for_Pun}))

{'_id': ObjectId('63fb619796c3a56b758fd459'),
 'country': DBRef('countries', 'us'),
 'famousFor': ['Punxsutawney Phil'],
 'lastCensus': '2016-01-31',
 'mayor': {'name': 'Richard Alexander'},
 'name': 'Punxsutawney',
 'population': 6200}


In [30]:
var = db.towns.find_one({"_id": object_id_for_Pun})
print("var=", var)
print('var["country"]=', var["country"])
# Dereference.
print(var["country"].id)

var= {'_id': ObjectId('63fb619796c3a56b758fd459'), 'name': 'Punxsutawney', 'population': 6200, 'lastCensus': '2016-01-31', 'famousFor': ['Punxsutawney Phil'], 'mayor': {'name': 'Richard Alexander'}, 'country': DBRef('countries', 'us')}
var["country"]= DBRef('countries', 'us')
us


## Delete

In [31]:
# Find all contries where the bacon is not tasty.
bad_bacon = {
    "exports.foods": {
        "$elemMatch": {
            "name": "bacon",
            "tasty": False,
        }
    }
}
print_(db.countries.find(bad_bacon))

[{'_id': 'ca',
  'exports': {'foods': [{'name': 'bacon', 'tasty': False},
                        {'name': 'syrup', 'tasty': True}]},
  'name': 'Canada'}]


In [32]:
print_(db.countries.find())

[{'_id': 'us',
  'exports': {'foods': [{'name': 'bacon', 'tasty': True}, {'name': 'burgers'}]},
  'name': 'United States'},
 {'_id': 'ca',
  'exports': {'foods': [{'name': 'bacon', 'tasty': False},
                        {'name': 'syrup', 'tasty': True}]},
  'name': 'Canada'},
 {'_id': 'mx',
  'exports': {'foods': [{'condiment': True, 'name': 'salsa', 'tasty': True}]},
  'name': 'Mexico'}]


In [33]:
print("count=", db.countries.count_documents({}))
db.countries.delete_many(bad_bacon)
print("count=", db.countries.count_documents({}))

count= 3
count= 2


## Query with code

# Indexing

In [34]:
import random

random.seed(1)


def populatePhones(area, start, stop):
    for i in range(start, stop):
        country = 1 + random.randint(1, 8)
        num = int(country * 1e10 + area * 1e7 + i)
        # +4 800-5550000
        full_number = "+%s %s-%s" % (country, area, i)
        # print(num, full_number)
        # assert 0
        db.phones.insert_one(
            {
                "_id": num,
                "components": {
                    "country": country,
                    "area": area,
                    "number": i,
                },
                "display": full_number,
            }
        )


# Generate 100,000 phone numbers (it may take a while), between 1-800-555-0000 and 1-800-565-0000.
db.phones.drop()
populatePhones(800, 5550000, 5650000)

In [35]:
print(db.phones.count_documents({}))

100000


In [36]:
print_(db.phones.find().limit(2))
print_(db.phones.find().limit(2))

[{'_id': 48005550000,
  'components': {'area': 800, 'country': 4, 'number': 5550000},
  'display': '+4 800-5550000'},
 {'_id': 38005550001,
  'components': {'area': 800, 'country': 3, 'number': 5550001},
  'display': '+3 800-5550001'}]
[{'_id': 48005550000,
  'components': {'area': 800, 'country': 4, 'number': 5550000},
  'display': '+4 800-5550000'},
 {'_id': 38005550001,
  'components': {'area': 800, 'country': 3, 'number': 5550001},
  'display': '+3 800-5550001'}]


In [37]:
# Print information about the indices.
for collection in db.list_collection_names():
    print("# collection=", collection)
    print_(db[collection].index_information())

# collection= phones
{'_id_': {'key': [('_id', 1)], 'v': 2}}
# collection= towns
{'_id_': {'key': [('_id', 1)], 'v': 2}}
# collection= countries
{'_id_': {'key': [('_id', 1)], 'v': 2}}


In [38]:
print_(db.phones.find_one({"display": "+4 800-5550000"}))

{'_id': 48005550000,
 'components': {'area': 800, 'country': 4, 'number': 5550000},
 'display': '+4 800-5550000'}


In [39]:
# db.phones.find({"display": "+4 800-5550000"}).explain()
db.phones.find({"display": "+4 800-5550000"}).explain()["executionStats"][
    "executionTimeMillis"
]

65

In [40]:
# Create an indesx on `display`.
db.phones.create_index([("display", pymongo.ASCENDING)], unique=True, dropDups=True)

print_(db["phones"].index_information())

{'_id_': {'key': [('_id', 1)], 'v': 2},
 'display_1': {'key': [('display', 1)], 'unique': True, 'v': 2}}


In [41]:
# Show that the query now it's very fast.
print_(
    db.phones.find({"display": "+4 800-5550000"}).explain()["executionStats"][
        "executionTimeMillis"
    ]
)

2


# Aggregated queries.

In [42]:
db.phones.count_documents({"components.number": {"$gt": 5599999}})

50000

In [43]:
db.phones.distinct('components.number', {"components.number": {"$gt": 5599999}})

[5600000,
 5600001,
 5600002,
 5600003,
 5600004,
 5600005,
 5600006,
 5600007,
 5600008,
 5600009,
 5600010,
 5600011,
 5600012,
 5600013,
 5600014,
 5600015,
 5600016,
 5600017,
 5600018,
 5600019,
 5600020,
 5600021,
 5600022,
 5600023,
 5600024,
 5600025,
 5600026,
 5600027,
 5600028,
 5600029,
 5600030,
 5600031,
 5600032,
 5600033,
 5600034,
 5600035,
 5600036,
 5600037,
 5600038,
 5600039,
 5600040,
 5600041,
 5600042,
 5600043,
 5600044,
 5600045,
 5600046,
 5600047,
 5600048,
 5600049,
 5600050,
 5600051,
 5600052,
 5600053,
 5600054,
 5600055,
 5600056,
 5600057,
 5600058,
 5600059,
 5600060,
 5600061,
 5600062,
 5600063,
 5600064,
 5600065,
 5600066,
 5600067,
 5600068,
 5600069,
 5600070,
 5600071,
 5600072,
 5600073,
 5600074,
 5600075,
 5600076,
 5600077,
 5600078,
 5600079,
 5600080,
 5600081,
 5600082,
 5600083,
 5600084,
 5600085,
 5600086,
 5600087,
 5600088,
 5600089,
 5600090,
 5600091,
 5600092,
 5600093,
 5600094,
 5600095,
 5600096,
 5600097,
 5600098,
 5600099,
