# Introduction to MongoDB in Python
## 1. Flexbily Structured Data

In [6]:
import os
from pymongo import MongoClient

MONGODB_URI = os.getenv('MONGODB_URI')

client = MongoClient(MONGODB_URI)
db = client["nobel"]

In [7]:
filter = {}
print(db.prizes.count_documents(filter))
print(db.laureates.count_documents(filter))

652
955


In [8]:
db_names = client.list_database_names()
db_names

['admin', 'config', 'local', 'nobel']

In [9]:
nobel_coll_names = client.nobel.list_collection_names()
nobel_coll_names

['prizes', 'laureates']

In [10]:
prize = db.prizes.find_one()
laureate = db.laureates.find_one()

print(prize)
print(laureate)
print(type(laureate))

{'_id': ObjectId('5ffa97fa9d6167d9d146dd5d'), 'year': '2020', 'category': 'chemistry', 'laureates': [{'id': '991', 'firstname': 'Emmanuelle', 'surname': 'Charpentier', 'motivation': '"for the development of a method for genome editing"', 'share': '2'}, {'id': '992', 'firstname': 'Jennifer A.', 'surname': 'Doudna', 'motivation': '"for the development of a method for genome editing"', 'share': '2'}]}
{'_id': ObjectId('5ffa982f9d6167d9d146dfe9'), 'id': '1', 'firstname': 'Wilhelm Conrad', 'surname': 'Röntgen', 'born': '1845-03-27', 'died': '1923-02-10', 'bornCountry': 'Prussia (now Germany)', 'bornCountryCode': 'DE', 'bornCity': 'Lennep (now Remscheid)', 'diedCountry': 'Germany', 'diedCountryCode': 'DE', 'diedCity': 'Munich', 'gender': 'male', 'prizes': [{'year': '1901', 'category': 'physics', 'share': '1', 'motivation': '"in recognition of the extraordinary services he has rendered by the discovery of the remarkable rays subsequently named after him"', 'affiliations': [{'name': 'Munich Un

In [11]:
prize_fields = list(prize.keys())
laureate_fields = list(laureate.keys())

print(prize_fields)
print(laureate_fields)

['_id', 'year', 'category', 'laureates']
['_id', 'id', 'firstname', 'surname', 'born', 'died', 'bornCountry', 'bornCountryCode', 'bornCity', 'diedCountry', 'diedCountryCode', 'diedCity', 'gender', 'prizes']


## 2. Working with Distinct Values and Sets

In [13]:
set(db.prizes.distinct("category"))

{'chemistry', 'economics', 'literature', 'medicine', 'peace', 'physics'}

In [14]:
set(db.laureates.distinct("prizes.category"))

{'chemistry', 'economics', 'literature', 'medicine', 'peace', 'physics'}

In [17]:
set(db.prizes.distinct("category")) == set(db.laureates.distinct("prizes.category"))


True

In [21]:
countries = set(db.laureates.distinct("diedCountry")) - set(db.laureates.distinct("bornCountry"))
countries

{'Barbados',
 'East Germany (now Germany)',
 'Gabon',
 'Greece',
 'Israel',
 'Jamaica',
 'Northern Rhodesia (now Zambia)',
 'Philippines',
 'Puerto Rico',
 'Singapore',
 'Tunisia',
 'Yugoslavia (now Serbia)'}

In [22]:
db.laureates.find_one()

{'_id': ObjectId('5ffa982f9d6167d9d146dfe9'),
 'id': '1',
 'firstname': 'Wilhelm Conrad',
 'surname': 'Röntgen',
 'born': '1845-03-27',
 'died': '1923-02-10',
 'bornCountry': 'Prussia (now Germany)',
 'bornCountryCode': 'DE',
 'bornCity': 'Lennep (now Remscheid)',
 'diedCountry': 'Germany',
 'diedCountryCode': 'DE',
 'diedCity': 'Munich',
 'gender': 'male',
 'prizes': [{'year': '1901',
   'category': 'physics',
   'share': '1',
   'motivation': '"in recognition of the extraordinary services he has rendered by the discovery of the remarkable rays subsequently named after him"',
   'affiliations': [{'name': 'Munich University',
     'city': 'Munich',
     'country': 'Germany'}]}]}

In [24]:
set(db.laureates.distinct("prizes.affiliations.country"))

{'Argentina',
 'Australia',
 'Austria',
 'Belgium',
 'Canada',
 'China',
 'Czechoslovakia (now Czech Republic)',
 'Denmark',
 'Finland',
 'France',
 'Germany',
 'Germany (now France)',
 'Hungary',
 'India',
 'Ireland',
 'Israel',
 'Italy',
 'Japan',
 'Norway',
 'Portugal',
 'Russia',
 'Spain',
 'Sweden',
 'Switzerland',
 'Tunisia',
 'USA',
 'USSR (now Russia)',
 'United Kingdom',
 'the Netherlands'}

In [25]:
db.laureates.find_one()

{'_id': ObjectId('5ffa982f9d6167d9d146dfe9'),
 'id': '1',
 'firstname': 'Wilhelm Conrad',
 'surname': 'Röntgen',
 'born': '1845-03-27',
 'died': '1923-02-10',
 'bornCountry': 'Prussia (now Germany)',
 'bornCountryCode': 'DE',
 'bornCity': 'Lennep (now Remscheid)',
 'diedCountry': 'Germany',
 'diedCountryCode': 'DE',
 'diedCity': 'Munich',
 'gender': 'male',
 'prizes': [{'year': '1901',
   'category': 'physics',
   'share': '1',
   'motivation': '"in recognition of the extraordinary services he has rendered by the discovery of the remarkable rays subsequently named after him"',
   'affiliations': [{'name': 'Munich University',
     'city': 'Munich',
     'country': 'Germany'}]}]}

In [28]:
# In which countries have USA-born laureates had affiliations for their prizes?
db.laureates.distinct("prizes.affiliations.country", {"bornCountry": "USA"})

['Australia', 'Denmark', 'USA', 'United Kingdom']

In [41]:
# Confirm via an assertion that "literature" is the only prize category with no prizes shared by three or more laureates.
criteria = {"laureates.2": {"$exists": True}}
triple_play_categories = set(db.prizes.distinct("category", criteria))
assert set(['literature']) == set(db.prizes.distinct("category")) - triple_play_categories


In [44]:
# What is the approximate ratio of the number of laureates who won an unshared ({"share": "1"}) prize in physics after World War II ({"year": {"$gte": "1945"}}) to the number of laureates who won a shared prize in physics after World War II?
a = db.laureates.count_documents({"prizes": {"$elemMatch": {"share": "1", "category": "physics", "year": {"$gte": "1945"}}}})
b = db.laureates.count_documents({"prizes": {"$elemMatch": {"share": {"$ne": "1"}, "category": "physics", "year": {"$gte": "1945"}}}})
a/b

0.12080536912751678

In [51]:
# What is this ratio for prize categories other than physics, chemistry, and medicine?
unshared = {"prizes": {"$elemMatch": {"share": "1", "category": {"$nin": ["physics", "chemistry", "medicine"]}, "year": {"$gte": "1945"}}}}
shared = {"prizes": {"$elemMatch": {"share": {"$ne": "1"}, "category": {"$nin": ["physics", "chemistry", "medicine"]}, "year": {"$gte": "1945"}}}}
count_unshared = db.laureates.count_documents(unshared)
count_shared = db.laureates.count_documents(shared)
ratio = count_unshared / count_shared
ratio

1.348623853211009

In [52]:
before = {"gender": "org", "prizes.year": {"$lt": "1945"}}
in_or_after = {"gender": "org", "prizes.year": {"$gte": "1945"}}
count_before = db.laureates.count_documents(before)
count_in_or_after = db.laureates.count_documents(in_or_after)
ratio = count_in_or_after / (count_before + count_in_or_after)
ratio

0.8461538461538461

In [53]:
# How many laureates in total have a first name beginning with "G" and a surname beginning with "S"?
from bson.regex import Regex
db.laureates.count_documents({"firstname": Regex("^G"), "surname": Regex("^S")})

10

In [54]:
# Use a regular expression object to filter for laureates with "Germany" in their "bornCountry" value.
criteria = {"bornCountry": Regex("Germany")}
set(db.laureates.distinct("bornCountry", criteria))

{'Bavaria (now Germany)',
 'East Friesland (now Germany)',
 'Germany',
 'Germany (now France)',
 'Germany (now Poland)',
 'Germany (now Russia)',
 'Hesse-Kassel (now Germany)',
 'Mecklenburg (now Germany)',
 'Prussia (now Germany)',
 'Schleswig (now Germany)',
 'West Germany (now Germany)',
 'Württemberg (now Germany)'}

In [55]:
# Use a regular expression object to filter for laureates with a "bornCountry" value starting with "Germany".
criteria = {"bornCountry": Regex("^Germany")}
set(db.laureates.distinct("bornCountry", criteria))

{'Germany',
 'Germany (now France)',
 'Germany (now Poland)',
 'Germany (now Russia)'}

In [57]:
# Use a regular expression object to filter for laureates born in what was at the time Germany but is now another country.
criteria = {"bornCountry": Regex("^Germany \(now")}
set(db.laureates.distinct("bornCountry", criteria))

{'Germany (now France)', 'Germany (now Poland)', 'Germany (now Russia)'}

In [58]:
# Use a regular expression object to filter for laureates born in what is now Germany but at the time was another country.
criteria = {"bornCountry": Regex("now Germany\)$")}
set(db.laureates.distinct("bornCountry", criteria))

{'Bavaria (now Germany)',
 'East Friesland (now Germany)',
 'Hesse-Kassel (now Germany)',
 'Mecklenburg (now Germany)',
 'Prussia (now Germany)',
 'Schleswig (now Germany)',
 'West Germany (now Germany)',
 'Württemberg (now Germany)'}

In [60]:
# We can filter on "transistor" as a substring of a laureate's "prizes.motivation" field value to find these laureates.
criteria = {"prizes.motivation": Regex("transistor")}
first = "firstname"
last = "surname"
print([(laureate[first], laureate[last]) for laureate in db.laureates.find(criteria)])

[('William B.', 'Shockley'), ('John', 'Bardeen'), ('Walter H.', 'Brattain')]
