In [1]:
from pymongo import MongoClient

In [2]:
#Quiz: Kicking the tires on MongoDB

"""
Your task is to sucessfully run the exercise to see how pymongo works
and how easy it is to start using it.
You don't actually have to change anything in this exercise,
but you can change the city name in the add_city function if you like.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB (see Instructor comments for link to installation information)
and uncomment the get_db function.
"""

def add_city(db):
    # Changes to this function will be reflected in the output. 
    # All other functions are for local use only.
    # Try changing the name of the city to be inserted
    db.cities.insert_one({"name" : "Bergen"})
    
def get_city(db):
    return db.cities.find_one()

def get_db():
    # For local use
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    # 'examples' here is the database name. It will be created if it does not exist.
    db = client.examples
    return db

if __name__ == "__main__":
    # For local use
    db = get_db() # uncomment this line if you want to run this locally
    add_city(db)
    print(get_city(db))

{'name': 'Bergen', '_id': ObjectId('58206f494dd9620a3dc336ee')}


In [3]:
db.cities

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'examples'), 'cities')

In [4]:
#Quiz: Finding Porsche

#!/usr/bin/env python
"""
Your task is to complete the 'porsche_query' function and in particular the query
to find all autos where the manufacturer field matches "Porsche".
Please modify only 'porsche_query' function, as only that will be taken into account.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB and download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials at
the following link:
https://www.udacity.com/wiki/ud032
"""

def porsche_query():
    # Please fill in the query to find all autos manuafactured by Porsche.
    query = {"manufacturer" : "Porsche"}
    return query


# Do not edit code below this line in the online code editor.
# Code here is for local use on your own computer.
def get_db(db_name):
    # For local use
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def find_porsche(db, query):
    # For local use
    return db.autos.find(query)


if __name__ == "__main__":
    # For local use
    db = get_db('examples')
    query = porsche_query()
    results = find_porsche(db, query)

    print("Printing first 3 results\n")
    import pprint
    for car in results[:3]:
        pprint.pprint(car)

Printing first 3 results

{'_id': ObjectId('58207e32d27f3a7c119c7f02'),
 'assembly': ['Finland', 'Germany', 'Stuttgart', 'Uusikaupunki'],
 'bodyStyle': 'roadster',
 'class': 'sports car',
 'layout': 'rear mid-engine rear-wheel-drive layout',
 'manufacturer': 'Porsche',
 'modelYears': [],
 'name': 'Porsche Boxster',
 'productionYears': []}


In [5]:
db.collection_names()

['autos', 'cities']

In [6]:
db.example_car.drop()
db.cars.drop()

In [7]:
client = MongoClient('localhost:27017')
db = client['examples']
    
db.examples.cars.find_one('manufacturer')

In [8]:
db.examples.find({"manufacturer" : "Porsche"})

<pymongo.cursor.Cursor at 0x105706470>

In [9]:
for f in db.cars.find({"manufacturer" : "Porsche"}):
    print(f)

In [10]:
db.cars.find()

<pymongo.cursor.Cursor at 0x10570e128>

In [11]:
#Quiz: Inserting multiple documents

#!/usr/bin/env python
""" 
Add a single line of code to the insert_autos function that will insert the
automobile data into the 'autos' collection. The data variable that is
returned from the process_file function is a list of dictionaries, as in the
example in the previous video.
"""

# from autos import process_file


def insert_autos(infile, db):
    data = process_file(infile)
    # Add your code here. Insert the data in one command.
    db.autos.insert_many(data)
  
if __name__ == "__main__":
    # Code here is for local use on your own computer.
    from pymongo import MongoClient
    client = MongoClient("mongodb://localhost:27017")
    db = client.examples

    insert_autos('data/autos-small.csv', db)
    print(db.autos.find_one())

{'productionYears': [], 'assembly': ['Finland', 'Germany', 'Stuttgart', 'Uusikaupunki'], 'bodyStyle': 'roadster', 'class': 'sports car', '_id': ObjectId('58207e32d27f3a7c119c7f02'), 'layout': 'rear mid-engine rear-wheel-drive layout', 'name': 'Porsche Boxster', 'modelYears': [], 'manufacturer': 'Porsche'}


In [56]:
#importing data needed for next quiz

import datetime

db.cities.insert_one({
 'areaCode': ['916'],
 'areaLand': 109271000.0,
 'country': 'United States',
 'elevation': 13.716,
 'foundingDate': datetime.datetime(2001, 7, 1, 0, 0),
 'governmentType': ['Council\u2013manager government'],
 'homepage': ['http://elkgrovecity.org/'],
 'isPartOf': ['California', u'Sacramento County California'],
 'lat': 38.4383,
 'leaderTitle': 'Chief Of Police',
 'lon': -121.382,
 'motto': 'Proud Heritage Bright Future',
 'name': 'City of Elk Grove',
 'population': 155937,
 'postalCode': '95624 95757 95758 95759',
 'timeZone': ['Pacific Time Zone'],
 'utcOffset': ['-7', '-8']
})

<pymongo.results.InsertOneResult at 0x1057de048>

In [57]:
#Quiz: Range queries

#!/usr/bin/env python
"""
Your task is to write a query that will return all cities
that are founded in 21st century.
Please modify only 'range_query' function, as only that will be taken into account.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.
"""

from datetime import datetime
    
def range_query():
    # Modify the below line with your query.
    # You can use datetime(year, month, day) to specify date in the query
    query = {"foundingDate" : {"$gte" : datetime(2001,1,1), 
                               "$lte" : datetime(2100,12,31)}} 
    return query

# Do not edit code below this line in the online code editor.
# Code here is for local use on your own computer.
def get_db():
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client.examples
    return db

if __name__ == "__main__":
    # For local use
    db = get_db()
    query = range_query()
    cities = db.cities.find(query)

    print("Found cities:", cities.count())
    import pprint
    pprint.pprint(cities[0])


Found cities: 1
{'_id': ObjectId('5820f8574dd9620c1de02036'),
 'areaCode': ['916'],
 'areaLand': 109271000.0,
 'country': 'United States',
 'elevation': 13.716,
 'foundingDate': datetime.datetime(2001, 7, 1, 0, 0),
 'governmentType': ['Council–manager government'],
 'homepage': ['http://elkgrovecity.org/'],
 'isPartOf': ['California', 'Sacramento County California'],
 'lat': 38.4383,
 'leaderTitle': 'Chief Of Police',
 'lon': -121.382,
 'motto': 'Proud Heritage Bright Future',
 'name': 'City of Elk Grove',
 'population': 155937,
 'postalCode': '95624 95757 95758 95759',
 'timeZone': ['Pacific Time Zone'],
 'utcOffset': ['-7', '-8']}


In [46]:
db.cities.find()[2]

{'_id': ObjectId('5820f2524dd9620c1de02032'),
 'areaCode': ['916'],
 'areaLand': 109271000.0,
 'country': 'United States',
 'elevation': 13.716,
 'foundingDate': datetime.datetime(2000, 7, 1, 0, 0),
 'governmentType': ['Council–manager government'],
 'homepage': ['http://elkgrovecity.org/'],
 'isPartOf': ['California', 'Sacramento County California'],
 'lat': 38.4383,
 'leaderTitle': 'Chief Of Police',
 'lon': -121.382,
 'motto': 'Proud Heritage Bright Future',
 'name': 'City of Elk Grove',
 'population': 155937,
 'postalCode': '95624 95757 95758 95759',
 'timeZone': ['Pacific Time Zone'],
 'utcOffset': ['-7', '-8']}

In [63]:
#Quiz: Using $in operator

#!/usr/bin/env python
"""
Your task is to write a query that will return all cars manufactured by
"Ford Motor Company" that are assembled in Germany, United Kingdom, or Japan.
Please modify only 'in_query' function, as only that will be taken into account.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.
"""


def in_query():
    # Modify the below line with your query; try to use the $in operator.
    query = {"manufacturer" : "Ford Motor Company", 
             "assembly" : {"$in" : ["Germany", "United Kingdom", "Japan"]}}
    
    return query


# Do not edit code below this line in the online code editor.
# Code here is for local use on your own computer.
def get_db():
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client.examples
    return db


if __name__ == "__main__":

    db = get_db()
    query = in_query()
    autos = db.autos.find(query, {"name":1, "manufacturer":1, "assembly": 1, "_id":0})

    print("Found autos:", autos.count())
    import pprint
    for a in autos:
        pprint.pprint(a)


Found autos: 0


In [96]:
db.autos.insert_one({
"_id" : "52fd438b5a98d65507d288cf",
"engine" : "Crawler-transporter__1",
"dimensions" : {
"width" : 34.7472,
"length" : 39.9288,
"weight" : 2721000
},
"transmission" : "16 traction motors powered by four  generators",
"modelYears" : [ ],
"productionYears" : [ ],
"manufacturer" : "Marion Power Shovel Company",
"name" : "Crawler-transporter"
})

DuplicateKeyError: E11000 duplicate key error collection: examples.autos index: _id_ dup key: { : "52fd438b5a98d65507d288cf" }

In [113]:
#Quiz: Dot notation

#!/usr/bin/env python
"""
Your task is to write a query that will return all cars with width dimension
greater than 2.5. Please modify only the 'dot_query' function, as only that
will be taken into account.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine, you will need to install
MongoDB, download and insert the dataset. For instructions related to MongoDB
setup and datasets, please see the Course Materials.
"""


def dot_query():
    # Edit the line below with your query - try to use dot notation.
    # You can check out example_auto.txt for an example of the document
    # structure in the collection.
    query = {'dimensions.width' : {'$gt' : 2.5}}
    return query


# Do not edit code below this line in the online code editor.
# Code here is for local use on your own computer.
def get_db():
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client.examples
    return db


if __name__ == "__main__":
    db = get_db()
    query = dot_query()
    cars = db.autos.find(query)

    print("Printing first 3 results\n")
    import pprint
    for car in cars[:3]:
        pprint.pprint(car)


Printing first 3 results

{'_id': '52fd438b5a98d65507d288cf',
 'dimensions': {'length': 39.9288, 'weight': 2721000, 'width': 34.7472},
 'engine': 'Crawler-transporter__1',
 'manufacturer': 'Marion Power Shovel Company',
 'modelYears': [],
 'name': 'Crawler-transporter',
 'productionYears': [],
 'transmission': '16 traction motors powered by four  generators'}


In [112]:
query = {'dimensions.width' : {'$gt' : 2.5}}

for car in db.autos.find(query):
    print(car)

{'productionYears': [], 'transmission': '16 traction motors powered by four  generators', 'engine': 'Crawler-transporter__1', 'dimensions': {'width': 34.7472, 'length': 39.9288, 'weight': 2721000}, 'modelYears': [], 'name': 'Crawler-transporter', '_id': '52fd438b5a98d65507d288cf', 'manufacturer': 'Marion Power Shovel Company'}


In [110]:
for car in db.autos.find({},{"dimensions.width" : 1}):
    print(car)

{'_id': ObjectId('58207e32d27f3a7c119c7f02')}
{'_id': ObjectId('5820eb624dd9620c1de02019')}
{'_id': ObjectId('5820eb624dd9620c1de0201a')}
{'_id': ObjectId('5820eb624dd9620c1de0201b')}
{'_id': ObjectId('5820eb624dd9620c1de0201c'), 'dimensions': {'width': 2.0}}
{'_id': ObjectId('5820eb624dd9620c1de0201d'), 'dimensions': {'width': 1.65}}
{'_id': ObjectId('5820eb624dd9620c1de0201e'), 'dimensions': {'width': 1.55}}
{'_id': ObjectId('5820eb624dd9620c1de0201f')}
{'_id': ObjectId('5820eb624dd9620c1de02020')}
{'_id': ObjectId('5820eb624dd9620c1de02021'), 'dimensions': {}}
{'_id': ObjectId('5820eb624dd9620c1de02022')}
{'_id': ObjectId('5820eb624dd9620c1de02023')}
{'_id': ObjectId('5820eb624dd9620c1de02024')}
{'_id': ObjectId('5820eb624dd9620c1de02025'), 'dimensions': {'width': 1.1}}
{'_id': ObjectId('5820eb624dd9620c1de02026')}
{'_id': ObjectId('5820eb624dd9620c1de02027'), 'dimensions': {}}
{'_id': ObjectId('5820eb624dd9620c1de02028')}
{'_id': ObjectId('5820eb624dd9620c1de02029')}
{'_id': Object

## Problem set

In [170]:
print(re.search('_field','la_field'))

<_sre.SRE_Match object; span=(2, 8), match='_field'>


In [182]:
#Quiz: Preparing data

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with another type of infobox data, audit it,
clean it, come up with a data model, insert it into MongoDB and then run some
queries against your database. The set contains data about Arachnid class
animals.

Your task in this exercise is to parse the file, process only the fields that
are listed in the FIELDS dictionary as keys, and return a list of dictionaries
of cleaned values. 

The following things should be done:
- keys of the dictionary changed according to the mapping in FIELDS dictionary
- trim out redundant description in parenthesis from the 'rdf-schema#label'
  field, like "(spider)"
- if 'name' is "NULL" or contains non-alphanumeric characters, set it to the
  same value as 'label'.
- if a value of a field is "NULL", convert it to None
- if there is a value in 'synonym', it should be converted to an array (list)
  by stripping the "{}" characters and splitting the string on "|". Rest of the
  cleanup is up to you, e.g. removing "*" prefixes etc. If there is a singular
  synonym, the value should still be formatted in a list.
- strip leading and ending whitespace from all fields, if there is any
- the output structure should be as follows:

[ { 'label': 'Argiope',
    'uri': 'http://dbpedia.org/resource/Argiope_(spider)',
    'description': 'The genus Argiope includes rather large and spectacular spiders that often ...',
    'name': 'Argiope',
    'synonym': ["One", "Two"],
    'classification': {
                      'family': 'Orb-weaver spider',
                      'class': 'Arachnid',
                      'phylum': 'Arthropod',
                      'order': 'Spider',
                      'kingdom': 'Animal',
                      'genus': None
                      }
  },
  { 'label': ... , }, ...
]

  * Note that the value associated with the classification key is a dictionary
    with taxonomic labels.
"""
import codecs
import csv
import json
import pprint
import re

DATAFILE = 'data/arachnid.csv'
FIELDS ={'rdf-schema#label': 'label',
         'URI': 'uri',
         'rdf-schema#comment': 'description',
         'synonym': 'synonym',
         'name': 'name',
         'family_label': 'family',
         'class_label': 'class',
         'phylum_label': 'phylum',
         'order_label': 'order',
         'kingdom_label': 'kingdom',
         'genus_label': 'genus'}


def process_file(filename, fields):

    process_fields = fields.keys()
    data = []
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        for i in range(3):
            l = next(reader)

        for line in reader:
            # YOUR CODE HERE
            record = dict()
            classification = dict()
            label_val = re.sub(' \(.*?\)$','',line['rdf-schema#label'])
            
            for key in fields.keys():
                if re.search('_label',key):
                    field_val = line[key].strip()
                    
                    if field_val == 'NULL':
                        classification[fields[key]] = None
                    else:
                        classification[fields[key]] = field_val
                            
                elif key == "rdf-schema#label":
                    if label_val == 'NULL':
                        record['label'] = None
                    else:
                        record['label'] = label_val
                
                elif key == 'name' and (line['name'] == 'NULL' or re.search('\W',line['name'])):
                    record['name'] = label_val
                
                elif key == 'synonym' and line['synonym'] != ('' or 'NULL'):

                    synonym_val = re.sub('\*','',line['synonym']).strip('{}').split('|')
                    record['synonym'] = list(map(str.strip,synonym_val))
               
                else:
                    field_val = line[key].strip()
                    
                    if field_val == 'NULL':
                        record[fields[key]] = None
                    else:
                        record[fields[key]] = field_val

            record['classification'] = classification  
            data.append(record)
        
    return data


def parse_array(v):
    if (v[0] == "{") and (v[-1] == "}"):
        v = v.lstrip("{")
        v = v.rstrip("}")
        v_array = v.split("|")
        v_array = [i.strip() for i in v_array]
        return v_array
    return [v]


def test():
    data = process_file(DATAFILE, FIELDS)
    print("Your first entry:")
    pprint.pprint(data[0])
    first_entry = {
        "synonym": None, 
        "name": "Argiope", 
        "classification": {
            "kingdom": "Animal", 
            "family": "Orb-weaver spider", 
            "order": "Spider", 
            "phylum": "Arthropod", 
            "genus": None, 
            "class": "Arachnid"
        }, 
        "uri": "http://dbpedia.org/resource/Argiope_(spider)", 
        "label": "Argiope", 
        "description": "The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced."
    }

    assert len(data) == 76
    assert data[0] == first_entry
    assert data[17]["name"] == "Ogdenia"
    assert data[48]["label"] == "Hydrachnidiae"
    assert data[14]["synonym"] == ["Cyrene Peckham & Peckham"]
    
    return data

if __name__ == "__main__":
    tmp_data = test()

Your first entry:
{'classification': {'class': 'Arachnid',
                    'family': 'Orb-weaver spider',
                    'genus': None,
                    'kingdom': 'Animal',
                    'order': 'Spider',
                    'phylum': 'Arthropod'},
 'description': 'The genus Argiope includes rather large and spectacular '
                'spiders that often have a strikingly coloured abdomen. These '
                'spiders are distributed throughout the world. Most countries '
                'in tropical or temperate climates host one or more species '
                'that are similar in appearance. The etymology of the name is '
                'from a Greek name meaning silver-faced.',
 'label': 'Argiope',
 'name': 'Argiope',
 'synonym': None,
 'uri': 'http://dbpedia.org/resource/Argiope_(spider)'}


In [195]:
# db.arachnid.find_one()

{'_id': ObjectId('5825f1864dd9620c1de02040'),
 'classification': {'class': 'Arachnid',
  'family': 'Orb-weaver spider',
  'genus': None,
  'kingdom': 'Animal',
  'order': 'Spider',
  'phylum': 'Arthropod'},
 'description': 'The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced.',
 'label': 'Argiope',
 'name': 'Argiope',
 'synonym': None,
 'uri': 'http://dbpedia.org/resource/Argiope_(spider)'}

In [188]:
# db.arachnid.insert_many(tmp_data)

<pymongo.results.InsertManyResult at 0x106b56360>

In [None]:
#Quiz: Inserting into DB

"""
Complete the insert_data function to insert the data into MongoDB.
"""

import json

def insert_data(data, db):

    # Your code here. Insert the data into a collection 'arachnid'
    db.arachnid.insert_many()

    pass


if __name__ == "__main__":
    
    from pymongo import MongoClient
    client = MongoClient("mongodb://localhost:27017")
    db = client.examples

    with open('arachnid.json') as f:
        data = json.loads(f.read())
        insert_data(data, db)
        print db.arachnid.find_one()