In [4]:
from pymongo import MongoClient

In [179]:
#Quiz: Kicking the tires on MongoDB

"""
Your task is to sucessfully run the exercise to see how pymongo works
and how easy it is to start using it.
You don't actually have to change anything in this exercise,
but you can change the city name in the add_city function if you like.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB (see Instructor comments for link to installation information)
and uncomment the get_db function.
"""

def add_city(db):
    # Changes to this function will be reflected in the output. 
    # All other functions are for local use only.
    # Try changing the name of the city to be inserted
    db.cities.insert_one({"name" : "Bergen"})
    
def get_city(db):
    return db.cities.find_one()

def get_db():
    # For local use
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    # 'examples' here is the database name. It will be created if it does not exist.
    db = client.examples
    return db

if __name__ == "__main__":
    # For local use
    db = get_db() # uncomment this line if you want to run this locally
    add_city(db)
    print(get_city(db))

{'name': 'Bergen', '_id': ObjectId('58206f494dd9620a3dc336ee')}


In [180]:
db.cities

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'examples'), 'cities')

In [73]:
db.autos.find_one({"manufacturer" : "Porsche"})

{'_id': ObjectId('58207e32d27f3a7c119c7f02'),
 'assembly': ['Finland', 'Germany', 'Stuttgart', 'Uusikaupunki'],
 'bodyStyle': 'roadster',
 'class': 'sports car',
 'layout': 'rear mid-engine rear-wheel-drive layout',
 'manufacturer': 'Porsche',
 'modelYears': [],
 'name': 'Porsche Boxster',
 'productionYears': []}

In [4]:
#Quiz: Finding Porsche

#!/usr/bin/env python
"""
Your task is to complete the 'porsche_query' function and in particular the query
to find all autos where the manufacturer field matches "Porsche".
Please modify only 'porsche_query' function, as only that will be taken into account.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB and download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials at
the following link:
https://www.udacity.com/wiki/ud032
"""

def porsche_query():
    # Please fill in the query to find all autos manuafactured by Porsche.
    query = {"manufacturer" : "Porsche"}
    return query


# Do not edit code below this line in the online code editor.
# Code here is for local use on your own computer.
def get_db(db_name):
    # For local use
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def find_porsche(db, query):
    # For local use
    return db.autos.find(query)


if __name__ == "__main__":
    # For local use
    db = get_db('examples')
    query = porsche_query()
    results = find_porsche(db, query)

    print("Printing first 3 results\n")
    import pprint
    for car in results[:3]:
        pprint.pprint(car)

Printing first 3 results

{'_id': ObjectId('58207e32d27f3a7c119c7f02'),
 'assembly': ['Finland', 'Germany', 'Stuttgart', 'Uusikaupunki'],
 'bodyStyle': 'roadster',
 'class': 'sports car',
 'layout': 'rear mid-engine rear-wheel-drive layout',
 'manufacturer': 'Porsche',
 'modelYears': [],
 'name': 'Porsche Boxster',
 'productionYears': []}


In [7]:
db.collection_names()

['examples', 'autos', 'cities', 'arachnid']

In [6]:
db.example_car.drop()
db.cars.drop()

In [7]:
client = MongoClient('localhost:27017')
db = client['examples']
    
db.examples.cars.find_one('manufacturer')

In [8]:
db.examples.find({"manufacturer" : "Porsche"})

<pymongo.cursor.Cursor at 0x105706470>

In [9]:
for f in db.cars.find({"manufacturer" : "Porsche"}):
    print(f)

In [10]:
db.cars.find()

<pymongo.cursor.Cursor at 0x10570e128>

In [11]:
#Quiz: Inserting multiple documents

#!/usr/bin/env python
""" 
Add a single line of code to the insert_autos function that will insert the
automobile data into the 'autos' collection. The data variable that is
returned from the process_file function is a list of dictionaries, as in the
example in the previous video.
"""

# from autos import process_file


def insert_autos(infile, db):
    data = process_file(infile)
    # Add your code here. Insert the data in one command.
    db.autos.insert_many(data)
  
if __name__ == "__main__":
    # Code here is for local use on your own computer.
    from pymongo import MongoClient
    client = MongoClient("mongodb://localhost:27017")
    db = client.examples

    insert_autos('data/autos-small.csv', db)
    print(db.autos.find_one())

{'productionYears': [], 'assembly': ['Finland', 'Germany', 'Stuttgart', 'Uusikaupunki'], 'bodyStyle': 'roadster', 'class': 'sports car', '_id': ObjectId('58207e32d27f3a7c119c7f02'), 'layout': 'rear mid-engine rear-wheel-drive layout', 'name': 'Porsche Boxster', 'modelYears': [], 'manufacturer': 'Porsche'}


In [181]:
#importing data needed for next quiz

import datetime

db.cities.insert_one({
 'areaCode': ['916'],
 'areaLand': 109271000.0,
 'country': 'United States',
 'elevation': 13.716,
 'foundingDate': datetime.datetime(2001, 7, 1, 0, 0),
 'governmentType': ['Council\u2013manager government'],
 'homepage': ['http://elkgrovecity.org/'],
 'isPartOf': ['California', u'Sacramento County California'],
 'lat': 38.4383,
 'leaderTitle': 'Chief Of Police',
 'lon': -121.382,
 'motto': 'Proud Heritage Bright Future',
 'name': 'City of Elk Grove',
 'population': 155937,
 'postalCode': '95624 95757 95758 95759',
 'timeZone': ['Pacific Time Zone'],
 'utcOffset': ['-7', '-8']
})

<pymongo.results.InsertOneResult at 0x1068ebaf8>

In [215]:
#Quiz: Range queries

#!/usr/bin/env python
"""
Your task is to write a query that will return all cities
that are founded in 21st century.
Please modify only 'range_query' function, as only that will be taken into account.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.
"""

from datetime import datetime
    
def range_query():
    # Modify the below line with your query.
    # You can use datetime(year, month, day) to specify date in the query
    query = {"foundingDate" : {"$gte" : datetime(2001,1,1), 
                               "$lte" : datetime(2100,12,31)}} 
    return query

# Do not edit code below this line in the online code editor.
# Code here is for local use on your own computer.
def get_db():
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client.examples
    return db

if __name__ == "__main__":
    # For local use
    db = get_db()
    query = range_query()
    cities = db.cities.find(query)

    print("Found cities:", cities.count())
    import pprint
    pprint.pprint(cities[0])


Found cities: 2
{'_id': ObjectId('5820f8574dd9620c1de02036'),
 'areaCode': ['916'],
 'areaLand': 109271000.0,
 'country': 'United States',
 'elevation': 13.716,
 'foundingDate': datetime.datetime(2001, 7, 1, 0, 0),
 'governmentType': ['Council–manager government'],
 'homepage': ['http://elkgrovecity.org/'],
 'isPartOf': ['California', 'Sacramento County California'],
 'lat': 38.4383,
 'leaderTitle': 'Chief Of Police',
 'lon': -121.382,
 'motto': 'Proud Heritage Bright Future',
 'name': 'City of Elk Grove',
 'population': 155937,
 'postalCode': '95624 95757 95758 95759',
 'timeZone': ['Pacific Time Zone'],
 'utcOffset': ['-7', '-8']}


In [216]:
db.cities.find()[2]

{'_id': ObjectId('5820f2524dd9620c1de02032'),
 'areaCode': ['916'],
 'areaLand': 109271000.0,
 'country': 'United States',
 'elevation': 13.716,
 'foundingDate': datetime.datetime(2000, 7, 1, 0, 0),
 'governmentType': ['Council–manager government'],
 'homepage': ['http://elkgrovecity.org/'],
 'isPartOf': ['California', 'Sacramento County California'],
 'lat': 38.4383,
 'leaderTitle': 'Chief Of Police',
 'lon': -121.382,
 'motto': 'Proud Heritage Bright Future',
 'name': 'City of Elk Grove',
 'population': 155937,
 'postalCode': '95624 95757 95758 95759',
 'timeZone': ['Pacific Time Zone'],
 'utcOffset': ['-7', '-8']}

In [184]:
db.cities.find_one({'country': 'United States'})

{'_id': ObjectId('5820f2524dd9620c1de02032'),
 'areaCode': ['916'],
 'areaLand': 109271000.0,
 'country': 'United States',
 'elevation': 13.716,
 'foundingDate': datetime.datetime(2000, 7, 1, 0, 0),
 'governmentType': ['Council–manager government'],
 'homepage': ['http://elkgrovecity.org/'],
 'isPartOf': ['California', 'Sacramento County California'],
 'lat': 38.4383,
 'leaderTitle': 'Chief Of Police',
 'lon': -121.382,
 'motto': 'Proud Heritage Bright Future',
 'name': 'City of Elk Grove',
 'population': 155937,
 'postalCode': '95624 95757 95758 95759',
 'timeZone': ['Pacific Time Zone'],
 'utcOffset': ['-7', '-8']}

In [63]:
#Quiz: Using $in operator

#!/usr/bin/env python
"""
Your task is to write a query that will return all cars manufactured by
"Ford Motor Company" that are assembled in Germany, United Kingdom, or Japan.
Please modify only 'in_query' function, as only that will be taken into account.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine,
you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.
"""


def in_query():
    # Modify the below line with your query; try to use the $in operator.
    query = {"manufacturer" : "Ford Motor Company", 
             "assembly" : {"$in" : ["Germany", "United Kingdom", "Japan"]}}
    
    return query


# Do not edit code below this line in the online code editor.
# Code here is for local use on your own computer.
def get_db():
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client.examples
    return db


if __name__ == "__main__":

    db = get_db()
    query = in_query()
    autos = db.autos.find(query, {"name":1, "manufacturer":1, "assembly": 1, "_id":0})

    print("Found autos:", autos.count())
    import pprint
    for a in autos:
        pprint.pprint(a)


Found autos: 0


In [96]:
db.autos.insert_one({
"_id" : "52fd438b5a98d65507d288cf",
"engine" : "Crawler-transporter__1",
"dimensions" : {
"width" : 34.7472,
"length" : 39.9288,
"weight" : 2721000
},
"transmission" : "16 traction motors powered by four  generators",
"modelYears" : [ ],
"productionYears" : [ ],
"manufacturer" : "Marion Power Shovel Company",
"name" : "Crawler-transporter"
})

DuplicateKeyError: E11000 duplicate key error collection: examples.autos index: _id_ dup key: { : "52fd438b5a98d65507d288cf" }

In [113]:
#Quiz: Dot notation

#!/usr/bin/env python
"""
Your task is to write a query that will return all cars with width dimension
greater than 2.5. Please modify only the 'dot_query' function, as only that
will be taken into account.

Your code will be run against a MongoDB instance that we have provided.
If you want to run this code locally on your machine, you will need to install
MongoDB, download and insert the dataset. For instructions related to MongoDB
setup and datasets, please see the Course Materials.
"""


def dot_query():
    # Edit the line below with your query - try to use dot notation.
    # You can check out example_auto.txt for an example of the document
    # structure in the collection.
    query = {'dimensions.width' : {'$gt' : 2.5}}
    return query


# Do not edit code below this line in the online code editor.
# Code here is for local use on your own computer.
def get_db():
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client.examples
    return db


if __name__ == "__main__":
    db = get_db()
    query = dot_query()
    cars = db.autos.find(query)

    print("Printing first 3 results\n")
    import pprint
    for car in cars[:3]:
        pprint.pprint(car)


Printing first 3 results

{'_id': '52fd438b5a98d65507d288cf',
 'dimensions': {'length': 39.9288, 'weight': 2721000, 'width': 34.7472},
 'engine': 'Crawler-transporter__1',
 'manufacturer': 'Marion Power Shovel Company',
 'modelYears': [],
 'name': 'Crawler-transporter',
 'productionYears': [],
 'transmission': '16 traction motors powered by four  generators'}


In [112]:
query = {'dimensions.width' : {'$gt' : 2.5}}

for car in db.autos.find(query):
    print(car)

{'productionYears': [], 'transmission': '16 traction motors powered by four  generators', 'engine': 'Crawler-transporter__1', 'dimensions': {'width': 34.7472, 'length': 39.9288, 'weight': 2721000}, 'modelYears': [], 'name': 'Crawler-transporter', '_id': '52fd438b5a98d65507d288cf', 'manufacturer': 'Marion Power Shovel Company'}


In [110]:
for car in db.autos.find({},{"dimensions.width" : 1}):
    print(car)

{'_id': ObjectId('58207e32d27f3a7c119c7f02')}
{'_id': ObjectId('5820eb624dd9620c1de02019')}
{'_id': ObjectId('5820eb624dd9620c1de0201a')}
{'_id': ObjectId('5820eb624dd9620c1de0201b')}
{'_id': ObjectId('5820eb624dd9620c1de0201c'), 'dimensions': {'width': 2.0}}
{'_id': ObjectId('5820eb624dd9620c1de0201d'), 'dimensions': {'width': 1.65}}
{'_id': ObjectId('5820eb624dd9620c1de0201e'), 'dimensions': {'width': 1.55}}
{'_id': ObjectId('5820eb624dd9620c1de0201f')}
{'_id': ObjectId('5820eb624dd9620c1de02020')}
{'_id': ObjectId('5820eb624dd9620c1de02021'), 'dimensions': {}}
{'_id': ObjectId('5820eb624dd9620c1de02022')}
{'_id': ObjectId('5820eb624dd9620c1de02023')}
{'_id': ObjectId('5820eb624dd9620c1de02024')}
{'_id': ObjectId('5820eb624dd9620c1de02025'), 'dimensions': {'width': 1.1}}
{'_id': ObjectId('5820eb624dd9620c1de02026')}
{'_id': ObjectId('5820eb624dd9620c1de02027'), 'dimensions': {}}
{'_id': ObjectId('5820eb624dd9620c1de02028')}
{'_id': ObjectId('5820eb624dd9620c1de02029')}
{'_id': Object

## Problem set

In [170]:
print(re.search('_field','la_field'))

<_sre.SRE_Match object; span=(2, 8), match='_field'>


In [48]:
#Quiz: Preparing data

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with another type of infobox data, audit it,
clean it, come up with a data model, insert it into MongoDB and then run some
queries against your database. The set contains data about Arachnid class
animals.

Your task in this exercise is to parse the file, process only the fields that
are listed in the FIELDS dictionary as keys, and return a list of dictionaries
of cleaned values. 

The following things should be done:
- keys of the dictionary changed according to the mapping in FIELDS dictionary
- trim out redundant description in parenthesis from the 'rdf-schema#label'
  field, like "(spider)"
- if 'name' is "NULL" or contains non-alphanumeric characters, set it to the
  same value as 'label'.
- if a value of a field is "NULL", convert it to None
- if there is a value in 'synonym', it should be converted to an array (list)
  by stripping the "{}" characters and splitting the string on "|". Rest of the
  cleanup is up to you, e.g. removing "*" prefixes etc. If there is a singular
  synonym, the value should still be formatted in a list.
- strip leading and ending whitespace from all fields, if there is any
- the output structure should be as follows:

[ { 'label': 'Argiope',
    'uri': 'http://dbpedia.org/resource/Argiope_(spider)',
    'description': 'The genus Argiope includes rather large and spectacular spiders that often ...',
    'name': 'Argiope',
    'synonym': ["One", "Two"],
    'classification': {
                      'family': 'Orb-weaver spider',
                      'class': 'Arachnid',
                      'phylum': 'Arthropod',
                      'order': 'Spider',
                      'kingdom': 'Animal',
                      'genus': None
                      }
  },
  { 'label': ... , }, ...
]

  * Note that the value associated with the classification key is a dictionary
    with taxonomic labels.
"""
import codecs
import csv
import json
import pprint
import re

DATAFILE = 'data/arachnid.csv'
FIELDS ={'rdf-schema#label': 'label',
         'URI': 'uri',
         'rdf-schema#comment': 'description',
         'synonym': 'synonym',
         'name': 'name',
         'family_label': 'family',
         'class_label': 'class',
         'phylum_label': 'phylum',
         'order_label': 'order',
         'kingdom_label': 'kingdom',
         'genus_label': 'genus'}


def process_file(filename, fields):

    process_fields = fields.keys()
    data = []
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        for i in range(3):
            l = next(reader)

        for line in reader:
            # YOUR CODE HERE
            record = dict()
            classification = dict()
            label_val = re.sub(' \(.*?\)$','',line['rdf-schema#label'])
            
            for key in fields.keys():
                if re.search('_label',key):
                    field_val = line[key].strip()
                    
                    if field_val == 'NULL':
                        classification[fields[key]] = None
                    else:
                        classification[fields[key]] = field_val
                            
                elif key == "rdf-schema#label":
                    if label_val == 'NULL':
                        record['label'] = None
                    else:
                        record['label'] = label_val
                
                elif key == 'name' and (line['name'] == 'NULL' or re.search('\W',line['name'])):
                    record['name'] = label_val
                
                elif key == 'synonym' and line['synonym'] != ('' or 'NULL'):

                    synonym_val = re.sub('\*','',line['synonym']).strip('{}').split('|')
                    record['synonym'] = list(map(str.strip,synonym_val))
               
                else:
                    field_val = line[key].strip()
                    
                    if field_val == 'NULL':
                        record[fields[key]] = None
                    else:
                        record[fields[key]] = field_val

            record['classification'] = classification  
            data.append(record)
        
    return data


def parse_array(v):
    if (v[0] == "{") and (v[-1] == "}"):
        v = v.lstrip("{")
        v = v.rstrip("}")
        v_array = v.split("|")
        v_array = [i.strip() for i in v_array]
        return v_array
    return [v]


def test():
    data = process_file(DATAFILE, FIELDS)
    print("Your first entry:")
    pprint.pprint(data[0])
    first_entry = {
        "synonym": None, 
        "name": "Argiope", 
        "classification": {
            "kingdom": "Animal", 
            "family": "Orb-weaver spider", 
            "order": "Spider", 
            "phylum": "Arthropod", 
            "genus": None, 
            "class": "Arachnid"
        }, 
        "uri": "http://dbpedia.org/resource/Argiope_(spider)", 
        "label": "Argiope", 
        "description": "The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced."
    }

    assert len(data) == 76
    assert data[0] == first_entry
    assert data[17]["name"] == "Ogdenia"
    assert data[48]["label"] == "Hydrachnidiae"
    assert data[14]["synonym"] == ["Cyrene Peckham & Peckham"]
    
    return data

if __name__ == "__main__":
    tmp_data = test()

Your first entry:
{'classification': {'class': 'Arachnid',
                    'family': 'Orb-weaver spider',
                    'genus': None,
                    'kingdom': 'Animal',
                    'order': 'Spider',
                    'phylum': 'Arthropod'},
 'description': 'The genus Argiope includes rather large and spectacular '
                'spiders that often have a strikingly coloured abdomen. These '
                'spiders are distributed throughout the world. Most countries '
                'in tropical or temperate climates host one or more species '
                'that are similar in appearance. The etymology of the name is '
                'from a Greek name meaning silver-faced.',
 'label': 'Argiope',
 'name': 'Argiope',
 'synonym': None,
 'uri': 'http://dbpedia.org/resource/Argiope_(spider)'}


In [79]:
db.arachnid.find_one({"classification.family" : "Orb-weaver spider"})

{'_id': ObjectId('5825f1864dd9620c1de02040'),
 'classification': {'class': 'Arachnid',
  'family': 'Orb-weaver spider',
  'genus': None,
  'kingdom': 'Animal',
  'order': 'Spider',
  'phylum': 'Arthropod'},
 'description': 'The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced.',
 'label': 'Argiope',
 'name': 'Argiope',
 'synonym': None,
 'uri': 'http://dbpedia.org/resource/Argiope_(spider)'}

In [188]:
# db.arachnid.insert_many(tmp_data)

<pymongo.results.InsertManyResult at 0x106b56360>

In [50]:
#Quiz: Inserting into DB

"""
Complete the insert_data function to insert the data into MongoDB.
"""

import json

def insert_data(data, db):

    # Your code here. Insert the data into a collection 'arachnid'
    db.arachnid.insert_many(data)


if __name__ == "__main__":
    
    from pymongo import MongoClient
    client = MongoClient("mongodb://localhost:27017")
    db = client.examples

    with open('data/arachnid.json') as f:
        data = json.loads(f.read())
        insert_data(data, db)
        print(db.arachnid.find_one())

{'name': 'Argiope', 'uri': 'http://dbpedia.org/resource/Argiope_(spider)', 'label': 'Argiope', 'description': 'The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced.', 'synonym': None, 'classification': {'phylum': 'Arthropod', 'order': 'Spider', 'genus': None, 'class': 'Arachnid', 'kingdom': 'Animal', 'family': 'Orb-weaver spider'}, '_id': ObjectId('5825f1864dd9620c1de02040')}


In [55]:
db.arachnid.find_one()#.classification.find_one()

{'_id': ObjectId('5825f1864dd9620c1de02040'),
 'classification': {'class': 'Arachnid',
  'family': 'Orb-weaver spider',
  'genus': None,
  'kingdom': 'Animal',
  'order': 'Spider',
  'phylum': 'Arthropod'},
 'description': 'The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced.',
 'label': 'Argiope',
 'name': 'Argiope',
 'synonym': None,
 'uri': 'http://dbpedia.org/resource/Argiope_(spider)'}

## Analyzing data

In [207]:
#Needed for quiz below
import json
with open('data/tweet.json') as f:
    
    data = json.loads(f.read())
    
    db.tweets.insert_one(data)

In [69]:
db.twitter.user.find({'favourites_count': 1}).count()

0

In [87]:
db.twitter.find_one({},{"source" : 1})

{'_id': '5304e2e3cc9e684aa98bef97', 'source': 'web'}

In [106]:
db.twitter.find_one({'source': 'web'})

{'_id': '5304e2e3cc9e684aa98bef97',
 'contributors': None,
 'coordinates': None,
 'created_at': 'Thu Sep 02 18:11:25 +0000 2010',
 'entities': {'hashtags': [], 'urls': [], 'user_mentions': []},
 'favorited': False,
 'geo': None,
 'id': '22819398300',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_user_id': None,
 'place': None,
 'retweet_count': None,
 'retweeted': False,
 'source': 'web',
 'text': 'First week of school is over :P',
 'truncated': False,
 'user': {'contributors_enabled': False,
  'created_at': 'Sun May 03 19:51:04 +0000 2009',
  'description': '',
  'favourites_count': 1,
  'follow_request_sent': None,
  'followers_count': 169,
  'following': None,
  'friends_count': 145,
  'geo_enabled': False,
  'id': 37486277,
  'lang': 'en',
  'listed_count': 77,
  'location': 'Ireland :)',
  'name': 'Catherine Mullane',
  'notifications': None,
  'profile_background_color': 'FF6699',
  'profile_background_image_url': 'http://a3.twimg.com/profile_bac

In [82]:
#db.twitter.find().count()

for rec in db.twitter.find():
    pprint.pprint(rec)

{'_id': '5304e2e3cc9e684aa98bef97',
 'contributors': None,
 'coordinates': None,
 'created_at': 'Thu Sep 02 18:11:25 +0000 2010',
 'entities': {'hashtags': [], 'urls': [], 'user_mentions': []},
 'favorited': False,
 'geo': None,
 'id': '22819398300',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_user_id': None,
 'place': None,
 'retweet_count': None,
 'retweeted': False,
 'source': 'web',
 'text': 'First week of school is over :P',
 'truncated': False,
 'user': {'contributors_enabled': False,
          'created_at': 'Sun May 03 19:51:04 +0000 2009',
          'description': '',
          'favourites_count': 1,
          'follow_request_sent': None,
          'followers_count': 169,
          'following': None,
          'friends_count': 145,
          'geo_enabled': False,
          'id': 37486277,
          'lang': 'en',
          'listed_count': 77,
          'location': 'Ireland :)',
          'name': 'Catherine Mullane',
          'notifications': 

In [115]:
for doc in db.twitter.aggregate([{"$group" : {"_id" : "$source","count" : {"$sum" :1} } } ]):
    print(doc)

{'count': 1, '_id': 'web'}


In [123]:
#Quiz: Using group

#!/usr/bin/env python
"""
The tweets in our twitter collection have a field called "source". This field describes the application
that was used to create the tweet. Following the examples for using the $group operator, your task is 
to modify the 'make-pipeline' function to identify most used applications for creating tweets. 
As a check on your query, 'web' is listed as the most frequently used application.
'Ubertwitter' is the second most used. The number of counts should be stored in a field named 'count'
(see the assertion at the end of the script).

Please modify only the 'make_pipeline' function so that it creates and returns an aggregation pipeline
that can be passed to the MongoDB aggregate function. As in our examples in this lesson, the aggregation 
pipeline should be a list of one or more dictionary objects. 
Please review the lesson examples if you are unsure of the syntax.

Your code will be run against a MongoDB instance that we have provided. 
If you want to run this code locally on your machine, you have to install MongoDB, 
download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.

Please note that the dataset you are using here is a smaller version of the twitter dataset 
used in examples in this lesson. 
If you attempt some of the same queries that we looked at in the lesson examples,
your results will be different.
"""


def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def make_pipeline():
    # complete the aggregation pipeline
#     pipeline = []
    pipeline = [{"$group" : {"_id" : "$source",
                            "count" : {"$sum" :1} } },
                 {"$sort" : {"count" : -1} } ]
    return pipeline

def tweet_sources(db, pipeline):
    return [doc for doc in db.tweets.aggregate(pipeline)]

if __name__ == '__main__':
    db = get_db('twitter')
    pipeline = make_pipeline()
    result = tweet_sources(db, pipeline)
    import pprint
    pprint.pprint(result[0])
    assert result[0] == {u'count': 868, u'_id': u'web'}


{'_id': 'web', 'count': 1}


AssertionError: 

In [134]:
#example use of $match and $project operators
for rec in db.tweets.aggregate([
        { "$match" : { "user.friends_count": {"$gt" : 0},
                    "user.followers_count": {"$gt" : 0} } },
        { "$project" : { "ratio" :{ "$divide" : ["$user.followers_count",
                                                "$user.friends_count"]},
                       "screen_name": "$user.screen_name"} },
        { "$sort" : { "ratio" : -1} },
        { "$limit" : 1 } ]):
        print(rec)

{'ratio': 1.1655172413793105, 'screen_name': 'Catherinemull', '_id': '5304e2e3cc9e684aa98bef97'}


In [131]:
#example use of $project operator
for rec in db.tweets.aggregate([
        { "$project" : { "ratio" :{ "$divide" : ["$user.followers_count",
                                                "$user.friends_count"]},
                       "screen_name": "$user.screen_name"} }]):
    print(rec)

{'ratio': 1.1655172413793105, 'screen_name': 'Catherinemull', '_id': '5304e2e3cc9e684aa98bef97'}


In [137]:
#importing data needed for quiz below

with open('data/tweet2.json','r') as f:
    data = json.load(f)
    
    db.tweets.insert_one(data)

DuplicateKeyError: E11000 duplicate key error collection: twitter.tweets index: _id_ dup key: { : "5304e2e3cc9e684aa98bef97" }

In [168]:
#Quiz: Using match and project

#!/usr/bin/env python
"""
Write an aggregation query to answer this question:

Of the users in the "Brasilia" timezone who have tweeted 100 times or more,
who has the largest number of followers?

The following hints will help you solve this problem:
- Time zone is found in the "time_zone" field of the user object in each tweet.
- The number of tweets for each user is found in the "statuses_count" field.
  To access these fields you will need to use dot notation (from Lesson 4)
- Your aggregation query should return something like the following:
{u'ok': 1.0,
 u'result': [{u'_id': ObjectId('52fd2490bac3fa1975477702'),
                  u'followers': 2597,
                  u'screen_name': u'marbles',
                  u'tweets': 12334}]}
Note that you will need to create the fields 'followers', 'screen_name' and 'tweets'.

Please modify only the 'make_pipeline' function so that it creates and returns an aggregation 
pipeline that can be passed to the MongoDB aggregate function. As in our examples in this lesson,
the aggregation pipeline should be a list of one or more dictionary objects. 
Please review the lesson examples if you are unsure of the syntax.

Your code will be run against a MongoDB instance that we have provided. If you want to run this code
locally on your machine, you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.

Please note that the dataset you are using here is a smaller version of the twitter dataset used 
in examples in this lesson. If you attempt some of the same queries that we looked at in the lesson 
examples, your results will be different.
"""

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def make_pipeline():
    # complete the aggregation pipeline
    pipeline = [ { "$match" : {"user.time_zone": "Brasilia", "user.statuses_count" : {"$gte" : 100} } },
               {"$project" : {"followers" : "$user.followers_count",
                              "screen_name" : "$user.screen_name",
                             "tweets" : "$user.statuses_count"} },
                {"$sort" : {"followers": -1} },
               {"$limit": 1} ]
    return pipeline

#  u'result': [{u'_id': ObjectId('52fd2490bac3fa1975477702'),
#                   u'followers': 2597,
#                   u'screen_name': u'marbles',
#                   u'tweets': 12334}]}


def aggregate(db, pipeline):
    return [doc for doc in db.tweets.aggregate(pipeline)]


if __name__ == '__main__':
    db = get_db('twitter')
    pipeline = make_pipeline()
    result = aggregate(db, pipeline)
    import pprint
    pprint.pprint(result)
    assert len(result) == 1
    assert result[0]["followers"] == 17209



[]


AssertionError: 

In [173]:
#Unwind operator example
for rec in db.tweets.aggregate([
        { "$unwind" : "$entities.user_mentions" },
        { "$group" : { "_id" : "$user.screen_name",
                    "count" : { "$sum" : 1 } } },
        { "$sort" : { "count" : -1 } },
        { "$limit" : 1 } ] ):
    print(rec)

In [140]:
db.tweets.find_one({"user.time_zone" : None})

{'_id': '5304e2e3cc9e684aa98bef97',
 'contributors': None,
 'coordinates': None,
 'created_at': 'Thu Sep 02 18:11:25 +0000 2010',
 'entities': {'hashtags': [], 'urls': [], 'user_mentions': []},
 'favorited': False,
 'geo': None,
 'id': '22819398300',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_user_id': None,
 'place': None,
 'retweet_count': None,
 'retweeted': False,
 'source': 'web',
 'text': 'First week of school is over :P',
 'truncated': False,
 'user': {'contributors_enabled': False,
  'created_at': 'Sun May 03 19:51:04 +0000 2009',
  'description': '',
  'favourites_count': 1,
  'follow_request_sent': None,
  'followers_count': 169,
  'following': None,
  'friends_count': 145,
  'geo_enabled': False,
  'id': 37486277,
  'lang': 'en',
  'listed_count': 77,
  'location': 'Ireland :)',
  'name': 'Catherine Mullane',
  'notifications': None,
  'profile_background_color': 'FF6699',
  'profile_background_image_url': 'http://a3.twimg.com/profile_bac

In [197]:
db.cities.find()[2]

{'_id': ObjectId('5820f2524dd9620c1de02032'),
 'areaCode': ['916'],
 'areaLand': 109271000.0,
 'country': 'United States',
 'elevation': 13.716,
 'foundingDate': datetime.datetime(2000, 7, 1, 0, 0),
 'governmentType': ['Council–manager government'],
 'homepage': ['http://elkgrovecity.org/'],
 'isPartOf': ['California', 'Sacramento County California'],
 'lat': 38.4383,
 'leaderTitle': 'Chief Of Police',
 'lon': -121.382,
 'motto': 'Proud Heritage Bright Future',
 'name': 'City of Elk Grove',
 'population': 155937,
 'postalCode': '95624 95757 95758 95759',
 'timeZone': ['Pacific Time Zone'],
 'utcOffset': ['-7', '-8']}

In [198]:
#Quiz: Using Unwind

#!/usr/bin/env python
"""
For this exercise, let's return to our cities infobox dataset. The question we would like you to answer
is as follows:  Which region or district in India contains the most cities? (Make sure that the count of
cities is stored in a field named 'count'; see the assertions at the end of the script.)

As a starting point, use the solution for the example question we looked at -- "Who includes the most
user mentions in their tweets?"

One thing to note about the cities data is that the "isPartOf" field contains an array of regions or 
districts in which a given city is found. See the example document in Instructor Comments below.

Please modify only the 'make_pipeline' function so that it creates and returns an aggregation pipeline 
that can be passed to the MongoDB aggregate function. As in our examples in this lesson, the aggregation 
pipeline should be a list of one or more dictionary objects. Please review the lesson examples if you 
are unsure of the syntax.

Your code will be run against a MongoDB instance that we have provided. If you want to run this code 
locally on your machine, you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.

Please note that the dataset you are using here is a smaller version of the cities collection used in 
examples in this lesson. If you attempt some of the same queries that we looked at in the lesson 
examples, your results may be different.
"""

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def make_pipeline():
    # complete the aggregation pipeline
    pipeline = [{ "$unwind" : "$isPartOf" },
                { "$match" : {"country" : "India" } },
                { "$group" : {"_id" : "$isPartOf",
                             "count" : {"$sum" : 1} } },
                { "$sort" : {"count" : -1 } } 
                ]
    
    return pipeline

def aggregate(db, pipeline):
    return [doc for doc in db.cities.aggregate(pipeline)]

if __name__ == '__main__':
    db = get_db('examples')
    pipeline = make_pipeline()
    result = aggregate(db, pipeline)
    print("Printing the first result:")
    import pprint
    pprint.pprint(result[0])
    assert result[0]["_id"] == "Uttar Pradesh"
    assert result[0]["count"] == 623




Printing the first result:


IndexError: list index out of range

In [203]:
#Example of $group with $avg operator
#goal: find average number of retweets for any tweet using a particular hashtag

db.tweets.aggregate([
        { "$unwind" : "$entities.hashtags" },
        { "$group" : { "_id" : "$entities.hashtags.text",
                     "retweet_avg" : {"$avg" : "$retweet_count"}
                     } },
        { "$sort" : { "retweet_avg" : -1} } ] )

<pymongo.command_cursor.CommandCursor at 0x106b02588>

In [204]:
#Example of $addToSet operator

db.tweets.aggregate([
        { "$unwind" : "$entities.hashtags" },
        { "$group" : { "_id" : "$user.screen_name",
                      "unique_hashtags" : {
                            "$addToSet" : "$entities.hashtags.text"
                }
                     } },
        { "$sort" : { "_id" : -1} } ] )

<pymongo.command_cursor.CommandCursor at 0x106b02ac8>

In [208]:
db.tweets.find_one()

{'_id': '5304e2e3cc9e684aa98bef97',
 'contributors': None,
 'coordinates': None,
 'created_at': 'Thu Sep 02 18:11:25 +0000 2010',
 'entities': {'hashtags': [], 'urls': [], 'user_mentions': []},
 'favorited': False,
 'geo': None,
 'id': '22819398300',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_user_id': None,
 'place': None,
 'retweet_count': None,
 'retweeted': False,
 'source': 'web',
 'text': 'First week of school is over :P',
 'truncated': False,
 'user': {'contributors_enabled': False,
  'created_at': 'Sun May 03 19:51:04 +0000 2009',
  'description': '',
  'favourites_count': 1,
  'follow_request_sent': None,
  'followers_count': 169,
  'following': None,
  'friends_count': 145,
  'geo_enabled': False,
  'id': 37486277,
  'lang': 'en',
  'listed_count': 77,
  'location': 'Ireland :)',
  'name': 'Catherine Mullane',
  'notifications': None,
  'profile_background_color': 'FF6699',
  'profile_background_image_url': 'http://a3.twimg.com/profile_bac

In [210]:
#Quiz: Using push

#!/usr/bin/env python
"""
$push is similar to $addToSet. The difference is that rather than accumulating only unique values 
it aggregates all values into an array.

Using an aggregation query, count the number of tweets for each user. In the same $group stage, 
use $push to accumulate all the tweet texts for each user. Limit your output to the 5 users
with the most tweets. 
Your result documents should include only the fields:
"_id" (screen name of user), 
"count" (number of tweets found for the user),
"tweet_texts" (a list of the tweet texts found for the user).  

Please modify only the 'make_pipeline' function so that it creates and returns an aggregation 
pipeline that can be passed to the MongoDB aggregate function. As in our examples in this lesson, 
the aggregation pipeline should be a list of one or more dictionary objects. 
Please review the lesson examples if you are unsure of the syntax.

Your code will be run against a MongoDB instance that we have provided. If you want to run this code 
locally on your machine, you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.

Please note that the dataset you are using here is a smaller version of the twitter dataset used in 
examples in this lesson. If you attempt some of the same queries that we looked at in the lesson 
examples, your results will be different.
"""

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def make_pipeline():
    # complete the aggregation pipeline
    pipeline = [ {"$group" :{"_id" : "$user.screen_name", 
                            "count" : {"$sum" : 1},
                            "tweet_texts" : {"$push" : "$text" } } },
                {"$sort" : {"count" : -1} },
                {"$limit" : 5}
               ]
    return pipeline

def aggregate(db, pipeline):
    return [doc for doc in db.twitter.aggregate(pipeline)]


if __name__ == '__main__':
    db = get_db('twitter')
    pipeline = make_pipeline()
    result = aggregate(db, pipeline)
    import pprint
    pprint.pprint(result)
    assert len(result) == 5
    assert result[0]["count"] > result[4]["count"]
    sample_tweet_text = u'Take my money! #liesguystell http://movie.sras2.ayorganes.com'
    assert result[4]["tweet_texts"][0] == sample_tweet_text
    

[{'_id': 'Catherinemull',
  'count': 1,
  'tweet_texts': ['First week of school is over :P']}]


AssertionError: 

In [233]:
#Example for using the same operator in multiple stages

for doc in db.tweets.aggregate([
        { "$unwind" : "$entities.user_mentions" },
        { "$group" : { "_id" : "$user.screen_name",
                      "mset" : {
                        "$addToSet" : "$entities.user_mentions.screen_name"
                } } },
        { "$unwind" : "$mset"},
        { "$group" : { "_id" : "$_id", 
                      "count" : { "$sum" : 1} } },
        { "$sort" : { "count" : -1 } },
        { "$limit" : 10 } ] ):
    print(doc)

In [235]:
db.tweets.find_one()

{'_id': '5304e2e3cc9e684aa98bef97',
 'contributors': None,
 'coordinates': None,
 'created_at': 'Thu Sep 02 18:11:25 +0000 2010',
 'entities': {'hashtags': [], 'urls': [], 'user_mentions': []},
 'favorited': False,
 'geo': None,
 'id': '22819398300',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_user_id': None,
 'place': None,
 'retweet_count': None,
 'retweeted': False,
 'source': 'web',
 'text': 'First week of school is over :P',
 'truncated': False,
 'user': {'contributors_enabled': False,
  'created_at': 'Sun May 03 19:51:04 +0000 2009',
  'description': '',
  'favourites_count': 1,
  'follow_request_sent': None,
  'followers_count': 169,
  'following': None,
  'friends_count': 145,
  'geo_enabled': False,
  'id': 37486277,
  'lang': 'en',
  'listed_count': 77,
  'location': 'Ireland :)',
  'name': 'Catherine Mullane',
  'notifications': None,
  'profile_background_color': 'FF6699',
  'profile_background_image_url': 'http://a3.twimg.com/profile_bac

#notes for quiz below: how I would solve it in mysql
select avg(pop) from (select avg(population) from table t 
                      where country = 'India'
group by region)

In [310]:
#notes for quiz below
for doc in db.cities.aggregate([
        { "$match" : { "country" : "United States" } },
        { "$unwind" : "$isPartOf" },
        { "$group" : {
                "_id" : {"region": "$isPartOf"},
                    "average_population" : { "$avg" : "$population" },
                    "rset" : { "$addToSet" : "$isPartOf" } } },
        { "$unwind" : "$rset" },
        { "$group" : { "_id" : "$_id.country",
          "average_population" : {"$avg" : "$average_population" } } }
        ]):
    print(doc)

{'average_population': 155937.0, '_id': None}


In [309]:
#Quiz: same operator

#!/usr/bin/env python
"""
In an earlier exercise we looked at the cities dataset and asked which region in India contains 
the most cities. In this exercise, we'd like you to answer a related question regarding regions in 
India. What is the average city population for a region in India? Calculate your answer by first 
finding the average population of cities in each region and then by calculating the average of the 
regional averages.

Hint: If you want to accumulate using values from all input documents to a group stage, you may use 
a constant as the value of the "_id" field. For example, 
    { "$group" : {"_id" : "India Regional City Population Average",
      ... }

Please modify only the 'make_pipeline' function so that it creates and returns an aggregation 
pipeline that can be passed to the MongoDB aggregate function. As in our examples in this lesson, 
the aggregation pipeline should be a list of one or more dictionary objects. 
Please review the lesson examples if you are unsure of the syntax.

Your code will be run against a MongoDB instance that we have provided. If you want to run this code 
locally on your machine, you have to install MongoDB, download and insert the dataset.
For instructions related to MongoDB setup and datasets please see Course Materials.

Please note that the dataset you are using here is a smaller version of the twitter dataset used 
in examples in this lesson. If you attempt some of the same queries that we looked at in the lesson 
examples, your results will be different.
"""

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def make_pipeline():
    # complete the aggregation pipeline
    pipeline = [ 
        { "$match" : { "country" : "India" } },
        { "$unwind" : "$isPartOf" },
        { "$group" : {
                "_id" : {"region": "$isPartOf"},
                    "average_population" : { "$avg" : "$population" },
                    "rset" : { "$addToSet" : "$isPartOf" } } },
        { "$unwind" : "$rset" },
        { "$group" : { "_id" : "$_id.country",
          "average_population" : {"$avg" : "$average_population" } } } 
    ]
    return pipeline

def aggregate(db, pipeline):
    return [doc for doc in db.cities.aggregate(pipeline)]


if __name__ == '__main__':
    db = get_db('examples')
    pipeline = make_pipeline()
    result = aggregate(db, pipeline)
    assert len(result) == 1
    # Your result should be close to the value after the minus sign.
    assert abs(result[0]["avg"] - 201128.0241546919) < 10 ** -8
    import pprint
    pprint.pprint(result)


AssertionError: 

The above works, but is failing due to different dataset from what the test is based on. It is however not 100% optimal, see https://discussions.udacity.com/t/lesson-5-same-operator-p/7291/10 for more details.