# PyMongo Notes

## Connecting to a Database/Collection

In [1]:
# make a connection with MongoClient

# default connection
# client = MongoClient()

# connection with specified host and port
# client = MongoClient('localhost', 27017)

import pymongo
from pymongo import MongoClient

# connection using a URI string
URI = ''
client = MongoClient(URI)

In [2]:
# get a database from the connected client
# this db name is database_name
# can also use client['database_name']
db = client.TrainingDB

In [3]:
# list all collections in the db
db.list_collection_names()

['TrainingDB']

In [4]:
# get a collection
# can use .collection_name or ['collection_name']
# must use [''] if collection name has a '.'


# does not throw an exception if the collection does not exist
# simply returns an empty collection object (see below)
collection = db['col.name']

In [5]:
from pprint import pprint

In [6]:
# print the first record from a collection
pprint(collection.find_one())

None


In [7]:
collection is None

False

In [8]:
# collection is an empty object
type(collection)

pymongo.collection.Collection

#### Creating a New Collection

In [10]:
# adding collation of en_US to help with sorting strings
from pymongo.collation import Collation

# good practice to check for a collection before creating a new one
if 'PyMongo_Test' not in db.list_collection_names():
    collection = db.create_collection('PyMongo_Test', collation=Collation(locale='en_US'))
else:
    print('Collection already exists.')

Collection already exists.


## Queries

```python
import pprint

# get a single document with find_one()
# find_one() returns a single document matching a query (or None if no matches)
# will return only the first match if multiple matches
doc1 = collection.find_one()

# pprint.pprint works better for viewing the JSON structure
pprint.pprint(doc1)
print(doc1)
```

```python
# find_one() returns None if no matches found
doc_notexists = collection.find_one({'_id': '123456'})
pprint.pprint(doc_notexists)
doc_notexists is None
```

```python
# if the items exists, it is found
doc1 = collection.find_one({'_id': ''})
pprint.pprint(doc1)
```

```python
# ObjectId is not the same as it's string representation
# if using ObjectId's (common to get ObjectId's from a request URL) use this
from bson.objectid import ObjectId

# get post_id from web framework and feed into this function
def get(post_id):
    # convert string to ObjectId
    # use appropriate db/collection names here or add as arguments to the function
    document = client.db.collection.find_one({'_id': ObjectId(post_id)})
```

```python
# return multiple results with find()
# find() returns a Cursor instance, which allows iteration over all matching docs
# if only one record exists/matches, returns that record
for doc in collection.find():
    pprint.pprint(doc)
```

```python
# can filter results using find, just like with find_one
for doc in collection.find({'_id': ''}):
    pprint.pprint(doc)
```

```python
# count the number of total documents
collection.count_documents({})
```

```python
# count the number that meet a certain criteria
collection.count_documents({'_id': '1234'})
```

```python
from datetime import datetime as dt

# Range queries
# helps when data are the correct types!
# pay attention to UTC format (see Datetime section below)
start_date = dt(2009, 11, 12)
dt.strftime(start_date.date(), '%Y-%m-%d')
```

- get all records before a certain date, then sort by another field
- `$lt` is an operator that matches values less than the specified value
- see below for advanced queries and documentation
```python
for doc in collection.find({'date': {'$lt': start_date}}).sort('_id'):
    pprint.pprint(doc)
```

## Advanced Queries

#### Note on Python Dictionaries
- These do not maintain order
    - Use `collections.OrderedDict` where explicit ordering is required (e.g. `$sort`)
    - Another option is `bson.son.SON` (illustrated here)

#### Aggregations
- See full docs for [Aggregation Framework](http://docs.mongodb.org/manual/applications/aggregation)

```python
# example to count the number of occurrences for all values under 'key_name'
# example data could be {'key_name': []}, {'key_name': ['value1', 'value3']}, {'key_name': ['value1', 'value2', 'value3']}
# example collection is 'col'

from bson.son import SON
pipeline = [
    {'$unwind': '$key_name'},
    {'$group': {'_id': '$key_name', 'count': {'$sum': 1}}},
    {'$sort': SON([('count', -1), ('_id', -1)])}
]

pprint.pprint(list(db.col.aggregate(pipeline)))
```

```python
# run an explain plan for this aggregation pipeline above
bd.command('aggregate', 'col', pipeline=pipeline, explain=True)
```

#### Map/Reduce
- See full docs for [Map Reduce Engine](http://www.mongodb.org/display/DOCS/MapReduce)

```python
# this example uses map/reduce to count the number of occurrences for each value under 'key_name'
# same problem as aggregation above, but different strategy
# code for these functions is coded in JavaScript

from bson.code import Code

# define the mapper
mapper = Code('''
              function () {
                this.key_name.forEach(function(z) {
                  emit(z, 1);
                });
              }
              ''')

# define the reducer
# can't just return values.length, as reduce might be called iteratively on results of other reduce steps
reducer = Code('''
               function(key, values) {
                 var total = 0;
                 for (var i = 0; i < values.length; i++) {
                   total += values[i];
                 }
                 return total;
               }
               ''')

# call map_reduce() and iterate over the collection (col)
result = db.col.map_reduce(mapper, reducer, 'myresults')
for doc in result.find():
    pprint.pprint(doc)
    
# output is something like this
# first key/value pair shows all the unique values
# second key/value pair returns the counts for each unique value
# {u'_id': u'value1', u'value': 3.0},
# {u'_id': u'value2', u'value': 1.0}
```

#### Advanced Map Reduce
- PyMongo supports all features of MongoDB's map/reduce engine
- See some examples below

```python
# full_response=True will return the full response rather than just the collection
pprint.pprint(db.col.map_reduce(mapper, reducer, 'myresults', full_response=True))
```

```python
# can pass any/all of Mongo's map/reduce parameters as keyword args

# returns only results that meet the query condition of having counts less than 2
results = db.col.map_reduce(mapper, reducer, 'myresults', query={'x': {'$lt': 2}})
```

```python
# use SON or OrderedDict to specify a diff database to store the result collection

pprint.pprint(db.col.map_reduce(mapper, reducer, out=SON([('replace', 'results'), ('db', 'outdb')]), full_response=True))
```

#### MongoDB Operators
- See [Operators](https://docs.mongodb.com/manual/reference/operator/) for more details
- Basic syntax
    - `db.col_name.find({'field_name': {$eq: value}})`
    - `db.col_name.find({'field_name.inner_field_name': {$lt: 'value'}})`

Comparison
- `$eq` (will return results with a single array element equal to the value supplied, more array options below)
    - `equal`
- `$gt`
    - `greater than`
- `$gte`
    - `greater than or equal to`
- `$in`
    - `matches any values in an array`: like SQL `IN`
- `$lt`
    - `less than`
- `$lte`
    - `less than or equal to`
- `$ne`
    - `not equal`
- `$nin`
    - `not in`: like SQL `NOT IN`

Logical
- `$and`
- `$not`
- `$nor`
- `$or`

Element
- `$exists`
- `$type` (looks for fields of a certain type)

Evaluation
- `$expr`
    - allows the use of aggregation expressions
- `$jsonSchema`
    - validate against the given JSON schema
- `$mod`
    - modulo operation on the value of a field and selects docs with a specified result
- `$regex`
    - select docs where values match a specified regular expression
- `$text`
    - performs a text search
- `$where`
    - matches docs that satisfy a JavaScript expression

Geospatial
- `$geoIntersects`
    - geometries with a [GeoJSON](https://docs.mongodb.com/manual/reference/glossary/#term-geojson) geometry
- `$geoWithin`
    - select within a bounding GeoJSON geometry
- `$near`
    - returns objects within proximity to a point
- `$nearSphere`
    - returns objectws within proximity on a sphere

Array Options
- `$all`
    - must contain all elements specified
- `$elemMatch`
    - array field must matches all specified conditions
- `$size`
    - selects with array field of a certain size

Bitwise
- `$bitsAllClear`
    - numeric or binary values where all bit positions are 0
- `$bitsAllSet`
    - numeric or binary values where all bit positions are 1
- `$bitsAnyClear`
    - any bit is 0
- `$bitsAnySet`
    - any bit is 1

Comments
- `$comment`
    - adds a comment to a query predicate

Projection Operators
- `$`
    - projects the first element in an array that matches the query condition
- `$elemMatch`
    - projects the first element that matches the specified 'elemMatch' condition
- `$meta`
    - projects the document's score assigned during `$text` operation
- `$slice`
    - limits the number of elements projected from an array

## Replica Sets
- Start with [High Availability](https://pymongo.readthedocs.io/en/stable/examples/high_availability.html)
    - Also see [Replica Sets](http://dochub.mongodb.org/core/rs) and [Large Shared Cluster](http://www.mongodb.org/display/DOCS/Sharding+Introduction)

#### 3 Node Replica Set (Bare Minimum Example)
- Replica sets should always use multiple nodes in production
    - Putting all set members on the same physical node is only recommended for testing and development

- Start three mongo processes, each on a diff port with a diff dbpath, but same replica set name
- Sets will be up and running, but no primary set until the set is initialized
```
$ mkdir -p /data/db0 /data/db1 /data/db2
$ mongod --port 27017 --dbpath /data/db0 --replSet foo
$ mongod --port 27018 --dbpath /data/db1 --replSet foo
$ mongod --port 27019 --dbpath /data/db2 --replSet foo
```

- Creating a set for the first time

```python
# initialize the set
# connect to a single node and initiate
# until you do this, the sets are essentially 'offline'

# start a client, only this node is allowed to have data
c = MongoClient('localhost', 27017)

# only the node that is initiated is allowed to contain any data
# in this case, that is the node on port 27017, since that's where the client started

# assign the set to config
config = {'_id': 'foo', 'members': [
    {'_id': 0, 'host': 'localhost:27017'},
    {'_id': 1, 'host': 'localhost:27018'},
    {'_id': 2, 'host': 'localhost:27019'}]}

# initiate the set
c.admin.command('replSetInitiate', config)
```

- Connecting to a set that is already initialized
    - The addresses passed to `MongoClient` are the seeds
    - As long as at least one of the seeds is online, Mongo discovers all members of the replica set
    - Also determines which one is primary and which are secondary (arbiters)
    - Each seed must be the address of a single mongodb
    
```python
# any of the following commands would connect to the set above after initialization

# method 1
MongoClient('localhost', replicaset='foo')

# method 2
MongoClient('localhost:27018', replicaset='foo')

# method 3
MongoClient('localhost', 27019, replicaset='foo')

# method 4
MongoClient(('mongodb://localhost:27017,localhost:27018/?replicaSet=foo')
```

- Printing the nodes
    - May initially appear empty since `MongoClient` returns before the connection is complete
        - Wait a few seconds and try again if this happens
    - In an app, operations such as `find()` or `insert_one()` will auto-wait to discover before attempting

```python
c = MongoClient(replicaset='foo')
print(c.nodes)
```

- Secondary Reads
    - Queries are sent to the primary node by default
    - Can change the read preference to send to secondary nodes
    - If there are no secondary members, the primary will be used as fallback
    - If there are queries you would prefer never send to the primary, use `secondary`
- Defaults
    - Read pref for a `database` is inherited from the `client`
    - Read pref for a `collection` is inherited from teh `database`
    
    
```python
# initialize the client specifying the read preference
client = MongoClient(
    'localhost:27017',
    replicaSet='foo',
    readPreference='secondaryPreferred')

# check the read preference attribute
client.read_preference


# overriding the default read preference
# use get_database() or get_collection() methods
from pymongo import ReadPreference

# get a new db or a new coll and specify a different read preference

# for a db
db = client.get_database('test', read_preference=ReadPreference.SECONDARY)

# for a collection
coll = db.get_collection('test', read_preference=ReadPreference.PRIMARY)

# change the read preference of an existing collection using with_options()
coll2 = coll.with_options(read_preference=ReadPreference.NEAREST)

# most commands can only be sent to the primary
# can pass an explicit read preference to some methods
db.command('dbstats', read_preference=ReadPreference.NEAREST)
```

#### Read Configuration (three components)
- Read Preference
    - Configured using one of the classes from `read_preferences`
        - `Primary`: read from the primary (default)
            - strongest consistency
            - if no primary available, raise AutoReconnect error
        - `PrimaryPreferred`: read from primary if available
            - if no primary, read from a secondary
        - `Secondary`: read from a secondary
            - if no matching secondary, raise AutoReconnect
        - `SecondaryPreferred`: read from secondary if available
            - if no secondary, read from primary
        - `Nearest`: read from any available member
- Tag Sets
    - Can specify tag sets to look for
    - Not really relevant to us at the moment
    - See [Docs](https://pymongo.readthedocs.io/en/stable/examples/high_availability.html) or [Tags](http://www.mongodb.org/display/DOCS/Data+Center+Awareness)
- Local Threshold
    - Not really relevant to us at the moment
    - See [Docs](https://pymongo.readthedocs.io/en/stable/examples/high_availability.html)

#### Load Balancing
- See the [Docs](https://pymongo.readthedocs.io/en/stable/examples/high_availability.html)

## mod_wsgi
- PyMongo supports `mod_wsgi` to implement an Apache module to host Python apps
    - See [The Docs](https://pymongo.readthedocs.io/en/stable/examples/mod_wsgi.html)

## Server Selection
- You can exert control over server selection
    - See [The Docs](https://pymongo.readthedocs.io/en/stable/examples/server_selection.html)

## Cursor Control 
- By default, cursors are closed after all results are exhausted
- For [Capped Collections](https://docs.mongodb.org/manual/core/capped-collections/) you can use a [Tailable Cursor](https://docs.mongodb.org/manual/reference/glossary/#term-tailable-cursor) that remains open
- Primary [Docs](https://pymongo.readthedocs.io/en/stable/examples/tailable.html)

## TSL/SSL
- These connections are supported
- See [Server Docs](http://docs.mongodb.org/manual/tutorial/configure-ssl/) for configuring MongoDB
- See [Primary Docs](https://pymongo.readthedocs.io/en/stable/examples/tls.html) for connecting

```python
# if configured for TSL/SSL, connection can be as simple ssl=True or ?ssl=true in the connection string in MongoClient() call
```

## Encryption
- Client side
- Server side
- See [The Docs](https://pymongo.readthedocs.io/en/stable/examples/encryption.html)

## Authentication
- See more info at [PyMongo Authentication](https://pymongo.readthedocs.io/en/stable/examples/authentication.html)

```python
# %-escaping un & pw
# useful if controlling access through a webform
# get un/pw, convert to quote_plus, supply to URI

from urllib.parse import quote_plus

username = quote_plus('username')
password = quote_plus('password')

# create a connection to mongo client replacing un/pw with %s, and supplying those args as a tuple
# this is a sample URI
sample_client = MongoClient('mongodb://%s:%s@127.0.0.1' % (username, password))
```

```python
# SCRAM-SHA-256 (RFC 7677)
# new to version 3.7
# this is the default mech for clusters configured for authentication for MongoDB 4.0 or higher

# can supply values as keyword args
# default authSource='admin'
sample_client = MongoClient('example.com',
                           username='user',
                           password='password',
                           authSource='sample_database',
                           authMechanism='SCRAM-SHA-256')

# can supply values through the URI (HTML GET request)
sample_uri = 'mongodb://user:password@example.com/?authSource=sample_database&authMechanism=SCRAM-SHA-256'
sample_client = MongoClient(sample_uri)
```

## Collation
- See the [Collation API Docs](https://pymongo.readthedocs.io/en/stable/api/pymongo/collation.html#module-pymongo.collation
- Usage
    - Collations are used to specify a sort order for different languages (i.e. French, German)
    - Helps to sort words with accents

```python
# assign default collation to a collection
from pymongo.collation import Collation
collection = db.create_collection('col_name', collation=Collation(locale='fr_CA'))

# assign default collation to an index
col_name.create_index('index_name', unique=True, collation=Callation(locale='fr_CA'))

# specify a collation for a query
docs = col_name.find({'city': 'New York'}).sort('name').collation(Collation(local='fr_CA'))

# other query types
contacts = MongoClient().test.contacts   # get a collection

# update all records with name 'jürgen' 
# strength=CollationStrength.SECONDARY makes this case insensitive: 'jürgen' and 'Jürgen' updated
result = contacts.update_many(
    {'first_name': 'jürgen'},
    {'$set': {'verified': 1}},
    collation=Collation(local='de', strength=CollationStrength.SECONDARY))
```

## Copying a Database

```python
# copying dbs on the same server
client.admin.command('copydb', fromdb='source_db_name', todb='target_db_name')

# copying from a different mongo server that is not password protected
# will copy to the host specified in your current mongo client object
client.admin.command('copydb', fromdb='source_db_name', todb='target_db_name', fromhost='source.example.com')

# if copying from a different server that is password protected
# authenticate to the 'admin' database
client = MongoClient('target.example.com', username='administrator', password='pwd')   # this is the key step
client.admin.command('copydb', fromdb='source_db_name', todb='target_db_name', fromhost='source.example.com')
```

## Data Types
- [PyMongo Documentation](https://pymongo.readthedocs.io/en/stable/)
    - Documents can contain native python types (like datetime.datetime instances)
        - **Note:** datetime.date objects are not supported
    - These datatypes will be automatically converted to and from appropriate [BSON](http://www.mongodb.org/display/DOCS/BSON) datatypes
- Supported directions for datatypes viewable at the [BSON Encoding and Decoding](https://pymongo.readthedocs.io/en/stable/api/bson/index.html)

#### Unicode Strings
- `u'Mike'` vs. `'Mike'`
- Mongo stores data in BSON
    - BSON strings are UTF-8 encoded
    - Regular strings are stored unaltered
    - Unicode strings are encoded UTF-8 first
    - Python decodes each BSON string to a Python unicode string, not a regular string
    - `find_one()` and `find()` will match whether or not there is a `u` in front in the doc
        - Do NOT need to worry about this not returning query results
        - `col.find({'author': 'Mike'})` will find `{u'author': u'Mike'}`
- See more about [Python Unicode Strings](http://docs.python.org/howto/unicode.html)

## Datetimes and Timezones
- PyMongo uses the `datetime.datetime` object for representing dates/times
    - **Mongo assumes dates/times are in UTC**
        - [bson.codec_options.CodecOptions](https://pymongo.readthedocs.io/en/stable/api/bson/codec_options.html#bson.codec_options.CodecOptions) has a tz_aware option that enables 'aware' datetime.datetime objects `CodecOptions(tz_aware=True)`
        - Default option is 'naive' datetimes

#### Writing Datetimes
```python
# when using now, use utcnow()
# always use this
from datetime import datetime as dt
result = db.col_name.insert_one({'last_modified': dt.utcnow()})


# saving datetimes with timezones
import pytz
PST = pytz.timezone('US/Pacific')
aware_datetime = PST.localize(dt(2002, 10, 27, 6, 0, 0))
result = db.col_name.insert_one({'date': aware_datetime})
db.col_name.find_one()['date'] # will return UTC -> datetime.datetime(2002, 10, 27, 14, 0)
# find_one above assumes that the first record is the one specified here
```

#### Reading Datetimes
- Default action is that `datetime.datetime` objects returned by MongoDB will be naive and reflect UTC
- Can set `CodecOptions(tz_aware=True)` to write/read tz aware datetimes

```python
from bson.codec_options import CodecOptions
import pytz
db.col_name.find_one()['date']  # retrieves the 'date' value only from the first record
# assume the above returns UTC datetime.datetime(2002, 10, 27, 14, 0): same as the write example above

# reading in pacific time by setting tz_aware using with_options
aware_times = db.col_name.with_options(codec_options=CodecOptions(
    tz_aware=True,
    tzinfo=pytz.timezone('US/Pacific')))
result = aware_times.find_one()  # returns datetime.datetime(2002, 10, 27, 6, 0): a tz aware time
```

## Custom Data Types
- Mongo handles most data types well with some exceptions
- These will throw a bson.error.InvalidDocument exception
    - Pythons `Decimal` datatype
- Can only define these for types bson doesn't understand
    - Will throw a `TypeError` if you try to write these for types bson can interpret like `int` and `str`
- Defining custom data types
    - First ddefine a type codec for that type
        - This describes how an instance of a custom type can be transformed to and/or from a bson type
        - Choices for type codecs
            - `TypeEncoder`: encodes a custom Python type to a known BSON type
                - Must implement the `python_type` property/attribute and the `transform_python` method
            - `TypeDecoder`: decods a specified BSON type into a custom Python type
                - Must implement the `bson_type` property/attribute and the `transform_bson` method
            - `TypeCodec`: can both encode/decode a custom type
                - Must implement the `python_type`/`bson_type` attributes, and the `transform_python`/`transform_bson` methods
- Must register the codec using `TypeRegistry` class to use the codec
- Must define a `CodecOptions` instance within the `TypeRegistry` instance to get a `Collection` object
    - When retrieved this way, the `Collection` object will 'understand' the custom Python datatype

```python
# example using TypeCodec to encode/decode the Python Decimal datatype

# import necessary packages
from decimal import *
from bson.decimal128 import Decimal128
from bson.codec_options import TypeCodec, TypeRegistry, CodecOptions

# define DecimalCodec class
class DecimalCodec(TypeCodec):
    python_type = Decimal       # the python type
    bston_type = Decimal128     # the bson type
    
    # define a method within the class to transform python into bson
    def transform_python(self, value):
        '''Function that transforms a custom type value into a type that
        bson can understand.'''
        return Decimal128(value)
    
    # define a method that transforms bson to custom python type
    def transform_bson(self, value):
        '''Function that transforms bson type into a python custom type.'''
        return value.to_decimal()
    
# create an instance of the codec class
decimal_codec = DecimalCodec()

# register the codec with PyMongo (can supply a list of several objects here)
type_registry = TypeRegistry([decimal_codec])

# define a CodecOptions instance using the registry
codec_options = CodecOptions(type_registry=type_registry)

# create a Collection object using the codec options instance
collection = db.get_collection('col_name', codec_options=codec_options)

# if you connect to a saved collection without using a TypeDecoder or TypeCodec
# in the codec_options,
# you will retrieve the bson datatype when querying
```

#### Fallback Encoder
- A fallback encoder will take datatypes bson doesn't understand and convert to a type it does understand
- Fallback encoders are only called when explicitly defined encoders or standard bson encoders fail
    - If something else can encode the value first, that will be used

```python
# example that checks the python datatype before attempting to convert
# will return Decimal128 if python type is decimal only
def fallback_encoder(value):
    if isinstance(value, Decimal):
        return Decimal128(value)
    return value

type_registry = TypeRegistry(fallback_encoder=fallback_encoder)
codec_options = CodecOptions(type_registry=type_registry)
```

## Geospatial Indexing
- See also MongoDB docs on [geo](http://dochub.mongodb.org/core/geo)
- See additional docs on lat/long GEOSPHERE and 2dsphere [2dsphere](https://docs.mongodb.com/manual/core/2dsphere/)
    - long is specified first!

```python
# create a geospatial index
from pymongo import GEO2D
db.col_name.create_index([('loc', GEO2D)])
```

```python
# insert places
result = db.col_name.insert_many([{'loc': [2, 5]},
                                  {'loc': [30, 5]},
                                  {'loc': [1, 2]},
                                  {'loc': [4, 4]}])
```

```python
# querying location info

# finds 3 docs near a given location
for doc in db.col_name.find({'loc': {'$near': [3, 6]}}).limit(3):
    pprint.pprint(doc)
    
# if using pymongo.GEOSPHERE, `$nearSphere` is recommended instead of `$near`

# the `$maxDistance` operator requires the use of SON
from bson.son import SON
query = {'loc': SON([('$near', [3, 6]), ('$maxDistance', 100)])}
for doc in db.col_name.find(query).limit(3)
    pprint.pprint(doc)
```

```python
# query for all items within a given rectangle (specified by lower-left and upper-right coordinates)
query = {'loc': {'$within': {'$box': [[2, 2], [5, 6]]}}}
for doc in db.places.find(query).sort('_id'):
    pprint.pprint(doc)

                     
# or circle (specified by center point and radius):
query = {'loc': {'$within': {'$center': [[0, 0], 6]}}}
for doc in db.places.find(query).sort('_id'):
    pprint.pprint(doc)
```

## Gevent
- PyMongo supports Gevent
    - See the [docs](https://pymongo.readthedocs.io/en/stable/examples/gevent.html)
        - [Gevent](http://www.gevent.org/)

## GridFS
- [gridfs](https://pymongo.readthedocs.io/en/stable/api/gridfs/index.html#module-gridfs) can be used to store large binary objects (e.g. files) in Mongo
- See also the [API Docs](https://pymongo.readthedocs.io/en/stable/api/gridfs/index.html#module-gridfs)
- See [binary](https://pymongo.readthedocs.io/en/stable/api/bson/binary.html) for tools for representing binary data in MongoDB

- Every GridFS instance created will operate only on the specified database

```python
# creating a gridfs instance on a db object
import gridfs
fs = gridfs.GridFS(db)
```

- Saving and Retrieving Data

```python
# put() creates a new file in GridFS and returns the value of the file document's '_id' key
# that '_id' key can be used to get() back the contents of the file

a = fs.put(b'Hello World')
fs.get(a).read()  # returns 'Hello World'


# can put() any object with a read() method
# GridFS handles reading in chunk-sized args automatically
# can include additional keyword args to add attributes
b = fs.put(fs.get(a), filename='foo', bar='baz')
out = fs.get(b)
out.read()  # outputs 'Hello World' based on our put above
out.filename  # outputs u'foo'
out.bar       # outputs u'baz'
out.upload_date   # outputs the datetime.datetime object for the upload date
```

## Inserting Records
- Can insert single records or many records at a time

```python
# generic example

# insert_many
new_records = [{'key1': 'value1',
               'key2': 'value2',
               'key3': ['value3', 'value4'],
               'key4': {'key5': 'value5', 'key6': 'value6'},
               'key7': 'value7'},
               {'key1': 'value8',
               'key2': 'value9',
               'key3': ['value10', 'value11'],
               'key4': {'key5': 'value12', 'key6': 'value13'},
               'key7': 'value14'}]

# the new records do NOT have to contain the same keys
# MongoDB will allow them to have different fields/keys

result = col.insert_many(new_records)
result.inserted_ids  # will display the ObjectId instances for each inserted document
```

## Bulk Write Operations

- `insert_many()`
    - PyMongo will automatically split the batch into smaller sub-batches based on the maximum message size accepted by MongoDB
        - This leads to support of very large bulk insert operations
    - Returns an instance of `InsertManyResult`

```python
# insert_many is a method of a collection object, no extra import needed
# where documents is an iterable of documents to insert (usually a list)
# ordered=True is default, will insert the docs in the order supplied
# ordered=True: if exception, will abort all records that follow
# ordered=False: will try to insert every doc even if one fails
result = db.col_name.insert_many(documents[, ordered=True])
```

- `bulk_write()`
    - Allows for mixed bulk write operations
        - e.g. `InsertOne`, `UpdateOne`, `ReplaceOne`, `DeleteMany` etc.
    - Returns a `BulkWriteResult` object

```python
# ordered operations (carried out in order provided for serial execution)
# if an exception occurrs, all remaining items are aborted
from pymongo import InsertOne, DeleteMany, ReplaceOne, UpdateOne
result = db.col_name.bulk_write([
    DeleteMany({}),   # written this way, would delete all records
    InsertOne({'_id': 1}),
    InsertOne({'_id': 2}),
    UpdateOne({'_id': 1}, {'$set': {'foo': 'bar'}}),
    ReplaceOne({'j': 1}, {'j', 2})])
    
# print the result including the number of records affected by each operation
pprint.pprint(result.bulk_api_result)

# ordered opreations using try/catch to see errors
requests = [
    DeleteMany({}),   # written this way, would delete all records
    InsertOne({'_id': 1}),
    InsertOne({'_id': 2}),
    UpdateOne({'_id': 1}, {'$set': {'foo': 'bar'}}),
    ReplaceOne({'j': 1}, {'j', 2})]
try:
    db.col_name.bulk_write(requests)
except BulkWriteError as bwe:
    pprint.pprint(bwe.details)


# unordered bulk_write operations
# write in parallel, will try every operation even if exceptions occurr
try:
    db.col_name.bulk_write(requests, ordered=False)
except BulkWriteError as bwe:
    pprint.pprint(bwe.details)
```

## Indexing
- Adding indexes helps accelerate queries and adds functionality for querying/storing documents
- MongoDB automatically creates a unique index on `_id`
- View Mongo [Index Documentation](http://www.mongodb.org/display/DOCS/Indexes)

#### Viewing Indexes on a Collection
- `sorted(list(db.col.index_information)))`
    - `col` is the collection name

#### Creating Unique Indexes
- Unique indexes will reject documents whose value for that key already exists

```python
# creating a unique index
# generic example (won't work) where 'col' is the collection name and key_name is the field/key to index
# unique=True
result = db.col.create_index([('key_name', pymongo.ASCENDING)],
                                   unique=True)
```

## MongoDB FAQ's
- [FAQ's](https://pymongo.readthedocs.io/en/stable/faq.html)

## API Documentation
- [API Docs](https://pymongo.readthedocs.io/en/stable/api/index.html)