In [1]:
###
### Libraries
###

import pymongo
import time

In [2]:
###
### Create a client object connected locally
###

client = pymongo.MongoClient('localhost', 27017)

In [3]:
###
### Create a database for our client to connect to, if not exists, create it
###

###
### Drop a database
###
# client.drop_database('chapter3')

db = client['UCI-Database']

In [23]:
collection = db['income']

In [9]:
###
### Pro-tip: Documents have a JSON structure, i.e,.. a python {} structure of key-value pairs
### This means we can leverage the zip command for each line, zipping values by location with the right
###   `column_names` value, then calling that a dict. This is the `document` that MongoDB is expecting
###

column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
               'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain',
               'capital-loss', 'hours-per-week', 'native-country', 'label']

###
### Dataset location: https://archive.ics.uci.edu/ml/datasets/adult
### Just download it and save it, the column names are on the homepage also.
###

collection.delete_many({}) # If it already exists

with open("income.txt") as f_in:
    for line in f_in:
        row = [val.strip(" ").strip("\n") for val in line.split(',')]
        row_dict = dict(zip(column_names, row))
        try:
            row_dict['age'] = int(row_dict['age'])
        except: 
            pass;
        collection.insert_one(row_dict)

In [19]:
###
### Verify things were added successfully
###

collection.count()

32562

In [13]:
###
### Grab an item from our collection to verify it worked
###

age39 = collection.find_one({'age':39})

In [14]:
###
### View the age 39, not there? Look at the first object to see why
###
print age39
collection.find_one({})

{u'sex': u'Male', u'education': u'Bachelors', u'workclass': u'State-gov', u'relationship': u'Not-in-family', u'age': 39, u'capital-gain': u'2174', u'label': u'<=50K', u'hours-per-week': u'40', u'race': u'White', u'native-country': u'United-States', u'education-num': u'13', u'capital-loss': u'0', u'_id': ObjectId('591b4bd3375e7d0b10cb38ee'), u'fnlwgt': u'77516', u'marital-status': u'Never-married', u'occupation': u'Adm-clerical'}


{u'_id': ObjectId('591b4bd3375e7d0b10cb38ee'),
 u'age': 39,
 u'capital-gain': u'2174',
 u'capital-loss': u'0',
 u'education': u'Bachelors',
 u'education-num': u'13',
 u'fnlwgt': u'77516',
 u'hours-per-week': u'40',
 u'label': u'<=50K',
 u'marital-status': u'Never-married',
 u'native-country': u'United-States',
 u'occupation': u'Adm-clerical',
 u'race': u'White',
 u'relationship': u'Not-in-family',
 u'sex': u'Male',
 u'workclass': u'State-gov'}

NameError: name 'collection' is not defined

In [28]:
###
### Query for everyone over the age of 35
###
### $gt is the filter for greater than, we can use this with special dictionary syntax
###

over_35 = collection.find({'age': {"$gt" : 35}})
over_35.next()

{u'_id': ObjectId('591b4ce5375e7d0b10ccb684'),
 u'age': 39,
 u'capital-gain': u'2174',
 u'capital-loss': u'0',
 u'education': u'Bachelors',
 u'education-num': u'13',
 u'fnlwgt': u'77516',
 u'hours-per-week': u'40',
 u'label': u'<=50K',
 u'marital-status': u'Never-married',
 u'native-country': u'United-States',
 u'occupation': u'Adm-clerical',
 u'race': u'White',
 u'relationship': u'Not-in-family',
 u'sex': u'Male',
 u'workclass': u'State-gov'}

In [26]:
###
### A Cursor data structure which contains a pointer to the current row in a result set, iterator-like object
###

type(over_35)
over_35.next()
over_35.count()

17636

In [36]:
###
### Performance is a big deal when dealing with large datasets, so creating an index is a good idea
###


###
### Time non-indexed value for a query
###
start = time.time()
age45 = collection.find({'age':{"$eq":45}})
end = time.time()
print (end - start)

###
### Create the index
###
index_result = db.profiles.create_index([('age',pymongo.ASCENDING)], unique=False) # there may exist same ages

###
### Re-time after the index
###
start = time.time()
age30 = collection.find({'age':{"$eq":30}})
end = time.time()
print (end - start)

0.000200033187866
0.000179052352905


In [30]:
db.save()

TypeError: 'Collection' object is not callable. If you meant to call the 'save' method on a 'Database' object it is failing because no such method exists.