In [29]:
###
### Libraries
###

import pymongo
import time

In [4]:
###
### Create a client object connected locally
###

client = pymongo.MongoClient('localhost', 27017)

In [5]:
###
### Create a database for our client to connect to, if not exists, create it
###

###
### Drop a database
###
# client.drop_database('chapter3')

db = client['UCI-Database']

In [6]:
collection = db['income']

In [7]:
###
### Pro-tip: Documents have a JSON structure, i.e,.. a python {} structure of key-value pairs
### This means we can leverage the zip command for each line, zipping values by location with the right
###   `column_names` value, then calling that a dict. This is the `document` that MongoDB is expecting
###

column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
               'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain',
               'capital-loss', 'hours-per-week', 'native-country', 'label']

###
### Dataset location: https://archive.ics.uci.edu/ml/datasets/adult
### Just download it and save it, the column names are on the homepage also.
###

row_container=list()
with open("income.txt") as f_in:
    for line in f_in:
        row = [val.strip(" ").strip("\n") for val in line.split(',')]
        row_dict = dict(zip(column_names, row))
        collection.insert_one(row_dict)
        row_container.append(row_dict) # store it locally for debugging

In [8]:
###
### Verify things were added successfully
###

collection.count()

32562

In [17]:
###
### Grab an item from our collection to verify it worked
###

age39 = collection.find_one({'age':39})

In [19]:
###
### View the age 39, not there? Look at the first object to see why
###
print age39
collection.find_one({})

None


{u'_id': ObjectId('591b3ec6375e7d0ac651b535'),
 u'age': u'39',
 u'capital-gain': u'2174',
 u'capital-loss': u'0',
 u'education': u'Bachelors',
 u'education-num': u'13',
 u'fnlwgt': u'77516',
 u'hours-per-week': u'40',
 u'label': u'<=50K',
 u'marital-status': u'Never-married',
 u'native-country': u'United-States',
 u'occupation': u'Adm-clerical',
 u'race': u'White',
 u'relationship': u'Not-in-family',
 u'sex': u'Male',
 u'workclass': u'State-gov'}

In [20]:
###
### 1. Delete the old collection, Re-input our data with the correct types
###
collection.delete_many({})

with open("income.txt") as f_in:
    for line in f_in:
        row = [val.strip(" ").strip("\n") for val in line.split(',')]
        row_dict = dict(zip(column_names, row))
        try:
            row_dict['age'] = int(row_dict['age'])
        except: 
            pass;
        collection.insert_one(row_dict)

In [23]:
###
### Query for everyone over the age of 35
###
### $gt is the filter for greater than, we can use this with special dictionary syntax
###

over_35 = collection.find({'age': {"$gt" : 35}})


<pymongo.cursor.Cursor at 0x114c54c90>

In [26]:
###
### A Cursor data structure which contains a pointer to the current row in a result set, iterator-like object
###

type(over_35)
over_35.next()
over_35.count()

17636

In [32]:
###
### Performance is a big deal when dealing with large datasets, so creating an index is a good idea
###


###
### Time non-indexed value for a query
###
start = time.time()
age_over50 = collection.find({'age':{"$eq":50}})
end = time.time()
print (end - start)

###
### Create the index
###
index_result = db.profiles.create_index([('age':pymongo.ASCENDING)], unique=False) # there may exist ma

###
### Re-time after the index
###

0.000157117843628
