# PyMongo Testing Notebook

> This notebook is a playground for using the MongoDB training database `TrainingDB`.

In [1]:
# import primary packages needed for exploration
import pymongo
import os
from pymongo import MongoClient

# import pprint, which does a better job of displaying nested JSON objects in python
from pprint import pprint

In [2]:
# create a connection to MongoClient using a URI string
URI = os.getenv('MONGO_AD_URI')
client = MongoClient(URI)

In [3]:
# get a database from the connected client
# this db name is database_name
# can also use client['database_name']
db = client.TrainingDB

In [4]:
# list all collections in the db
db.list_collection_names()

['PyMongo_Test', 'TrainingDB']

In [5]:
# create a collection if it doesn't already exist

# adding collation of en_US to help with sorting strings
from pymongo.collation import Collation

# good practice to check for a collection before creating a new one
if 'PyMongo_Test' not in db.list_collection_names():
    collection = db.create_collection('PyMongo_Test', collation=Collation(locale='en_US'))
else:
    print('Collection already exists.')

Collection already exists.


In [7]:
# connect to a collection
if 'PyMongo_Test' in db.list_collection_names():
    collection = db.PyMongo_Test
else:
    print('Collection does not exist')

In [8]:
# print the first record from a collection
# returns None if no records exist
pprint(collection.find_one())

None


## Preprocessing Data for Inserting Documents to a Collection

#### Working with XML Data
- This is sample data generated for proof of concept testing

In [10]:
# import xml/json packages
import xmltodict
import json

In [11]:
# set path and file name to read the data
path = './'
file = 'Batch_Req_Report_Sample.xml'
file_path = path + file

In [12]:
# read the data (result is a string)
with open(file_path) as xml_file:
    my_dict = xmltodict.parse(xml_file.read())
xml_file.close()
json_data = json.dumps(my_dict)
json_data

'{"VisionDataExchange": {"@xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", "Credentials": {"Client": "client_rtg_base", "Version": "4.3"}, "ExportData": {"Requisition": {"@clientReqId": "21", "Title": "ReqExportTest", "SalaryLow": "888", "SalaryHigh": "999", "RecruitStartDate": "2008-12-26 00:00:00", "RecruitEndDate": null, "TargetStartDate": "2009-03-03 00:00:00", "CandFillCount": "50", "WorkHours": "40", "WorkHourUnit": "Week", "TravelRequired": {"@code": "1", "#text": "Yes"}, "TravelPercent": "65", "EssentialFunction": "position requirements", "Description": "position description", "WorkEnv": null, "Active": {"@code": "1", "#text": "Yes"}, "FolderActivityDate": "2008-12-22 17:15:46", "Posted": {"@code": "0", "#text": "No"}, "Status": {"@code": "OPEN", "#text": "Open"}, "Workflow": {"@code": "RENE_WK", "#text": "Rene workflow"}, "Priority": {"@code": "MEDIUM", "#text": "Medium Priority"}, "Location": {"@code": "RALEIGH_NC", "#text": "Raleigh, NC"}, "Experience": {"@code": "L

In [14]:
# replace null with None so python doesn't freak out
jd = json_data.replace('null', 'None')

In [16]:
# use the eval function to return a python dictionary in JSON format
jd = eval(jd)
jd

{'VisionDataExchange': {'@xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance',
  'Credentials': {'Client': 'client_rtg_base', 'Version': '4.3'},
  'ExportData': {'Requisition': {'@clientReqId': '21',
    'Title': 'ReqExportTest',
    'SalaryLow': '888',
    'SalaryHigh': '999',
    'RecruitStartDate': '2008-12-26 00:00:00',
    'RecruitEndDate': None,
    'TargetStartDate': '2009-03-03 00:00:00',
    'CandFillCount': '50',
    'WorkHours': '40',
    'WorkHourUnit': 'Week',
    'TravelRequired': {'@code': '1', '#text': 'Yes'},
    'TravelPercent': '65',
    'EssentialFunction': 'position requirements',
    'Description': 'position description',
    'WorkEnv': None,
    'Active': {'@code': '1', '#text': 'Yes'},
    'FolderActivityDate': '2008-12-22 17:15:46',
    'Posted': {'@code': '0', '#text': 'No'},
    'Status': {'@code': 'OPEN', '#text': 'Open'},
    'Workflow': {'@code': 'RENE_WK', '#text': 'Rene workflow'},
    'Priority': {'@code': 'MEDIUM', '#text': 'Medium Priority'},
    

In [17]:
# datatype is indeed a python dictionary
type(jd)

dict

In [19]:
# duplicate the data to simulate a collection of records
data = [jd, jd, jd, jd, jd, jd]

In [20]:
# verify data
type(data)

list

In [21]:
# verify that 'data' is a list of dictionaries, each representing a record
data[4]

{'VisionDataExchange': {'@xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance',
  'Credentials': {'Client': 'client_rtg_base', 'Version': '4.3'},
  'ExportData': {'Requisition': {'@clientReqId': '21',
    'Title': 'ReqExportTest',
    'SalaryLow': '888',
    'SalaryHigh': '999',
    'RecruitStartDate': '2008-12-26 00:00:00',
    'RecruitEndDate': None,
    'TargetStartDate': '2009-03-03 00:00:00',
    'CandFillCount': '50',
    'WorkHours': '40',
    'WorkHourUnit': 'Week',
    'TravelRequired': {'@code': '1', '#text': 'Yes'},
    'TravelPercent': '65',
    'EssentialFunction': 'position requirements',
    'Description': 'position description',
    'WorkEnv': None,
    'Active': {'@code': '1', '#text': 'Yes'},
    'FolderActivityDate': '2008-12-22 17:15:46',
    'Posted': {'@code': '0', '#text': 'No'},
    'Status': {'@code': 'OPEN', '#text': 'Open'},
    'Workflow': {'@code': 'RENE_WK', '#text': 'Rene workflow'},
    'Priority': {'@code': 'MEDIUM', '#text': 'Medium Priority'},
    

In [24]:
# remove the unneccessary first few levels
def get_data(data):
    return [i['VisionDataExchange']['ExportData']['Requisition'] for i in data]

In [27]:
# run function, check results
data = get_data(data)
data[0]

{'@clientReqId': '21',
 'Title': 'ReqExportTest',
 'SalaryLow': '888',
 'SalaryHigh': '999',
 'RecruitStartDate': '2008-12-26 00:00:00',
 'RecruitEndDate': None,
 'TargetStartDate': '2009-03-03 00:00:00',
 'CandFillCount': '50',
 'WorkHours': '40',
 'WorkHourUnit': 'Week',
 'TravelRequired': {'@code': '1', '#text': 'Yes'},
 'TravelPercent': '65',
 'EssentialFunction': 'position requirements',
 'Description': 'position description',
 'WorkEnv': None,
 'Active': {'@code': '1', '#text': 'Yes'},
 'FolderActivityDate': '2008-12-22 17:15:46',
 'Posted': {'@code': '0', '#text': 'No'},
 'Status': {'@code': 'OPEN', '#text': 'Open'},
 'Workflow': {'@code': 'RENE_WK', '#text': 'Rene workflow'},
 'Priority': {'@code': 'MEDIUM', '#text': 'Medium Priority'},
 'Location': {'@code': 'RALEIGH_NC', '#text': 'Raleigh, NC'},
 'Experience': {'@code': 'LESS_THAN_ONE_YEAR', '#text': 'Less than 1 year'},
 'Duration': {'@code': 'FULLTIMEREG', '#text': 'Full-Time Regular'},
 'Education': {'@code': 'ASSOCIATES',

## Modifying Datatypes on a Set of Records
- The dictionary 'data' represents an XML data export that is realistic
- The goal of this section is to convert datatypes from all strings to appropriate types prior to inserting in MongoDB
    - This particular example really only includes `int` and `datetime` datatypes
    - Also creating Month, Year, and YYYYMM fields

In [28]:
# import relevant packages
from datetime import datetime as dt

In [29]:
data[0]['Created']['Date']

'2008-12-22 17:15:46'

In [None]:
def convert_add_dates(data):
    '''This function accepts '''