### NOTE: Missing Values in Date fields with MongoDB
When you try to save a DataFrame that includes a columnn of datetime values and there
are some of those values missing, pymongo gives you a "ValueError: NaTType does not support utcoffset".  This is because for type datetype, it first calls utcoffset on each value, but the NaT (Not a Time -- designator for missing values in datetime columns) does not support having utcoffset called on it.

To solve this, I am going to modify the "dataframe_to_mongo()" function to test for any columns of datetime type with missing values.  If it finds any of those, it will save those missing value rows separately and save them without the datetime column.

The key to understanding this solution is that MongoDB is a NoSQL database, so all the rows do __not__ have to have the same fields -- it is open structure and there is no table schema like in SQL databases.  So I will save the rows that have the datetime value so that they _include_ the datetime column, but save the rows that are _missing_ the datetime value so that they _exclude_ the datetime column.  That way, I will avoid the ValueError.

In [146]:
import numpy as np
import pandas as pd

from pymongo import MongoClient

def get_mongo_database(db_name, host='localhost', port=27017, username=None, password=None):
    '''Get (or create) named database from MongoDB with/out authentication'''
    if username and password:
        mongo_uri = 'mongodb://{}:{}@{}/{}'.format(username, password, host, db_name)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)
    return conn[db_name]

def mongo_to_dataframe(db_name, collection_name, query={}, host='localhost',
                       port=27017, username=None, password=None, no_id=True, datecols=None):
    '''Create a Pandas DataFrame from MongoDB collection'''
    db = get_mongo_database(db_name, host, port, username, password)
    cursor = db[collection_name].find(query)
    df = pd.DataFrame(list(cursor))
    if no_id:
        del df['_id']
    return df

def dataframe_to_mongo(dframe, db_name, collection_name, host='localhost',
                 port=27017, username=None, password=None, datecols=None):
    '''save a dataframe to mongodb collection'''
    db = get_mongo_database(db_name, host, port, username, password)
    if datecols:
        # 1. rows with valid date entries in datecols:
        valid_mask = np.array(dframe.loc[:,datecols].notnull())
        valids = dframe[valid_mask].to_dict('records')  # list-of-dicts format
        db[collection_name].insert_many(valids)
        # 2. rows *without* valid date entries in datecols:
        invalids = dframe[~valid_mask].loc[:,df.columns.difference(datecols)].to_dict('records')  # list-of-dicts format
        db[collection_name].insert_many(invalids)
    else:
        records = dframe.to_dict('records')  # 'records' -> list-of-dicts format
        db[collection_name].insert_many(records)
        
def delete_collection(db_name, collection_name, host='localhost',
                 port=27017, username=None, password=None):
    db = get_mongo_database(db_name, host, port, username, password)
    db[collection_name].delete_many({}) # empty filter deletes all entries



# Test DB

In [147]:
names = ['Bob','Jessica','Mary','John','Mel']
death_date = ['2007-11-01', 'living', '2001-01-02', '2005-10-06', '2003-04-25']
DeathDataSet = list(zip(names,death_date))
df = pd.DataFrame(data=DeathDataSet, columns=['name', 'death_date'])
df

Unnamed: 0,name,death_date
0,Bob,2007-11-01
1,Jessica,living
2,Mary,2001-01-02
3,John,2005-10-06
4,Mel,2003-04-25


In [148]:
df.death_date = pd.to_datetime(df.death_date, errors='coerce')
df

Unnamed: 0,name,death_date
0,Bob,2007-11-01
1,Jessica,NaT
2,Mary,2001-01-02
3,John,2005-10-06
4,Mel,2003-04-25


In [149]:
datecols = ['death_date']
i = df.loc[:,datecols].notnull()
df[np.array(i)]

Unnamed: 0,name,death_date
0,Bob,2007-11-01
2,Mary,2001-01-02
3,John,2005-10-06
4,Mel,2003-04-25


# Test the mongodb functions

In [152]:
db = get_mongo_database('test_db')  # create or open the DB

# Clear DB
db['test_collection'].delete_many({}) # deletes everything (no filter)

dataframe_to_mongo(df, 'test_db', 'test_collection', 
                   datecols=['death_date']) # save to specified db/collection

return_df = mongo_to_dataframe('test_db', 'test_collection') # read back out

# Compare the returned with the original
print("original\n{}\n".format(df))
print("return\n{}\n".format(return_df))

# Clear DB for next test
db['test_collection'].delete_many({}) # deletes everything (no filter)

return_df.info()


original
      name death_date
0      Bob 2007-11-01
1  Jessica        NaT
2     Mary 2001-01-02
3     John 2005-10-06
4      Mel 2003-04-25

return
  death_date     name
0 2007-11-01      Bob
1 2001-01-02     Mary
2 2005-10-06     John
3 2003-04-25      Mel
4        NaT  Jessica

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
death_date    4 non-null datetime64[ns]
name          5 non-null object
dtypes: datetime64[ns](1), object(1)
memory usage: 152.0+ bytes


# Postscript: Converting to datetime
Panda's "to_datetime()" method can treat errors one of three ways:
1. errors='raise' -- Raise a ValueError for values that cannot be converted to datetime values
2. errors='coerce' -- replace values that cannot be converted into "NaT" (not a time)
3. errors='ignore' -- just leave the unconvertable values as they are.

In [153]:
# reset df
names = ['Bob','Jessica','Mary','John','Mel']
death_date = ['2007-11-01', 'living', '2001-01-02', '2005-10-06', '2003-04-25']
DeathDataSet = list(zip(names,death_date))
df = pd.DataFrame(data=DeathDataSet, columns=['name', 'death_date'])

## 1. 'Raise'

In [154]:
pd.to_datetime(df.death_date, errors='raise')

ValueError: Unknown string format

In [155]:
# You can see which rows caused the error as follows:
# if there was an error raised, this is one way to know which row caused it:
for i, row in df.iterrows():
    try:
        pd.to_datetime(row.death_date, errors='raise')
    except:
        print('{}({}, {})'.format(row.death_date.ljust(30), row['name'], i))

#
# Alternate, better way (since it doesn't iterate over rows)
# (Anytime you interate over rows, you probably aren't doing it right)

with_death_dates = df[df.death_date.notnull()]
bad_dates = pd.isnull(pd.to_datetime(with_death_dates.death_date, errors='coerce'))
with_death_dates[bad_dates][['name', 'death_date']]

living                        (Jessica, 1)


Unnamed: 0,name,death_date
1,Jessica,living


## 2. Coerce

In [156]:
pd.to_datetime(df.death_date, errors='coerce')
# fine, but the NaT it inserted will cause pymongo to error when saving

0   2007-11-01
1          NaT
2   2001-01-02
3   2005-10-06
4   2003-04-25
Name: death_date, dtype: datetime64[ns]

## 3. Ignore

In [157]:
pd.to_datetime(df.death_date, errors='ignore')
# leaves the 'living' string, but note that the dtype
# of this resulting Series is 'object', not 'datetime64[ns]

0    2007-11-01
1        living
2    2001-01-02
3    2005-10-06
4    2003-04-25
Name: death_date, dtype: object