## Load a single bird's data from Mongo

We're assuming that `mongod` is running and pointing to a database with the BIRT data. In my case, that's `mongod --dbpath /Volumes/Transcend/data/db`.

In [18]:
# ! mongod --dbpath /Volumes/Transcend/data/db --fork --logpath ~/Library/Logs/mongodb.log

In [19]:
import numpy as np
#import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
import pandas as pd
import os
from pymongo import MongoClient

In [20]:
client = MongoClient()
db = client.birt

In [21]:
# Make sure we've got the thing hooked up right.
db.collection_names()

['migrations', 'birds', 'halunka:i18n']

In [22]:
birds = db.birds

So, now we have `birds`, which is a collection of the eBird sightings. The documentation for the `Collection()` class is [here](https://api.mongodb.org/python/current/api/pymongo/collection.html#pymongo.collection.Collection). `birds.find_one()` will get us the first record to take a look at the structure.

In [23]:
birds.find_one()

{'_id': 'abeillia_abeillei',
 'category': 'species',
 'family_name': 'Trochilidae (Hummingbirds)',
 'genus_name': 'Abeillia',
 'order_name': 'Apodiformes',
 'primary_com_name': 'Emerald-chinned_Hummingbird',
 'species_name': 'abeillei',
 'subfamily_name': None,
 'taxon_order': 9016.0}

In [24]:
migrations = db.migrations

Below, we examine a single `migrations` document. It includes... gosh, I think this data is pretty shittily organized. How can I index this? It doesn't have like, a "sightings" array. It has just, all the variables, and then the number of birds at that location.

So, for each bird, I should:

1. Find all documents with that species name.
2. Extract all covariates for those locations, and the "target", in `scikit-learn` terminology, is the number of birds.
3. Load all background covariates

In [25]:
migrations.find_one()

{'_id': 'S10000010',
 'agelaius_phoeniceus': 2,
 'baeolophus_bicolor': 4,
 'bailey_ecoregion': '-222J',
 'bcr': 12,
 'bucephala_clangula': 9,
 'cardinalis_cardinalis': 1,
 'caus_prec': 3,
 'caus_snow': 3,
 'caus_temp_avg': 2,
 'caus_temp_max': 1,
 'caus_temp_min': 2,
 'corvus_brachyrhynchos': 2,
 'count_type': 'P22',
 'country': 'United_States',
 'cyanocitta_cristata': 7,
 'date': datetime.datetime(2012, 2, 23, 0, 0),
 'day': 54,
 'effort_area_ha': 0.0,
 'effort_distance_km': 1.931,
 'effort_hrs': 0.83,
 'elev_gt': 182,
 'elev_ned': None,
 'group_id': None,
 'haemorhous_mexicanus': 2,
 'haliaeetus_leucocephalus': 3,
 'housing_density': None,
 'housing_percent_vacant': None,
 'larus_argentatus': 3,
 'larus_delawarensis': 81,
 'loc': {'coordinates': [-83.911171, 43.6727141], 'type': 'Point'},
 'lophodytes_cucullatus': 4,
 'mergus_merganser': 87,
 'month': 2,
 'nlcd2001_fs_c11_7500_pland': 38.0488,
 'nlcd2001_fs_c12_7500_pland': 0.0,
 'nlcd2001_fs_c21_7500_pland': 6.928,
 'nlcd2001_fs_c22

In [26]:
migrations.find_one(filter={'sightings.bird_id': 'zenaida_macroura'},
                    projection=['sightings.$'])

# And then I'll just add the core_covariates to `projection`!!!

{'_id': 'S10000010',
 'sightings': [{'bird_id': 'zenaida_macroura', 'count': 1}]}

In [27]:
# So, I need to read in the list of core covariates.
core_covariates = open('../data/core-covariates.names').readlines()

In [28]:
core_covariates = [cv.split(":")[0].lower() for cv in\
                   open('../data/core-covariates.names').readlines()]

## Planning

We need to decide which bird to use as a demo. The `Species_Analysis_Matrix_V1` document lists species and various properties.

In [29]:
core_covariates

['sampling_event_id',
 'pop00_sqmi',
 'housing_density',
 'housing_percent_vacant',
 'elev_gt',
 'elev_ned',
 'bcr',
 'bailey_ecoregion',
 'omernik_l3_ecoregion',
 'caus_temp_avg',
 'caus_temp_min',
 'caus_temp_max',
 'caus_prec',
 'caus_snow',
 'nlcd2001_fs_c11_7500_pland',
 'nlcd2001_fs_c12_7500_pland',
 'nlcd2001_fs_c21_7500_pland',
 'nlcd2001_fs_c22_7500_pland',
 'nlcd2001_fs_c23_7500_pland',
 'nlcd2001_fs_c24_7500_pland',
 'nlcd2001_fs_c31_7500_pland',
 'nlcd2001_fs_c41_7500_pland',
 'nlcd2001_fs_c42_7500_pland',
 'nlcd2001_fs_c43_7500_pland',
 'nlcd2001_fs_c52_7500_pland',
 'nlcd2001_fs_c71_7500_pland',
 'nlcd2001_fs_c81_7500_pland',
 'nlcd2001_fs_c82_7500_pland',
 'nlcd2001_fs_c90_7500_pland',
 'nlcd2001_fs_c95_7500_pland',
 'nlcd2006_fs_c11_7500_pland',
 'nlcd2006_fs_c12_7500_pland',
 'nlcd2006_fs_c21_7500_pland',
 'nlcd2006_fs_c22_7500_pland',
 'nlcd2006_fs_c23_7500_pland',
 'nlcd2006_fs_c24_7500_pland',
 'nlcd2006_fs_c31_7500_pland',
 'nlcd2006_fs_c41_7500_pland',
 'nlcd2006_

In [30]:
migrations.find_one(filter={'sightings.bird_id' : 'zenaida_macroura'},
                    projection=['sightings.$'] + core_covariates)

{'_id': 'S10000010',
 'bailey_ecoregion': '-222J',
 'bcr': 12,
 'caus_prec': 3,
 'caus_snow': 3,
 'caus_temp_avg': 2,
 'caus_temp_max': 1,
 'caus_temp_min': 2,
 'elev_gt': 182,
 'elev_ned': None,
 'housing_density': None,
 'housing_percent_vacant': None,
 'nlcd2001_fs_c11_7500_pland': 38.0488,
 'nlcd2001_fs_c12_7500_pland': 0.0,
 'nlcd2001_fs_c21_7500_pland': 6.928,
 'nlcd2001_fs_c22_7500_pland': 9.4952,
 'nlcd2001_fs_c23_7500_pland': 4.6392,
 'nlcd2001_fs_c24_7500_pland': 1.5504,
 'nlcd2001_fs_c31_7500_pland': 1.4828,
 'nlcd2001_fs_c41_7500_pland': 4.1796,
 'nlcd2001_fs_c42_7500_pland': 0.1372,
 'nlcd2001_fs_c43_7500_pland': 0.2728,
 'nlcd2001_fs_c52_7500_pland': 0.0636,
 'nlcd2001_fs_c71_7500_pland': 0.8444,
 'nlcd2001_fs_c81_7500_pland': 2.272,
 'nlcd2001_fs_c82_7500_pland': 22.8724,
 'nlcd2001_fs_c90_7500_pland': 5.65,
 'nlcd2001_fs_c95_7500_pland': 1.5636,
 'nlcd2006_fs_c11_7500_pland': 38.3464,
 'nlcd2006_fs_c12_7500_pland': 0.0,
 'nlcd2006_fs_c21_7500_pland': 7.15,
 'nlcd2006_fs

In [31]:
# Alternately, using the flat bird name:
migrations.find_one(filter={'zenaida_macroura' : {'$gt' : 0}},
                   projection=['zenaida_macroura'] + core_covariates)
# Equivalent to
migrations.find_one(filter={'zenaida_macroura' : {'$exists':True}},
                   projection=['zenaida_macroura'] + core_covariates)

{'_id': 'S10000010',
 'bailey_ecoregion': '-222J',
 'bcr': 12,
 'caus_prec': 3,
 'caus_snow': 3,
 'caus_temp_avg': 2,
 'caus_temp_max': 1,
 'caus_temp_min': 2,
 'elev_gt': 182,
 'elev_ned': None,
 'housing_density': None,
 'housing_percent_vacant': None,
 'nlcd2001_fs_c11_7500_pland': 38.0488,
 'nlcd2001_fs_c12_7500_pland': 0.0,
 'nlcd2001_fs_c21_7500_pland': 6.928,
 'nlcd2001_fs_c22_7500_pland': 9.4952,
 'nlcd2001_fs_c23_7500_pland': 4.6392,
 'nlcd2001_fs_c24_7500_pland': 1.5504,
 'nlcd2001_fs_c31_7500_pland': 1.4828,
 'nlcd2001_fs_c41_7500_pland': 4.1796,
 'nlcd2001_fs_c42_7500_pland': 0.1372,
 'nlcd2001_fs_c43_7500_pland': 0.2728,
 'nlcd2001_fs_c52_7500_pland': 0.0636,
 'nlcd2001_fs_c71_7500_pland': 0.8444,
 'nlcd2001_fs_c81_7500_pland': 2.272,
 'nlcd2001_fs_c82_7500_pland': 22.8724,
 'nlcd2001_fs_c90_7500_pland': 5.65,
 'nlcd2001_fs_c95_7500_pland': 1.5636,
 'nlcd2006_fs_c11_7500_pland': 38.3464,
 'nlcd2006_fs_c12_7500_pland': 0.0,
 'nlcd2006_fs_c21_7500_pland': 7.15,
 'nlcd2006_fs

In [32]:
projection = dict.fromkeys(['zenaida_macroura'] + core_covariates, 1)
projection

{'bailey_ecoregion': 1,
 'bcr': 1,
 'caus_prec': 1,
 'caus_snow': 1,
 'caus_temp_avg': 1,
 'caus_temp_max': 1,
 'caus_temp_min': 1,
 'elev_gt': 1,
 'elev_ned': 1,
 'housing_density': 1,
 'housing_percent_vacant': 1,
 'nlcd2001_fs_c11_7500_pland': 1,
 'nlcd2001_fs_c12_7500_pland': 1,
 'nlcd2001_fs_c21_7500_pland': 1,
 'nlcd2001_fs_c22_7500_pland': 1,
 'nlcd2001_fs_c23_7500_pland': 1,
 'nlcd2001_fs_c24_7500_pland': 1,
 'nlcd2001_fs_c31_7500_pland': 1,
 'nlcd2001_fs_c41_7500_pland': 1,
 'nlcd2001_fs_c42_7500_pland': 1,
 'nlcd2001_fs_c43_7500_pland': 1,
 'nlcd2001_fs_c52_7500_pland': 1,
 'nlcd2001_fs_c71_7500_pland': 1,
 'nlcd2001_fs_c81_7500_pland': 1,
 'nlcd2001_fs_c82_7500_pland': 1,
 'nlcd2001_fs_c90_7500_pland': 1,
 'nlcd2001_fs_c95_7500_pland': 1,
 'nlcd2006_fs_c11_7500_pland': 1,
 'nlcd2006_fs_c12_7500_pland': 1,
 'nlcd2006_fs_c21_7500_pland': 1,
 'nlcd2006_fs_c22_7500_pland': 1,
 'nlcd2006_fs_c23_7500_pland': 1,
 'nlcd2006_fs_c24_7500_pland': 1,
 'nlcd2006_fs_c31_7500_pland': 1,
 '

## Actually, we want to subsample ALL checklists, and then select our birds.



In [33]:
zen = migrations.aggregate(
    [
        {'$sample' : {'size' : 1000}},
        {'$project' : projection} # We will use '$project' to select our bird of choice.
    ]
)
zen_df = pd.DataFrame(list(zen))
zen_df.fillna(0, inplace=True)
zen_df.drop('_id', 1, inplace=True)

In [34]:
zen_df

Unnamed: 0,bailey_ecoregion,bcr,caus_prec,caus_snow,caus_temp_avg,caus_temp_max,caus_temp_min,elev_gt,elev_ned,housing_density,...,nlcd2006_fs_c43_7500_pland,nlcd2006_fs_c52_7500_pland,nlcd2006_fs_c71_7500_pland,nlcd2006_fs_c81_7500_pland,nlcd2006_fs_c82_7500_pland,nlcd2006_fs_c90_7500_pland,nlcd2006_fs_c95_7500_pland,omernik_l3_ecoregion,pop00_sqmi,zenaida_macroura
0,-221D,29.0,6.0,0.0,6.0,6.0,7.0,244.0,268.48,195.266147,...,2.0092,3.7992,0.1576,13.5100,16.3952,1.0496,0.0720,64.0,509.4,2.0
1,M221D,28.0,6.0,0.0,6.0,6.0,7.0,693.0,815.65,16.730378,...,4.2512,0.3008,0.8592,28.8712,0.2532,0.2240,0.0208,66.0,40.1,1.0
2,-212F,28.0,6.0,0.0,6.0,6.0,7.0,404.0,0.00,50.273016,...,13.6540,6.8792,0.6808,23.4296,9.0236,9.2432,0.4060,60.0,125.9,0.0
3,-222K,23.0,6.0,0.0,5.0,5.0,6.0,259.0,0.00,165.847206,...,0.0268,0.3684,0.2664,6.5316,22.2844,1.9944,4.9684,53.0,346.0,0.0
4,-232A,30.0,6.0,1.0,5.0,5.0,6.0,1.0,0.00,19.930028,...,0.1940,1.0204,0.0304,0.0232,0.7548,2.1176,23.9364,63.0,32.9,0.0
5,-232B,27.0,6.0,0.0,7.0,8.0,8.0,79.0,0.00,3.107606,...,0.7376,7.1512,2.9280,0.1256,2.8336,13.9644,0.8968,65.0,7.6,0.0
6,-261A,32.0,4.0,1.0,5.0,5.0,6.0,10.0,0.00,28.675641,...,3.6108,5.9828,53.2012,0.0000,4.9388,0.1188,5.9460,6.0,76.7,7.0
7,M261B,32.0,7.0,1.0,4.0,4.0,5.0,259.0,0.00,6.113849,...,10.7104,35.7880,9.4908,0.0000,10.3444,0.1840,0.0808,6.0,15.7,1.0
8,M221A,28.0,6.0,0.0,6.0,6.0,7.0,1000.0,1061.90,8.579772,...,1.5028,0.0000,0.0000,6.2232,0.4716,0.0000,0.0000,66.0,17.9,0.0
9,M221D,28.0,6.0,0.0,7.0,7.0,7.0,693.0,815.65,16.730378,...,4.2512,0.3008,0.8592,28.8712,0.2532,0.2240,0.0208,66.0,40.1,2.0


In [57]:
X = np.array(zen_df.iloc[:, 1:45]) # Removing categorical predictor.
y = np.array(zen_df.iloc[:, -1])

In [58]:
X

array([[  2.90000000e+01,   6.00000000e+00,   0.00000000e+00, ...,
          7.20000000e-02,   6.40000000e+01,   5.09400000e+02],
       [  2.80000000e+01,   6.00000000e+00,   0.00000000e+00, ...,
          2.08000000e-02,   6.60000000e+01,   4.01000000e+01],
       [  2.80000000e+01,   6.00000000e+00,   0.00000000e+00, ...,
          4.06000000e-01,   6.00000000e+01,   1.25900000e+02],
       ..., 
       [  1.30000000e+01,   6.00000000e+00,   0.00000000e+00, ...,
          4.80000000e-02,   8.30000000e+01,   0.00000000e+00],
       [  2.80000000e+01,   7.00000000e+00,   1.00000000e+00, ...,
          6.76000000e-02,   6.60000000e+01,   1.18900000e+02],
       [  2.30000000e+01,   5.00000000e+00,   1.00000000e+00, ...,
          6.15200000e-01,   5.60000000e+01,   5.17630000e+03]])

In [59]:
# This is an example AdaBoost regression

# Create the dataset

zen_regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                             n_estimators=300)

zen_regr.fit(X, y)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
         learning_rate=1.0, loss='linear', n_estimators=300,
         random_state=None)

In [63]:
import pickle
pickle.dump(zen_regr, open('/Volumes/Transcend/birt data/zen_regr.p', 'wb'))

Okay! We've now gotten to the point where we're fitting a model for a bird species! Next time: I will load the core covariates CSV and plot the data.