## Load a single bird's data from Mongo

We're assuming that `mongod` is running and pointing to a database with the BIRT data. In my case, that's `mongod --dbpath /Volumes/Transcend/data/db`.

In [2]:
# ! mongod --dbpath /Volumes/Transcend/data/db --fork --logpath ~/Library/Logs/mongodb.log

In [2]:
import numpy as np
#import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
import pandas as pd
import os
from pymongo import MongoClient
import pickle

In [3]:
client = MongoClient()
db = client.birt

In [4]:
# Make sure we've got the thing hooked up right.
db.collection_names()

['migrations', 'birds', 'halunka:i18n']

In [5]:
birds = db.birds

So, now we have `birds`, which is a collection of the eBird sightings. The documentation for the `Collection()` class is [here](https://api.mongodb.org/python/current/api/pymongo/collection.html#pymongo.collection.Collection). `birds.find_one()` will get us the first record to take a look at the structure.

In [6]:
birds.find_one()

{'_id': 'abeillia_abeillei',
 'category': 'species',
 'family_name': 'Trochilidae (Hummingbirds)',
 'genus_name': 'Abeillia',
 'order_name': 'Apodiformes',
 'primary_com_name': 'Emerald-chinned_Hummingbird',
 'species_name': 'abeillei',
 'subfamily_name': None,
 'taxon_order': 9016.0}

In [7]:
migrations = db.migrations

Below, we examine a single `migrations` document. It includes... gosh, I think this data is pretty shittily organized. How can I index this? It doesn't have like, a "sightings" array. It has just, all the variables, and then the number of birds at that location.

So, for each bird, I should:

1. Find all documents with that species name.
2. Extract all covariates for those locations, and the "target", in `scikit-learn` terminology, is the number of birds.
3. Load all background covariates

In [8]:
migrations.find_one()

{'_id': 'S10000010',
 'agelaius_phoeniceus': 2,
 'baeolophus_bicolor': 4,
 'bailey_ecoregion': '-222J',
 'bcr': 12,
 'bucephala_clangula': 9,
 'cardinalis_cardinalis': 1,
 'caus_prec': 3,
 'caus_snow': 3,
 'caus_temp_avg': 2,
 'caus_temp_max': 1,
 'caus_temp_min': 2,
 'corvus_brachyrhynchos': 2,
 'count_type': 'P22',
 'country': 'United_States',
 'cyanocitta_cristata': 7,
 'date': datetime.datetime(2012, 2, 23, 0, 0),
 'day': 54,
 'effort_area_ha': 0.0,
 'effort_distance_km': 1.931,
 'effort_hrs': 0.83,
 'elev_gt': 182,
 'elev_ned': None,
 'group_id': None,
 'haemorhous_mexicanus': 2,
 'haliaeetus_leucocephalus': 3,
 'housing_density': None,
 'housing_percent_vacant': None,
 'larus_argentatus': 3,
 'larus_delawarensis': 81,
 'loc': {'coordinates': [-83.911171, 43.6727141], 'type': 'Point'},
 'lophodytes_cucullatus': 4,
 'mergus_merganser': 87,
 'month': 2,
 'nlcd2001_fs_c11_7500_pland': 38.0488,
 'nlcd2001_fs_c12_7500_pland': 0.0,
 'nlcd2001_fs_c21_7500_pland': 6.928,
 'nlcd2001_fs_c22

In [10]:
migrations.find_one(filter={'sightings.bird_id': 'zenaida_macroura'},
                    projection=['sightings.$'])

# And then I'll just add the core_covariates to `projection`!!!

{'_id': 'S10000010',
 'sightings': [{'bird_id': 'zenaida_macroura', 'count': 1}]}

In [11]:
# So, I need to read in the list of core covariates.
core_covariates = open('../data/core-covariates.names').readlines()

In [12]:
core_covariates = [cv.split(":")[0].lower() for cv in\
                   open('../data/core-covariates.names').readlines()]

## Planning

We need to decide which bird to use as a demo. The `Species_Analysis_Matrix_V1` document lists species and various properties.

In [13]:
core_covariates

['sampling_event_id',
 'pop00_sqmi',
 'housing_density',
 'housing_percent_vacant',
 'elev_gt',
 'elev_ned',
 'bcr',
 'bailey_ecoregion',
 'omernik_l3_ecoregion',
 'caus_temp_avg',
 'caus_temp_min',
 'caus_temp_max',
 'caus_prec',
 'caus_snow',
 'nlcd2001_fs_c11_7500_pland',
 'nlcd2001_fs_c12_7500_pland',
 'nlcd2001_fs_c21_7500_pland',
 'nlcd2001_fs_c22_7500_pland',
 'nlcd2001_fs_c23_7500_pland',
 'nlcd2001_fs_c24_7500_pland',
 'nlcd2001_fs_c31_7500_pland',
 'nlcd2001_fs_c41_7500_pland',
 'nlcd2001_fs_c42_7500_pland',
 'nlcd2001_fs_c43_7500_pland',
 'nlcd2001_fs_c52_7500_pland',
 'nlcd2001_fs_c71_7500_pland',
 'nlcd2001_fs_c81_7500_pland',
 'nlcd2001_fs_c82_7500_pland',
 'nlcd2001_fs_c90_7500_pland',
 'nlcd2001_fs_c95_7500_pland',
 'nlcd2006_fs_c11_7500_pland',
 'nlcd2006_fs_c12_7500_pland',
 'nlcd2006_fs_c21_7500_pland',
 'nlcd2006_fs_c22_7500_pland',
 'nlcd2006_fs_c23_7500_pland',
 'nlcd2006_fs_c24_7500_pland',
 'nlcd2006_fs_c31_7500_pland',
 'nlcd2006_fs_c41_7500_pland',
 'nlcd2006_

In [14]:
migrations.find_one(filter={'sightings.bird_id' : 'zenaida_macroura'},
                    projection=['sightings.$'] + core_covariates)

{'_id': 'S10000010',
 'bailey_ecoregion': '-222J',
 'bcr': 12,
 'caus_prec': 3,
 'caus_snow': 3,
 'caus_temp_avg': 2,
 'caus_temp_max': 1,
 'caus_temp_min': 2,
 'elev_gt': 182,
 'elev_ned': None,
 'housing_density': None,
 'housing_percent_vacant': None,
 'nlcd2001_fs_c11_7500_pland': 38.0488,
 'nlcd2001_fs_c12_7500_pland': 0.0,
 'nlcd2001_fs_c21_7500_pland': 6.928,
 'nlcd2001_fs_c22_7500_pland': 9.4952,
 'nlcd2001_fs_c23_7500_pland': 4.6392,
 'nlcd2001_fs_c24_7500_pland': 1.5504,
 'nlcd2001_fs_c31_7500_pland': 1.4828,
 'nlcd2001_fs_c41_7500_pland': 4.1796,
 'nlcd2001_fs_c42_7500_pland': 0.1372,
 'nlcd2001_fs_c43_7500_pland': 0.2728,
 'nlcd2001_fs_c52_7500_pland': 0.0636,
 'nlcd2001_fs_c71_7500_pland': 0.8444,
 'nlcd2001_fs_c81_7500_pland': 2.272,
 'nlcd2001_fs_c82_7500_pland': 22.8724,
 'nlcd2001_fs_c90_7500_pland': 5.65,
 'nlcd2001_fs_c95_7500_pland': 1.5636,
 'nlcd2006_fs_c11_7500_pland': 38.3464,
 'nlcd2006_fs_c12_7500_pland': 0.0,
 'nlcd2006_fs_c21_7500_pland': 7.15,
 'nlcd2006_fs

In [15]:
# Alternately, using the flat bird name:
migrations.find_one(filter={'zenaida_macroura' : {'$gt' : 0}},
                   projection=['zenaida_macroura'] + core_covariates)
# Equivalent to
migrations.find_one(filter={'zenaida_macroura' : {'$exists':True}},
                   projection=['zenaida_macroura'] + core_covariates)

{'_id': 'S10000010',
 'bailey_ecoregion': '-222J',
 'bcr': 12,
 'caus_prec': 3,
 'caus_snow': 3,
 'caus_temp_avg': 2,
 'caus_temp_max': 1,
 'caus_temp_min': 2,
 'elev_gt': 182,
 'elev_ned': None,
 'housing_density': None,
 'housing_percent_vacant': None,
 'nlcd2001_fs_c11_7500_pland': 38.0488,
 'nlcd2001_fs_c12_7500_pland': 0.0,
 'nlcd2001_fs_c21_7500_pland': 6.928,
 'nlcd2001_fs_c22_7500_pland': 9.4952,
 'nlcd2001_fs_c23_7500_pland': 4.6392,
 'nlcd2001_fs_c24_7500_pland': 1.5504,
 'nlcd2001_fs_c31_7500_pland': 1.4828,
 'nlcd2001_fs_c41_7500_pland': 4.1796,
 'nlcd2001_fs_c42_7500_pland': 0.1372,
 'nlcd2001_fs_c43_7500_pland': 0.2728,
 'nlcd2001_fs_c52_7500_pland': 0.0636,
 'nlcd2001_fs_c71_7500_pland': 0.8444,
 'nlcd2001_fs_c81_7500_pland': 2.272,
 'nlcd2001_fs_c82_7500_pland': 22.8724,
 'nlcd2001_fs_c90_7500_pland': 5.65,
 'nlcd2001_fs_c95_7500_pland': 1.5636,
 'nlcd2006_fs_c11_7500_pland': 38.3464,
 'nlcd2006_fs_c12_7500_pland': 0.0,
 'nlcd2006_fs_c21_7500_pland': 7.15,
 'nlcd2006_fs

In [16]:
projection = dict.fromkeys(['zenaida_macroura'] + core_covariates, 1)
projection

{'bailey_ecoregion': 1,
 'bcr': 1,
 'caus_prec': 1,
 'caus_snow': 1,
 'caus_temp_avg': 1,
 'caus_temp_max': 1,
 'caus_temp_min': 1,
 'elev_gt': 1,
 'elev_ned': 1,
 'housing_density': 1,
 'housing_percent_vacant': 1,
 'nlcd2001_fs_c11_7500_pland': 1,
 'nlcd2001_fs_c12_7500_pland': 1,
 'nlcd2001_fs_c21_7500_pland': 1,
 'nlcd2001_fs_c22_7500_pland': 1,
 'nlcd2001_fs_c23_7500_pland': 1,
 'nlcd2001_fs_c24_7500_pland': 1,
 'nlcd2001_fs_c31_7500_pland': 1,
 'nlcd2001_fs_c41_7500_pland': 1,
 'nlcd2001_fs_c42_7500_pland': 1,
 'nlcd2001_fs_c43_7500_pland': 1,
 'nlcd2001_fs_c52_7500_pland': 1,
 'nlcd2001_fs_c71_7500_pland': 1,
 'nlcd2001_fs_c81_7500_pland': 1,
 'nlcd2001_fs_c82_7500_pland': 1,
 'nlcd2001_fs_c90_7500_pland': 1,
 'nlcd2001_fs_c95_7500_pland': 1,
 'nlcd2006_fs_c11_7500_pland': 1,
 'nlcd2006_fs_c12_7500_pland': 1,
 'nlcd2006_fs_c21_7500_pland': 1,
 'nlcd2006_fs_c22_7500_pland': 1,
 'nlcd2006_fs_c23_7500_pland': 1,
 'nlcd2006_fs_c24_7500_pland': 1,
 'nlcd2006_fs_c31_7500_pland': 1,
 '

## Actually, we want to subsample ALL checklists, and then select our birds.



In [17]:
zen = migrations.aggregate(
    [
        {'$sample' : {'size' : 1000}},
        {'$project' : projection} # We will use '$project' to select our bird of choice.
    ]
)
zen_df = pd.DataFrame(list(zen))
zen_df.fillna(0, inplace=True)
zen_df.drop('_id', 1, inplace=True)

In [26]:
zen_df

Unnamed: 0,bailey_ecoregion,bcr,caus_prec,caus_snow,caus_temp_avg,caus_temp_max,caus_temp_min,elev_gt,elev_ned,housing_density,...,nlcd2006_fs_c43_7500_pland,nlcd2006_fs_c52_7500_pland,nlcd2006_fs_c71_7500_pland,nlcd2006_fs_c81_7500_pland,nlcd2006_fs_c82_7500_pland,nlcd2006_fs_c90_7500_pland,nlcd2006_fs_c95_7500_pland,omernik_l3_ecoregion,pop00_sqmi,zenaida_macroura
0,-232B,31.0,5.0,1.0,6.0,6.0,6.0,18.0,0.00,37.177708,...,0.0000,0.1088,2.3200,26.6940,1.2656,16.7720,2.4944,75.0,72.9,0.0
1,M221A,28.0,6.0,0.0,7.0,7.0,7.0,213.0,221.44,23.885216,...,3.1640,0.0000,0.0000,16.1880,12.9556,0.0084,0.0056,67.0,57.9,0.0
2,-313E,16.0,3.0,0.0,6.0,7.0,6.0,2132.0,0.00,32.446319,...,0.0000,10.8456,16.1432,0.0000,0.0000,0.0192,0.0404,23.0,79.9,2.0
3,-332E,19.0,2.0,1.0,3.0,3.0,3.0,582.0,0.00,861.029792,...,0.0804,2.2052,69.9496,0.0000,10.0132,0.0000,0.0000,27.0,1604.4,0.0
4,-221A,14.0,6.0,3.0,2.0,2.0,3.0,44.0,0.00,0.000000,...,39.2272,1.4412,0.4644,9.6132,1.4708,7.5632,1.4952,82.0,0.0,0.0
5,-221D,29.0,6.0,1.0,2.0,2.0,3.0,16.0,0.00,242.742014,...,0.0664,0.5444,0.0860,0.0552,0.0000,1.2980,0.7804,64.0,503.4,0.0
6,-251D,22.0,6.0,1.0,5.0,5.0,5.0,187.0,189.88,3022.339028,...,0.0000,0.0000,1.6820,3.5316,73.7684,0.0372,0.0052,54.0,5576.5,5.0
7,-222K,23.0,3.0,3.0,2.0,1.0,2.0,260.0,0.00,92.166757,...,0.0084,0.7508,0.4080,10.5504,38.8648,2.6192,8.2384,53.0,178.7,0.0
8,-315F,37.0,3.0,1.0,5.0,5.0,6.0,1.0,0.00,32.585829,...,0.0000,5.2016,2.3376,0.7032,27.0144,0.4964,13.8240,34.0,91.3,0.0
9,-261A,32.0,5.0,1.0,5.0,5.0,6.0,34.0,0.00,2708.821233,...,0.0244,0.0168,0.2780,0.0000,0.0240,0.1044,0.0132,6.0,6432.5,2.0


In [19]:
X = np.array(zen_df.iloc[:, 1:45]) # Removing categorical predictor.
y = np.array(zen_df.iloc[:, -1])

In [20]:
X

array([[  3.10000000e+01,   5.00000000e+00,   1.00000000e+00, ...,
          2.49440000e+00,   7.50000000e+01,   7.29000000e+01],
       [  2.80000000e+01,   6.00000000e+00,   0.00000000e+00, ...,
          5.60000000e-03,   6.70000000e+01,   5.79000000e+01],
       [  1.60000000e+01,   3.00000000e+00,   0.00000000e+00, ...,
          4.04000000e-02,   2.30000000e+01,   7.99000000e+01],
       ..., 
       [  2.70000000e+01,   6.00000000e+00,   1.00000000e+00, ...,
          5.93200000e-01,   6.50000000e+01,   1.39000000e+02],
       [  3.00000000e+01,   6.00000000e+00,   0.00000000e+00, ...,
          3.28000000e-02,   5.90000000e+01,   2.30580000e+03],
       [  1.40000000e+01,   6.00000000e+00,   0.00000000e+00, ...,
          1.90400000e-01,   5.80000000e+01,   5.71000000e+01]])

In [21]:
# This is an example AdaBoost regression

# Create the dataset

zen_regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                             n_estimators=300)

zen_regr.fit(X, y)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
         learning_rate=1.0, loss='linear', n_estimators=300,
         random_state=None)

In [24]:
with open('/Volumes/Transcend/birt data/zen_regr.p', 'wb') as f:
    pickle.dump(zen_regr, f)

with open('/Volumes/Transcend/birt data/zen_df.p', 'wb') as f:
    pickle.dump(zen_df, f)

Okay! We've now gotten to the point where we're fitting a model for a bird species! Next time: I will load the core covariates CSV and plot the data.