## Load a single bird's data from Mongo

This version doesn't use Mongo. Instead, it works directly from the eBird CSVs.

In [101]:
import os
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import OneHotEncoder
import pickle

In [102]:
datapath = "/Volumes/Transcend/birt data/eBird raw data"

In the following cells, we read in the 2012 `checklists.csv` and `core-covariates.csv`. We're only reading the first ten thousand rows, because they're *massive* (`checklists.csv` is about 4.6 GB!) but to simulate how we might work with a larger file, we're randomly subsampling 1000 rows from that. The CSVs share an index, so I'm sampling from the index and then using that to select stuff.

In [107]:
checklists = pd.read_csv(datapath + "/erd_us48_data_grouped_by_year_v5.0/2012/checklists.csv",
                         index_col = 0,
                         nrows = 10000,
                         na_values = ['?', 'X'])

In [163]:
core_covariates = pd.read_csv(datapath + "/erd_us48_data_grouped_by_year_v5.0/2012/core-covariates.csv",
                              index_col = 0,
                              nrows = 10000,
                              na_values = ['?', 'X'])

In [164]:
dummies = pd.get_dummies(core_covariates.BAILEY_ECOREGION)
core_covariates = pd.concat([core_covariates, dummies], axis = 1)
core_covariates.drop("BAILEY_ECOREGION", axis=1, inplace=True)

In [165]:
core_covariates

Unnamed: 0_level_0,POP00_SQMI,HOUSING_DENSITY,HOUSING_PERCENT_VACANT,ELEV_GT,ELEV_NED,BCR,OMERNIK_L3_ECOREGION,CAUS_TEMP_AVG,CAUS_TEMP_MIN,CAUS_TEMP_MAX,...,NLCD2006_FS_C43_7500_PLAND,NLCD2006_FS_C52_7500_PLAND,NLCD2006_FS_C71_7500_PLAND,NLCD2006_FS_C81_7500_PLAND,NLCD2006_FS_C82_7500_PLAND,NLCD2006_FS_C90_7500_PLAND,NLCD2006_FS_C95_7500_PLAND,-315E,-315F,-411A
SAMPLING_EVENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S10797396,,,,0.0,,,,,,,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0
S10803596,,,,0.0,,,,,,,...,0.0000,0.0000,0.0058,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0
S10653317,,,,0.0,,,,,,,...,0.0000,0.0000,0.0000,0.0000,0.0000,1.1052,9.5852,0.0,0.0,0.0
S10398084,,,,0.0,,,,,,,...,0.0000,0.0000,0.0058,0.0000,0.0000,0.0599,2.1970,0.0,0.0,0.0
S10281610,,,,0.0,,,,,,,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0
S10281612,,,,0.0,,,,,,,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0
S10653280,,,,0.0,,,,,,,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0
S11552362,,,,0.0,,,,,,,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0
S11967824,,,,0.0,,,,,,,...,0.0000,0.0000,0.0000,0.0000,0.0000,1.0795,22.8446,0.0,0.0,0.0
S11974138,,,,0.0,,,,,,,...,0.0000,0.0000,0.0000,0.0000,0.0000,1.0795,22.8446,0.0,0.0,0.0


In [166]:
core_covariates.dropna(inplace=True)

In [167]:
core_covariates

Unnamed: 0_level_0,POP00_SQMI,HOUSING_DENSITY,HOUSING_PERCENT_VACANT,ELEV_GT,ELEV_NED,BCR,OMERNIK_L3_ECOREGION,CAUS_TEMP_AVG,CAUS_TEMP_MIN,CAUS_TEMP_MAX,...,NLCD2006_FS_C43_7500_PLAND,NLCD2006_FS_C52_7500_PLAND,NLCD2006_FS_C71_7500_PLAND,NLCD2006_FS_C81_7500_PLAND,NLCD2006_FS_C82_7500_PLAND,NLCD2006_FS_C90_7500_PLAND,NLCD2006_FS_C95_7500_PLAND,-315E,-315F,-411A
SAMPLING_EVENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S10077017,9061.9,5227.117097,0.131004,2.0,1.41,31.0,76.0,7.0,8.0,6.0,...,0.0000,0.0000,0.0109,0.0000,0.0000,1.0226,16.1209,0.0,0.0,1.0
S10281377,9061.9,5227.117097,0.131004,2.0,1.41,31.0,76.0,7.0,8.0,6.0,...,0.0000,0.0000,0.0109,0.0000,0.0000,1.0226,16.1209,0.0,0.0,1.0
S10281387,9061.9,5227.117097,0.131004,2.0,1.41,31.0,76.0,7.0,8.0,6.0,...,0.0000,0.0000,0.0109,0.0000,0.0000,1.0226,16.1209,0.0,0.0,1.0
S10398044,9061.9,5227.117097,0.131004,2.0,1.41,31.0,76.0,7.0,9.0,7.0,...,0.0000,0.0000,0.0109,0.0000,0.0000,1.0226,16.1209,0.0,0.0,1.0
S10555296,9061.9,5227.117097,0.131004,2.0,1.41,31.0,76.0,7.0,9.0,7.0,...,0.0000,0.0000,0.0109,0.0000,0.0000,1.0226,16.1209,0.0,0.0,1.0
S10574475,9061.9,5227.117097,0.131004,2.0,1.41,31.0,76.0,7.0,9.0,7.0,...,0.0000,0.0000,0.0109,0.0000,0.0000,1.0226,16.1209,0.0,0.0,1.0
S10574614,9061.9,5227.117097,0.131004,2.0,1.41,31.0,76.0,7.0,9.0,7.0,...,0.0000,0.0000,0.0109,0.0000,0.0000,1.0226,16.1209,0.0,0.0,1.0
S10648932,9061.9,5227.117097,0.131004,2.0,1.41,31.0,76.0,7.0,9.0,7.0,...,0.0000,0.0000,0.0109,0.0000,0.0000,1.0226,16.1209,0.0,0.0,1.0
S9632186,9061.9,5227.117097,0.131004,2.0,1.41,31.0,76.0,6.0,8.0,6.0,...,0.0000,0.0000,0.0109,0.0000,0.0000,1.0226,16.1209,0.0,0.0,1.0
S10159982,8086.3,3177.282066,0.071594,1.0,1.11,31.0,76.0,7.0,8.0,6.0,...,0.0000,0.0000,0.0592,0.0000,0.0000,3.5788,21.9076,0.0,0.0,1.0


In [168]:
# We'll use the index of `checklists` to draw a random sample of rows
sample = np.random.choice(core_covariates.index, size=1000)

In [185]:
# We'll sample checklists based on a bird's name, plus the sample index.
y = np.array(checklists.loc[sample, 'Zenaida_macroura'])

# We're replacing NaNs with zeroes, because I'm assuming not marking a
# bird down means you didn't see it.
y[np.isnan(y)] = 0

In [174]:
# We'll take all core covariate columns, using the same sample index.
X = np.array(core_covariates.loc[sample, :])

In [186]:
regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                             n_estimators=300)

regressor.fit(X, y)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
         learning_rate=1.0, loss='linear', n_estimators=300,
         random_state=None)

In [188]:
with open('/Volumes/Transcend/birt data/regressor.p', 'wb') as f:
    pickle.dump(regressor, f)

with open('/Volumes/Transcend/birt data/regressor.p', 'wb') as f:
    pickle.dump(regressor, f)

Okay! We've now gotten to the point where we're fitting a model for a bird species! Next time: I will load the core covariates CSV and plot the data.

Now to read in core covariates and plot stuff.

In [190]:
srd30km = pd.read_csv(datapath + "/srd_point_data_1.5km_v1.0.csv", na_values = '?')

In [203]:
srd30km.columns.format()

['LATITUDE',
 'LONGITUDE',
 'ASTER2011_DEM',
 'UMD2011_LANDCOVER',
 'UMD2011_FS_L_1500_LPI',
 'UMD2011_FS_L_1500_PD',
 'UMD2011_FS_L_1500_ED',
 'UMD2011_FS_C0_1500_PLAND',
 'UMD2011_FS_C0_1500_LPI',
 'UMD2011_FS_C0_1500_PD',
 'UMD2011_FS_C0_1500_ED',
 'UMD2011_FS_C1_1500_PLAND',
 'UMD2011_FS_C1_1500_LPI',
 'UMD2011_FS_C1_1500_PD',
 'UMD2011_FS_C1_1500_ED',
 'UMD2011_FS_C2_1500_PLAND',
 'UMD2011_FS_C2_1500_LPI',
 'UMD2011_FS_C2_1500_PD',
 'UMD2011_FS_C2_1500_ED',
 'UMD2011_FS_C3_1500_PLAND',
 'UMD2011_FS_C3_1500_LPI',
 'UMD2011_FS_C3_1500_PD',
 'UMD2011_FS_C3_1500_ED',
 'UMD2011_FS_C4_1500_PLAND',
 'UMD2011_FS_C4_1500_LPI',
 'UMD2011_FS_C4_1500_PD',
 'UMD2011_FS_C4_1500_ED',
 'UMD2011_FS_C5_1500_PLAND',
 'UMD2011_FS_C5_1500_LPI',
 'UMD2011_FS_C5_1500_PD',
 'UMD2011_FS_C5_1500_ED',
 'UMD2011_FS_C6_1500_PLAND',
 'UMD2011_FS_C6_1500_LPI',
 'UMD2011_FS_C6_1500_PD',
 'UMD2011_FS_C6_1500_ED',
 'UMD2011_FS_C7_1500_PLAND',
 'UMD2011_FS_C7_1500_LPI',
 'UMD2011_FS_C7_1500_PD',
 'UMD2011_FS_C7_150

In [202]:
core_covariates.columns.format()

['POP00_SQMI',
 'HOUSING_DENSITY',
 'HOUSING_PERCENT_VACANT',
 'ELEV_GT',
 'ELEV_NED',
 'BCR',
 'OMERNIK_L3_ECOREGION',
 'CAUS_TEMP_AVG',
 'CAUS_TEMP_MIN',
 'CAUS_TEMP_MAX',
 'CAUS_PREC',
 'CAUS_SNOW',
 'NLCD2001_FS_C11_7500_PLAND',
 'NLCD2001_FS_C12_7500_PLAND',
 'NLCD2001_FS_C21_7500_PLAND',
 'NLCD2001_FS_C22_7500_PLAND',
 'NLCD2001_FS_C23_7500_PLAND',
 'NLCD2001_FS_C24_7500_PLAND',
 'NLCD2001_FS_C31_7500_PLAND',
 'NLCD2001_FS_C41_7500_PLAND',
 'NLCD2001_FS_C42_7500_PLAND',
 'NLCD2001_FS_C43_7500_PLAND',
 'NLCD2001_FS_C52_7500_PLAND',
 'NLCD2001_FS_C71_7500_PLAND',
 'NLCD2001_FS_C81_7500_PLAND',
 'NLCD2001_FS_C82_7500_PLAND',
 'NLCD2001_FS_C90_7500_PLAND',
 'NLCD2001_FS_C95_7500_PLAND',
 'NLCD2006_FS_C11_7500_PLAND',
 'NLCD2006_FS_C12_7500_PLAND',
 'NLCD2006_FS_C21_7500_PLAND',
 'NLCD2006_FS_C22_7500_PLAND',
 'NLCD2006_FS_C23_7500_PLAND',
 'NLCD2006_FS_C24_7500_PLAND',
 'NLCD2006_FS_C31_7500_PLAND',
 'NLCD2006_FS_C41_7500_PLAND',
 'NLCD2006_FS_C42_7500_PLAND',
 'NLCD2006_FS_C43_7500_

In [191]:
prediction = regressor.predict(srd30km)

ValueError: Number of features of the model must  match the input. Model n_features is 47 and  input n_features is 63 