## imports

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

%matplotlib inline

In [27]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

import os
import sys

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

# import useful functions from the source code
%aimport features.build_features
from features.build_features import preprocess, create_features

%aimport models.train_model
from models.train_model import prep_x_and_y, rf_feature_importances_single, rf_feature_importances_by_city

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
# this pre-processing was done in the previous jupyter notebook. 
# however this notebook can be run independently of that if this box is run

df = preprocess('../data/raw/dengue_features_train.csv', '../data/raw/dengue_labels_train.csv')
create_features(df)

## load processed data

In [30]:
df = pd.read_pickle('../data/processed/df.pkl')

In [31]:
df.head()

Unnamed: 0,total_cases,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,...,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,sj,month,day,yr
0,4,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,292.414286,299.8,...,2.628571,25.442857,6.9,29.4,20.0,16.0,1,4,30,1990
1,5,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,293.951429,300.9,...,2.371429,26.714286,6.371429,31.7,22.2,8.6,1,5,7,1990
2,4,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,295.434286,300.5,...,2.3,26.714286,6.485714,32.2,22.8,41.4,1,5,14,1990
3,3,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,295.31,301.4,...,2.428571,27.471429,6.771429,33.3,23.3,4.0,1,5,21,1990
4,6,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,299.664286,295.821429,301.9,...,3.014286,28.942857,9.371429,35.0,23.9,5.8,1,5,28,1990


## out of the box random forest

In [38]:
rf_feature_importances_single(df)

Mean absolute error: 12.451029748283753

('yr', 0.3019195672457625)
('month', 0.16864541345805933)
('ndvi_sw', 0.1539054367061644)
('ndvi_se', 0.06517980644958862)
('reanalysis_dew_point_temp_k', 0.03336748904108496)
('reanalysis_max_air_temp_k', 0.029330379376337867)
('ndvi_ne', 0.02678261148437624)
('station_precip_mm', 0.02576430592437673)
('reanalysis_specific_humidity_g_per_kg', 0.0229279979594759)
('station_max_temp_c', 0.02264811892197686)
('reanalysis_tdtr_k', 0.018484502272580826)
('reanalysis_precip_amt_kg_per_m2', 0.015182510986956768)
('ndvi_nw', 0.014896431702641566)
('reanalysis_avg_temp_k', 0.013745539634023507)
('station_diur_temp_rng_c', 0.013385514169657133)
('station_avg_temp_c', 0.011442705334758883)
('reanalysis_relative_humidity_percent', 0.011241568008444882)
('day', 0.010539679708989844)
('precipitation_amt_mm', 0.009611029781385923)
('reanalysis_sat_precip_amt_mm', 0.008937337264711308)
('reanalysis_min_air_temp_k', 0.008385082174884859)
('reanalysis_air_temp_k

In [39]:
# this looks very promising

In [40]:
# do we need a random forest run on each city separately or will one random forest suffice?
# features visualizations above indicate it could be helpful, but city is least helpful feature in above random forest.

In [41]:
rf_feature_importances_by_city(df)

('yr', 0.4422239741268165)
('month', 0.1510678452401952)
('reanalysis_specific_humidity_g_per_kg', 0.05310457468735887)
('station_max_temp_c', 0.039969404651466735)
('reanalysis_precip_amt_kg_per_m2', 0.039031952061423174)
('reanalysis_dew_point_temp_k', 0.03830532244637597)
('ndvi_sw', 0.03817334833638768)
('ndvi_se', 0.028179741148277048)
('reanalysis_air_temp_k', 0.025425651462712483)
('station_precip_mm', 0.02449236832660711)
('ndvi_nw', 0.014915401433835218)
('ndvi_ne', 0.014005563602289214)
('reanalysis_avg_temp_k', 0.013500013336015632)
('reanalysis_relative_humidity_percent', 0.013245067251172662)
('station_diur_temp_rng_c', 0.012631285092022859)
('reanalysis_max_air_temp_k', 0.012200428202558022)
('reanalysis_tdtr_k', 0.011898737386048864)
('reanalysis_min_air_temp_k', 0.0068945089327205195)
('station_avg_temp_c', 0.005715218999119949)
('station_min_temp_c', 0.004917278057227903)
('reanalysis_sat_precip_amt_mm', 0.003925162494520797)
('precipitation_amt_mm', 0.0034804297194798

In [42]:
# mean absolute error is worth with the individual models; will stick with one random forest for the entire df

## dictionary of models + grid search

## time series

In [264]:
## month, day, and year continue to show up as promising vars in decision tree; let's try a time series analysis