In [None]:
from __future__ import absolute_import, division, print_function

# Feature engineering

Here I will extract and engineer the features that will be the input for a subsequent model.

The main data are of the follwoing format:

* The data report weather measurements for 1000 unique locations in ~150 counties across 5 states. 
* Most locations have one entry per day reporting the current weather conditions.
* At the end of the season, the harvest generates a certain yield. This yield is propagated to *all* entries in the data set, even though it is only a final value.
* The reported yield number refers to the yield in the county and is not specific to a location. I assume it's an average for the county, even though that is not explicitely specified.



My goal here is to create a profile for each location from the avaiable or additional data. This will leave me with one set of feature values for each location, which is connected to a final yield. And that will be the input to my model.

## Imports

In [1]:
import os
import pickle
import time

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap


%matplotlib inline

## Functions

In [None]:
def find_30day_extreme(in_df, in_column, in_agg):
    """
    Calculates a rolling window aggregate.
    Window size: 30 days.
    in_df: input dataframe
    in_column: column to be aggregated
    in_agg: aggregation function (e.g., min/max)
    """
    agg_list = []
    start_date = in_df['Date'].min()
    stop_date = in_df['Date'].min() + pd.Timedelta(30, unit='d')


    in_df[in_column][(in_df['Date'] == start_date)]
    
    
    df_2013['temperatureAverage'][(df_2013['Location'] == (-118.69523719999999, 46.811685799999992)) & 
                                  (df_2013['Date'] == start_date) & 
                                  (df_2013['Date'] <= stop_date)] 
    
    
    
    

In [18]:
start_date = df_2013['Date'].min()

In [21]:
stop_date = start_date + pd.Timedelta(30, unit='d')

In [22]:
print(start_date,stop_date)

(Timestamp('2013-11-30 00:00:00'), Timestamp('2013-12-30 00:00:00'))


In [24]:
df_2013['Location'].unique()

array([(-118.69523719999999, 46.811685799999992),
       (-118.35210929999998, 46.929839100000002),
       (-118.5101603, 47.006888099999998), ...,
       (-95.798478900000006, 33.670839899999997),
       (-95.817206299999995, 33.611278900000002),
       (-95.749667200000005, 33.5842752)], dtype=object)

In [31]:
tmp1 = df_2013[df_2013['Location'] == (-118.69523719999999, 46.811685799999992)]

In [30]:
df_2013['temperatureAverage'][(df_2013['Location'] == (-118.69523719999999, 46.811685799999992)) & 
                              ((df_2013['Date'] == start_date) | 
                              (df_2013['Date'] <= stop_date))]

0        31.590
944      43.220
1887     40.480
2838     24.595
3787     18.635
4732     13.760
5682     14.645
6631     10.725
7579      9.360
8523     15.245
9465     21.035
10413    21.790
11360    24.015
12293    28.700
13241    32.565
14169    39.340
15096    36.875
16024    29.950
16976    35.400
17925    27.545
18875    28.735
19815    28.510
20758    31.020
21702    41.200
22653    33.500
23603    26.235
24550    27.935
25493    26.455
26438    26.485
27387    27.845
28334    29.155
Name: temperatureAverage, dtype: float64

In [27]:
df_2013['temperatureAverage'][(df_2013['Date'] == start_date) & 
                              (df_2013['Date'] <= stop_date)]

0      31.590
1      31.010
2      30.165
3      30.180
4      30.460
5      30.990
6      45.740
7      45.755
8      46.340
9      46.985
10     46.920
11     47.020
12     46.875
13     46.750
14     46.890
15     46.725
16     46.610
17     46.565
18     43.160
19     45.920
20     46.250
21     43.815
22     43.475
23     45.300
24     52.785
25     53.105
26     54.405
27     54.535
28     53.655
29     53.615
        ...  
910    54.400
911    54.780
912    54.365
913    54.345
914    54.060
915    54.035
916    53.605
917    53.765
918    53.585
919    53.635
920    44.010
921    44.015
922    43.480
923    43.995
924    43.800
925    43.795
926    61.670
927    61.310
928    61.125
929    61.180
930    60.785
931    61.700
932    43.065
933    42.765
934    43.220
935    43.010
936    43.070
937    43.110
938    56.140
939    55.535
Name: temperatureAverage, dtype: float64

In [12]:
df_2013.columns

Index([u'CountyName', u'State', u'Latitude', u'Longitude', u'Date',
       u'cloudCover', u'dewPoint', u'humidity', u'precipIntensity',
       u'precipProbability', u'precipAccumulation', u'precipTypeIsRain',
       u'precipTypeIsSnow', u'pressure', u'temperatureMax', u'temperatureMin',
       u'visibility', u'windBearing', u'windSpeed', u'NDVI', u'DayInSeason',
       u'Yield', u'Location', u'precipTotal', u'temperatureDiff',
       u'temperatureRatio', u'temperatureAverage'],
      dtype='object')

## Data

In [3]:
cwd = os.getcwd()
df_2013 = pd.read_pickle(os.path.join(cwd,'data','df_2013_clean.df'))
df_2014 = pd.read_pickle(os.path.join(cwd,'data','df_2014_clean.df'))


## Additional data

I have also obtained information on the elevation and length_of_day for each location (see 03_elevation_and_length_of_day.ipynb).

In [6]:
# Load data
with open(os.path.join('data','elevation.pickle'), 'rb') as handle:
    elevation = pickle.load(handle)
with open(os.path.join('data','length_of_day.pickle'), 'rb') as handle:
    length_of_day = pickle.load(handle)




## Features

Each location comes with:

* longitude
* latitude

The features I am going to engineer for each location are:

* the total amount of precipitation
* the minimum average temperature in a consecutive 30-day period
* the maximum average temperature in a consecutive 30-day period
* ratio of maximum average NDVI in a consecutive 30-day period and its respective minimum
* the mean temperature difference between daily min/max temperatures
* the standard deviation of the above mean
* the total average wind speed

I will add to those features external values of:

* the hours of daylight
* the elevation

During the engineering I will keep both years (2013 and 2014) separate. Even though these data cover mostly the same locations (~>80% overlap), weather conditions and yield is likely to be different. Keeping them separate provides additional data points. 

## 2013