# DengueAI - Time-lagged features

*Anders Poirel - 14-05-2020*

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import os

In [12]:
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.model_selection import (cross_validate, TimeSeriesSplit)

Figure prettyfying:

In [3]:
mpl.rcParams['figure.dpi']= 150
sns.set()

## Reading data

In [13]:
RAW_DATA_PATH = '../data/raw'
PROCESSED_DATA_PATH = '../data/processed'

In [5]:
train_features = pd.read_csv(os.path.join(RAW_DATA_PATH, "dengue_features_train.csv"))
train_labels = pd.read_csv(os.path.join(RAW_DATA_PATH, "dengue_labels_train.csv"))
test_features = pd.read_csv(os.path.join(RAW_DATA_PATH, "dengue_features_test.csv"))

## Feature engineering

Backfill missing features, combing ndvi indicators and drop unused/correlated columns

In [6]:
train = (train_features
    .drop( # correlated features
        ['reanalysis_sat_precip_amt_mm', 'reanalysis_dew_point_temp_k', 
         'reanalysis_air_temp_k', 'reanalysis_tdtr_k'],
        axis = 1
    )
    .fillna(method = 'backfill')
    .assign(
        ndvi_n = lambda x : x['ndvi_ne'] + x['ndvi_nw'] / 2,
        ndvi_s = lambda x : x['ndvi_se'] + x['ndvi_sw'] / 2,
        monthofyear = lambda x: pd.to_datetime(x['week_start_date']).dt.month
    )
    .drop( # unused features
        ['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw', 'year', 'weekofyear',
         'week_start_date'], 
        axis = 1
    )
)

Select features that we want to shift

In [7]:
ts_features = list(
    train.loc[:, 'precipitation_amt_mm' : 'ndvi_s'].columns.values
)

Split data between San Juan and Iquitos

In [8]:
train_sj = train[train['city'] == 'sj'].drop('city', axis = 1).reset_index()
train_iq = train[train['city'] == 'iq'].drop('city', axis = 1).reset_index()

Function for adding time-lagged features to dataframe

In [9]:
def add_lagged_features(df, max_lag, features):
    lag_df = [
        df[features].shift(k).add_prefix('lag' + str(k) + '_')
        for k in range(1, max_lag+1)
    ]
    return pd.concat([df] + lag_df, axis = 1)

Add the lagged features to each dataframe. As before, we fill crudely
using `method = 'backfill'`, though a more sophisticated approach would be 
preferred

In [10]:
train_sj = add_lagged_features(train_sj, 7, ts_features).fillna(method = 'backfill')
train_iq = add_lagged_features(train_iq, 7, ts_features).fillna(method = 'backfill')

In [11]:
train_iq.head()

Unnamed: 0,index,precipitation_amt_mm,reanalysis_avg_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_specific_humidity_g_per_kg,station_avg_temp_c,station_diur_temp_rng_c,...,lag7_reanalysis_precip_amt_kg_per_m2,lag7_reanalysis_relative_humidity_percent,lag7_reanalysis_specific_humidity_g_per_kg,lag7_station_avg_temp_c,lag7_station_diur_temp_rng_c,lag7_station_max_temp_c,lag7_station_min_temp_c,lag7_station_precip_mm,lag7_ndvi_n,lag7_ndvi_s
0,936,25.41,298.45,307.3,293.1,43.19,92.418571,16.651429,26.4,10.775,...,43.19,92.418571,16.651429,26.4,10.775,32.5,20.7,3.0,0.259014,0.464486
1,937,60.61,298.428571,306.6,291.1,46.0,93.581429,16.862857,26.9,11.566667,...,43.19,92.418571,16.651429,26.4,10.775,32.5,20.7,3.0,0.259014,0.464486
2,938,55.52,297.392857,304.5,292.6,64.77,95.848571,17.12,26.8,11.466667,...,43.19,92.418571,16.651429,26.4,10.775,32.5,20.7,3.0,0.259014,0.464486
3,939,5.6,296.228571,303.6,288.6,23.96,87.234286,14.431429,25.766667,10.533333,...,43.19,92.418571,16.651429,26.4,10.775,32.5,20.7,3.0,0.259014,0.464486
4,940,62.76,297.635714,307.0,291.5,31.8,88.161429,15.444286,26.6,11.48,...,43.19,92.418571,16.651429,26.4,10.775,32.5,20.7,3.0,0.259014,0.464486


## Writing the files

In [15]:
train_sj.to_csv(os.path.join(PROCESSED_DATA_PATH, 'train_sj.csv'), index = False)
train_iq.to_csv(os.path.join(PROCESSED_DATA_PATH, 'train_iq.csv'), index = False)