# Lagged Features Exploration

Papers I've read seem to indicate that features far in the past have an impact. What if we built features 
for many lags and selected those with the highest correlation with our target?

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [3]:
cd ..

C:\Users\Anders\Documents\data-science-projects\dss-diseasespread


In [14]:
mpl.rcParams.update({
    'figure.autolayout': True,
    'figure.dpi': 150
})
sns.set()

## Reading the data

In [13]:
RAW_PATH = 'data/raw'

In [15]:
train_features = pd.read_csv(
    os.path.join(RAW_PATH, 'dengue_features_train.csv')
)
train_labels = pd.read_csv(
    os.path.join(RAW_PATH, 'dengue_labels_train.csv')
)
test_features = pd.read_csv(
    os.path.join(RAW_PATH, 'dengue_features_test.csv')
)

## Lagged Features

In [22]:
def make_dataset(features):
    features = (features
        .drop( # correlated features
            ['reanalysis_sat_precip_amt_mm', 'reanalysis_dew_point_temp_k', 
             'reanalysis_air_temp_k', 'reanalysis_tdtr_k'],
            axis = 1
        )
        .fillna(method = 'backfill')
        .drop( # unused features
            ['year', 'weekofyear','week_start_date'], 
            axis = 1
        )
    )
    ts_features = list(features.loc[:, 'ndvi_ne' :].columns.values)

    features_sj = features[
        features['city'] == 'sj']
    features_iq = features[
        features['city'] == 'iq']

    features_sj = add_lagged_features(
        features_sj, 52, ts_features).fillna(method = 'backfill')
    features_iq = add_lagged_features(
        features_iq, 52, ts_features).fillna(method = 'backfill')

    features = pd.concat([features_sj, features_iq], axis = 0)

    return features

In [23]:
train_features = make_dataset(train_features)
test_features = make_dataset(test_features)

In [26]:
train_features_sj = train_features[
    train_features['city'] == 'sj'
].drop('city', axis = 1)
train_features_iq = train_features[
    train_features['city'] == 'iq'
].drop('city', axis = 1)

y_train_sj = train_labels[
    train_labels['city'] == 'sj'
]['total_cases'].astype('float')
y_train_iq = train_labels[
    train_labels['city'] == 'iq'
]['total_cases'].astype('float')

test_features_sj = test_features[
    test_features['city'] == 'sj'
].drop('city', axis = 1)
test_features_iq = test_features[
    test_features['city'] == 'iq'
].drop('city', axis = 1)

## Correlations

### San Juan

In [50]:
sj_corrs = train_features_sj.corrwith(y_train_sj).abs().sort_values()

In [64]:
sj_corrs.iloc[-100:-50]

lag30_reanalysis_max_air_temp_k                0.278465
lag5_reanalysis_avg_temp_k                     0.278470
lag30_reanalysis_specific_humidity_g_per_kg    0.278698
lag11_reanalysis_max_air_temp_k                0.279157
lag38_reanalysis_avg_temp_k                    0.280345
lag11_reanalysis_specific_humidity_g_per_kg    0.280538
lag3_station_avg_temp_c                        0.281353
lag36_station_min_temp_c                       0.282095
lag36_reanalysis_min_air_temp_k                0.282417
lag37_station_avg_temp_c                       0.282919
lag5_reanalysis_max_air_temp_k                 0.283155
lag11_reanalysis_min_air_temp_k                0.283252
lag4_reanalysis_specific_humidity_g_per_kg     0.283265
lag32_reanalysis_min_air_temp_k                0.284112
lag5_station_max_temp_c                        0.284696
lag36_station_avg_temp_c                       0.284853
lag35_reanalysis_min_air_temp_k                0.285153
lag34_reanalysis_min_air_temp_k                0

In [54]:
sj_corrs.tail(50)

lag9_reanalysis_specific_humidity_g_per_kg     0.303438
lag31_reanalysis_avg_temp_k                    0.303769
lag37_reanalysis_max_air_temp_k                0.303802
lag9_reanalysis_min_air_temp_k                 0.303842
lag7_station_max_temp_c                        0.304182
lag35_reanalysis_specific_humidity_g_per_kg    0.304631
lag32_reanalysis_specific_humidity_g_per_kg    0.306225
lag7_reanalysis_min_air_temp_k                 0.306642
lag8_reanalysis_specific_humidity_g_per_kg     0.306698
lag36_reanalysis_avg_temp_k                    0.306987
lag4_station_avg_temp_c                        0.307308
lag8_reanalysis_min_air_temp_k                 0.307353
lag35_reanalysis_avg_temp_k                    0.308285
lag13_station_max_temp_c                       0.309248
lag36_reanalysis_max_air_temp_k                0.310781
lag34_reanalysis_avg_temp_k                    0.311027
lag34_reanalysis_specific_humidity_g_per_kg    0.311068
lag14_station_min_temp_c                       0

These are uniformly higher than the correlations for the variables in 
the original exploration notebook (highest was ~0.28). We thus probably could build a much
better model by using these features and experimenting with how many to include

### Iquitos

In [55]:
iq_corrs = train_features_iq.corrwith(y_train_iq).abs().sort_values()

In [62]:
iq_corrs.iloc[-100:-50]

lag45_reanalysis_specific_humidity_g_per_kg    0.183175
lag40_ndvi_se                                  0.184816
lag45_ndvi_se                                  0.184969
lag49_station_precip_mm                        0.185129
lag29_station_diur_temp_rng_c                  0.185376
lag5_reanalysis_specific_humidity_g_per_kg     0.185544
lag24_reanalysis_min_air_temp_k                0.185998
lag30_station_diur_temp_rng_c                  0.187065
lag41_station_diur_temp_rng_c                  0.187127
lag48_station_precip_mm                        0.187663
lag45_precipitation_amt_mm                     0.187777
lag39_reanalysis_max_air_temp_k                0.188046
lag1_reanalysis_min_air_temp_k                 0.188151
lag49_reanalysis_precip_amt_kg_per_m2          0.188209
lag31_station_max_temp_c                       0.188656
lag3_reanalysis_min_air_temp_k                 0.189832
lag5_reanalysis_min_air_temp_k                 0.190152
lag44_reanalysis_specific_humidity_g_per_kg    0

In [63]:
iq_corrs.iloc[-50:]

lag35_station_diur_temp_rng_c                  0.209225
lag48_station_avg_temp_c                       0.209651
lag1_station_min_temp_c                        0.210040
reanalysis_min_air_temp_k                      0.212263
lag36_station_diur_temp_rng_c                  0.214904
lag37_station_diur_temp_rng_c                  0.216953
lag47_station_avg_temp_c                       0.218238
lag51_reanalysis_min_air_temp_k                0.218854
lag47_reanalysis_min_air_temp_k                0.219681
lag35_reanalysis_max_air_temp_k                0.219899
lag2_reanalysis_specific_humidity_g_per_kg     0.221102
lag38_station_diur_temp_rng_c                  0.222043
lag38_reanalysis_max_air_temp_k                0.222481
lag52_station_min_temp_c                       0.223038
lag52_reanalysis_precip_amt_kg_per_m2          0.223965
lag46_station_avg_temp_c                       0.224625
lag50_precipitation_amt_mm                     0.224631
lag51_station_min_temp_c                       0