# Simple iterations on the baseline models

*Anders Poirel - 13-02-20120*

Goals:
- keep results of `models-03
- use some features at time lags to see if we get improvements. The idea is 
 using the lifecycle of the mosquito: new mostiquos become adults 1-3 weeks after eggs are laid in water. Therefore, we could expect a lot of cases if the previous 2-3week/~month was humid

In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from os.path import join

In [29]:
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso
from sklearn.model_selection import (cross_validate, TimeSeriesSplit, 
                                     RandomizedSearchCV)

In [30]:
DATA_PATH = '../data/raw/'

## Acquiring the data

In [31]:
X_test_o = pd.read_csv(join(DATA_PATH, 'dengue_features_test.csv'))
X_train_o = pd.read_csv(join(DATA_PATH, 'dengue_features_train.csv'))
y_train_o = pd.read_csv(join(DATA_PATH, 'dengue_labels_train.csv'))

### Encodings

In [32]:
X_train = pd.get_dummies(X_train_o, columns = ['city'], drop_first = True)
X_test = pd.get_dummies(X_test_o, columns = ['city'], drop_first = True)

### Dropping unecessary / correlated columns

In [33]:
X_train.drop('week_start_date', axis = 1, inplace = True)
X_test.drop('week_start_date', axis = 1, inplace = True)
y_train = y_train_o['total_cases']

In [34]:
X_train.drop(
    ['reanalysis_sat_precip_amt_mm',
     'reanalysis_tdtr_k'],
    axis = 1, 
    inplace = True
)

In [36]:
X_test.drop(
    ['reanalysis_sat_precip_amt_mm',
     'reanalysis_tdtr_k'],
    axis = 1, 
    inplace = True
)

### City seperation

In [37]:
X_train_iq = X_train[X_train['city_sj'] == 0]
X_test_iq = X_test[X_test['city_sj'] == 0]

X_train_sj = X_train[X_train['city_sj'] == 1]
X_test_sj = X_test[X_test['city_sj'] == 1]

y_train_sj = y_train_o[y_train_o['city'] == 'sj']['total_cases']
y_train_iq =  y_train_o[y_train_o['city'] == 'iq']['total_cases']

Ratio of San Jose instances to total

In [38]:
sj_ratio = len(y_train_sj) / len(y_train)

#### Features at several time lags

In [65]:
def humid_n_weeks(k, n, humids):
    if k - n < 0:
        return .0
    else:
        return humids[k - n]  

In [72]:
train_humid_sj = X_train_sj['reanalysis_relative_humidity_percent']
train_humid_iq = X_train_iq['reanalysis_relative_humidity_percent']

test_humid_sj = X_test_sj['reanalysis_relative_humidity_percent']
test_humid_iq = X_test_iq['reanalysis_relative_humidity_percent']

In [73]:
X_train.head()

Unnamed: 0,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_specific_humidity_g_per_kg,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,city_sj
0,1990,18,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,299.8,295.9,32.0,73.365714,14.012857,25.442857,6.9,29.4,20.0,16.0,1
1,1990,19,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,300.9,296.4,17.94,77.368571,15.372857,26.714286,6.371429,31.7,22.2,8.6,1
2,1990,20,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,300.5,297.3,26.1,82.052857,16.848571,26.714286,6.485714,32.2,22.8,41.4,1
3,1990,21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,301.4,297.0,13.9,80.337143,16.672857,27.471429,6.771429,33.3,23.3,4.0,1
4,1990,22,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,299.664286,301.9,297.5,12.2,80.46,17.21,28.942857,9.371429,35.0,23.9,5.8,1


We re-index the series for Iquitos so that they start from 0 and our code can run properly

In [74]:
iq_train_index = list(range(len(train_humid_iq)))
iq_test_index = list(range(len(test_humid_iq)))
train_humid_iq.index = iq_train_index
test_humid_iq.index = iq_test_index

In [75]:
X_train_sj['humid_2'] = [humid_n_weeks(k, 2, train_humid_sj)
                          for k in range(len(train_humid_sj))]
X_train_sj['humid_3'] = [humid_n_weeks(k, 3, train_humid_sj) 
                          for k in range(len(train_humid_sj))]
X_train_sj['humid_4'] = [humid_n_weeks(k, 4, train_humid_sj)
                          for k in range(len(train_humid_sj))]

X_test_sj['humid_2'] = [humid_n_weeks(k, 2, test_humid_sj)
                          for k in range(len(test_humid_sj))]
X_test_sj['humid_3'] = [humid_n_weeks(k, 3, test_humid_sj) 
                          for k in range(len(test_humid_sj))]
X_test_sj['humid_4'] = [humid_n_weeks(k, 4, test_humid_sj)
                          for k in range(len(test_humid_sj))]

X_train_iq['humid_2'] = [humid_n_weeks(k, 2, train_humid_iq)
                          for k in range(len(train_humid_iq))]
X_train_iq['humid_3'] = [humid_n_weeks(k, 3, train_humid_iq) 
                          for k in range(len(train_humid_iq))]
X_train_iq['humid_4'] = [humid_n_weeks(k, 4, train_humid_iq)
                          for k in range(len(train_humid_iq))]

X_test_iq['humid_2'] = [humid_n_weeks(k, 2, test_humid_iq)
                          for k in range(len(test_humid_iq))]
X_test_iq['humid_3'] = [humid_n_weeks(k, 3, test_humid_iq) 
                          for k in range(len(test_humid_iq))]
X_test_iq['humid_4'] = [humid_n_weeks(k, 4, test_humid_iq)
                          for k in range(len(test_humid_iq))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http:/

Let's check that this f.e. worked as intended:

In [76]:
X_test_sj.head(30)

Unnamed: 0,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_max_air_temp_k,...,station_max_temp_c,station_min_temp_c,station_precip_mm,city_sj,precip_2,precip_3,precip_4,humid_2,humid_3,humid_4
0,2008,18,-0.0189,-0.0189,0.102729,0.0912,78.6,298.492857,298.55,301.1,...,33.3,21.7,75.2,1,0.0,0.0,0.0,0.0,0.0,0.0
1,2008,19,-0.018,-0.0124,0.082043,0.072314,12.56,298.475714,298.557143,300.8,...,30.0,22.2,34.3,1,0.0,0.0,0.0,0.0,0.0,0.0
2,2008,20,-0.0015,,0.151083,0.091529,3.66,299.455714,299.357143,302.2,...,32.8,22.8,3.0,1,78.6,0.0,0.0,78.781429,0.0,0.0
3,2008,21,,-0.019867,0.124329,0.125686,0.0,299.69,299.728571,303.0,...,33.3,24.4,0.3,1,12.56,78.6,0.0,78.23,78.781429,0.0
4,2008,22,0.0568,0.039833,0.062267,0.075914,0.76,299.78,299.671429,302.3,...,33.3,23.3,84.1,1,3.66,12.56,78.6,78.27,78.23,78.781429
5,2008,23,-0.044,-0.030467,0.132,0.083529,71.17,299.768571,299.728571,301.9,...,32.8,25.0,27.7,1,0.0,3.66,12.56,73.015714,78.27,78.23
6,2008,24,-0.0443,-0.024925,0.132271,0.159157,48.99,300.062857,300.007143,302.4,...,31.1,23.3,91.7,1,0.76,0.0,3.66,74.084286,73.015714,78.27
7,2008,25,,0.08215,0.144371,0.116729,30.81,300.484286,300.578571,303.5,...,34.4,24.4,0.3,1,71.17,0.76,0.0,76.557143,74.084286,73.015714
8,2008,26,0.0108,0.0499,0.100571,0.117329,8.02,300.601429,300.621429,302.5,...,32.8,23.9,28.7,1,48.99,71.17,0.76,76.844286,76.557143,74.084286
9,2008,27,0.072667,0.10666,0.155429,0.1649,17.52,300.497143,300.528571,302.3,...,31.1,25.0,2.9,1,30.81,48.99,71.17,76.87,76.844286,76.557143


## Model with city separation

In [79]:
en_sj = Pipeline([
    ('scale', StandardScaler()),
    ('impute_m', SimpleImputer()),
    ('en', ElasticNet(14))
])
en_sj_res = cross_validate(
    estimator = en_sj,
    X = X_train_sj,
    y = y_train_sj,
    cv = TimeSeriesSplit(n_splits = 10),
    scoring = 'neg_mean_absolute_error',
    n_jobs = -1
)
en_score_sj = np.mean(en_sj_res['test_score'])
en_score_sj

-33.03588533477845

In [82]:
en_iq = Pipeline([
    ('scale', StandardScaler()),
    ('impute_m', SimpleImputer()),
    ('en', ElasticNet(5.5))
])
en_iq_res = cross_validate(
    estimator = en_iq,
    X = X_train_iq,
    y = y_train_iq,
    cv = TimeSeriesSplit(n_splits = 10),
    scoring = 'neg_mean_absolute_error',
    n_jobs = -1
)
en_score_iq = np.mean(en_iq_res['test_score'])
en_score_iq

-6.430553285279375

In [81]:
en_tot_score = sj_ratio * en_score_sj + (1 - sj_ratio) * en_score_iq
en_tot_score

-23.533981031385924

Let's see if a polynomial kernel improves this

In [53]:
poly_sj = Pipeline([
    ('scale', StandardScaler()),
    ('impute_m', SimpleImputer()),
    ('poly_f', PolynomialFeatures(2)),
    ('en', ElasticNet(14))
])
poly_sj_res = cross_validate(
    estimator = poly_sj,
    X = X_train_sj,
    y = y_train_sj,
    cv = TimeSeriesSplit(n_splits = 10),
    scoring = 'neg_mean_absolute_error',
    n_jobs = -1
)
poly_score_sj = np.mean(poly_sj_res['test_score'])
poly_score_sj

-32.805269599526284

In [55]:
poly_iq = Pipeline([
    ('scale', StandardScaler()),
    ('impute_m', SimpleImputer()),
    ('poly_f', PolynomialFeatures(2)),
    ('en', ElasticNet(3))
])
poly_iq_res = cross_validate(
    estimator = poly_iq,
    X = X_train_iq,
    y = y_train_iq,
    cv = TimeSeriesSplit(n_splits = 10),
    scoring = 'neg_mean_absolute_error',
    n_jobs = -1
)
poly_score_iq = np.mean(poly_iq_res['test_score'])
poly_score_iq

-6.845977615497036

In [56]:
poly_tot_score = sj_ratio * poly_score_sj + (1 - sj_ratio) * poly_score_iq
poly_tot_score

-23.53409389094441

This is a slight improvement, so let's build a submission form that:

In [57]:
poly_sj.fit(X_train_sj, y_train_sj)
y_pred_sj = poly_sj.predict(X_test_sj)

In [58]:
poly_iq.fit(X_train_iq, y_train_iq)
y_pred_iq = poly_iq.predict(X_test_iq)

In [59]:
y_pred = np.concatenate((y_pred_sj, y_pred_iq))

In [60]:
split_en_sub = pd.read_csv(join(DATA_PATH, 'submission_format.csv'))

In [61]:
split_en_sub['total_cases'] = np.round(y_pred).astype(int)

In [63]:
split_en_sub.to_csv('../models/split_poly_en_fe.csv', index = None)

## Take aways

27.27 on leaderboard, this is barely an improvement.
We'll attempt to use models explicitely designed for time series next