# Simple iterations on the baseline models

*Anders Poirel - 13-02-20120*

Goals:
- build seperate models by city
- drop correlated features
- observe the results from there

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from os.path import join

In [2]:
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso
from sklearn.model_selection import (cross_validate, TimeSeriesSplit, 
                                     RandomizedSearchCV)

In [3]:
DATA_PATH = '../data/raw/'

## Acquiring the data

In [4]:
X_test_o = pd.read_csv(join(DATA_PATH, 'dengue_features_test.csv'))
X_train_o = pd.read_csv(join(DATA_PATH, 'dengue_features_train.csv'))
y_train_o = pd.read_csv(join(DATA_PATH, 'dengue_labels_train.csv'))

### Encodings

In [10]:
X_train = pd.get_dummies(X_train_o, columns = ['city'], drop_first = True)
X_test = pd.get_dummies(X_test_o, columns = ['city'], drop_first = True)

### Dropping unecessary / correlated columns

In [11]:
X_train.drop('week_start_date', axis = 1, inplace = True)
X_test.drop('week_start_date', axis = 1, inplace = True)
y_train = y_train_o['total_cases']

In [12]:
X_train.drop(
    ['reanalysis_sat_precip_amt_mm', 'reanalysis_dew_point_temp_k',
     'reanalysis_tdtr_k'],
    axis = 1, 
    inplace = True
)

In [13]:
X_test.drop(
    ['reanalysis_sat_precip_amt_mm', 'reanalysis_dew_point_temp_k',
     'reanalysis_tdtr_k'],
    axis = 1, 
    inplace = True
)

### City seperation

In [14]:
X_train_iq = X_train[X_train['city_sj'] == 0]
X_test_iq = X_test[X_test['city_sj'] == 0]

X_train_sj = X_train[X_train['city_sj'] == 1]
X_test_sj = X_test[X_test['city_sj'] == 1]

y_train_sj = y_train_o[y_train_o['city'] == 'sj']['total_cases']
y_train_iq =  y_train_o[y_train_o['city'] == 'iq']['total_cases']

Ratio of San Jose instances to total

In [15]:
sj_ratio = len(y_train_sj) / len(y_train)

## Model w/o city separation

In [77]:
ridge = Pipeline([
    ('scale', StandardScaler()),
    ('impute_m', SimpleImputer()),
    ('ridge', Ridge(1400))
])
ridge_res = cross_validate(
    estimator = ridge,
    X = X_train,
    y = y_train,
    cv = TimeSeriesSplit(n_splits = 10),
    scoring = 'neg_mean_absolute_error',
    n_jobs = -1
)
ridge_score = np.mean(ridge_res['test_score'])
ridge_score

-25.7426637592478

Ridge seems to like high penalties. Cranking up $\alpha$ to 1400 decreade MAE from 35 to ~25.7

Let's try LASSO to see if feature selection is more important

In [93]:
lasso = Pipeline([
    ('scale', StandardScaler()),
    ('impute_m', SimpleImputer()),
    ('lasso', Lasso(6.65))
])
lasso_res = cross_validate(
    estimator = lasso,
    X = X_train,
    y = y_train,
    cv = TimeSeriesSplit(n_splits = 10),
    scoring = 'neg_mean_absolute_error',
    n_jobs = -1
)
lasso_score = np.mean(lasso_res['test_score'])
lasso_score

-28.9878546196094

We seem to be having difficulties getting under 28 MAE with the LASSO. Try ElasticNet now

In [104]:
en = Pipeline([
    ('scale', StandardScaler()),
    ('impute_m', SimpleImputer()),
    ('en', ElasticNet(5))
])
en_res = cross_validate(
    estimator = en,
    X = X_train,
    y = y_train,
    cv = TimeSeriesSplit(n_splits = 10),
    scoring = 'neg_mean_absolute_error',
    n_jobs = -1
)
en_score = np.mean(en_res['test_score'])
en_score

-27.582635929294337

We can't seem to improve on ElasticNet, so we try a polynomial kernel with Ridge

In [118]:
ridgep = Pipeline([
    ('scale', StandardScaler()),
    ('impute_m', SimpleImputer()),
    ('poly_f', PolynomialFeatures(degree = 2)),
    ('ridge', Ridge(5600))
])
ridgep_res = cross_validate(
    estimator = ridgep,
    X = X_train,
    y = y_train,
    cv = TimeSeriesSplit(n_splits = 10),
    scoring = 'neg_mean_absolute_error',
    n_jobs = -1
)
ridgep_score = np.mean(ridgep_res['test_score'])
ridgep_score

-24.94206254674812

This still doesn't beat elasticnet with polynomial features in `models-02`

In [132]:
enp = Pipeline([
    ('scale', StandardScaler()),
    ('impute_m', SimpleImputer()),
    ('poly_f', PolynomialFeatures(degree = 2)),
    ('en', ElasticNet(5))
])
enp_res = cross_validate(
    estimator = enp,
    X = X_train,
    y = y_train,
    cv = TimeSeriesSplit(n_splits = 10),
    scoring = 'neg_mean_absolute_error',
    n_jobs = -1
)
enp_score = np.mean(enp_res['test_score'])
enp_score

-24.58044699431419

STILL no improvement. Let's move on to separating by city

## Model with city separation

In [163]:
en_sj = Pipeline([
    ('scale', StandardScaler()),
    ('impute_m', SimpleImputer()),
    ('en', ElasticNet(7))
])
en_sj_res = cross_validate(
    estimator = en_sj,
    X = X_train_sj,
    y = y_train_sj,
    cv = TimeSeriesSplit(n_splits = 10),
    scoring = 'neg_mean_absolute_error',
    n_jobs = -1
)
en_score_sj = np.mean(en_sj_res['test_score'])
en_score_sj

-32.890977114750974

In [185]:
en_iq = Pipeline([
    ('scale', StandardScaler()),
    ('impute_m', SimpleImputer()),
    ('en', ElasticNet(5.5))
])
en_iq_res = cross_validate(
    estimator = en_iq,
    X = X_train_iq,
    y = y_train_iq,
    cv = TimeSeriesSplit(n_splits = 10),
    scoring = 'neg_mean_absolute_error',
    n_jobs = -1
)
en_score_iq = np.mean(en_iq_res['test_score'])
en_score_iq

-6.430553285279375

In [186]:
en_tot_score = sj_ratio * en_score_sj + (1 - sj_ratio) * en_score_iq
en_tot_score

-23.44082574708255

Let's see if a polynomial kernel improves this

In [16]:
poly_sj = Pipeline([
    ('scale', StandardScaler()),
    ('impute_m', SimpleImputer()),
    ('poly_f', PolynomialFeatures(2)),
    ('en', ElasticNet(14))
])
poly_sj_res = cross_validate(
    estimator = poly_sj,
    X = X_train_sj,
    y = y_train_sj,
    cv = TimeSeriesSplit(n_splits = 10),
    scoring = 'neg_mean_absolute_error',
    n_jobs = -1
)
poly_score_sj = np.mean(poly_sj_res['test_score'])
poly_score_sj

-32.80471491375509

In [45]:
poly_iq = Pipeline([
    ('scale', StandardScaler()),
    ('impute_m', SimpleImputer()),
    ('poly_f', PolynomialFeatures(2)),
    ('en', ElasticNet(2))
])
poly_iq_res = cross_validate(
    estimator = poly_iq,
    X = X_train_iq,
    y = y_train_iq,
    cv = TimeSeriesSplit(n_splits = 10),
    scoring = 'neg_mean_absolute_error',
    n_jobs = -1
)
poly_score_iq = np.mean(poly_iq_res['test_score'])
poly_score_iq

-7.176574691549268

In [46]:
poly_tot_score = sj_ratio * poly_score_sj + (1 - sj_ratio) * poly_score_iq
poly_tot_score

-23.65180769153873

This is a slight improvement, so let's build a submission form that:

In [24]:
poly_sj.fit(X_train_sj, y_train_sj)
y_pred_sj = poly_sj.predict(X_test_sj)

In [47]:
poly_iq.fit(X_train_iq, y_train_iq)
y_pred_iq = poly_iq.predict(X_test_iq)

In [48]:
y_pred = np.concatenate((y_pred_sj, y_pred_iq))

In [33]:
split_en_sub = pd.read_csv(join(DATA_PATH, 'submission_format.csv'))

In [49]:
split_en_sub['total_cases'] = np.round(y_pred).astype(int)

In [51]:
split_en_sub.to_csv('../models/split_poly_en.csv', index = None)

## Take aways

27.27 on leaderboard, not much of an improvement!