# 1.Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [90]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error

In [91]:
from statsmodels.tsa.deterministic import (CalendarFourier,
                                           CalendarSeasonality,
                                           CalendarTimeTrend,
                                           DeterministicProcess)

In [None]:
import xgboost as xgb

# 2.Functions

### Loading data and splitting X and y:

In [92]:
def get_train_data(features_dir, labels_dir):
    """ 
    Input:
        str to csv with train features and labels
        drop: to drop NaNs from feature columns.

    Output:
         X, y
    
    """
    features = pd.read_csv(features_dir)
    labels = pd.read_csv(labels_dir)

    data = features.merge(labels)
    
    X = data.drop(columns=['total_cases'])
    y = data.loc[:,'total_cases']
    print("Loading of training data finished.\n")
        
    return X, y

### Building fetures from 'week_start_date' (month, day of year, week of year):

In [116]:
def get_time_features(X, time_column='week_start_date', format='%Y-%m-%d'):
    
    # Timed features.
    X_new = X.copy()

    X_new[time_column] = pd.to_datetime(X_new[time_column], format=format)
    X_new['year'] =  X_new[time_column].apply(lambda x: x.year)
    X_new['month'] = X_new[time_column].apply(lambda x: x.month)
    X_new['dayofyear'] = X_new[time_column].apply(lambda x: x.dayofyear)
    X_new['weekofyear'] =  X_new[time_column].apply(lambda x: x.weekofyear)

    return X_new

### Building lagged features based on all numerical features

In [94]:
def make_lags(X, columns, lags=8):
    new_dict = {}
    labels = []
    for column in columns:
        for lag in range(1,lags+1):
            #X_new.loc[:, column+'_lag_{}'.format(lag)] = X[column].shift(lag)
            new_dict[column+'_lag_{}'.format(lag)]= X[column].shift(lag)
            labels.append(column+'_lag_{}'.format(lag))

    X_new = pd.DataFrame(new_dict, columns=labels, index=X.index)
    return pd.concat([X, X_new], axis=1)

### Building seasonality features (sin & cos based on timestamps):

In [120]:
def get_seasonality(X, time_column='week_start_date', format='%Y-%m-%d'):
    X_new = X.copy()
    X_new[time_column] = pd.to_datetime(X_new[time_column], format=format)
 
    #return_cols = [time_column, 'year', 'month', 'dayofyear', 'weekofyear']
    #X_fourier = X_new[time_column]
    
    fourier_year = CalendarFourier(freq='A', order=5)
    fourier_month = CalendarFourier(freq='M', order=6)
    fourier_week = CalendarFourier(freq='W', order=10)
    fourier_day = CalendarFourier(freq='D', order=3)
    df_year = fourier_year.in_sample(X_new[time_column]).set_index(X_new.index)
    df_month = fourier_month.in_sample(X_new[time_column]).set_index(X_new.index)
    df_week = fourier_week.in_sample(X_new[time_column]).set_index(X_new.index)
    df_day = fourier_day.in_sample(X_new[time_column]).set_index(X_new.index)

    ym = pd.concat([df_year,df_month], axis=1)
    wd = pd.concat([df_week,df_day], axis=1)
    ymwd = pd.concat([ym, wd], axis=1)

    return pd.concat([X_new, ymwd], axis=1)

### Building a simpleImputer() from sklearn:

In [97]:
def get_imputer(X):

    numerical_cols = list(X.select_dtypes(include=[np.number]).columns)

    numerical_transformer = Pipeline( steps =[
        ('imputer', SimpleImputer())
    ])

    #categorical_transformer = Pipeline( steps =[('donothing', 'passthrough')])

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols)
        ])
    
    return preprocessor

### Formatting output for submission:

In [98]:
def get_test_output(X_test, predictions, indexes = ['city', 'year', 'weekofyear'], exponentiate=False, file_dir = '../data/output.csv'):
    """
    Input:
        X_test - 
        predictions - for both cities indexed as X_test requires it and 
                        with the same number of rows as X_test. 
        indexes (List(str)) - By default the columns of X_test for the required format,
                        i.e., ['city', 'year', 'weekofyear'];
        exp (bool) - to exponentiate predictions or not;
                     
        file_dir (str) - directory with the filename of the output csv file;
                         Give an empty string if saving the output is not desired.
        
    Returns:
        output_df (pd.DataFrame) - Dataframe with formatted results
        csv file
    """

    # 1. Exponentiate the predictions if needed.
    if exponentiate:
        result = np.round(np.expm1(predictions)).astype(int)
    else:
        result = np.round(predictions).astype(int)

    # 2. Make the dictionary for the output dataframe format.
    out_dict = {}
    for index in indexes:
        out_dict[index] = X_test[index]

    out_dict['total_cases'] = result
    output_df = pd.DataFrame.from_dict(out_dict).set_index(indexes)
    
    # 3. Save predictions in a csv file ready for submission.
    if file_dir:
        print("Saved predictions in competition file format in path {} . /n".format(file_dir))
        output_df.to_csv(file_dir)
        print('\n')

    return None
    

### Splitting train & test data by city:

In [99]:
def split_by_city(from_dir = '../data/01_raw/', to_dir = '../data/02_intermediate/'):
    
    features = pd.read_csv(from_dir + 'dengue_features_train.csv')
    labels = pd.read_csv(from_dir + 'dengue_labels_train.csv')
    test = pd.read_csv(from_dir + 'dengue_features_test.csv')

    features[features.city == 'sj'].to_csv(to_dir + 'dengue_features_train_sj.csv', index=False)
    features[features.city == 'iq'].to_csv(to_dir + 'dengue_features_train_iq.csv', index=False)

    labels[labels.city == 'sj'].to_csv(to_dir + 'dengue_labels_train_sj.csv', index=False)
    labels[labels.city == 'iq'].to_csv(to_dir + 'dengue_labels_train_iq.csv', index=False)

    test[test.city == 'sj'].to_csv(to_dir + 'dengue_features_test_sj.csv', index=False)
    test[test.city == 'iq'].to_csv(to_dir + 'dengue_features_test_iq.csv', index=False)

    return None

### Binding city outputs together:

In [100]:
def bind_city_outputs(raw_dir = '../data/01_raw/', output_dir = '../data/07_model_output/'):

    test_data = pd.read_csv(raw_dir + 'dengue_features_test.csv')

    output_sj = pd.read_csv(output_dir + 'output_sj.csv')
    output_iq = pd.read_csv(output_dir + 'output_iq.csv')

    output = pd.concat([output_sj, output_iq])

    test_data[['city','year','weekofyear']].merge(output).to_csv(output_dir + 'output_two_cities.csv', index=False)

    return None

## 3. Pipeline

In [101]:
lag_cols = ['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw',
       'precipitation_amt_mm', 'reanalysis_air_temp_k',
       'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
       'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k',
       'reanalysis_precip_amt_kg_per_m2',
       'reanalysis_relative_humidity_percent', 'reanalysis_sat_precip_amt_mm',
       'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k',
       'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
       'station_min_temp_c', 'station_precip_mm']

In [102]:
split_by_city()

In [None]:
features_dir = '../data/02_intermediate/dengue_features_train_sj.csv'
labels_dir = '../data/02_intermediate/dengue_labels_train_sj.csv'
test_dir = '../data/02_intermediate/dengue_features_test_sj.csv'
test_output_dir = '../data/07_model_output/output_sj.csv'

X, y = get_train_data(features_dir=features_dir, labels_dir=labels_dir)
model = RandomForestRegressor(criterion='absolute_error', n_estimators=200)
#model = xgb.XGBRegressor(max_depth=2, learning_rate=0.01, n_estimators=500, n_jobs=-1)

# preprocessing & training
X_train = get_time_features(X)
X_train = make_lags(X_train, lag_cols, lags=8)
X_train = get_seasonality(X_train, time_column='week_start_date', format='%Y-%m-%d')
my_imputer = get_imputer(X_train)
num_cols = list(X_train.select_dtypes(include=[np.number]).columns)
X_train = my_imputer.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=num_cols)

model.fit(X_train, y)

# test preprocessing & predict
X_test = pd.read_csv(test_dir)
X_test_processed = get_time_features(X_test)
X_test_processed = make_lags(X_test_processed, lag_cols, lags=8)
X_test_processed = get_seasonality(X_test_processed, time_column='week_start_date', format='%Y-%m-%d')
num_cols = list(X_test_processed.select_dtypes(include=[np.number]).columns)
X_test_processed = my_imputer.transform(X_test_processed)
X_test_processed = pd.DataFrame(X_test_processed, columns=num_cols)

predictions = model.predict(X_test_processed)

# format output
get_test_output(X_test=X_test,
                predictions=predictions,
                file_dir=test_output_dir)

In [None]:
features_dir = '../data/02_intermediate/dengue_features_train_iq.csv'
labels_dir = '../data/02_intermediate/dengue_labels_train_iq.csv'
test_dir = '../data/02_intermediate/dengue_features_test_iq.csv'
test_output_dir = '../data/07_model_output/output_iq.csv'

X, y = get_train_data(features_dir=features_dir, labels_dir=labels_dir)
model = RandomForestRegressor(criterion='absolute_error', n_estimators=200)
#model = xgb.XGBRegressor(max_depth=2, learning_rate=0.01, n_estimators=500, n_jobs=-1)

# preprocessing & training
X_train = get_time_features(X)
X_train = make_lags(X_train, lag_cols, lags=8)
X_train = get_seasonality(X_train, time_column='week_start_date', format='%Y-%m-%d')
my_imputer = get_imputer(X_train)
num_cols = list(X_train.select_dtypes(include=[np.number]).columns)
X_train = my_imputer.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=num_cols)

model.fit(X_train, y)

# test preprocessing & predict
X_test = pd.read_csv(test_dir)
X_test_processed = get_time_features(X_test)
X_test_processed = make_lags(X_test_processed, lag_cols, lags=8)
X_test_processed = get_seasonality(X_test_processed, time_column='week_start_date', format='%Y-%m-%d')
num_cols = list(X_test_processed.select_dtypes(include=[np.number]).columns)
X_test_processed = my_imputer.transform(X_test_processed)
X_test_processed = pd.DataFrame(X_test_processed, columns=num_cols)

predictions = model.predict(X_test_processed)

# format output
get_test_output(X_test=X_test,
                predictions=predictions,
                file_dir=test_output_dir)

In [130]:
bind_city_outputs()