# **Project Pipeline**

| **Steps**                                              | **Script files**                          |
|-----------------------------------------------------------|-------------------------------------------|
| 1) Read and pre-process data                              | pre_processing.py                         |
| 2) Feature engineering                                    | feature_engineering.py                    |
| 3) Train models                                           | model_training.py, <br>tree_model_training.py |
| 4) Predict on test_features <br>and write submission file | final_predict.py                          |

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

import scripts.pre_processing as pp
import scripts.feature_engineering as fe
import scripts.model_evaluation as me
from scripts.model_training import Model
import scripts.tree_model_training as tm
from scripts.model_evaluation import regression_evaluation
import scripts.final_predict as fp
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit


## Pre-process data

In [2]:
# Read in data files 
train_features = pd.read_csv('./data/dengue_features_train.csv')
train_target = pd.read_csv('./data/dengue_labels_train.csv')
test_features = pd.read_csv('./data/dengue_features_test.csv')

In [3]:
# Merge features and target data
data = pp.merge_data(train_features, train_target, test_features, inc_test=False)

# Run processing and split by city
train_iq = pp.pre_process(data, 'iq', remove_anomalies=True, inc_test=False)
train_sj = pp.pre_process(data, 'sj', remove_anomalies=True, inc_test=False)

# Run checks for missing values
assert train_iq.isnull().any().any() == False
assert train_sj.isnull().any().any() == False
print(f'train_iq shape: {train_iq.shape}') 
print(f'train_sj shape: {train_sj.shape}') 

train_iq shape: (518, 24)
train_sj shape: (928, 24)


In [4]:
# Split data into X and y
X_iq = train_iq.drop(labels=['total_cases'], axis=1)
y_iq = train_iq['total_cases']
X_sj = train_sj.drop(labels=['total_cases'], axis=1)
y_sj = train_sj['total_cases']
assert len(X_iq) == len(y_iq)

In [5]:
# Run feature engineering 
X_iq = fe.cyclical_encode_date(X_iq)
X_sj = fe.cyclical_encode_date(X_sj)
X_iq = fe.shift_features(X_iq, periods=1)
X_sj = fe.shift_features(X_sj, periods=1)
X_iq = fe.drop_date(X_iq)
X_sj = fe.drop_date(X_sj)

## Train model

In [6]:
# Baseline model predictions for Iquitos (Iq)
bl_pred_train = np.tile(np.mean(y_iq), len(y_iq))
bl_pred_test = np.tile(np.mean(y_iq), len(y_iq))
print('For Iquitos: ')
regression_evaluation(y_iq, y_iq, bl_pred_train, bl_pred_test)

# Baseline model predictions for San Jose (sj)
bl_pred_train = np.tile(np.mean(y_sj), len(y_sj))
bl_pred_test = np.tile(np.mean(y_sj), len(y_sj))
print('For San Jose: ')
regression_evaluation(y_sj, y_sj, bl_pred_train, bl_pred_test)

For Iquitos: 

    Evaluation metrics:
        RMSE train: 9.072430947354988
        RMSE test: 9.072430947354988
        MAE train: 6.220546801627883
        MAE test: 6.220546801627883 
    
For San Jose: 

    Evaluation metrics:
        RMSE train: 39.06484683794444
        RMSE test: 39.06484683794444
        MAE train: 24.198600995838284
        MAE test: 24.198600995838284 
    


(39.06484683794444, 39.06484683794444, 24.198600995838284, 24.198600995838284)

In [7]:
# XG boost model for iq
X, y = X_iq, y_iq
tm.xg_model(X, y)

Unnamed: 0,TSS iteration,rmse_test,rmse_train,mae_train,mae_test,learning_rate,n_estimators,max_depth,subsample,colsample_bytree,reg_lambda
0,1,8.968666,0.185971,0.125537,6.386428,0.1,100,5,1.0,1.0,2
1,1,8.967792,0.008658,0.005863,6.387295,0.1,200,5,1.0,1.0,2
2,2,11.877454,0.507538,0.356672,7.035071,0.1,100,5,1.0,1.0,2
3,2,11.875514,0.108966,0.076362,7.050161,0.1,200,5,1.0,1.0,2


In [8]:
# XG boost model for sj
X, y = X_sj, y_sj
tm.xg_model(X, y)

Unnamed: 0,TSS iteration,rmse_test,rmse_train,mae_train,mae_test,learning_rate,n_estimators,max_depth,subsample,colsample_bytree,reg_lambda
0,1,43.618572,1.682272,1.185423,26.427041,0.1,100,5,1.0,1.0,2
1,1,43.67337,0.256781,0.174029,26.518406,0.1,200,5,1.0,1.0,2
2,2,37.038379,5.631207,3.957482,27.439259,0.1,100,5,1.0,1.0,2
3,2,37.290659,1.717175,1.203274,27.637586,0.1,200,5,1.0,1.0,2


## Optimise models

## Predict on test data with chosen model and write file

In [9]:
# Code to run pipeline on dataset including test_features, and then take only test_features to run the final model
# Merge features and target data
data = pp.merge_data(train_features, train_target, test_features, inc_test=True)

# Run processing and split by city
train_iq = pp.pre_process(data, 'iq', remove_anomalies=True, inc_test=True)
train_sj = pp.pre_process(data, 'sj', remove_anomalies=True, inc_test=True)
assert train_iq.isnull().any().any() == False
assert train_sj.isnull().any().any() == False

# Split data into X and y
X_iq_comb = train_iq.drop(labels=['total_cases'], axis=1)
y_train_iq = train_iq.loc[train_iq['total_cases'] >= 0, train_iq.columns == 'total_cases']
y_test_iq = train_iq.loc[train_iq['total_cases'] < 0, train_iq.columns == 'total_cases']
X_sj_comb = train_sj.drop(labels=['total_cases'], axis=1)
y_train_sj = train_sj.loc[train_sj['total_cases'] >= 0, train_sj.columns == 'total_cases']
y_test_sj = train_sj.loc[train_sj['total_cases'] < 0, train_sj.columns == 'total_cases']

# Run feature engineering 
X_iq_comb = fe.cyclical_encode_date(X_iq_comb)
X_sj_comb = fe.cyclical_encode_date(X_sj_comb)
X_iq_comb = fe.shift_features(X_iq_comb, periods=1)
X_sj_comb = fe.shift_features(X_sj_comb, periods=1)
X_iq_comb = fe.drop_date(X_iq_comb)
X_sj_comb = fe.drop_date(X_sj_comb)

# Take now only the test features 
X_train_iq = X_iq_comb.iloc[:(X_iq_comb.shape[0] - y_test_iq.shape[0] + 1) , :]
X_train_sj = X_sj_comb.iloc[:(X_sj_comb.shape[0] - y_test_sj.shape[0] + 1) , :]
X_test_iq = X_iq_comb.iloc[(X_iq_comb.shape[0] - y_test_iq.shape[0]): , :]
X_test_sj = X_sj_comb.iloc[(X_sj_comb.shape[0] - y_test_sj.shape[0]): , :]
assert (X_train_iq.shape[0] == y_train_iq.shape[0])
assert (X_train_sj.shape[0] == y_train_sj.shape[0])


In [10]:
# Select data and chosen model and hyperparameters for final prediction

params = dict(learning_rate=0.1,
        n_estimators=150,
        max_depth=5,
        subsample=1.0,
        colsample_bytree=1.0,
        reg_lambda=2)

# Iquitos, iq
X_test_iq = X_test_iq
X_train_iq = X_train_iq
y_train_iq = y_train_iq
model_iq = 'XGBRegressor'
params_iq = params

# San Jose, sj
X_test_sj = X_test_sj
X_train_sj = X_train_sj
y_train_sj = y_train_sj
model_sj = 'XGBRegressor'
params_sj = params

In [11]:
# Perform final predictions and reformat for submission
final_iq = fp.final_predict(X_test_iq, X_train_iq, y_train_iq, 
              city='iq', model=model_iq, params=params_iq)
final_sj = fp.final_predict(X_test_sj, X_train_sj, y_train_sj, 
              city='sj', model=model_sj, params=params_sj)

#final_comb = fp.write_submission(final_iq, final_sj) 

In [12]:
# Merge the two cities into one DataFrame and write to new csv file 
final = pd.concat([final_sj, final_iq], axis=0)
final = final.drop(['weekofyear','year'], axis=1)
final['weekofyear'] = test_features.loc[:, ['weekofyear']]
final['year'] = test_features.loc[:, ['year']]
final = final.loc[:, ['city','year','weekofyear','total_cases']]
final['total_cases'] = final['total_cases'].astype(int)
print('Writing submission file to folder: ')
final.to_csv('for_submission.csv', index=False)


Writing submission file to folder: 


# 