# **Project Pipeline**

| **Steps**                                              | **Script files**                          |
|-----------------------------------------------------------|-------------------------------------------|
| 1) Read and pre-process data                              | pre_processing.py                         |
| 2) Feature engineering                                    | feature_engineering.py                    |
| 3) Train models                                           | model_training.py, <br>tree_model_training.py |
| 4) Predict on test_features <br>and write submission file | final_predict.py                          |

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

import scripts.pre_processing as pp
import scripts.feature_engineering as fe
import scripts.model_evaluation as me
from scripts.model_training import Model
import scripts.tree_model_training as tm
from scripts.model_evaluation import regression_evaluation
import scripts.final_predict as fp
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit


## Pre-process data

In [2]:
# Read in data files 
train_features = pd.read_csv('./data/dengue_features_train.csv')
train_target = pd.read_csv('./data/dengue_labels_train.csv')
test_features = pd.read_csv('./data/dengue_features_test.csv')

In [3]:
# Merge features and target data
data = pp.merge_data(train_features, train_target, test_features, inc_test=False)

# Run processing and split by city
train_iq = pp.pre_process(data, 'iq', remove_anomalies=True, inc_test=False)
train_sj = pp.pre_process(data, 'sj', remove_anomalies=True, inc_test=False)

# Run checks for missing values
assert train_iq.isnull().any().any() == False
assert train_sj.isnull().any().any() == False
print(f'train_iq shape: {train_iq.shape}') 
print(f'train_sj shape: {train_sj.shape}') 

train_iq shape: (518, 24)
train_sj shape: (928, 24)


In [4]:
# Run feature engineering 
#train_iq = fe.cyclical_encode_date(train_iq)
#train_sj = fe.cyclical_encode_date(train_sj)

train_iq = fe.drop_date(train_iq)
train_sj = fe.drop_date(train_sj)

In [5]:
# Split data into X and y
X_iq = train_iq.drop(labels=['total_cases'], axis=1)
y_iq = train_iq['total_cases']
X_sj = train_sj.drop(labels=['total_cases'], axis=1)
y_sj = train_sj['total_cases']

In [6]:
# Split into training and cross-validation sets
X_train_sj, y_train_sj, X_test_sj, y_test_sj = pp.train_cv_split(train_sj, city='sj')
X_train_iq, y_train_iq, X_test_iq, y_test_iq = pp.train_cv_split(train_iq, city='iq')

# Check compatible sizes for models:
assert len(X_train_sj) == len(y_train_sj)
assert len(X_test_sj) == len(y_test_sj)
assert len(X_train_iq) == len(y_train_iq)
assert len(X_test_iq) == len(y_test_iq)

## Train model

In [7]:
# Baseline model predictions for San Jose (sj)
bl_pred_train = np.tile(np.mean(y_train_sj), len(y_train_sj))
bl_pred_test = np.tile(np.mean(y_test_sj), len(y_test_sj))
regression_evaluation(y_train_sj, y_test_sj, bl_pred_train, bl_pred_test)


    Evaluation metrics:
        RMSE train: 42.77395132680239
        RMSE test: 26.424758032157015
        MAE train: 26.39839012792388
        MAE test: 16.795393417771038 
    


(42.77395132680239, 26.424758032157015, 26.39839012792388, 16.795393417771038)

In [8]:
# Baseline model predictions for Iquitos (Iq)
bl_pred_train = np.tile(np.mean(y_train_iq), len(y_train_iq))
bl_pred_test = np.tile(np.mean(y_test_iq), len(y_test_iq))
regression_evaluation(y_train_iq, y_test_iq, bl_pred_train, bl_pred_test)


    Evaluation metrics:
        RMSE train: 9.072430947354988
        RMSE test: 9.072430947354988
        MAE train: 6.220546801627883
        MAE test: 6.220546801627883 
    


(9.072430947354988, 9.072430947354988, 6.220546801627883, 6.220546801627883)

In [9]:
# Tree model for IQ
tm.rf_model(X_train_iq, y_train_iq, X_test_iq, y_test_iq)


    RandomForestRegressor with params: {}
    Evaluation metrics:
        RMSE train: 2.711479244087937
        RMSE test: 2.711479244087937
        MAE train: 1.6680694980694981
        MAE test: 1.6680694980694981 
    


(2.711479244087937, 2.711479244087937, 1.6680694980694981, 1.6680694980694981)

In [10]:
# Tree model for SJ 
tm.rf_model(X_train_sj, y_train_sj, X_test_sj, y_test_sj)


    RandomForestRegressor with params: {}
    Evaluation metrics:
        RMSE train: 9.986098805958408
        RMSE test: 28.485665945097097
        MAE train: 5.469720062208398
        MAE test: 16.687202797202797 
    


(9.986098805958408, 28.485665945097097, 5.469720062208398, 16.687202797202797)

In [12]:
# XG boost model for iq
X, y = X_iq, y_iq
tm.xg_model(X, y)

Unnamed: 0,TSS iteration,rmse_test,rmse_train,mae_train,mae_test,learning_rate,n_estimators,max_depth,subsample,colsample_bytree,reg_lambda
0,1,8.40499,0.17938,0.111742,5.465261,0.1,100,5,1.0,1.0,2
1,1,8.396832,0.00964,0.00631,5.456236,0.1,200,5,1.0,1.0,2
2,2,11.305093,0.740652,0.509354,6.922478,0.1,100,5,1.0,1.0,2
3,2,11.288674,0.171332,0.113673,6.928529,0.1,200,5,1.0,1.0,2


In [None]:
# XG boost model for sj
X, y = X_sj, y_sj
tm.xg_model(X, y)

## Optimise models

## Predict on test data with chosen model and write file

In [None]:
# Code to run pipeline on dataset including test_features, and then take only test_features to run the final model
# Merge features and target data
data = pp.merge_data(train_features, train_target, test_features, inc_test=True)

# Run processing and split by city
train_iq = pp.pre_process(data, 'iq', remove_anomalies=True, inc_test=True)
train_sj = pp.pre_process(data, 'sj', remove_anomalies=True, inc_test=True)
assert train_iq.isnull().any().any() == False
assert train_sj.isnull().any().any() == False

# Run feature engineering 
train_iq = fe.cyclical_encode_date(train_iq)
train_sj = fe.cyclical_encode_date(train_sj)
train_iq = fe.drop_date(train_iq)
train_sj = fe.drop_date(train_sj)

# Split into to final test_features DataFrames
test_iq = train_iq.loc[train_iq['total_cases'] < 0, train_iq.columns != 'total_cases']
X_iq = train_iq.loc[train_iq['total_cases'] >= 0, train_iq.columns != 'total_cases']
y_iq = train_iq.loc[train_iq['total_cases'] >= 0, train_iq.columns == 'total_cases']
test_sj = train_sj.loc[train_sj['total_cases'] < 0, train_sj.columns != 'total_cases']
X_sj = train_sj.loc[train_sj['total_cases'] >= 0, train_sj.columns != 'total_cases']
y_sj = train_sj.loc[train_sj['total_cases'] >= 0, train_sj.columns == 'total_cases']
assert len(X_iq) == len(y_iq)
assert len(X_sj) == len(y_sj)
assert test_iq.shape[1] == X_iq.shape[1]
assert test_sj.shape[1] == X_sj.shape[1]


In [None]:
# Select data and chosen model and hyperparameters for final prediction

# Iquitos, iq
final_test_iq = test_iq
X_train_iq = X_iq
y_train_iq = y_iq
model_iq = 'RandomForestRegressor'
params_iq = {}

# San Jose, sj
final_test_sj = test_sj
X_train_sj = X_sj
y_train_sj = y_sj
model_sj = 'RandomForestRegressor'
params_sj = {}

In [None]:
# Perform final predictions and reformat for submission
final_iq = fp.final_predict(final_test_iq, X_train_iq, y_train_iq, 
              city='iq', model=model_iq, params=params_iq)
final_sj = fp.final_predict(final_test_sj, X_train_sj, y_train_sj, 
              city='sj', model=model_sj, params=params_sj)

# Merge the two cities into one DataFrame and write to new csv file 
final_comb = fp.write_submission(final_iq, final_sj) 

# 