# **Project Pipeline**

| **Steps**                                              | **Script files**                          |
|-----------------------------------------------------------|-------------------------------------------|
| 1) Read and pre-process data                              | pre_processing.py                         |
| 2) Feature engineering                                    | feature_engineering.py                    |
| 3) Train models                                           | model_training.py, <br>tree_model_training.py |
| 4) Predict on test_features <br>and write submission file | final_predict.py                          |

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

import scripts.pre_processing as pp
import scripts.feature_engineering as fe
import scripts.model_evaluation as me
from scripts.model_training import Model
from scripts.tree_model_training import rf_model
from scripts.model_evaluation import regression_evaluation
import scripts.final_predict as fp
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit


## Pre-process data

In [5]:
# Read in data files 
train_features = pd.read_csv('./data/dengue_features_train.csv')
train_target = pd.read_csv('./data/dengue_labels_train.csv')
test_features = pd.read_csv('./data/dengue_features_test.csv')

In [6]:
# Merge features and target data
data = pp.merge_data(train_features, train_target, test_features, inc_test=False)

# Run processing and split by city
train_iq = pp.pre_process(data, 'iq', remove_anomalies=True)
train_sj = pp.pre_process(data, 'sj', remove_anomalies=True)

# Run checks for missing values
assert train_iq.isnull().any().any() == False
assert train_sj.isnull().any().any() == False
print(f'train_iq shape: {train_iq.shape}') 
print(f'train_sj shape: {train_sj.shape}') 

train_iq shape: (520, 24)
train_sj shape: (928, 24)


In [None]:
#plt.boxplot(train_sj['total_cases'])
d = train_iq.loc[train_iq['total_cases'] <80, :] 
plt.boxplot(d['total_cases'])


In [None]:
# Run feature engineering 
train_iq = fe.feature_engineer_1(train_iq)
train_sj = fe.feature_engineer_1(train_sj)

In [None]:
# Split into training and cross-validation sets
X_train_sj, y_train_sj, X_test_sj, y_test_sj = pp.train_cv_split(train_sj, city='sj')
X_train_iq, y_train_iq, X_test_iq, y_test_iq = pp.train_cv_split(train_iq, city='iq')

# Check compatible sizes for models:
assert len(X_train_sj) == len(y_train_sj)
assert len(X_test_sj) == len(y_test_sj)
assert len(X_train_iq) == len(y_train_iq)
assert len(X_test_iq) == len(y_test_iq)

In [16]:
# Train test split with sklearn 
tss = TimeSeriesSplit(n_splits = 3)

X_iq = train_iq.drop(labels=['total_cases'], axis=1)
y_iq = train_iq['total_cases']
X_sj = train_sj.drop(labels=['total_cases'], axis=1)
y_sj = train_sj['total_cases']

for train_index, test_index in tss.split(X_iq):
    X_train, X_test = X_iq.iloc[train_index, :], X_iq.iloc[test_index,:]
    y_train, y_test = y_iq.iloc[train_index], y_iq.iloc[test_index]
    print(X_train.shape, X_test.shape)

(130, 23) (130, 23)
(260, 23) (130, 23)
(390, 23) (130, 23)


## Train model

In [None]:
# Baseline model predictions for San Jose (sj)
bl_pred_train = np.tile(np.mean(y_train_sj), len(y_train_sj))
bl_pred_test = np.tile(np.mean(y_test_sj), len(y_test_sj))
regression_evaluation(y_train_sj, y_test_sj, bl_pred_train, bl_pred_test)

In [None]:
# Baseline model predictions for Iquitos (Iq)
bl_pred_train = np.tile(np.mean(y_train_iq), len(y_train_iq))
bl_pred_test = np.tile(np.mean(y_test_iq), len(y_test_iq))
regression_evaluation(y_train_iq, y_test_iq, bl_pred_train, bl_pred_test)

In [None]:
# Tree model for IQ
rf_model(X_train_iq, y_train_iq, X_test_iq, y_test_iq)

In [None]:
# Tree model for SJ 
rf_model(X_train_sj, y_train_sj, X_test_sj, y_test_sj)

## Optimise models

## Predict on test data with chosen model and write file

In [None]:
# Code to run pipeline on dataset including test_features, and then take only test_features to run the final model

In [None]:
# Select data and chosen model and hyperparameters for final prediction

# Iquitos, iq
final_test_iq = train_iq.drop(['total_cases'], axis=1)
X_train_iq = X_train_iq
y_train_iq = y_train_iq
model_iq = 'RandomForestRegressor'
params_iq = {}

# San Jose, sj
final_test_sj = train_sj.drop(['total_cases'], axis=1)
X_train_sj = X_train_sj
y_train_sj = y_train_sj
model_sj = 'RandomForestRegressor'
params_sj = {}

In [None]:
# Perform final predictions and reformat for submission
final_iq = fp.final_predict(final_test_iq, X_train_iq, y_train_iq, 
              city='iq', model=model_iq, params=params_iq)
final_sj = fp.final_predict(final_test_sj, X_train_sj, y_train_sj, 
              city='sj', model=model_sj, params=params_sj)

# Merge the two cities into one DataFrame and write to new csv file 
fp.write_submission(final_iq, final_sj) 

# 