# **Project Pipeline**

| **Steps**                                              | **Script files**                          |
|-----------------------------------------------------------|-------------------------------------------|
| 1) Read in data                                           | pre_processing.py                         |
| 2) Process data                                           | feature_engineering.py                    |
| 3) Train models                                           | model_training.py, <br>tree_model_training.py |
| 4) Predict on test_features <br>and write submission file | final_predict.py                          |

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

import scripts.pre_processing as pp
import scripts.feature_engineering as fe
import scripts.model_evaluation as me
from scripts.model_training import Model
from scripts.tree_model_training import rf_model
from scripts.model_evaluation import regression_evaluation
import scripts.final_predict as fp
from sklearn.ensemble import RandomForestRegressor

## Pre-process data

In [2]:
# Read in data files 
train_features = pd.read_csv('./data/dengue_features_train.csv')
train_target = pd.read_csv('./data/dengue_labels_train.csv')
test_features = pd.read_csv('./data/dengue_features_test.csv')

In [3]:
# Merge features and target data
data = pp.merge_data(train_features, train_target, test_features, inc_test=False)

# Run processing and split by city
train_iq = pp.pre_process(data, 'iq')
train_sj = pp.pre_process(data, 'sj')

# Run checks for missing values
assert train_iq.isnull().any().any() == False
assert train_sj.isnull().any().any() == False
print(f'train_iq shape: {train_iq.shape}') 
print(f'train_sj shape: {train_sj.shape}') 

train_iq shape: (520, 24)
train_sj shape: (936, 24)


In [4]:
# Run feature engineering 
train_iq = fe.feature_engineer_1(train_iq)
train_sj = fe.feature_engineer_1(train_sj)

In [5]:
# Split into training and cross-validation sets
X_train_sj, y_train_sj, X_test_sj, y_test_sj = pp.train_cv_split(train_sj, city='sj')
X_train_iq, y_train_iq, X_test_iq, y_test_iq = pp.train_cv_split(train_iq, city='iq')

# Check compatible sizes for models:
assert len(X_train_sj) == len(y_train_sj)
assert len(X_test_sj) == len(y_test_sj)
assert len(X_train_iq) == len(y_train_iq)
assert len(X_test_iq) == len(y_test_iq)

## Train model

In [6]:
# Baseline model predictions for San Jose (sj)
bl_pred_train = np.tile(np.mean(y_train_sj), len(y_train_sj))
bl_pred_test = np.tile(np.mean(y_test_sj), len(y_test_sj))
regression_evaluation(y_train_sj, y_test_sj, bl_pred_train, bl_pred_test)


    Evaluation metrics:
        RMSE train: 58.13035344509564
        RMSE test: 26.424758032157015
        MAE train: 32.27655432620499
        MAE test: 16.795393417771038 
    


(58.13035344509564, 26.424758032157015, 32.27655432620499, 16.795393417771038)

In [7]:
# Baseline model predictions for Iquitos (Iq)
bl_pred_train = np.tile(np.mean(y_train_iq), len(y_train_iq))
bl_pred_test = np.tile(np.mean(y_test_iq), len(y_test_iq))
regression_evaluation(y_train_iq, y_test_iq, bl_pred_train, bl_pred_test)


    Evaluation metrics:
        RMSE train: 10.755121939289861
        RMSE test: 10.755121939289861
        MAE train: 6.684008875739645
        MAE test: 6.684008875739645 
    


(10.755121939289861, 10.755121939289861, 6.684008875739645, 6.684008875739645)

In [8]:
# Tree model for IQ
rf_model(X_train_iq, y_train_iq, X_test_iq, y_test_iq)


    RandomForestRegressor with params: {}
    Evaluation metrics:
        RMSE train: 3.5985061804544998
        RMSE test: 3.5985061804544998
        MAE train: 1.994442307692308
        MAE test: 1.994442307692308 
    


(3.5985061804544998, 3.5985061804544998, 1.994442307692308, 1.994442307692308)

In [9]:
# Tree model for SJ 
rf_model(X_train_sj, y_train_sj, X_test_sj, y_test_sj)


    RandomForestRegressor with params: {}
    Evaluation metrics:
        RMSE train: 10.20866373270142
        RMSE test: 30.94373533194896
        MAE train: 5.716082949308756
        MAE test: 18.368356643356645 
    


(10.20866373270142, 30.94373533194896, 5.716082949308756, 18.368356643356645)

## Optimise models

## Predict on test data with chosen model and write file

In [None]:
# Code to run pipeline on dataset including test_features, and then take only test_features to run the final model

In [10]:
# Select data and chosen model and hyperparameters for final prediction

# Iquitos, iq
final_test_iq = train_iq.drop(['total_cases'], axis=1)
X_train_iq = X_train_iq
y_train_iq = y_train_iq
model_iq = 'RandomForestRegressor'
params_iq = {}

# San Jose, sj
final_test_sj = train_sj.drop(['total_cases'], axis=1)
X_train_sj = X_train_sj
y_train_sj = y_train_sj
model_sj = 'RandomForestRegressor'
params_sj = {}

In [11]:
# Perform final predictions and reformat for submission
final_iq = fp.final_predict(final_test_iq, X_train_iq, y_train_iq, 
              city='iq', model=model_iq, params=params_iq)
final_sj = fp.final_predict(final_test_sj, X_train_sj, y_train_sj, 
              city='sj', model=model_sj, params=params_sj)

# Merge the two cities into one DataFrame and write to new csv file 
fp.write_submission(final_iq, final_sj) 

Writing submission file to folder: 


Unnamed: 0,city,year,weekofyear,total_cases
0,sj,1990,18,6.13
1,sj,1990,19,5.84
2,sj,1990,20,4.24
3,sj,1990,21,3.47
4,sj,1990,22,22.36
...,...,...,...,...
1451,iq,2010,21,4.91
1452,iq,2010,22,8.10
1453,iq,2010,23,2.39
1454,iq,2010,24,2.54


# 