# **Project Pipeline**

- Read in data
- Process data
- Train model
- Evaluate model
- Predict on test data
- Write file for submission

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

import scripts.pre_processing as pp
import scripts.model_evaluation as me
from scripts.model_training import Model
from scripts.tree_model_training import rf_model
from scripts.model_evaluation import regression_evaluation
from sklearn.ensemble import RandomForestRegressor

## Pre-process data

In [None]:
# Read in data files 
train_features = pd.read_csv('./data/dengue_features_train.csv')
train_target = pd.read_csv('./data/dengue_labels_train.csv')
test_features = pd.read_csv('./data/dengue_features_test.csv')

In [None]:
# Merge features and target data
data = pp.merge_data(train_features, train_target, test_features, inc_test=False)

# Run processing and split by city
train_iq = pp.pre_process(data, 'iq')
train_sj = pp.pre_process(data, 'sj')

# Run checks for missing values
assert train_iq.isnull().any().any() == False
assert train_sj.isnull().any().any() == False
print(f'train_iq shape: {train_iq.shape}') 
print(f'train_sj shape: {train_sj.shape}') 

In [None]:
# Run feature engineering 
train_iq = pp.feature_engineer(train_iq)
train_sj = pp.feature_engineer(train_sj)

In [None]:
# Split into training and cross-validation sets
X_train_sj, y_train_sj, X_test_sj, y_test_sj = pp.train_cv_split(train_sj, city='sj')
X_train_iq, y_train_iq, X_test_iq, y_test_iq = pp.train_cv_split(train_iq, city='iq')

# Check compatible sizes for models:
assert len(X_train_sj) == len(y_train_sj)
assert len(X_test_sj) == len(y_test_sj)
assert len(X_train_iq) == len(y_train_iq)
assert len(X_test_iq) == len(y_test_iq)

## Train model

In [None]:
# Baseline model predictions for San Jose (sj)
bl_pred_train = np.tile(np.mean(y_train_sj), len(y_train_sj))
bl_pred_test = np.tile(np.mean(y_test_sj), len(y_test_sj))
regression_evaluation(y_train_sj, y_test_sj, bl_pred_train, bl_pred_test)

In [None]:
# Baseline model predictions for Iquitos (Iq)
bl_pred_train = np.tile(np.mean(y_train_iq), len(y_train_iq))
bl_pred_test = np.tile(np.mean(y_test_iq), len(y_test_iq))
regression_evaluation(y_train_iq, y_test_iq, bl_pred_train, bl_pred_test)

In [None]:
# Tree model for IQ
rf_model(X_train_iq, y_train_iq, X_test_iq, y_test_iq)

In [None]:
# Tree model for SJ 
rf_model(X_train_sj, y_train_sj, X_test_sj, y_test_sj)

## Optimise models

## Predict on test data with chosen model and write file

In [None]:
### NOTE this cell works, but input data is currently not correct

# For Iquitos (iq)
final_test_iq = train_iq.drop(['total_cases'], axis=1)
model = RandomForestRegressor()
model.fit(X_train_iq, y_train_iq)
final_preds_iq = model.predict(final_test_iq)
final_test_iq = final_test_iq.loc[:,['year','weekofyear']]
final_test_iq['city'] = 'iq'
final_test_iq['total_cases'] = final_preds_iq.tolist()
final_test_iq = final_test_iq.loc[:, ['city','year','weekofyear','total_cases']]

# For San Jose (sj)
final_test_sj = train_sj.drop(['total_cases'], axis=1)
model = RandomForestRegressor()
model.fit(X_train_sj, y_train_sj)
final_preds_sj = model.predict(final_test_sj)
final_test_sj = final_test_sj.loc[:,['year','weekofyear']]
final_test_sj['city'] = 'sj'
final_test_sj['total_cases'] = final_preds_sj.tolist()
final_test_sj = final_test_sj.loc[:, ['city','year','weekofyear','total_cases']]

# Concat the two for subsmission
final = pd.concat([final_test_sj, final_test_iq], axis=0)

# Write to csv file 
final.to_csv('for_submission.csv', index=False)

# 