In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


In [2]:
train_filename_iq = ( './datasets/dengue_test_iq.pkl' )
dengue_test_iq = pd.read_pickle( train_filename_iq )

test_filename_sj = ( './datasets/dengue_test_sj.pkl' )
dengue_test_sj = pd.read_pickle( test_filename_sj )

submission_sj = pd.DataFrame( dengue_test_sj, columns = ['city'])
submission_iq = pd.DataFrame( dengue_test_iq, columns = ['city'])
submission_sj.reset_index( inplace = True)
submission_sj['year'] = submission_sj['week_start_date'].dt.year
submission_sj['weekofyear'] = submission_sj['week_start_date'].dt.weekofyear
submission_iq.reset_index( inplace = True)
submission_iq['year'] = submission_iq['week_start_date'].dt.year
submission_iq['weekofyear'] = submission_iq['week_start_date'].dt.weekofyear
submission = submission_sj.append( submission_iq )
submission.drop( columns = ['week_start_date'], inplace = True )
sub_sj = pd.DataFrame()
sub_iq = pd.DataFrame()

In [3]:
for month in range( 1, 13 ):
    train_filename = ( './datasets/train_sj_month_' + str( month ) + '.pkl' )
    test_filename  = ( './datasets/test_sj_month_' + str( month ) + '.pkl' )
    dengue_train_sj_month = pd.read_pickle( train_filename )
    dengue_test_sj_month  = pd.read_pickle( test_filename )
    X = pd.DataFrame( dengue_train_sj_month )
    X.drop( columns = ['city','year','total_cases','month'], inplace = True )
    y = dengue_train_sj_month.total_cases
    X_sj_Full_train, X_sj_Full_test, Y_sj_Full_train, Y_sj_Full_test = train_test_split( X, y, shuffle = False)
    param_grid = { 
        "n_estimators"      : [160,220,150],
        "max_features"      : ["sqrt"],
        "min_samples_split" : [10,12,18],
        "bootstrap"         : [True],
        "max_depth"         : [2,4,5,8]
        }
    estimator = RandomForestRegressor( criterion='mae', oob_score=True)
    rf_est = GridSearchCV(estimator, param_grid, n_jobs=-1, cv=5, verbose=0).fit( X_sj_Full_train, Y_sj_Full_train)
    Y_sj_test_pred = rf_est.best_estimator_.predict(X_sj_Full_test).astype(int)
    print ("San Juan Test  MAE error :", mean_absolute_error(Y_sj_test_pred, Y_sj_Full_test))
    Y_sj_train_pred = rf_est.best_estimator_.predict(X_sj_Full_train).astype(int)
    print ("San Juan Train MAE error :", mean_absolute_error(Y_sj_train_pred, Y_sj_Full_train))
    X_test = pd.DataFrame( dengue_test_sj_month, columns = X.columns )
    submit_pred_sj = pd.DataFrame( dengue_test_sj_month, columns = ['city','week_start_date'])
    submit_pred_sj['total_cases'] = rf_est.best_estimator_.predict(X_test).astype( 'int')
    sub_sj = sub_sj.append( submit_pred_sj )
sub_sj = sub_sj.drop( columns=['week_start_date'])
sub_sj.reset_index( inplace = True )
sub_sj['year'] = sub_sj['week_start_date'].dt.year
sub_sj['weekofyear'] = sub_sj['week_start_date'].dt.weekofyear
sub_sj.drop( columns = ['week_start_date'], inplace = True )



San Juan Test  MAE error : 17.0
San Juan Train MAE error : 12.73134328358209




San Juan Test  MAE error : 13.666666666666666
San Juan Train MAE error : 9.037037037037036




San Juan Test  MAE error : 8.11111111111111
San Juan Train MAE error : 6.351851851851852




San Juan Test  MAE error : 7.391304347826087
San Juan Train MAE error : 4.924242424242424




San Juan Test  MAE error : 8.0
San Juan Train MAE error : 5.153846153846154




San Juan Test  MAE error : 12.222222222222221
San Juan Train MAE error : 7.407407407407407




San Juan Test  MAE error : 30.26086956521739
San Juan Train MAE error : 14.925373134328359




San Juan Test  MAE error : 33.72222222222222
San Juan Train MAE error : 23.22222222222222




San Juan Test  MAE error : 38.421052631578945
San Juan Train MAE error : 22.350877192982455




San Juan Test  MAE error : 29.818181818181817
San Juan Train MAE error : 26.734375




San Juan Test  MAE error : 36.22222222222222
San Juan Train MAE error : 27.22222222222222
San Juan Test  MAE error : 24.38888888888889
San Juan Train MAE error : 18.462962962962962




In [4]:
for month in range( 1, 13 ):
    train_filename = ( './datasets/train_iq_month_' + str( month ) + '.pkl' )
    test_filename  = ( './datasets/test_iq_month_' + str( month ) + '.pkl' )
    dengue_train_iq_month = pd.read_pickle( train_filename )
    dengue_test_iq_month  = pd.read_pickle( test_filename )
    X = pd.DataFrame( dengue_train_iq_month )
    X.drop( columns = ['city','year','total_cases','month'], inplace = True )
    y = dengue_train_iq_month.total_cases
    X_iq_train, X_iq_test, Y_iq_train, Y_iq_test = train_test_split( X, y, shuffle = False)
    param_grid = { 
        "n_estimators"      : [160,220,100],
        "max_features"      : ["sqrt"],
        "min_samples_split" : [10,16,20],
        "bootstrap"         : [True],
        "max_depth"         : [2,4,5,8]
        }
    estimator = RandomForestRegressor( criterion='mae', oob_score=True)
    rf_est = GridSearchCV(estimator, param_grid, n_jobs=-1, cv=5, verbose=0).fit( X_iq_train, Y_iq_train )
    Y_iq_pred = rf_est.best_estimator_.predict(X_iq_test).astype(int)
    print ("Iquitos Test  MAE error :", mean_absolute_error(Y_iq_pred, Y_iq_test))
    pred_train_iq = rf_est.best_estimator_.predict(X_iq_train).astype(int)
    print ("Iquitos Train MAE error :", mean_absolute_error(pred_train_iq, Y_iq_train))
    X_test = pd.DataFrame( dengue_test_iq_month, columns = X.columns )
    submit_pred_iq = pd.DataFrame( dengue_test_iq_month, columns = ['city', 'week_start_date'])
    submit_pred_iq['total_cases'] = rf_est.best_estimator_.predict(X_test).astype( 'int')
    sub_iq = sub_iq.append( submit_pred_iq )
sub_iq = sub_iq.drop( columns=['week_start_date'])
sub_iq.reset_index( inplace = True )
sub_iq['year'] = sub_iq['week_start_date'].dt.year
sub_iq['weekofyear'] = sub_iq['week_start_date'].dt.weekofyear
sub_iq.drop( columns = ['week_start_date'], inplace = True )



Iquitos Test  MAE error : 11.461538461538462
Iquitos Train MAE error : 4.972972972972973
Iquitos Test  MAE error : 11.8
Iquitos Train MAE error : 6.633333333333334
Iquitos Test  MAE error : 4.0
Iquitos Train MAE error : 3.7333333333333334




Iquitos Test  MAE error : 2.1538461538461537
Iquitos Train MAE error : 2.4054054054054053
Iquitos Test  MAE error : 2.3
Iquitos Train MAE error : 2.433333333333333
Iquitos Test  MAE error : 1.8
Iquitos Train MAE error : 2.2333333333333334




Iquitos Test  MAE error : 1.9166666666666667
Iquitos Train MAE error : 1.4411764705882353
Iquitos Test  MAE error : 2.5
Iquitos Train MAE error : 1.3




Iquitos Test  MAE error : 11.0
Iquitos Train MAE error : 3.1875
Iquitos Test  MAE error : 17.916666666666668
Iquitos Train MAE error : 4.285714285714286
Iquitos Test  MAE error : 7.0
Iquitos Train MAE error : 3.2666666666666666
Iquitos Test  MAE error : 5.5
Iquitos Train MAE error : 11.633333333333333


In [5]:
submit = sub_sj.append( sub_iq, sort=True )

In [6]:
submission_RF = pd.merge( submission, submit, on=['city','year','weekofyear'], how='left' )

In [7]:
submission_RF.columns = ['city','year','weekofyear','total_cases']
submission_RF.to_excel("data/submission_month_RF.xlsx", index = False)

In [8]:
submission_RF.to_csv("data/submission_month_RF.csv", index = False)