In [63]:
import tensorflow as tf
import numpy as np
import pandas as pd
import time
import datetime
import math
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [64]:
train_features = pd.read_csv('dengue_features_train.csv',
                             index_col=[0,1,2])
train_labels = pd.read_csv('dengue_labels_train.csv',
                          index_col=[0,1,2])

In [65]:
RANDOM_SEED = 42
tf.set_random_seed(RANDOM_SEED)

In [66]:
def init_weights(shape):

    """ Weight initialization """

    weights = tf.random_normal(shape, stddev=0.1)

    return tf.Variable(weights)

In [67]:
def forwardprop(X, w_1, w_2, w_3):

    """

    Forward-propagation.

    IMPORTANT: yhat is not softmax since TensorFlow's softmax_cross_entropy_with_logits() does that internally.

    """

    h    = tf.nn.sigmoid(tf.matmul(X, w_1))  # The \sigma function
    h2    = tf.nn.sigmoid(tf.matmul(h, w_2))

    yhat = tf.matmul(h2, w_3)  # The \varphi function

    return yhat

In [102]:
def preprocess_data(data, labels):
    
    df = data
    
    # select features we want
    features = ['reanalysis_specific_humidity_g_per_kg', 
                 'reanalysis_dew_point_temp_k', 
                 'station_avg_temp_c', 
                 'station_min_temp_c']
    df = df[features]
    
    # fill missing values
    df.fillna(method='ffill', inplace=True)


    dfl = labels
    
    
    # separate san juan and iquitos
    sjfeats = df.loc['sj']
    iqfeats = df.loc['iq']
    
    sjlabs = dfl.loc['sj']
    iqlabs = dfl.loc['iq']

    # Prepend the column of 1s for bias
    sjN, sjM  = sjfeats.shape
    sjall_X = np.ones((sjN, sjM + 1))
    sjall_X[:, 1:] = sjfeats
    
    iqN, iqM  = iqfeats.shape
    iqall_X = np.ones((iqN, iqM + 1))
    iqall_X[:, 1:] = iqfeats
    
    sjlabs = sjlabs.as_matrix()
    
    return sjall_X, iqall_X, sjlabs, iqlabs

In [103]:
sj_train, iq_train, sj_target, iq_target = preprocess_data(train_features, train_labels)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [104]:
sjx_train, sjx_test, sjy_train, sjy_test = train_test_split(sj_train, sj_target, 
                                                                    test_size=0.2, random_state=42)

iqx_train, iqx_test, iqy_train, iqy_test = train_test_split(iq_train, iq_target, 
                                                                    test_size=0.2, random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [260, 936]

In [71]:
sj_x_size = sjx_train.shape[1]

iq_x_size = iqx_train.shape[1]             

y_size = 1
h_size = 256               

In [72]:
sj_X = tf.placeholder("float", shape=[None, sj_x_size])
iq_X = tf.placeholder("float", shape=[None, iq_x_size])
y = tf.placeholder("float", shape=[None, y_size])

In [73]:
sj_w_1 = init_weights((sj_x_size, h_size))
iq_w_1 = init_weights((iq_x_size, h_size))
w_2 = init_weights((h_size, h_size))
w_3 = init_weights((h_size, y_size))

In [74]:
sj_yhat = forwardprop(sj_X, sj_w_1, w_2, w_3)
sj_predict = tf.to_int64(sj_yhat)

In [75]:
iq_yhat = forwardprop(iq_X, iq_w_1, w_2, w_3)
iq_predict = tf.to_int64(iq_yhat)

In [76]:
sj_cost    = tf.losses.mean_squared_error(labels=y, predictions=sj_yhat)
sj_updates = tf.train.GradientDescentOptimizer(0.0000001).minimize(sj_cost)

In [77]:
iq_cost    = tf.losses.mean_squared_error(labels=y, predictions=iq_yhat)
iq_updates = tf.train.GradientDescentOptimizer(0.0000001).minimize(iq_cost)

In [78]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

In [82]:
epochs = 100
for epoch in range(epochs):
        # Train with each example
        for i in range(len(sjx_train)):
            sess.run(sj_updates, feed_dict={sj_X: sjx_train[i: i + 1], y: sjy_train[i: i + 1]})

        sj_train_accuracy = metrics.mean_absolute_error(sjy_train,
                                 sess.run(sj_predict, feed_dict={sj_X: sjx_train}))
        sj_test_accuracy  = metrics.mean_absolute_error(sjy_test,
                                 sess.run(sj_predict, feed_dict={sj_X: sjx_test}))

        if epoch % 10 == 0 or epoch == epochs-1:
            print("Epoch = %d, train loss = %.2f, test loss = %.2f"
                  % (epoch + 1, sj_train_accuracy, sj_test_accuracy))

Epoch = 1, train loss = 31.45, test loss = 30.74
Epoch = 11, train loss = 28.41, test loss = 27.85
Epoch = 21, train loss = 26.36, test loss = 25.95
Epoch = 31, train loss = 25.35, test loss = 25.10
Epoch = 41, train loss = 24.78, test loss = 24.78
Epoch = 51, train loss = 24.68, test loss = 24.95
Epoch = 61, train loss = 24.80, test loss = 25.24
Epoch = 71, train loss = 25.05, test loss = 25.67
Epoch = 81, train loss = 25.42, test loss = 26.18
Epoch = 91, train loss = 25.91, test loss = 26.78
Epoch = 100, train loss = 26.20, test loss = 27.12


In [84]:
for epoch in range(epochs):
        # Train with each example
        for i in range(len(iqx_train)):
            sess.run(iq_updates, feed_dict={iq_X: iqx_train[i: i + 1], y: iqy_train[i: i + 1]})

        iq_train_accuracy = metrics.mean_absolute_error(iqy_train,
                                 sess.run(iq_predict, feed_dict={iq_X: iqx_train}))
        iq_test_accuracy  = metrics.mean_absolute_error(iqy_test,
                                 sess.run(iq_predict, feed_dict={iq_X: iqx_test}))

        if epoch % 10 == 0 or epoch == epochs-1:
            print("Epoch = %d, train loss = %.2f, test loss = %.2f"
                  % (epoch + 1, iq_train_accuracy, iq_test_accuracy))

Epoch = 1, train loss = 17.40, test loss = 17.62
Epoch = 11, train loss = 15.74, test loss = 15.89
Epoch = 21, train loss = 14.94, test loss = 15.05
Epoch = 31, train loss = 13.36, test loss = 13.39
Epoch = 41, train loss = 12.60, test loss = 12.62
Epoch = 51, train loss = 11.85, test loss = 11.86
Epoch = 61, train loss = 11.14, test loss = 11.09
Epoch = 71, train loss = 11.14, test loss = 11.09
Epoch = 81, train loss = 10.44, test loss = 10.32
Epoch = 91, train loss = 9.75, test loss = 9.55
Epoch = 100, train loss = 9.75, test loss = 9.55


In [86]:
test_features = pd.read_csv('dengue_features_test.csv',
                             index_col=[0,1,2])

def preprocess_data_test(data):
    
    df = data
    
    # select features we want
    features = ['reanalysis_specific_humidity_g_per_kg', 'reanalysis_dew_point_temp_k',
                'station_avg_temp_c', 'station_min_temp_c',
                'reanalysis_precip_amt_kg_per_m2', 'reanalysis_relative_humidity_percent', 
                'reanalysis_sat_precip_amt_mm']
    df = df[features]
    
    # fill missing values
    df.fillna(method='ffill', inplace=True)

    
    # separate san juan and iquitos
    sjfeats = df.loc['sj']
    iqfeats = df.loc['iq']


    # Prepend the column of 1s for bias
    sjN, sjM  = sjfeats.shape
    sjall_X = np.ones((sjN, sjM + 1))
    sjall_X[:, 1:] = sjfeats
    
    iqN, iqM  = iqfeats.shape
    iqall_X = np.ones((iqN, iqM + 1))
    iqall_X[:, 1:] = iqfeats
    
    return sjall_X, iqall_X

In [87]:
sj_test, iq_test = preprocess_data_test(test_features)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [88]:
final_predict_sj = sess.run(sj_predict, feed_dict={sj_X: sj_test})

In [89]:
final_predict_iq = sess.run(iq_predict, feed_dict={iq_X: iq_test})


In [90]:
submission = pd.read_csv("submission_format.csv",
                         index_col=[0, 1, 2])

submission.total_cases = np.concatenate([final_predict_sj, final_predict_iq])
submission.to_csv("submission_MLP.csv")

In [91]:
submission.head

<bound method NDFrame.head of                       total_cases
city year weekofyear             
sj   2008 18                   21
          19                   21
          20                   21
          21                   21
          22                   21
          23                   21
          24                   21
          25                   21
          26                   21
          27                   21
          28                   21
          29                   21
          30                   21
          31                   21
          32                   21
          33                   21
          34                   21
          35                   21
          36                   21
          37                   21
          38                   21
          39                   21
          40                   21
          41                   21
          42                   21
          43                   21
          44      