In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import time
import datetime
import math
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [2]:
train_features = pd.read_csv('dengue_features_train.csv',
                             index_col=[0,1,2])
train_labels = pd.read_csv('dengue_labels_train.csv',
                          index_col=[0,1,2])

In [3]:
RANDOM_SEED = 42
tf.set_random_seed(RANDOM_SEED)

In [4]:
def init_weights(shape):

    """ Weight initialization """

    weights = tf.random_normal(shape, stddev=0.1)

    return tf.Variable(weights)

In [11]:
def forwardprop(X, w_1, w_2, w_3):

    """

    Forward-propagation.

    IMPORTANT: yhat is not softmax since TensorFlow's softmax_cross_entropy_with_logits() does that internally.

    """

    h    = tf.nn.relu(tf.matmul(X, w_1))  # The \sigma function
    h2    = tf.nn.relu(tf.matmul(h, w_2))

    yhat = tf.matmul(h2, w_3)  # The \varphi function

    return yhat

In [6]:
def preprocess_data(data, labels):
    
    df = data
    
    # select features we want
    features = ['reanalysis_specific_humidity_g_per_kg', 
                 'reanalysis_dew_point_temp_k', 
                 'station_avg_temp_c', 
                 'station_min_temp_c']
    df = df[features]
    
    # fill missing values
    df.fillna(method='ffill', inplace=True)


    dfl = labels
    
    
    # separate san juan and iquitos
    sjfeats = df.loc['sj']
    iqfeats = df.loc['iq']
    
    sjlabs = dfl.loc['sj']
    iqlabs = dfl.loc['iq']

    # Prepend the column of 1s for bias
    sjN, sjM  = sjfeats.shape
    sjall_X = np.ones((sjN, sjM + 1))
    sjall_X[:, 1:] = sjfeats
    
    iqN, iqM  = iqfeats.shape
    iqall_X = np.ones((iqN, iqM + 1))
    iqall_X[:, 1:] = iqfeats
    
    sjlabs = sjlabs.as_matrix()
    
    return sjall_X, iqall_X, sjlabs, iqlabs

In [7]:
sj_train, iq_train, sj_target, iq_target = preprocess_data(train_features, train_labels)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [8]:
sjx_train, sjx_test, sjy_train, sjy_test = train_test_split(sj_train, sj_target, 
                                                                    test_size=0.2, random_state=42)

iqx_train, iqx_test, iqy_train, iqy_test = train_test_split(iq_train, iq_target, 
                                                                    test_size=0.2, random_state=42)

In [9]:
sj_x_size = sjx_train.shape[1]

iq_x_size = iqx_train.shape[1]             

y_size = 1
h_size = 256               

In [10]:
sj_X = tf.placeholder("float", shape=[None, sj_x_size])
iq_X = tf.placeholder("float", shape=[None, iq_x_size])
y = tf.placeholder("float", shape=[None, y_size])

In [12]:
sj_w_1 = init_weights((sj_x_size, h_size))
iq_w_1 = init_weights((iq_x_size, h_size))
w_2 = init_weights((h_size, h_size))
w_3 = init_weights((h_size, y_size))

In [13]:
sj_yhat = forwardprop(sj_X, sj_w_1, w_2, w_3)
sj_predict = tf.to_int64(sj_yhat)

In [14]:
iq_yhat = forwardprop(iq_X, iq_w_1, w_2, w_3)
iq_predict = tf.to_int64(iq_yhat)

In [15]:
sj_cost    = tf.losses.mean_squared_error(labels=y, predictions=sj_yhat)
sj_updates = tf.train.GradientDescentOptimizer(0.0000001).minimize(sj_cost)

In [16]:
iq_cost    = tf.losses.mean_squared_error(labels=y, predictions=iq_yhat)
iq_updates = tf.train.GradientDescentOptimizer(0.0000001).minimize(iq_cost)

In [17]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

In [18]:
epochs = 100
for epoch in range(epochs):
        # Train with each example
        for i in range(len(sjx_train)):
            sess.run(sj_updates, feed_dict={sj_X: sjx_train[i: i + 1], y: sjy_train[i: i + 1]})

        sj_train_accuracy = metrics.mean_absolute_error(sjy_train,
                                 sess.run(sj_predict, feed_dict={sj_X: sjx_train}))
        sj_test_accuracy  = metrics.mean_absolute_error(sjy_test,
                                 sess.run(sj_predict, feed_dict={sj_X: sjx_test}))

        if epoch % 10 == 0 or epoch == epochs-1:
            print("Epoch = %d, train loss = %.2f, test loss = %.2f"
                  % (epoch + 1, sj_train_accuracy, sj_test_accuracy))

Epoch = 1, train loss = 27.54, test loss = 28.53
Epoch = 11, train loss = 26.79, test loss = 27.74
Epoch = 21, train loss = 26.43, test loss = 27.38
Epoch = 31, train loss = 26.20, test loss = 27.16
Epoch = 41, train loss = 26.14, test loss = 27.05
Epoch = 51, train loss = 26.08, test loss = 27.04
Epoch = 61, train loss = 26.05, test loss = 26.99
Epoch = 71, train loss = 26.04, test loss = 27.01
Epoch = 81, train loss = 26.03, test loss = 26.98
Epoch = 91, train loss = 26.00, test loss = 26.96
Epoch = 100, train loss = 26.03, test loss = 26.95


In [19]:
for epoch in range(epochs):
        # Train with each example
        for i in range(len(iqx_train)):
            sess.run(iq_updates, feed_dict={iq_X: iqx_train[i: i + 1], y: iqy_train[i: i + 1]})

        iq_train_accuracy = metrics.mean_absolute_error(iqy_train,
                                 sess.run(iq_predict, feed_dict={iq_X: iqx_train}))
        iq_test_accuracy  = metrics.mean_absolute_error(iqy_test,
                                 sess.run(iq_predict, feed_dict={iq_X: iqx_test}))

        if epoch % 10 == 0 or epoch == epochs-1:
            print("Epoch = %d, train loss = %.2f, test loss = %.2f"
                  % (epoch + 1, iq_train_accuracy, iq_test_accuracy))

Epoch = 1, train loss = 6.36, test loss = 5.75
Epoch = 11, train loss = 6.34, test loss = 5.77
Epoch = 21, train loss = 6.33, test loss = 5.78
Epoch = 31, train loss = 6.33, test loss = 5.79
Epoch = 41, train loss = 6.32, test loss = 5.78
Epoch = 51, train loss = 6.31, test loss = 5.79
Epoch = 61, train loss = 6.31, test loss = 5.81
Epoch = 71, train loss = 6.30, test loss = 5.79
Epoch = 81, train loss = 6.29, test loss = 5.77
Epoch = 91, train loss = 6.29, test loss = 5.77
Epoch = 100, train loss = 6.29, test loss = 5.77


In [23]:
test_features = pd.read_csv('dengue_features_test.csv',
                             index_col=[0,1,2])

def preprocess_data_test(data):
    
    df = data
    
    # select features we want
    features = ['reanalysis_specific_humidity_g_per_kg', 
                'reanalysis_dew_point_temp_k',
                'station_avg_temp_c', 
                'station_min_temp_c']
    df = df[features]
    
    # fill missing values
    df.fillna(method='ffill', inplace=True)

    
    # separate san juan and iquitos
    sjfeats = df.loc['sj']
    iqfeats = df.loc['iq']


    # Prepend the column of 1s for bias
    sjN, sjM  = sjfeats.shape
    sjall_X = np.ones((sjN, sjM + 1))
    sjall_X[:, 1:] = sjfeats
    
    iqN, iqM  = iqfeats.shape
    iqall_X = np.ones((iqN, iqM + 1))
    iqall_X[:, 1:] = iqfeats
    
    return sjall_X, iqall_X

In [24]:
sj_test, iq_test = preprocess_data_test(test_features)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [25]:
final_predict_sj = sess.run(sj_predict, feed_dict={sj_X: sj_test})

In [26]:
final_predict_iq = sess.run(iq_predict, feed_dict={iq_X: iq_test})


In [27]:
submission = pd.read_csv("submission_format.csv",
                         index_col=[0, 1, 2])

submission.total_cases = np.concatenate([final_predict_sj, final_predict_iq])
submission.to_csv("submission_MLP.csv")

In [28]:
submission.head

<bound method NDFrame.head of                       total_cases
city year weekofyear             
sj   2008 18                   29
          19                   29
          20                   31
          21                   31
          22                   31
          23                   32
          24                   31
          25                   32
          26                   32
          27                   32
          28                   31
          29                   33
          30                   32
          31                   33
          32                   32
          33                   33
          34                   33
          35                   33
          36                   33
          37                   32
          38                   32
          39                   31
          40                   31
          41                   32
          42                   31
          43                   31
          44      