# Import Libraries

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import random
import argparse
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import set_random_seed

from keras import regularizers
from keras import backend as K
from keras.models import Model
from keras.utils import plot_model
from keras.losses import mse, binary_crossentropy
from keras.layers import Lambda, Input, Dense, Dropout

import pandas as pd
import seaborn as sns
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from keras.objectives import mse
from keras.models import Sequential
from keras.layers.core import Dropout, Dense
from keras.regularizers import l1, l2
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from collections import defaultdict

GLOBAL_SEED = 1
LOCAL_SEED = 42

set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
%matplotlib inline
%pylab inline
rcParams['figure.figsize'] = [10, 8]

# Read Data

In [0]:
# Define PATH to file
path = '/DataSets/Selected/breast-cancer-wisconsin/wdbc.data'
# Dataset will be generated with the prefix:
dest = '/Dataset/wdbc'

In [0]:
from sklearn.utils import shuffle

set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
import pandas as pd
na_values = {'?', np.nan}
df = pd.read_csv(path,
                 sep=',',
                 header=None,
                 na_filter=True, 
                 verbose=False, 
                 skip_blank_lines=True, 
                 na_values=na_values,
                 keep_default_na=False)
print('Origin dataset:')                 
print(df.head())
# Drop N/A 
df.replace('U', np.nan, inplace=True)
df.dropna(axis='rows', how='all', inplace=True)
df = shuffle(df, random_state=GLOBAL_SEED)
df.drop([0], axis=1, inplace=True)
print(df.head())

col_names = list(df)
new_names = {}
for i, name in enumerate(col_names):
    new_names[name] = 'X' + str(i)
df.rename(columns=new_names, inplace=True)
df = df.reindex(sorted(df.columns), axis=1)
print(df.head())

## Handling categorical columns

In [0]:
# For breast cancer
# df['X9'] = df['X9'].astype('category')

# For Pima Diabetes
cat_cols = ['X0']
df[cat_cols] = df[cat_cols].astype('category')

colnums = len(df.columns)
for i in df.columns:
    try:
        if df[i].dtype.name == 'object' or df[i].dtype.name == 'category':
            df[i] = df[i].astype('category')
        else:
            df[i] = df[i].astype('float32')
    except:
        continue
df.dropna(axis='rows', how='any', inplace=True)
print(df.head())
print(df.describe())
print(df.info())

In [0]:
df_non_null = df

# Make data become missing

In [0]:
from sklearn.utils import resample
# make 50% of the data becoming missing
prob_missing = 0.5
df_incomplete = df_non_null.copy()
ix = [(row, col) for row in range(df_non_null.shape[0]) for col in range(df_non_null.shape[1])]
L = resample(ix, n_samples = int(prob_missing*len(ix)), 
             random_state=LOCAL_SEED)
for row, col in L:
    df_incomplete.iat[row, col] = np.nan

In [0]:
df_incomplete.info()

In [0]:
missing_encoded = pd.get_dummies(df_incomplete)

for col in df.columns:
    missing_cols = missing_encoded.columns.str.startswith(str(col) + "_")
    missing_encoded.loc[df_incomplete[col].isnull(), missing_cols] = np.nan

In [0]:
missing_encoded.head()

In [0]:
hidden_size = 1000
n_epochs = 100
n_batch_size=1024

In [0]:
def masked_mae(X_true, X_pred, mask):
    masked_diff = X_true[mask] - X_pred[mask]
    return np.mean(np.abs(masked_diff))

In [0]:
def reverse_encoding(df_test_dummies):
    names = list(df_test_dummies)
    c_dict = {}
    for n in names:
        if '_' in n:
            index = n.index('_')
            c_dict[n[:index]] = [c for c in names if n[:index+1] in c]
    values = []
    for key, items in c_dict.items():
        dummies = df_test_dummies[items]
        d_names = list(dummies)
        c_dict = {}
        for n in d_names:
            c_dict[n] = n[n.index('_')+1:]
        dummies.rename(columns=c_dict, 
                    inplace=True)
        df_test_dummies[key] = dummies.idxmax(axis=1)
        df_test_dummies.drop(items, axis=1, inplace=True)
    print(df_test_dummies.head())
    return df_test_dummies

# AutoEncoder with Dropout

In [0]:
class AutoEncoderDropout:
    def __init__(self, 
                 n_dims,
                 recurrent_weight=0.5,
                 optimizer="adam",
                 dropout_probability=0.5,
                 hidden_activation="relu",
                 output_activation="sigmoid",
                 init="glorot_normal",
                 l1_penalty=1e-3,
                 l2_penalty=1e-3,
                 hidden_size=hidden_size):
        self.n_dims = n_dims
        self.recurrent_weight = recurrent_weight
        self.optimizer = optimizer
        self.dropout_probability = dropout_probability
        self.hidden_activation = hidden_activation
        self.output_activation = output_activation
        self.init = init
        self.l1_penalty = l1_penalty
        self.l2_penalty = l2_penalty
        self.hidden_size = hidden_size

    def make_reconstruction_loss(self, n_features):
        def reconstruction_loss(input_and_mask, y_pred):
            X_values = input_and_mask[:, :n_features]
            missing_mask = input_and_mask[:, n_features:]
            observed_mask = 1 - missing_mask
            X_values_observed = X_values * observed_mask
            pred_observed = y_pred * observed_mask
            return binary_crossentropy(y_true=X_values_observed, 
                                       y_pred=pred_observed)
        return reconstruction_loss

    def _create_model(self):
        latent_dim = int(np.ceil(self.n_dims*0.5))
        inputs = Input(shape=(2*self.n_dims, ), 
                       name='encoder_input')
        x = inputs
        x = Dense(self.hidden_size, activation=self.hidden_activation,
                  init=self.init,
                  kernel_regularizer=l2(self.l2_penalty),
                  bias_regularizer=l2(self.l2_penalty))(x)
        x = Dropout(self.dropout_probability)(x)
        x = Dense(self.hidden_size//4, activation=self.hidden_activation,
                  init=self.init,
                  kernel_regularizer=l2(self.l2_penalty),
                  bias_regularizer=l2(self.l2_penalty))(x)
        x = Dropout(self.dropout_probability)(x)
        encoded = Dense(latent_dim, name='encoding')(x)
        self.encoder = Model(inputs, encoded, name='encoder')
        latent_inputs = Input(shape=(latent_dim,), name='decoder_input')
        x = latent_inputs
        x = Dense(self.hidden_size//4, activation=self.hidden_activation,
                  init=self.init,
                  kernel_regularizer=l2(self.l2_penalty),
                  bias_regularizer=l2(self.l2_penalty))(x)
        x = Dropout(self.dropout_probability)(x)
        x = Dense(self.hidden_size, activation=self.hidden_activation,
                  init=self.init,
                  kernel_regularizer=l2(self.l2_penalty),
                  bias_regularizer=l2(self.l2_penalty))(x)
        x = Dropout(self.dropout_probability)(x)
        outputs = Dense(self.n_dims, activation=self.output_activation,
                        init=self.init,
                        kernel_regularizer=l2(self.l2_penalty),
                        bias_regularizer=l2(self.l2_penalty))(x)
        self.decoder = Model(latent_inputs, 
                             outputs, 
                             name='decoder')
        outputs = self.decoder(self.encoder(inputs))
        self.model = Model(inputs, outputs, name='ae_mlp')
        loss_function = self.make_reconstruction_loss(self.n_dims)
        self.model.compile(optimizer=self.optimizer, 
                           loss=loss_function)

    def fill(self, data, missing_mask):
        data[missing_mask] = -1
        return data

    def _create_missing_mask(self, data):
        if data.dtype != "f" and data.dtype != "d":
            data = data.astype(float)
        return np.isnan(data)

    def _train_epoch(self, data, missing_mask, batch_size):
        input_with_mask = np.hstack([data, missing_mask])
        n_samples = len(input_with_mask)
        n_batches = int(np.ceil(n_samples / batch_size))
        indices = np.arange(n_samples)
        np.random.shuffle(indices)
        X_shuffled = input_with_mask[indices]
        for batch_idx in range(n_batches):
            batch_start = batch_idx * batch_size
            batch_end = (batch_idx + 1) * batch_size
            batch_data = X_shuffled[batch_start:batch_end, :]
            self.model.train_on_batch(batch_data, batch_data)
        return self.model.predict(input_with_mask)
    
    def predict(self, x_test_with_mask):
        predict_stochastic = K.function([self.decoder.layers[0].input,
                                        K.learning_phase()],
                                        [self.decoder.layers[-1].output])
        latent_input = self.encoder.predict(x_test_with_mask)
    
        outputs = np.array([np.array(predict_stochastic([latent_input, 
                                                            1])).reshape((x_test_with_mask.shape[0], 
                                                                        x_test_with_mask.shape[1]//2)) for _ in range(50)])
        return np.mean(outputs, axis=0)  

    def train(self, x_train, x_test, batch_size=256, train_epochs=100):
        missing_mask = self._create_missing_mask(x_train)
        x_train = self.fill(x_train, missing_mask)
        x_test_missing_mask = self._create_missing_mask(x_test) 
        x_test = self.fill(x_test, x_test_missing_mask)

        self._create_model()
        
        observed_mask = ~missing_mask
        x_test_observed_mask = ~x_test_missing_mask
        input_with_mask = np.hstack([x_train, missing_mask])
        for epoch in range(train_epochs):
            X_pred = self._train_epoch(x_train, missing_mask, batch_size)
            x_test_with_mask = np.hstack([x_test, x_test_missing_mask])
            X_test_pred = self.predict(x_test_with_mask)
            observed_mae = masked_mae(X_true=x_train,
                                    X_pred=X_pred,
                                    mask=observed_mask)
            test_observed_mae = masked_mae(X_true=x_test,
                                           X_pred = X_test_pred,
                                           mask=x_test_observed_mask)
            if epoch % 50 == 0:
                print("observed mae:", observed_mae)
                print("Test mae:", test_observed_mae)
            old_weight = (1.0 - self.recurrent_weight)
            x_train[missing_mask] *= old_weight
            x_test[x_test_missing_mask] *= old_weight
            pred_missing = X_pred[missing_mask]
            x_test_pred_missing = X_test_pred[x_test_missing_mask]
            x_train[missing_mask] += self.recurrent_weight * pred_missing
            x_test[x_test_missing_mask] += self.recurrent_weight * x_test_pred_missing
        return x_train.copy(), x_test.copy()

In [0]:
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
seeds = [LOCAL_SEED+2, LOCAL_SEED+1, LOCAL_SEED+4, LOCAL_SEED+6, LOCAL_SEED+8]
rmses = []
cols = df_non_null.columns
non_null_values = df_non_null.values.copy()
for seed_number in seeds:
    values = missing_encoded.values.copy()
    train, test, comp_train, comp_test = train_test_split(values.copy(),
                                                        non_null_values.copy(),
                                                        test_size=0.1,
                                                        random_state=seed_number)
    df_test_complete = pd.DataFrame(columns=cols, 
                                    data=comp_test.copy())
    scaler = MinMaxScaler().fit(train)
    x_train = scaler.transform(train)
    x_test = scaler.transform(test)
    n_dims = x_train.shape[1]
    aedropout = AutoEncoderDropout(n_dims=n_dims)
    complete_encoded = aedropout.train(x_train.copy(), 
                                    x_test.copy(), 
                                    train_epochs=n_epochs,
                                    batch_size=n_batch_size)
    train_encoded, test_encoded = complete_encoded
    missing_cols = list(missing_encoded)
    inverse_test_encoded = scaler.inverse_transform(test_encoded)
    df_test_dummies = pd.DataFrame(columns=missing_cols, 
                                   data=inverse_test_encoded)
    df_test_dummies = reverse_encoding(df_test_dummies.copy())
    df_test_dummies.drop(cat_cols, axis=1,
                         inplace=True)
    df_test_complete.drop(cat_cols, axis=1,
                          inplace=True)
    true_vals = df_test_complete.values.copy()
    test_vals = df_test_dummies.values.copy()
    scaler2 = MinMaxScaler().fit(true_vals)
    scaled_true_vales = scaler2.transform(true_vals)
    scaled_test_vales = scaler2.transform(test_vals)
    rmse = math.sqrt(mean_squared_error(scaled_true_vales, 
                                        scaled_test_vales))
    rmses.append(rmse)

In [0]:
print(np.mean(rmses), np.std(rmses))

# VAE MCD

In [0]:
class VAEDropout:

    def __init__(self, n_dims,
                 recurrent_weight=0.5,
                 optimizer="adam",
                 dropout_probability=0.5,
                 hidden_activation="relu",
                 output_activation="sigmoid",
                 init="glorot_normal",
                 l2_penalty=1e-3,
                 hidden_size=hidden_size):
        self.n_dims = n_dims
        self.recurrent_weight = recurrent_weight
        self.optimizer = optimizer
        self.dropout_probability = dropout_probability
        self.hidden_activation = hidden_activation
        self.output_activation = output_activation
        self.init = init
        self.l2_penalty = l2_penalty
        self.hidden_size = hidden_size
        self._create_model()

    def _create_model(self):
        latent_dim = int(np.ceil(self.n_dims*0.5))
        inputs = Input(shape=(2*n_dims, ), name='encoder_input')
        x = inputs
        x = Dense(self.hidden_size, activation=self.hidden_activation,
                  init=self.init,
                  kernel_regularizer=l2(self.l2_penalty),
                  bias_regularizer=l2(self.l2_penalty))(x)
        x = Dropout(self.dropout_probability)(x)
        x = Dense(self.hidden_size//4, activation=self.hidden_activation,
                  init=self.init,
                  kernel_regularizer=l2(self.l2_penalty),
                  bias_regularizer=l2(self.l2_penalty))(x)
        x = Dropout(self.dropout_probability)(x)
        z_mean = Dense(latent_dim, name='z_mean')(x)
        z_log_var = Dense(latent_dim, name='z_log_var')(x)
        self.encoder = Model(inputs, [z_mean, 
                                      z_log_var], name='encoder')
        latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
        x = latent_inputs
        x = Dense(self.hidden_size//4, activation=self.hidden_activation,
                  init=self.init,
                  kernel_regularizer=l2(self.l2_penalty),
                  bias_regularizer=l2(self.l2_penalty))(x)
        x = Dropout(self.dropout_probability)(x)
        x = Dense(self.hidden_size, activation=self.hidden_activation,
                 init=self.init,
                 kernel_regularizer=l2(self.l2_penalty),
                  bias_regularizer=l2(self.l2_penalty))(x)
        x = Dropout(self.dropout_probability)(x)
        outputs = Dense(self.n_dims, activation=self.output_activation,
                        init=self.init,
                       kernel_regularizer=l2(self.l2_penalty),
                        bias_regularizer=l2(self.l2_penalty))(x)
        self.decoder = Model(latent_inputs, 
                             outputs, 
                             name='decoder')
        outputs = self.decoder(self.encoder(inputs)[0])
        self.model = Model(inputs, outputs, 
                           name='vae_mlp')
        reconstruction_loss = self.make_vae_reconstruction_loss(n_dims, 
                                                       z_mean, 
                                                       z_log_var)
        self.model.compile(optimizer=self.optimizer, 
                           loss=reconstruction_loss)

    def make_vae_reconstruction_loss(self, n_features, z_mean, z_log_var):
        def reconstruction_loss(input_and_mask, y_pred):
            X_values = input_and_mask[:, :n_features]
        
            missing_mask = input_and_mask[:, n_features:]
            observed_mask = 1 - missing_mask
            X_values_observed = X_values * observed_mask
            pred_observed = y_pred * observed_mask
            reconstruction_loss = binary_crossentropy(y_true=X_values_observed, 
                                        y_pred=pred_observed)
            reconstruction_loss*=n_features
            kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
            kl_loss = K.sum(kl_loss, axis=-1)
            kl_loss *= -0.5
            vae_loss = K.mean(reconstruction_loss + kl_loss)	
            return vae_loss
        return reconstruction_loss

    def fill(self, data, missing_mask):
        data[missing_mask] = -1
        return data

    def _create_missing_mask(self, data):
        if data.dtype != "f" and data.dtype != "d":
            data = data.astype(float)

        return np.isnan(data)

    def predict(self, x_test_with_mask):
        predict_stochastic = K.function([self.decoder.layers[0].input,
                                        K.learning_phase()],
                                        [self.decoder.layers[-1].output])
        latent_input = self.encoder.predict(x_test_with_mask)
    
        outputs = np.array([np.array(predict_stochastic([latent_input, 
                                                            1])).reshape((x_test_with_mask.shape[0], 
                                                                        x_test_with_mask.shape[1]//2)) for _ in range(50)])
        return np.mean(outputs, axis=0)  

    def _train_epoch(self, data, missing_mask, batch_size):
        input_with_mask = np.hstack([data, missing_mask])
        n_samples = len(input_with_mask)
        n_batches = int(np.ceil(n_samples / batch_size))
        indices = np.arange(n_samples)
        np.random.shuffle(indices)
        X_shuffled = input_with_mask[indices]
        for batch_idx in range(n_batches):
            batch_start = batch_idx * batch_size
            batch_end = (batch_idx + 1) * batch_size
            batch_data = X_shuffled[batch_start:batch_end, :]
            self.model.train_on_batch(batch_data, batch_data)
        return self.model.predict(input_with_mask)

    def train(self, x_train, x_test, batch_size=256, train_epochs=100):
        missing_mask = self._create_missing_mask(x_train)
        x_train = self.fill(x_train, missing_mask)
        x_test_missing_mask = self._create_missing_mask(x_test) 
        x_test = self.fill(x_test, x_test_missing_mask)
        observed_mask = ~missing_mask
        x_test_observed_mask = ~x_test_missing_mask
        for epoch in range(train_epochs):
            X_pred = self._train_epoch(x_train, missing_mask, batch_size)
            x_test_with_mask = np.hstack([x_test, x_test_missing_mask])
            X_test_pred = self.predict(x_test_with_mask)
            observed_mae = masked_mae(X_true=x_train,
                                      X_pred=X_pred,
                                      mask=observed_mask)
            test_observed_mae = masked_mae(X_true=x_test,
                                X_pred = X_test_pred,
                                mask=x_test_observed_mask)
            if epoch % 50 == 0:
                print("Traing observed mae:", observed_mae)
                print("Test observed mae:", test_observed_mae)
            old_weight = (1.0 - self.recurrent_weight)
            x_train[missing_mask] *= old_weight
            x_test[x_test_missing_mask] *= old_weight
            pred_missing = X_pred[missing_mask]
            x_test_pred_missing = X_test_pred[x_test_missing_mask]
            x_train[missing_mask] += self.recurrent_weight * pred_missing
            x_test[x_test_missing_mask] += self.recurrent_weight*x_test_pred_missing
        return x_train.copy(), x_test.copy()

In [0]:
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
seeds = [LOCAL_SEED+2, LOCAL_SEED+1, LOCAL_SEED+4, LOCAL_SEED+6, LOCAL_SEED+8]
rmses = []
cols = df_non_null.columns
non_null_values = df_non_null.values.copy()
for seed_number in seeds:
    values = missing_encoded.values.copy()
    train, test, comp_train, comp_test = train_test_split(values.copy(),
                                                        non_null_values.copy(),
                                                        test_size=0.1,
                                                        random_state=seed_number)
    df_test_complete = pd.DataFrame(columns=cols, 
                                    data=comp_test.copy())
    scaler = MinMaxScaler().fit(train)
    x_train = scaler.transform(train)
    x_test = scaler.transform(test)
    n_dims = x_train.shape[1]
    aedropout = VAEDropout(n_dims=n_dims)
    complete_encoded = aedropout.train(x_train.copy(), 
                                    x_test.copy(), 
                                    train_epochs=n_epochs,
                                    batch_size=n_batch_size)
    train_encoded, test_encoded = complete_encoded
    missing_cols = list(missing_encoded)
    inverse_test_encoded = scaler.inverse_transform(test_encoded)
    df_test_dummies = pd.DataFrame(columns=missing_cols, 
                                   data=inverse_test_encoded)
    df_test_dummies = reverse_encoding(df_test_dummies.copy())
    df_test_dummies.drop(cat_cols, axis=1,
                         inplace=True)
    df_test_complete.drop(cat_cols, axis=1,
                          inplace=True)
    true_vals = df_test_complete.values.copy()
    test_vals = df_test_dummies.values.copy()
    scaler2 = MinMaxScaler().fit(true_vals)
    scaled_true_vales = scaler2.transform(true_vals)
    scaled_test_vales = scaler2.transform(test_vals)
    rmse = math.sqrt(mean_squared_error(scaled_true_vales, 
                                        scaled_test_vales))
    rmses.append(rmse)

In [0]:
print(np.mean(rmses))
print(np.std(rmses))

# AutoEncoder

In [0]:
class Autoencoder:
    def __init__(self, n_dims,
                 recurrent_weight=0.5,
                 optimizer="adam",
                 dropout_probability=0.1,
                 hidden_activation="relu",
                 output_activation="sigmoid",
                 init="glorot_normal",
                 l1_penalty=0,
                 l2_penalty=1e-3,
                 hidden_size=hidden_size):
        self.n_dims = n_dims
        self.recurrent_weight = recurrent_weight
        self.optimizer = optimizer
        self.dropout_probability = dropout_probability
        self.hidden_activation = hidden_activation
        self.output_activation = output_activation
        self.init = init
        self.l1_penalty = l1_penalty
        self.l2_penalty = l2_penalty
        self.hidden_size = hidden_size

    def make_reconstruction_loss(self, n_features):
    
        def reconstruction_loss(input_and_mask, y_pred):
            X_values = input_and_mask[:, :n_features]
            missing_mask = input_and_mask[:, n_features:]
            observed_mask = 1 - missing_mask
            X_values_observed = X_values * observed_mask
            pred_observed = y_pred * observed_mask
            return binary_crossentropy(y_true=X_values_observed, 
                                       y_pred=pred_observed)
        return reconstruction_loss

    def _create_model(self):
        latent_dim = int(np.ceil(self.n_dims*0.5))
        inputs = Input(shape=(2*n_dims, ), name='encoder_input')
        x = inputs
        x = Dense(self.hidden_size, activation=self.hidden_activation,
                  init=self.init)(x)
        x = Dropout(self.dropout_probability)(x)
        x = Dense(self.hidden_size//4, activation=self.hidden_activation,
                  init=self.init)(x)
        x = Dropout(self.dropout_probability)(x)
        encoded = Dense(latent_dim, name='encoding')(x)
        self.encoder = Model(inputs, encoded, name='encoder')
        latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
        x = latent_inputs
        x = Dense(self.hidden_size//4, activation=self.hidden_activation,
                  init=self.init)(x)
        x = Dropout(self.dropout_probability)(x)
        x = Dense(self.hidden_size, activation=self.hidden_activation,
                  init=self.init)(x)
        x = Dropout(self.dropout_probability)(x)
        outputs = Dense(n_dims, activation=self.output_activation)(x)
        self.decoder = Model(latent_inputs, 
                             outputs, 
                             name='decoder')
        outputs = self.decoder(self.encoder(inputs))
        self.model = Model(inputs, outputs, name='vae_mlp')
        loss_function = self.make_reconstruction_loss(n_dims)
        self.model.compile(optimizer=self.optimizer, 
                           loss=loss_function)

    def fill(self, data, missing_mask):
        data[missing_mask] = -1
        return data

    def _create_missing_mask(self, data):
        if data.dtype != "f" and data.dtype != "d":
            data = data.astype(float)

        return np.isnan(data)

    def _train_epoch(self, data, missing_mask, batch_size):
        input_with_mask = np.hstack([data, missing_mask])
        n_samples = len(input_with_mask)
        n_batches = int(np.ceil(n_samples / batch_size))
        indices = np.arange(n_samples)
        np.random.shuffle(indices)
        X_shuffled = input_with_mask[indices]
        for batch_idx in range(n_batches):
            batch_start = batch_idx * batch_size
            batch_end = (batch_idx + 1) * batch_size
            batch_data = X_shuffled[batch_start:batch_end, :]
            self.model.train_on_batch(batch_data, batch_data)
        return self.model.predict(input_with_mask)

    def train(self, x_train, x_test, batch_size=256, train_epochs=100):
        missing_mask = self._create_missing_mask(x_train)
        x_train = self.fill(x_train, missing_mask)
        x_test_missing_mask = self._create_missing_mask(x_test) 
        x_test = self.fill(x_test, x_test_missing_mask)
        self._create_model()
        self.encoder.summary()
        self.decoder.summary()
        self.model.summary()
        observed_mask = ~missing_mask
        x_test_observed_mask = ~x_test_missing_mask
        input_with_mask = np.hstack([x_train, missing_mask])
        for epoch in range(train_epochs):
            X_pred = self._train_epoch(x_train, missing_mask, batch_size)
            x_test_with_mask = np.hstack([x_test, x_test_missing_mask])
            X_test_pred = self.model.predict(x_test_with_mask)
            observed_mae = masked_mae(X_true=x_train,
                                    X_pred=X_pred,
                                    mask=observed_mask)
            test_observed_mae = masked_mae(X_true=x_test,
                                           X_pred = X_test_pred,
                                           mask=x_test_observed_mask)
            if epoch % 50 == 0:
                print("observed mae:", observed_mae)
                print("Test mae:", test_observed_mae)

            old_weight = (1.0 - self.recurrent_weight)
            x_train[missing_mask] *= old_weight
            x_test[x_test_missing_mask] *= old_weight
            pred_missing = X_pred[missing_mask]
            x_test_pred_missing = X_test_pred[x_test_missing_mask]
            x_train[missing_mask] += self.recurrent_weight * pred_missing
            x_test[x_test_missing_mask] += self.recurrent_weight * x_test_pred_missing
        return x_train.copy(), x_test.copy()

In [0]:
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
seeds = [LOCAL_SEED+2, LOCAL_SEED+1, LOCAL_SEED+4, LOCAL_SEED+6, LOCAL_SEED+8]
rmses = []
cols = df_non_null.columns
non_null_values = df_non_null.values.copy()
for seed_number in seeds:
    values = missing_encoded.values.copy()
    train, test, comp_train, comp_test = train_test_split(values.copy(),
                                                        non_null_values.copy(),
                                                        test_size=0.1,
                                                        random_state=seed_number)
    df_test_complete = pd.DataFrame(columns=cols, 
                                    data=comp_test.copy())
    scaler = MinMaxScaler().fit(train)
    x_train = scaler.transform(train)
    x_test = scaler.transform(test)
    n_dims = x_train.shape[1]
    aedropout = Autoencoder(n_dims=n_dims)
    complete_encoded = aedropout.train(x_train.copy(), 
                                    x_test.copy(), 
                                    train_epochs=n_epochs,
                                    batch_size=n_batch_size)
    train_encoded, test_encoded = complete_encoded
    missing_cols = list(missing_encoded)
    inverse_test_encoded = scaler.inverse_transform(test_encoded)
    df_test_dummies = pd.DataFrame(columns=missing_cols, 
                                   data=inverse_test_encoded)
    df_test_dummies = reverse_encoding(df_test_dummies.copy())
    df_test_dummies.drop(cat_cols, axis=1,
                         inplace=True)
    df_test_complete.drop(cat_cols, axis=1,
                          inplace=True)
    true_vals = df_test_complete.values.copy()
    test_vals = df_test_dummies.values.copy()
    scaler2 = MinMaxScaler().fit(true_vals)
    scaled_true_vales = scaler2.transform(true_vals)
    scaled_test_vales = scaler2.transform(test_vals)
    rmse = math.sqrt(mean_squared_error(scaled_true_vales, 
                                        scaled_test_vales))
    rmses.append(rmse)

In [0]:
print(np.mean(rmses))
print(np.std(rmses))

# VAE

In [0]:
class VAE:

    def __init__(self, n_dims,
                 recurrent_weight=0.5,
                 optimizer="adam",
                 dropout_probability=0.1,
                 hidden_activation="relu",
                 output_activation="sigmoid",
                 init="glorot_normal",
                 l2_penalty=1e-3):
        self.n_dims = n_dims
        self.recurrent_weight = recurrent_weight
        self.optimizer = optimizer
        self.dropout_probability = dropout_probability
        self.hidden_activation = hidden_activation
        self.output_activation = output_activation
        self.init = init
        self.l2_penalty = l2_penalty
        self.hidden_size = hidden_size
        self._create_model()

    def make_reconstruction_loss(self, n_features, z_mean, z_log_var):
        def reconstruction_loss(input_and_mask, y_pred):
            X_values = input_and_mask[:, :n_features]
            missing_mask = input_and_mask[:, n_features:]
            observed_mask = 1 - missing_mask
            X_values_observed = X_values * observed_mask
            pred_observed = y_pred * observed_mask
            reconstruction_loss = binary_crossentropy(y_true=X_values_observed, 
                                        y_pred=pred_observed)
            reconstruction_loss*=n_features
            kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
            kl_loss = K.sum(kl_loss, axis=-1)
            kl_loss *= -0.5
            vae_loss = K.mean(reconstruction_loss + kl_loss)	
            return vae_loss
        return reconstruction_loss

    def sampling(self, args):
        """Reparameterization trick by sampling from an isotropic unit Gaussian.
        # Arguments
            args (tensor): mean and log of variance of Q(z|X)
        # Returns
            z (tensor): sampled latent vector
        """
    
        z_mean, z_log_var = args
        batch = K.shape(z_mean)[0]
        dim = K.int_shape(z_mean)[1]
        # by default, random_normal has mean = 0 and std = 1.0
        epsilon = K.random_normal(shape=(batch, dim))
        return z_mean + K.exp(0.5 * z_log_var) * epsilon

    def _create_model(self):
        latent_dim = (int(np.ceil(self.n_dims*0.5)))
        inputs = Input(shape=(2*self.n_dims, ), name='encoder_input')
        x = inputs
        x = Dense(self.hidden_size, activation=self.hidden_activation,
                  init=self.init)(x)
        x = Dropout(self.dropout_probability)(x)
        x = Dense(self.hidden_size//4, activation=self.hidden_activation,
                  init=self.init)(x)
        x = Dropout(self.dropout_probability)(x)
        z_mean = Dense(latent_dim, name='z_mean')(x)
        z_log_var = Dense(latent_dim, name='z_log_var')(x)
        z = Lambda(self.sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])
        self.encoder = Model(inputs, [z_mean, 
                                      z_log_var, z], name='encoder')
        latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
        x = latent_inputs
        x = Dense(self.hidden_size//4, activation=self.hidden_activation,
                  init=self.init)(x)
        x = Dropout(self.dropout_probability)(x)
        x = Dense(self.hidden_size, activation=self.hidden_activation,
                  init=self.init)(x)
        x = Dropout(self.dropout_probability)(x)
        outputs = Dense(self.n_dims, activation=self.output_activation)(x)
        self.decoder = Model(latent_inputs, 
                             outputs, 
                             name='decoder')
        outputs = self.decoder(self.encoder(inputs)[2])
        self.model = Model(inputs, outputs, 
                           name='vae_mlp')
        reconstruction_loss = self.make_reconstruction_loss(n_dims, 
                                                       z_mean, 
                                                       z_log_var)
        self.model.compile(optimizer=self.optimizer, 
                           loss=reconstruction_loss)

    def fill(self, data, missing_mask):
        data[missing_mask] = -1
        return data

    def _create_missing_mask(self, data):
        if data.dtype != "f" and data.dtype != "d":
            data = data.astype(float)
        return np.isnan(data)

    def _train_epoch(self, data, missing_mask, batch_size):
        input_with_mask = np.hstack([data, missing_mask])
        n_samples = len(input_with_mask)
        n_batches = int(np.ceil(n_samples / batch_size))
        indices = np.arange(n_samples)
        np.random.shuffle(indices)
        X_shuffled = input_with_mask[indices]
        for batch_idx in range(n_batches):
            batch_start = batch_idx * batch_size
            batch_end = (batch_idx + 1) * batch_size
            batch_data = X_shuffled[batch_start:batch_end, :]
            self.model.train_on_batch(batch_data, batch_data)
        return self.model.predict(input_with_mask)

    def train(self, x_train, x_test, batch_size=256, train_epochs=100):
        missing_mask = self._create_missing_mask(x_train)
        x_train = self.fill(x_train, missing_mask)
        x_test_missing_mask = self._create_missing_mask(x_test) 
        x_test = self.fill(x_test, x_test_missing_mask)
        observed_mask = ~missing_mask
        x_test_observed_mask = ~x_test_missing_mask
        for epoch in range(train_epochs):
            X_pred = self._train_epoch(x_train, missing_mask, batch_size)
            x_test_with_mask = np.hstack([x_test, x_test_missing_mask])
            X_test_pred = self.model.predict(x_test_with_mask)
            observed_mae = masked_mae(X_true=x_train,
                                      X_pred=X_pred,
                                      mask=observed_mask)
            test_observed_mae = masked_mae(X_true=x_test,
                                X_pred = X_test_pred,
                                mask=x_test_observed_mask)
            if epoch % 50 == 0:
                print("Traing observed mae:", observed_mae)
                print("Test observed mae:", test_observed_mae)
            old_weight = (1.0 - self.recurrent_weight)
            x_train[missing_mask] *= old_weight
            x_test[x_test_missing_mask] *= old_weight
            pred_missing = X_pred[missing_mask]
            x_test_pred_missing = X_test_pred[x_test_missing_mask]
            x_train[missing_mask] += self.recurrent_weight * pred_missing
            x_test[x_test_missing_mask] += self.recurrent_weight*x_test_pred_missing
        return x_train.copy(), x_test.copy()

In [0]:
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
seeds = [LOCAL_SEED+2, LOCAL_SEED+1, LOCAL_SEED+4, LOCAL_SEED+6, LOCAL_SEED+8]
rmses = []
cols = df_non_null.columns
non_null_values = df_non_null.values.copy()
for seed_number in seeds:
    values = missing_encoded.values.copy()
    train, test, comp_train, comp_test = train_test_split(values.copy(),
                                                        non_null_values.copy(),
                                                        test_size=0.1,
                                                        random_state=seed_number)
    df_test_complete = pd.DataFrame(columns=cols, 
                                    data=comp_test.copy())
    scaler = MinMaxScaler().fit(train)
    x_train = scaler.transform(train)
    x_test = scaler.transform(test)
    n_dims = x_train.shape[1]
    aedropout = VAE(n_dims=n_dims)
    complete_encoded = aedropout.train(x_train.copy(), 
                                    x_test.copy(), 
                                    train_epochs=n_epochs,
                                    batch_size=n_batch_size)
    train_encoded, test_encoded = complete_encoded
    missing_cols = list(missing_encoded)
    inverse_test_encoded = scaler.inverse_transform(test_encoded)
    df_test_dummies = pd.DataFrame(columns=missing_cols, 
                                   data=inverse_test_encoded)
    df_test_dummies = reverse_encoding(df_test_dummies.copy())
    df_test_dummies.drop(cat_cols, axis=1,
                         inplace=True)
    df_test_complete.drop(cat_cols, axis=1,
                          inplace=True)
    true_vals = df_test_complete.values.copy()
    test_vals = df_test_dummies.values.copy()
    scaler2 = MinMaxScaler().fit(true_vals)
    scaled_true_vales = scaler2.transform(true_vals)
    scaled_test_vales = scaler2.transform(test_vals)
    rmse = math.sqrt(mean_squared_error(scaled_true_vales, 
                                        scaled_test_vales))
    rmses.append(rmse)

In [0]:
print(np.mean(rmses))
print(np.std(rmses))