In [1]:
import tensorflow as tf

import os
os.environ['KERAS_BACKEND'] = 'tensorflow'

In [2]:
tf.__version__

'1.8.0'

In [3]:
# Creates a graph.
a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
c = tf.matmul(a, b)
# Creates a session with log_device_placement set to True.
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
# Runs the op.
print(sess.run(c))

[[22. 28.]
 [49. 64.]]


In [4]:
import sys
sys.path.insert(0, '.')
import super_pool

In [5]:
import os
from collections import Counter

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

from keras.layers import Input, Dense, Lambda
from keras.models import Model
from keras import backend as K
from keras import metrics


import pandas as pd 
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from math import pi

import seaborn as sns
%matplotlib inline 

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import sklearn.preprocessing
from scipy.special import erfinv
from sklearn.model_selection import train_test_split
from tqdm import tqdm

PATH="../input"

def rank_gauss(x):
    """
    First step is to assign a linspace to the sorted features from 0..1, then apply 
    the inverse of error function ErfInv to shape them like gaussians, then I 
    substract the mean. Binary features are not touched with this trafo (eg. 1-hot ones). 
    This works usually much better than standard mean/std scaler or min/max.    
    """
    
    # x is numpy vector
    N = x.shape[0]
    temp = x.argsort()
    rank_x = temp.argsort() / N
    efi_x = erfinv(rank_x)
    efi_x -= efi_x.mean()
    return efi_x


def load():
    print('reading csvs')
    df_train = pd.read_csv(os.path.join(PATH, "train.csv"))
    df_test = pd.read_csv(os.path.join(PATH, "test.csv"))
    dtypes = df_train.dtypes.to_dict()
    print(f"train shape: {df_train.shape}, test shape: {df_test.shape}")

    train_target = df_train['target'].values

    ntrain = df_train.shape[0]
    ntest  = df_test.shape[0]

    ignored_columns = ['ID', 'target']
    feature_columns = [c for c in df_train.columns if c not in ignored_columns]    
    categorical_columns = []    
    
    # concatencate everything train + test...
    df_all = pd.concat([df_train[feature_columns], df_test[feature_columns]]).astype(np.float32)
    
    dtype_counter = Counter()
    unique_counter = Counter()
    for c in feature_columns:
        dtype_str = str(dtypes[c])
        dtype_counter[dtype_str] += 1
        unique = len(set(df_train[c].values))
        unique_counter[unique] += 1
        if unique == 1:
            ignored_columns.append(c)
            
    # 
    # use ohe for categoricals and 
    # rank_gauss for the rest.
    #
    # df_all = pd.get_dummies(df_all, prefix='ohe_', columns=categorical_columns, sparse=True)

    print(f"categoricals: {len(categorical_columns)}")
    print(f"train+test shape after ohe'ing {df_all.shape}")
            
    not_categorical = [x for x in feature_columns 
                       if x not in (categorical_columns) and (x not in ignored_columns)]
    
    print(f"ignored columns: {len(ignored_columns)}")
    print(f"not categorical: {len(not_categorical)}")
    
    p = super_pool.SuperPool()
    out = p.map(rank_gauss, [df_all[c] for c in feature_columns])
    df_all = pd.concat(out, axis=1)
    print(df_all.shape)
        
    p.exit()
    
    #for c in not_categorical:
    #for c in tqdm(feature_columns): # TODO
    #    df_all[c] = rank_gauss(df_all[c].values)
                    
    # print('label encoding...')
    # for c in categorical_columns:
    #    df_all[c] = LabelEncoder.fit_transform(df_all[c])
        
    df_all = df_all[[c for c in df_all.columns if c not in ignored_columns]]
    train_data = df_all.iloc[0:ntrain, :]
    test_data  = df_all.iloc[ntrain:, :]

    assert train_data.shape[0] == ntrain
    assert test_data.shape[0]  == ntest

    return train_target, categorical_columns, train_data, test_data



Using TensorFlow backend.


In [21]:
train_target, categorical_columns, df_train, df_test = load()

reading csvs
train shape: (4459, 4993), test shape: (49342, 4992)
categoricals: 0
train+test shape after ohe'ing (53801, 4991)
ignored columns: 258
not categorical: 4735


8 CPUs: 100%|██████████| 4991/4991 [00:27<00:00, 182.18it/s]


(53801, 4991)


In [78]:
train_target.shape, len(categorical_columns), df_train.shape, df_test.shape, len(df_train.columns), len(df_test.columns)

((4459,), 0, (4459, 4735), (49342, 4735), 4735, 4735)

In [79]:
df_test.values[0,:] #, np.sum(df_test.values)

array([0.24367585, 0.24434055, 0.24004841, ..., 0.24146532, 0.23124167,
       0.22580724])

In [24]:
import pickle
with open("xx.pkl", 'wb') as f:
    pickle.dump([train_target, df_train, df_test, categorical_columns], f)

In [6]:
import pickle
with open('xx.pkl', 'rb') as f:
    train_target, df_train, df_test, categorical_columns = pickle.load(f)

In [43]:
print(np.mean(df_all.values), np.sum(df_all.values))

-4.983484e-11 -0.0126953125


In [7]:
df_all = pd.concat([df_train, df_test]).astype(np.float32)
train_idx, val_idx = train_test_split(range(df_all.shape[0]), test_size=0.2, random_state=0, shuffle=True)


In [82]:
len(train_idx), len(val_idx), df_all.shape

(43040, 10761, (53801, 4735))

In [89]:
import keras
import random

class MyGenerator(keras.utils.Sequence):
    def __init__(self, categorical_columns, df, df_y=None, batch_size=128, shuffle=True):
        self.categorical_columns = categorical_columns
        self.X = df
        self.y = df_y
        self.batch_size = batch_size
        self.shuffle =shuffle
        self.idxs = None
        self.on_epoch_end()        
        
    def __len__(self):
        # number of batches per epoch
        return int(np.floor(self.X.shape[0] / self.batch_size))
    
    def _swap_noise(self, x):
        shift = np.random.choice(x.shape[0], size=1)
        out = np.copy(x)
        x2 = np.roll(x, shift, axis=0)
        # cols to randomizr        
        for r in range(x.shape[0]):
            cols = np.random.choice(x.shape[1], int(0.15 * x.shape[1]), replace=False)
            for i in range(3):
                out[r, cols] = x2[r, cols]
                if np.random.sample() < 0.5:
                    x2 = np.roll(x, shift, axis=0)
                    out[r, cols] = x2[r, cols]
        return out
    
    def __getitem__(self, idx):
        # generate one batch
        batch_x = self.X.iloc[idx * self.batch_size:(idx + 1) * self.batch_size]                
        #return batch_x, batch_x
        return self._swap_noise(batch_x), batch_x
    
    def on_epoch_end(self):
        self.idxs = np.arange(self.X.shape[0])
        if self.shuffle:
            np.random.shuffle(self.idxs)
            

In [90]:
train_g = MyGenerator(categorical_columns, df_all.iloc[train_idx])
val_g = MyGenerator(categorical_columns, df_all.iloc[val_idx])

In [46]:
# todo -- ohe categoricals
# and modify original_dim


In [58]:
def vae1():
    batch_size = 128
    original_dim = 4735
    latent_dim = 32
    intermediate_dim = 128
    epochs = 50
    epsilon_std = 1.0

    x = Input(batch_shape=(batch_size, original_dim))
    h = Dense(intermediate_dim, activation='relu')(x)
    z_mean = Dense(latent_dim)(h)
    z_log_var = Dense(latent_dim)(h)

    def sampling(args):
        z_mean, z_log_var = args
        epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0., stddev=epsilon_std)
        return z_mean + K.exp(z_log_var / 2) * epsilon

    # note that "output_shape" isn't necessary with the TensorFlow backend
    z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

    # we instantiate these layers separately so as to reuse them later
    decoder_h = Dense(intermediate_dim, activation='relu')
    decoder_mean = Dense(original_dim, activation='sigmoid')
    h_decoded = decoder_h(z)
    x_decoded_mean = decoder_mean(h_decoded)


    def vae_loss(x, x_decoded_mean):
        xent_loss = original_dim * metrics.binary_crossentropy(x, x_decoded_mean)
        kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
        return xent_loss + kl_loss


    vae = Model(x, x_decoded_mean)
    vae.compile(optimizer='rmsprop', loss=vae_loss)
    return vae


In [85]:
def dae1(original_dim):
    #original_dim = 4735
    latent_dim = 512
    #intermediate_dim = 128        
    
    inputs = Input(shape=(original_dim,))
    encoded = Dense(latent_dim, activation='relu')(inputs)
    #encoded = Dense(512, activation='relu')(encoded)
    #decoded = Dense(512, activation='relu')(encoded)
    x = encoded
    decoded = Dense(original_dim, activation='linear')(x)
    ae = Model(inputs, decoded)
    ae.compile(optimizer='Adam', loss='mse')
    return ae


In [86]:
df_all.shape

(53801, 4735)

In [91]:
dae = dae1(df_all.shape[1])
dae.fit_generator(generator=train_g,
                  validation_data=val_g,
                  epochs=10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2b8d782860>

In [96]:
for l in dae.layers:
    print(l.name, l.input_shape)

input_9 (None, 4735)
dense_37 (None, 4735)
dense_38 (None, 512)


In [99]:
def dae2(original_dim, l1):
    #original_dim = 4735
    latent_dim = 512
    #intermediate_dim = 128        
    
    inputs = Input(shape=(original_dim,))
    
    encoded = l1(inputs)
    encoded = Dense(512, activation='relu')(encoded)
    
    x = encoded
    decoded = Dense(original_dim, activation='linear')(x)
    ae = Model(inputs, decoded)
    ae.compile(optimizer='Adam', loss='mse')
    return ae


In [101]:
l1 = dae.get_layer('dense_37')
l1.trainable = False

d2 = dae2(df_all.shape[1], l1)

In [102]:

d2.fit_generator(generator=train_g,
                  validation_data=val_g,
                  epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2f29768da0>

In [103]:
for l in d2.layers:
    print(l.name, l.input_shape)
    
    

input_11 (None, 4735)
dense_37 (None, 4735)
dense_39 (None, 512)
dense_40 (None, 512)


In [105]:
l1.name, l1.input_shape

('dense_37', (None, 4735))

In [109]:
l2 = d2.get_layer('dense_39')
l2.trainable = False


In [111]:
def dae3(original_dim, l1, l2):
    #original_dim = 4735
    latent_dim = 512
    #intermediate_dim = 128        
    
    inputs = Input(shape=(original_dim,))
    
    encoded = l1(inputs)
    encoded = l2(encoded)
    encoded = Dense(512, activation='relu')(encoded)
    
    x = encoded
    decoded = Dense(original_dim, activation='linear')(x)
    ae = Model(inputs, decoded)
    ae.compile(optimizer='Adam', loss='mse')
    return ae


d3 = dae3(df_all.shape[1], l1, l2)

In [112]:
d3.fit_generator(generator=train_g,
                  validation_data=val_g,
                  epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f2f293c1d68>

In [114]:
for l in d3.layers:
    print(l.name)

input_12
dense_37
dense_39
dense_41
dense_42


In [115]:
d3.save('d3.h5')

In [8]:
from keras.models import load_model

d3 = load_model('d3.h5')

In [10]:
for l in d3.layers:
    print(l, l.input_shape)

<keras.engine.topology.InputLayer object at 0x7f66ab368710> (None, 4735)
<keras.layers.core.Dense object at 0x7f66ab3688d0> (None, 4735)
<keras.layers.core.Dense object at 0x7f66ab368a20> (None, 512)
<keras.layers.core.Dense object at 0x7f66ab368898> (None, 512)
<keras.layers.core.Dense object at 0x7f66ab368b38> (None, 512)


In [116]:
l3 = d3.get_layer('dense_41')
l3.trainable = False

In [133]:
def dae4(original_dim, l1, l2, l3):
    #original_dim = 4735    
    #intermediate_dim = 128        
    
    inputs = Input(shape=(original_dim,))
    
    encoded = l1(inputs)
    encoded = l2(encoded)
    encoded = l3(encoded)
    
    x = encoded
    x = Dense(512, activation='relu')(x)
    x = Dense(512, activation='relu')(x)
    x = Dense(1, activation='linear')(x)
    ae = Model(inputs, x)
    ae.compile(optimizer='Adam', loss='mse')
    return ae


model = dae4(df_all.shape[1], l1, l2, l3)

In [149]:
df_train2 = df_all.iloc[:df_train.shape[0]]
df_target2 = np.log1p(train_target)
df_target2 /= np.mean(df_target2)

#train/test split.
train_idx2, val_idx2 = train_test_split(range(df_train2.shape[0]), test_size=0.05, random_state=0, shuffle=True)


In [150]:
df_train2.iloc[train_idx2].shape, df_target2[train_idx2].shape, df_train2.iloc[val_idx2].shape, df_target2[val_idx2].shape

((4236, 4735), (4236,), (223, 4735), (223,))

In [151]:
model.fit(df_train2.iloc[train_idx2], df_target2[train_idx2],
         validation_data=(df_train2.iloc[val_idx2], df_target2[val_idx2]),
         epochs=20)

Train on 4236 samples, validate on 223 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f2f1d0344a8>

In [152]:
 #df_train2.iloc[val_idx2].shape, df_target2[val_idx2].shape
pred_val = model.predict(df_train2.iloc[val_idx2])

In [158]:
from sklearn.metrics import mean_squared_error

mean_squared_error(df_target2[val_idx2]*14.9, pred_val*14.9)

2.78563305762031

In [11]:
m2 = Model(input=d3.input, outputs=d3.layers[-2].output)
p = m2.predict(df_all, verbose=1)



In [12]:
with open('out.pkl', 'wb') as f:
    pickle.dump(p, f)

In [37]:
if False:
    # train the VAE on MNIST digits
    (x_train, y_train), (x_test, y_test) = mnist.load_data()

    x_train = x_train.astype('float32') / 255.
    x_test = x_test.astype('float32') / 255.
    x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
    x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))

    vae.fit(x_train, x_train,
            shuffle=True,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(x_test, x_test))

Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz
  491520/11490434 [>.............................] - ETA: 1:51

KeyboardInterrupt: 

In [None]:
if False:
    # build a model to project inputs on the latent space
    encoder = Model(x, z_mean)

    # display a 2D plot of the digit classes in the latent space
    x_test_encoded = encoder.predict(x_test, batch_size=batch_size)
    plt.figure(figsize=(6, 6))
    plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test)
    plt.colorbar()
    plt.show()

    # build a digit generator that can sample from the learned distribution
    decoder_input = Input(shape=(latent_dim,))
    _h_decoded = decoder_h(decoder_input)
    _x_decoded_mean = decoder_mean(_h_decoded)
    generator = Model(decoder_input, _x_decoded_mean)

    # display a 2D manifold of the digits
    n = 15  # figure with 15x15 digits
    digit_size = 28
    figure = np.zeros((digit_size * n, digit_size * n))
    # linearly spaced coordinates on the unit square were transformed through the inverse CDF (ppf) of the Gaussian
    # to produce values of the latent variables z, since the prior of the latent space is Gaussian
    grid_x = norm.ppf(np.linspace(0.05, 0.95, n))
    grid_y = norm.ppf(np.linspace(0.05, 0.95, n))

    for i, yi in enumerate(grid_x):
        for j, xi in enumerate(grid_y):
            z_sample = np.array([[xi, yi]])
            x_decoded = generator.predict(z_sample)
            digit = x_decoded[0].reshape(digit_size, digit_size)
            figure[i * digit_size: (i + 1) * digit_size,
                   j * digit_size: (j + 1) * digit_size] = digit

    plt.figure(figsize=(10, 10))
    plt.imshow(figure, cmap='Greys_r')
    plt.show()