# AutoEncoder in Keras

## 1. Import Libraries

In [1]:
import os
import numpy as np
import pandas as pd

import urllib.request


import keras
from keras.callbacks import TensorBoard
from keras.models import Sequential
from keras.layers import Dense


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix

Using TensorFlow backend.


## 2. Hyperparameters

In [2]:
# HYPER-PARAMETERS
MODE = 'train'
RUN_NAME = 'run02'
SAVE_DIR = 'models'
NUM_HIDDEN_UNITS = [32, 4, 32]
EPOCHS = 100

In [3]:
# Create directory to save model
model_dir = os.path.join(SAVE_DIR, RUN_NAME)
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

## 3. Load the data

In [4]:
def download_data(url, filename):
    """
    Download the dataset from the url
    :param url: url of file to be downloaded
    :param filename: filname to be saved
    :return:
    """
    urllib.request.urlretrieve(url, filename)
    

def load_data(filename, class_col='class', rm_nan_by_axis=0):
    """
    Load the dataset from file and return X, y
    :param filename: name of xls file
    :param class_col: column name of class
    :param rm_nan_by_axis: remove empty values by axis row=0, column=1
    :return: X: features y:labels
    """
    xls_file = pd.read_excel(filename, index_col=0)
    # remove missing values by row: axis=0, column: axis=1
    xls_file = xls_file.dropna(axis=rm_nan_by_axis)

    X = xls_file[xls_file.columns[0:-4]].values
    y = xls_file[class_col].astype('category').cat.codes.values

    return X, y

In [5]:
# LOAD DATA
# Download the Mice Protein Expression dataset from uci
# https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00342/Data_Cortex_Nuclear.xls'
file_name = 'Data_Cortex_Nuclear.xls'
download_data(url, file_name)

# Load the dataset
X, y = load_data(file_name)

num_samples, num_features = X.shape
num_classes = np.max(y) + 1

## 4. Preprocessing

In [6]:
# One-hot encode
onehot_encoder = OneHotEncoder(sparse=False, categories='auto')
y_onehot = onehot_encoder.fit_transform(np.expand_dims(y, axis=1))

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=.3)

## 5. Create the model (i.e. Graph)

In [7]:
# BUILD MODEL
model = Sequential()
model.add(Dense(NUM_HIDDEN_UNITS[0], activation='relu', name='FC_1', input_shape=(num_features,)))
model.add(Dense(NUM_HIDDEN_UNITS[1], activation='relu', name='FC_2'))
model.add(Dense(NUM_HIDDEN_UNITS[2], activation='relu', name='FC_3'))
model.add(Dense(num_features, name='output'))
model.compile(loss=keras.losses.mse,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['mae'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
FC_1 (Dense)                 (None, 32)                2496      
_________________________________________________________________
FC_2 (Dense)                 (None, 4)                 132       
_________________________________________________________________
FC_3 (Dense)                 (None, 32)                160       
_________________________________________________________________
output (Dense)               (None, 77)                2541      
Total params: 5,329
Trainable params: 5,329
Non-trainable params: 0
_________________________________________________________________


### Create metadata for TensorBoard 

In [8]:
def write_metadata(filename,indices, labels):
    """
    Create a metadata file consisting of sample indices and labels
    :param filename: name of the file to save on disk
    :param shape: tensor of labels
    """
    with open(filename, 'w') as f:
        f.write("Index\tLabel\n")
        for index, label in zip(indices, labels):
            f.write("{}\t{}\n".format(index, label))

In [9]:
# TensorBoard
# Save class labels to disk to color data points in TensorBoard accordingly
index = pd.read_excel(file_name, index_col=0).index
write_metadata(os.path.join(model_dir, 'metadata.tsv'), index, y)
# Create tensorboard callback
tensorboard = TensorBoard(log_dir=model_dir,
                          embeddings_freq=1,
                          embeddings_layer_names=['FC_2'],
                          embeddings_metadata='metadata.tsv',
                          embeddings_data=X)


Instructions for updating:
Use the retry module or similar alternatives.


## 6. Train

In [10]:
if MODE == 'train':
    model.fit(X_train, X_train, epochs=EPOCHS, batch_size=32, validation_split=0.2,
              callbacks=[tensorboard])

    model.save_weights(os.path.join(os.path.join(model_dir), 'wieghts.h5'))

## 7. Test

In [11]:
if MODE == 'test':
    model.load_weights(os.path.join(os.path.join(model_dir), 'wieghts.h5'))
    model.layers