In [1]:
import sklearn
from sklearn import datasets
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import decomposition
import scipy

import tensorflow as tf
from keras.models import Model, load_model
from keras.layers import Input, Dense, Layer, InputSpec
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers, activations, initializers, constraints, Sequential
from keras import backend as K
from keras.constraints import UnitNorm, Constraint

Using TensorFlow backend.


# Generate random multi-dimensional correlated data

**Step 1**. Set the dimension of the data.

We set the dim small to clear understanding.

In [2]:
n_dim = 5

**Step 2.1.** Generate a positive definite symmetric matrix to be used as covariance to generate a random data.

This is a matrix of size n_dim x n_dim.

In [3]:
cov = sklearn.datasets.make_spd_matrix(n_dim, random_state=None)

**Step 2.2.** Generate a vector of mean for generating the random data.

This is an np array of size n_dim.

In [4]:
mu = np.random.normal(0, 0.1, n_dim)

**Step 3**. Generate the random data, `X`.

The number of samples for `X` is set as `n`.

In [5]:
n = 1000

X = np.random.multivariate_normal(mu, cov, n)

**Step 4.** Split the data into train and test.

We split the data into train and test. The test will be used to measure the improvement in Autoencoder after tuning.

In [6]:
X_train, X_test = train_test_split(X, test_size=0.5, random_state=123)

In [7]:
np.cov(X_train.T)

array([[ 1.52742251, -0.5274438 ,  0.23408997,  1.81206222, -0.35162448],
       [-0.5274438 ,  0.55208126, -0.01248374, -0.72039515,  0.10768684],
       [ 0.23408997, -0.01248374,  0.61008193,  0.04866543, -0.16359657],
       [ 1.81206222, -0.72039515,  0.04866543,  4.73651825, -0.53528873],
       [-0.35162448,  0.10768684, -0.16359657, -0.53528873,  0.75331836]])

# Data preprocessing

In [8]:
# scaler = StandardScaler().fit(X_train)
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.fit_transform(X_test)

In [9]:
X_train_scaled

array([[0.28802   , 0.72403075, 0.4411856 , 0.41454305, 0.54474074],
       [0.31318938, 0.70208478, 0.24933067, 0.48874311, 0.6244617 ],
       [0.5826303 , 0.42574537, 0.55908206, 0.61234947, 0.53454499],
       ...,
       [0.34993009, 0.50725329, 0.47272284, 0.36309813, 0.36297567],
       [0.25755915, 0.63645586, 0.26228602, 0.33809652, 0.58367054],
       [0.54259375, 0.6178697 , 0.37484038, 0.4000808 , 0.57805756]])

# PCA vs Single Layer Linear Autoencoder

### Fit Principal Component Analysis (PCA)

In [10]:
pca = decomposition.PCA(n_components=2)

pca.fit(X_train_scaled)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

### Fit Single Layer Linear Autoencoder

In [11]:
nb_epoch = 100
batch_size = 16
input_dim = X_train_scaled.shape[1] #num of predictor variables, 
encoding_dim = 2
learning_rate = 1e-3

encoder = Dense(encoding_dim, activation="linear", input_shape=(input_dim,), use_bias = True) 
decoder = Dense(input_dim, activation="linear", use_bias = True)

autoencoder = Sequential()
autoencoder.add(encoder)
autoencoder.add(decoder)

autoencoder.compile(metrics=['accuracy'],
                    loss='mean_squared_error',
                    optimizer='sgd')
autoencoder.summary()

autoencoder.fit(X_train_scaled, X_train_scaled,
                epochs=nb_epoch,
                batch_size=batch_size,
                shuffle=True,
                verbose=0)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 2)                 12        
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 15        
Total params: 27
Trainable params: 27
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.


<keras.callbacks.History at 0x1098c4668>

Compare and contrast the outputs.

### 1. Tied Weights

The weights on Encoder and Decoder are not the same.

In [12]:
w_encoder = np.round(autoencoder.layers[0].get_weights()[0], 2).T  # W in Figure 2.
w_decoder = np.round(autoencoder.layers[1].get_weights()[0], 2)  # W' in Figure 2.
print('Encoder weights \n', w_encoder)
print('Decoder weights \n', w_decoder)

Encoder weights 
 [[ 0.34 -0.68  0.5   0.06  0.74]
 [-0.28  0.54 -0.71  0.34  0.34]]
Decoder weights 
 [[-0.16 -0.55 -0.05  0.12  0.7 ]
 [-0.61  0.09 -0.68  0.44  0.78]]


### 2. Weight Orthogonality
Unlike PCA weights, the weights on Encoder and Decoder are not orthogonal.

In [13]:
w_pca = pca.components_
np.round(np.dot(w_pca, w_pca.T), 3)

array([[ 1., -0.],
       [-0.,  1.]])

In [14]:
np.round(np.dot(w_encoder, w_encoder.T), 3)

array([[ 1.379, -0.545],
       [-0.545,  1.105]], dtype=float32)

In [15]:
np.round(np.dot(w_decoder, w_decoder.T), 3)

array([[0.835, 0.681],
       [0.681, 1.645]], dtype=float32)

### 3. Uncorrelated Features
Unlike PCA features, i.e. Principal Scores, the Encoded features are correlated.

In [16]:
pca_features = pca.fit_transform(X_train_scaled)
np.round(np.cov(pca_features.T), 5)

array([[0.07356, 0.     ],
       [0.     , 0.02691]])

In [17]:
encoder_layer = Model(inputs=autoencoder.inputs, outputs=autoencoder.layers[0].output)
encoded_features = np.array(encoder_layer.predict(X_train_scaled))
print('Encoded feature covariance\n', np.cov(encoded_features.T))

Encoded feature covariance
 [[ 0.03769361 -0.02185375]
 [-0.02185375  0.03382182]]


### 4. Unit Norm

In [18]:
print('PCA weights norm, \n', np.sum(w_pca ** 2, axis = 1))
print('Encoder weights norm, \n', np.sum(w_encoder ** 2, axis = 1))
print('Decoder weights norm, \n', np.sum(w_decoder ** 2, axis = 1))

PCA weights norm, 
 [1. 1.]
Encoder weights norm, 
 [1.3792 1.1053]
Decoder weights norm, 
 [0.835     1.6445999]


In [21]:
np.round(np.dot(w_pca_noisy, w_pca_noisy.T), 30)

array([[1.01280309, 0.00732339],
       [0.00732339, 1.0018437 ]])

In [22]:
X_train_pca_features_noisy = np.dot(X_train_scaled, w_pca.T)
np.round(np.cov(X_train_pca_features_noisy.T), 20)

array([[0.07449593, 0.00049422],
       [0.00049422, 0.02696547]])

### The features are orthogonal

In [23]:
np.round(np.cov(X_train_pca_features.T), 20)

array([[ 7.35579672e-02, -9.79000000e-18],
       [-9.79000000e-18,  2.69149908e-02]])

### The PCA transform weights are orthogonal

In [24]:
np.round(np.dot(w_pca, w_pca.T), 30)

array([[1.01280309, 0.00732339],
       [0.00732339, 1.0018437 ]])

### The PCA transform weights have unit norm

In [25]:
np.sum(w_pca ** 2, axis = 1)

array([1.01280309, 1.0018437 ])

In [26]:
w_pca

array([[ 0.64515469, -0.51416254,  0.12819653,  0.51003097, -0.23590138],
       [ 0.09718476,  0.38589638,  0.79342538, -0.12496673, -0.44535635]])

In [27]:
eigenvalues = np.round(pca.explained_variance_, 2)
eigenvalues

array([0.07, 0.03])

## Plain vanilla PCA to mimic PCA

In [28]:
nb_epoch = 100
batch_size = 16
input_dim = X_train_scaled.shape[1] #num of predictor variables, 
encoding_dim = 2
learning_rate = 1e-3

# input_layer = Input(shape=(input_dim, ))
# encoder = Dense(encoding_dim, activation="linear", use_bias = False)(input_layer)
# decoder = Dense(input_dim, activation="linear", use_bias = False)(encoder)
# autoencoder = Model(inputs=input_layer, outputs=decoder)
# autoencoder.summary()

encoder = Dense(encoding_dim, activation="linear", input_shape=(input_dim,), use_bias = True) 
decoder = Dense(input_dim, activation="linear", use_bias = True)

autoencoder = Sequential()
autoencoder.add(encoder)
autoencoder.add(decoder)

autoencoder.compile(metrics=['accuracy'],
                    loss='mean_squared_error',
                    optimizer='sgd')
autoencoder.summary()

autoencoder.fit(X_train_scaled, X_train_scaled,
                epochs=nb_epoch,
                batch_size=batch_size,
                shuffle=True,
                verbose=0)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 2)                 12        
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 15        
Total params: 27
Trainable params: 27
Non-trainable params: 0
_________________________________________________________________


<keras.callbacks.History at 0x135326cc0>

In [29]:
w_encoder = np.round(autoencoder.layers[0].get_weights()[0], 2).T
w_encoder

array([[ 0.02, -0.26,  0.77, -0.71,  0.58],
       [ 0.22,  0.47, -0.88, -0.54, -0.23]], dtype=float32)

In [30]:
w_decoder = np.round(autoencoder.layers[1].get_weights()[0], 2)
w_decoder

array([[-0.51,  0.56,  0.4 , -0.69, -0.19],
       [-0.86,  0.51, -0.68, -0.81,  0.21]], dtype=float32)

In [31]:
train_predictions = autoencoder.predict(X_train_scaled)
print('Train reconstrunction error\n', sklearn.metrics.mean_squared_error(X_train_scaled, train_predictions))
test_predictions = autoencoder.predict(X_test_scaled)
print('Test reconstrunction error\n', sklearn.metrics.mean_squared_error(X_test_scaled, test_predictions))

Train reconstrunction error
 0.01691868241927651
Test reconstrunction error
 0.0186566660959673


### The weights are not orthogonal

In [32]:
print('Encoder weights\n', np.round(np.dot(w_encoder, w_encoder.T), 3))
print('Decoder weights\n', np.round(np.dot(w_decoder, w_decoder.T), 3))

Encoder weights
 [[ 1.501 -0.545]
 [-0.545  1.388]]
Decoder weights
 [[1.246 0.971]
 [0.971 2.162]]


### Encoded features uncorrelated

In [33]:
encoder_layer = Model(inputs=autoencoder.inputs, outputs=autoencoder.layers[0].output)
encoded_features = np.array(encoder_layer.predict(X_train_scaled))
print('Encoded feature covariance\n', np.cov(encoded_features.T))

Encoded feature covariance
 [[ 0.0319451  -0.00556114]
 [-0.00556114  0.0292503 ]]


### Weights are not unit norm

In [34]:
print('Encoder weights norm, \n', np.sum(w_encoder ** 2, axis = 1))
print('Decoder weights norm, \n', np.sum(w_decoder ** 2, axis = 1))

Encoder weights norm, 
 [1.5014    1.3881999]
Decoder weights norm, 
 [1.2459 2.1623]


# Optimizing Autoencoder using PCA principles

In [35]:
nb_epoch = 100
batch_size = 16
input_dim = X_train_scaled.shape[1] #num of predictor variables, 
encoding_dim = 2
learning_rate = 1e-3

### 1. Make decoder weights equal to encoder.

In [36]:
# Reference: https://stackoverflow.com/questions/53751024/tying-autoencoder-weights-in-a-dense-keras-layer
class DenseTied(Layer):
    def __init__(self, units,
                 activation=None,
                 use_bias=True,
                 kernel_initializer='glorot_uniform',
                 bias_initializer='zeros',
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 tied_to=None,
                 **kwargs):
        self.tied_to = tied_to
        if 'input_shape' not in kwargs and 'input_dim' in kwargs:
            kwargs['input_shape'] = (kwargs.pop('input_dim'),)
        super().__init__(**kwargs)
        self.units = units
        self.activation = activations.get(activation)
        self.use_bias = use_bias
        self.kernel_initializer = initializers.get(kernel_initializer)
        self.bias_initializer = initializers.get(bias_initializer)
        self.kernel_regularizer = regularizers.get(kernel_regularizer)
        self.bias_regularizer = regularizers.get(bias_regularizer)
        self.activity_regularizer = regularizers.get(activity_regularizer)
        self.kernel_constraint = constraints.get(kernel_constraint)
        self.bias_constraint = constraints.get(bias_constraint)
        self.input_spec = InputSpec(min_ndim=2)
        self.supports_masking = True
                
    def build(self, input_shape):
        assert len(input_shape) >= 2
        input_dim = input_shape[-1]

        if self.tied_to is not None:
            self.kernel = K.transpose(self.tied_to.kernel)
            self._non_trainable_weights.append(self.kernel)
        else:
            self.kernel = self.add_weight(shape=(input_dim, self.units),
                                          initializer=self.kernel_initializer,
                                          name='kernel',
                                          regularizer=self.kernel_regularizer,
                                          constraint=self.kernel_constraint)
        if self.use_bias:
            self.bias = self.add_weight(shape=(self.units,),
                                        initializer=self.bias_initializer,
                                        name='bias',
                                        regularizer=self.bias_regularizer,
                                        constraint=self.bias_constraint)
        else:
            self.bias = None
        self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim})
        self.built = True

    def compute_output_shape(self, input_shape):
        assert input_shape and len(input_shape) >= 2
        output_shape = list(input_shape)
        output_shape[-1] = self.units
        return tuple(output_shape)

    def call(self, inputs):
        output = K.dot(inputs, self.kernel)
        if self.use_bias:
            output = K.bias_add(output, self.bias, data_format='channels_last')
        if self.activation is not None:
            output = self.activation(output)
        return output

In [37]:
encoder = Dense(encoding_dim, activation="linear", input_shape=(input_dim,), use_bias = True) 
decoder = DenseTied(input_dim, activation="linear", tied_to=encoder, use_bias = False)

autoencoder = Sequential()
autoencoder.add(encoder)
autoencoder.add(decoder)

autoencoder.compile(metrics=['accuracy'],
                    loss='mean_squared_error',
                    optimizer='sgd')
autoencoder.summary()

autoencoder.fit(X_train_scaled, X_train_scaled,
                epochs=nb_epoch,
                batch_size=batch_size,
                shuffle=True,
                verbose=0)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 2)                 12        
_________________________________________________________________
dense_tied_1 (DenseTied)     (None, 5)                 10        
Total params: 22
Trainable params: 12
Non-trainable params: 10
_________________________________________________________________


<keras.callbacks.History at 0x135515e80>

In [38]:
w_encoder = np.round(np.transpose(autoencoder.layers[0].get_weights()[0]), 3)
w_decoder = np.round(autoencoder.layers[1].get_weights()[0], 3)
print('Encoder weights\n', w_encoder)
print('Decoder weights\n', w_decoder)

Encoder weights
 [[-0.261  0.259 -0.916  0.225 -0.274]
 [ 0.408  0.676  0.022  0.706  0.356]]
Decoder weights
 [[-0.261  0.259 -0.916  0.225 -0.274]
 [ 0.408  0.676  0.022  0.706  0.356]]


In [39]:
train_predictions = autoencoder.predict(X_train_scaled)
print('Train reconstrunction error\n', sklearn.metrics.mean_squared_error(X_train_scaled, train_predictions))
test_predictions = autoencoder.predict(X_test_scaled)
print('Test reconstrunction error\n', sklearn.metrics.mean_squared_error(X_test_scaled, test_predictions))

Train reconstrunction error
 0.020553788540722952
Test reconstrunction error
 0.02011294850382963


### 2. Add weights orthogonality constraint.

In [40]:
class WeightsOrthogonalityConstraint (Constraint):
    def __init__(self, encoding_dim, weightage = 1):
        self.encoding_dim = encoding_dim
        self.weightage = weightage
        
    def fro_norm(m):
        return self.weightage * K.sqrt(K.sum(K.square(K.abs(m))))

    def weights_orthogonality(self, w):
        if(self.encoding_dim > 1):
            print(w)
            m = K.dot(K.transpose(w), w) - K.eye(self.encoding_dim)
            print(m)
#             return self.fro_norm(m)
            return self.weightage * K.sqrt(K.sum(K.square(K.abs(m))))
        else:
            m = K.sum(w ** 2) - 1.
            return m

    def __call__(self, w):
        return self.weights_orthogonality(w)

In [41]:
encoder = Dense(encoding_dim, activation="linear", input_shape=(input_dim,), use_bias = True, kernel_regularizer=WeightsOrthogonalityConstraint(encoding_dim)) 
decoder = DenseTied(input_dim, activation="linear", tied_to=encoder, use_bias = False)

autoencoder = Sequential()
autoencoder.add(encoder)
autoencoder.add(decoder)

autoencoder.compile(metrics=['accuracy'],
                    loss='mean_squared_error',
                    optimizer='sgd')
autoencoder.summary()

autoencoder.fit(X_train_scaled, X_train_scaled,
                epochs=nb_epoch,
                batch_size=batch_size,
                shuffle=True,
                verbose=0)

<tf.Variable 'dense_6/kernel:0' shape=(5, 2) dtype=float32_ref>
Tensor("dense_6/weight_regularizer/sub:0", shape=(2, 2), dtype=float32)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 2)                 12        
_________________________________________________________________
dense_tied_2 (DenseTied)     (None, 5)                 10        
Total params: 22
Trainable params: 12
Non-trainable params: 10
_________________________________________________________________


<keras.callbacks.History at 0x1098c4390>

In [42]:
w_encoder = autoencoder.layers[0].get_weights()[0]
print(w_encoder)
print('Encoder weights dot product\n', np.round(np.dot(w_encoder.T, w_encoder), 20))

[[-0.64585453 -0.15490086]
 [-0.27592906 -0.3519888 ]
 [ 0.27592278 -0.6416658 ]
 [-0.6119648  -0.2081838 ]
 [ 0.25840914 -0.6213636 ]]
Encoder weights dot product
 [[ 1.0106746  -0.01304772]
 [-0.01304772  0.9890586 ]]


In [43]:
a = np.dot(X_train_scaled, w_encoder)
np.cov(a.T)

array([[ 0.03803921, -0.00182756],
       [-0.00182756,  0.01676502]])

In [44]:
train_predictions = autoencoder.predict(X_train_scaled)
print('Train reconstrunction error\n', sklearn.metrics.mean_squared_error(X_train_scaled, train_predictions))
test_predictions = autoencoder.predict(X_test_scaled)
print('Test reconstrunction error\n', sklearn.metrics.mean_squared_error(X_test_scaled, test_predictions))

Train reconstrunction error
 0.01767556744993414
Test reconstrunction error
 0.016522204126475946


### 3. Encoded features are uncorrelated.

In [244]:
from keras.regularizers import l1, Regularizer
np.linalg.eigvals
class UncorrelatedFeaturesConstraint (Constraint):
    def __init__(self, encoding_dim, correlation_weight = 1.0, eigen_decay_weight = 0.0):
        self.encoding_dim = encoding_dim
        self.correlation_weight = correlation_weight
        self.eigen_decay_weight = eigen_decay_weight
        
    def fro_norm(m):
        return self.weightage * K.sqrt(K.sum(K.square(K.abs(m))))
    
    def get_covariance(self, x):
        x_centered_list = []
        for i in range(self.encoding_dim):
            x_centered_list.append(x[:, i] - K.mean(x[:, i]))
        x_centered = tf.stack(x_centered_list)
        covariance = K.dot(x_centered, K.transpose(x_centered)) / tf.cast(x_centered.get_shape()[0], tf.float32)
        return covariance
            
        
    # Constraint penalties
    def uncorrelated_feature(self, x):
        if(self.encoding_dim <= 1):
            return 0.0
        else:
            output = K.sum(K.square(self.covariance - K.dot(self.covariance, K.eye(self.encoding_dim)))) ** 0.5
            return output

    def eigenvalue_decay(self, x):
        power = 10  # number of iterations of the power method
#         o = tf.cast(np.ones(shape=(self.encoding_dim,1)), tf.float32)  # initial values for the dominant eigenvector
        o = K.ones(shape=(self.encoding_dim,1))
        o = K.constant(np.ones(shape=(self.encoding_dim,1)))
        print('oo', o)
        print('cov', self.covariance)
        # power method for approximating the dominant eigenvector:
#         domin_eigenvect = K.dot(self.covariance, K.transpose(K.ones(shape=(self.encoding_dim,1))))
        domin_eigenvect = K.sum(self.covariance, axis = 1)
        for n in range(power):
            1
#             domin_eigenvect = K.dot(self.covariance, domin_eigenvect)    
        
        WWd = K.dot(self.covariance, domin_eigenvect)
#         domin_eigenval = K.dot(WWd, domin_eigenvect) / K.dot(domin_eigenvect, domin_eigenvect)  # the corresponding dominant eigenvalue
        domin_eigenval = 1
        return domin_eigenval ** 0.5

    def __call__(self, x):
        self.covariance = self.get_covariance(x)
        self.eigenvalue_decay(x)
        return self.correlation_weight * self.uncorrelated_feature(x) #+ self.eigen_decay_weight * self.eigenvalue_decay(x)
    


In [245]:
encoding_dim = 3
encoder = Dense(encoding_dim, activation="linear", input_shape=(input_dim,), use_bias = True, activity_regularizer=UncorrelatedFeaturesConstraint(encoding_dim, correlation_weight = 1.)) 
# encoder = Dense(encoding_dim, activation="linear", input_shape=(input_dim,), use_bias = True, activity_regularizer=l1(0.1)) 
# encoder = Dense(encoding_dim, activation="linear", input_shape=(input_dim,), use_bias = True) 
decoder = DenseTied(input_dim, activation="linear", tied_to=encoder, use_bias = False)

autoencoder = Sequential()
autoencoder.add(encoder)
autoencoder.add(decoder)

autoencoder.compile(metrics=['accuracy'],
                    loss='mean_squared_error',
                    optimizer='adam')
autoencoder.summary()

autoencoder.fit(X_train_scaled, X_train_scaled,
                epochs=nb_epoch,
                batch_size=batch_size,
                shuffle=True,
                verbose=0)

oo Tensor("dense_92/activity_regularizer/Const_3:0", shape=(3, 1), dtype=float32)
cov Tensor("dense_92/activity_regularizer/truediv:0", shape=(3, 3), dtype=float32)


ValueError: Shape must be rank 2 but is rank 1 for 'dense_92/activity_regularizer/MatMul_1' (op: 'MatMul') with input shapes: [3,3], [3].

In [None]:
class EigenvalueRegularizer(Regularizer):
    """This class implements the Eigenvalue Decay regularizer.
    
    Args:
        The constant that controls the regularization on the current layer
        ( see Section 3 of https://arxiv.org/abs/1604.06985 )
    Returns:
        The regularized loss (for the training data) and
        the original loss (for the validation data).
        
    """
    def __init__(self, k):
        self.k = k
        self.uses_learning_phase = True


    def set_param(self, p):
        self.p = p


    def __call__(self, loss):
        power = 9  # number of iterations of the power method
        W = self.p
        WW = K.dot(K.transpose(W), W)
        dim1, dim2 = K.eval(K.shape(WW))
        k = self.k
        o = np.ones(dim1)  # initial values for the dominant eigenvector

        # power method for approximating the dominant eigenvector:
        o = np.array([1., 1., 1.])
        domin_eigenvect = K.dot(WW, o)
        for n in range(power - 1):
            domin_eigenvect = K.dot(WW, domin_eigenvect)    
        
        WWd = K.dot(WW, domin_eigenvect)
        domin_eigenval = K.dot(WWd, domin_eigenvect) / K.dot(domin_eigenvect, domin_eigenvect)  # the corresponding dominant eigenvalue
        regularized_loss = loss + (domin_eigenval ** 0.5) * self.k  # multiplied by the given regularization gain
        return K.in_train_phase(regularized_loss, loss)
    

    def get_config(self):
        return {"name": self.__class__.__name__,
                "k": self.k}

In [175]:
w_encoder = autoencoder.layers[0].get_weights()[0]
print('Encoder weights dot product\n', np.round(np.dot(w_encoder.T, w_encoder), 20))

Encoder weights dot product
 [[ 1.0126897  -0.01281005  0.04509516]
 [-0.01281005  1.000313    0.00253354]
 [ 0.04509516  0.00253354  0.96785223]]


In [176]:
encoder_layer = Model(inputs=autoencoder.inputs, outputs=autoencoder.layers[0].output)
encoded_features = np.array(encoder_layer.predict(X_train_scaled))
# encoded_features
print(encoded_features.shape)
print('Encoded feature covariance\n', np.cov(encoded_features.T))

(500, 3)
Encoded feature covariance
 [[0.01719019 0.01223717 0.00116263]
 [0.01223717 0.06834599 0.01161249]
 [0.00116263 0.01161249 0.01674539]]


In [None]:
xx = encoded_features
xx_diff = xx
mu = np.mean(xx, axis=0)
for i in range(xx.shape[1]):
    for j in range(xx.shape[1]):
        a = xx[:, i] - np.mean(xx[:, i])
        b = xx[:, j] - np.mean(xx[:, j])
        print(np.sum(a * b) / (xx.shape[0] - 1))
        



In [None]:
np.cov(xx.T)

In [None]:
aa = np.array(np.sum(xx_diff, axis = 0))
(aa.T + np.zeros((2,1)))[:,0]


In [None]:
np.dot(encoded_features.T, encoded_features)

In [None]:
a = np.dot(X_train_scaled, w_encoder)
np.cov(a.T)

In [None]:
train_predictions = autoencoder.predict(X_train_scaled)
print('Train reconstrunction error\n', sklearn.metrics.mean_squared_error(X_train_scaled, train_predictions))
test_predictions = autoencoder.predict(X_test_scaled)
print('Test reconstrunction error\n', sklearn.metrics.mean_squared_error(X_test_scaled, test_predictions))

### 4. Unit norm

In [None]:
encoder = Dense(encoding_dim, activation="linear", input_shape=(input_dim,), use_bias = True, kernel_regularizer=OrthogonalityConstraint(encoding_dim), kernel_constraint=UnitNorm(axis=0)) 
decoder = DenseTied(input_dim, activation="linear", tied_to=encoder, use_bias = False)

autoencoder = Sequential()
autoencoder.add(encoder)
autoencoder.add(decoder)

autoencoder.compile(metrics=['accuracy'],
                    loss='mean_squared_error',
                    optimizer='sgd')
autoencoder.summary()

autoencoder.fit(X_train_scaled, X_train_scaled,
                epochs=nb_epoch,
                batch_size=batch_size,
                shuffle=True,
                verbose=0)

In [None]:
w_encoder = autoencoder.layers[0].get_weights()[0]
print('Encoder weights dot product\n', np.round(np.dot(w_encoder.T, w_encoder), 20))

In [None]:
train_predictions = autoencoder.predict(X_train_scaled)
print('Train reconstrunction error\n', sklearn.metrics.mean_squared_error(X_train_scaled, train_predictions))
test_predictions = autoencoder.predict(X_test_scaled)
print('Test reconstrunction error\n', sklearn.metrics.mean_squared_error(X_test_scaled, test_predictions))

# All of them together with a nonlinear Activation

In [None]:
n_dim = 100
cov = sklearn.datasets.make_spd_matrix(n_dim, random_state=None)
mu = np.random.normal(0, 0.1, n_dim)
n = 5000

X_large = np.random.multivariate_normal(mu, cov, n)
X_large_train, X_large_test = train_test_split(X_large, test_size=0.5, random_state=123)

# scaler_large = StandardScaler().fit(X_large_train)
scaler_large = MinMaxScaler().fit(X_large_train)

X_large_train_scaled = scaler_large.fit_transform(X_large_train)

X_large_test_scaled = scaler_large.fit_transform(X_large_test)

In [None]:
nb_epoch = 100
batch_size = 16
input_dim = X_large_train_scaled.shape[1] #num of predictor variables, 
encoding_dim = 8

# Plain vanilla
encoder1 = Dense(encoding_dim, activation="relu", input_shape=(input_dim,), use_bias = True) 
encoder2 = Dense(4, activation="relu", input_shape=(encoding_dim,), use_bias = True) 
decoder2 = Dense(encoding_dim, activation="relu", use_bias = True)
decoder1 = Dense(input_dim, activation="relu", use_bias = True)

# 1. Weight orthogonal
encoder1 = Dense(encoding_dim, activation="relu", input_shape=(input_dim,), use_bias = True, kernel_regularizer=OrthogonalityConstraint(encoding_dim), kernel_constraint=UnitNorm(axis=0)) 
encoder2 = Dense(4, activation="relu", input_shape=(encoding_dim,), use_bias = True, kernel_regularizer=OrthogonalityConstraint(4), kernel_constraint=UnitNorm(axis=0)) 
decoder2 = DenseTied(encoding_dim, activation="relu", tied_to=encoder2, use_bias = False)
decoder1 = DenseTied(input_dim, activation="relu", tied_to=encoder1, use_bias = False)

# encoder1 = Dense(encoding_dim, activation="relu", input_shape=(input_dim,), use_bias = True, kernel_regularizer=OrthogonalityConstraint(encoding_dim), kernel_constraint=UnitNorm(axis=0)) 
# encoder2 = Dense(4, activation="relu", input_shape=(encoding_dim,), use_bias = True, kernel_regularizer=OrthogonalityConstraint(4), kernel_constraint=UnitNorm(axis=0)) 
# decoder2 = DenseTied(encoding_dim, activation="relu", tied_to=encoder2, use_bias = False)
# decoder1 = DenseTied(input_dim, activation="relu", tied_to=encoder1, use_bias = False)


# encoder1 = Dense(encoding_dim, activation="relu", input_shape=(input_dim,), use_bias = True, kernel_regularizer=OrthogonalityConstraint(encoding_dim), kernel_constraint=UnitNorm(axis=0)) 
# encoder2 = Dense(4, activation="relu", input_shape=(encoding_dim,), use_bias = True, kernel_regularizer=OrthogonalityConstraint(4), kernel_constraint=UnitNorm(axis=0)) 
# decoder2 = DenseTied(encoding_dim, activation="relu", tied_to=encoder2, use_bias = False)
# decoder1 = DenseTied(input_dim, activation="relu", tied_to=encoder1, use_bias = False)

# encoder1 = Dense(encoding_dim, activation="relu", input_shape=(input_dim,), use_bias = True, kernel_regularizer=OrthogonalityConstraint(encoding_dim), kernel_constraint=UnitNorm(axis=0)) 
# encoder2 = Dense(4, activation="relu", input_shape=(encoding_dim,), use_bias = True, kernel_regularizer=OrthogonalityConstraint(4), kernel_constraint=UnitNorm(axis=0)) 
# decoder2 = Dense(encoding_dim, activation="relu", use_bias = True)
# decoder1 = Dense(input_dim, activation="relu", use_bias = True)


# encoder1 = Dense(encoding_dim, activation="relu", input_shape=(input_dim,), use_bias = True) 
# encoder2 = Dense(4, activation="relu", input_shape=(encoding_dim,), use_bias = True) 
# decoder2 = DenseTied(encoding_dim, activation="relu", use_bias = True)
# decoder1 = DenseTied(input_dim, activation="relu", use_bias = True)


autoencoder = Sequential()
autoencoder.add(encoder1)
autoencoder.add(encoder2)
autoencoder.add(decoder2)
autoencoder.add(decoder1)

autoencoder.compile(metrics=['accuracy'],
                    loss='mean_squared_error',
                    optimizer='adam')
autoencoder.summary()

autoencoder.fit(X_large_train_scaled, X_large_train_scaled,
                epochs=nb_epoch,
                batch_size=batch_size,
                shuffle=True,
                verbose=0)

In [None]:
train_predictions = autoencoder.predict(X_large_train_scaled)
print('Train reconstrunction error\n', sklearn.metrics.mean_squared_error(X_large_train_scaled, train_predictions))
test_predictions = autoencoder.predict(X_large_test_scaled)
print('Test reconstrunction error\n', sklearn.metrics.mean_squared_error(X_large_test_scaled, test_predictions))

In [None]:
w_encoder = autoencoder.layers[0].get_weights()[0]
print(w_encoder)
print('Encoder weights dot product\n', np.round(np.dot(w_encoder.T, w_encoder), 20))

### 3. Encoded features are uncorrelated.

In [None]:
encoder_layer = Model(inputs=autoencoder.inputs, outputs=autoencoder.layers[0].output)
encoded_features = np.array(encoder_layer.predict(X_train_scaled))
# encoded_features
print(encoded_features.shape)
print('Encoded feature covariance\n', np.cov(encoded_features.T))

In [None]:
# encoded1 = Dense(4, activation="sigmoid", input_shape=(4,), use_bias=True)
# decoded1 = DenseTied(4, activation="sigmoid", tied_to=encoded1, use_bias=False)

# # autoencoder
# #
# autoencoder = Sequential()
# # autoencoder.add(input_)
# autoencoder.add(encoded1)
# autoencoder.add(decoded1)

# autoencoder.compile(optimizer="adam", loss="binary_crossentropy")

# print(autoencoder.summary())

# autoencoder.fit(x=np.random.rand(100, 4), y=np.random.randint(0, 1, size=(100, 4)))

# print(autoencoder.layers[0].get_weights()[0])
# print(autoencoder.layers[1].get_weights()[0])

# input_ = Input(shape=(16,), dtype=np.float32)
# encoder
#

def fro_norm(w):
    return K.sqrt(K.sum(K.square(K.abs(w))))

def cust_reg(w):
    if(encoding_dim > 1):
        m = K.dot(K.transpose(w), w) - K.eye(encoding_dim)
        return fro_norm(m)
    else:
        m = K.sum(w ** 2) - 1.
        return m
    
nb_epoch = 100
batch_size = 16
input_dim = X_scaled.shape[1] #num of predictor variables, 
encoding_dim = 1
learning_rate = 1e-3
encoder = Dense(encoding_dim, activation="linear", input_shape=(input_dim,), use_bias = True, kernel_regularizer=cust_reg, kernel_constraint=UnitNorm(axis=0)) 
decoder = DenseTied(input_dim, activation="linear", tied_to=encoder, use_bias = False)

autoencoder = Sequential()
autoencoder.add(encoder)
autoencoder.add(decoder)

autoencoder.compile(metrics=['accuracy'],
                    loss='mean_squared_error',
                    optimizer='sgd')
autoencoder.summary()

In [None]:
autoencoder.fit(X_scaled, X_scaled,
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=True,
                    verbose=1)

In [None]:
print(np.round(np.transpose(autoencoder.layers[0].get_weights()[0]), 3))
print(np.round(autoencoder.layers[1].get_weights()[0], 3))



In [None]:
# scipy.linalg.norm(np.transpose(autoencoder.layers[0].get_weights()[0]), 2)
np.sum(np.transpose(autoencoder.layers[0].get_weights()[0]) ** 2, axis = 1)

In [None]:
np.round(pca.components_, 3)

In [None]:
np.round(np.dot(pca.components_, np.transpose(pca.components_)), 2)

In [None]:
np.round(np.dot(autoencoder.layers[0].get_weights()[0], np.transpose(autoencoder.layers[0].get_weights()[0])), 2)


-------

In [None]:
import numpy as np
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Activation

a_dim = 16
from keras import backend as K
def fro_norm(w):
    return K.sqrt(K.sum(K.square(K.abs(w))))

def cust_reg(w):
    print(w.shape[1])
    m = K.dot(K.transpose(w), w) - np.eye(a_dim)
    return fro_norm(m)

X = np.random.randn(100, 100)
y = np.random.randint(2, size=(100, 1))

model = Sequential()


# apply regularization here. applies regularization to the 
# output (activation) of the layer
model.add(Dense(a_dim, input_shape=(100,), 
                kernel_regularizer=cust_reg))
model.add(Dense(1))
model.add(Activation('softmax'))

model.compile(loss="binary_crossentropy",
              optimizer='sgd',
              metrics=['accuracy'])

model.fit(X, y, epochs=100, batch_size=32)

In [None]:
model.summary()

In [None]:
intermediate_layer = Model(inputs=model.inputs, outputs=model.layers[0].output)
intermediate_output = intermediate_layer.predict(X)

In [None]:
intermediate_output.shape

In [None]:
np.round(np.dot(np.transpose(model.layers[0].get_weights()[0]), model.layers[0].get_weights()[0]), 2)

In [None]:
model.layers[0].get_weights()[0].shape

-------

# Solve PCA by reconstruction loss

In [None]:
import scipy
from scipy.optimize import minimize

In [None]:
scipy.linalg.norm(X_scaled, ord=2, axis=None, keepdims=False)

In [None]:
np.diag(np.cov(X_scaled.T))

In [None]:
np.linalg.eig(np.cov(X_scaled.T))[0]

In [None]:
Vp = np.linalg.eig(np.cov(X_scaled.T))[1]

In [None]:
Z = np.dot(X_scaled, Vp)

In [None]:
np.round(np.cov(Z.T), 3)

In [None]:
np.dot(Vp, np.dot(np.dot(X_scaled.T, X_scaled), Vp.T)) /

In [None]:
def reconstruction_error(x):
    V = x.reshape(-1, 5)
    loss = 0.1 * scipy.linalg.norm((X_train_scaled - np.dot(X_train_scaled, np.dot(np.transpose(V), V))), 2) / V.shape[0] + orthogonality_constraint(V) + norm_constraint(V) + max_variance(V)
    return loss

In [None]:
pca.components_

In [None]:
reconstruction_error(pca.components_.flatten())

In [None]:
def orthogonality_constraint(V):
    return scipy.linalg.norm(np.dot(V, np.transpose(V)) - np.eye(V.shape[0]), 2)

In [None]:
def norm_constraint(V):
    return scipy.linalg.norm(np.sum(V ** 2, axis = 1) - np.ones(V.shape[0]), 2)

In [None]:
def max_variance(V):
    eigenvalues = np.linalg.eig(np.cov(X_train_scaled.T))[0][0:V.shape[0]]
#     print(eigenvalues)
    Z_scores = np.dot(X_train_scaled, V.T)
    if(V.shape[0] > 1):
        Z_cov = np.diag(np.cov(Z_scores.T))
    else:
        Z_cov = np.cov(Z_scores.T)
#     print(Z_cov)
    return scipy.linalg.norm(Z_cov - eigenvalues)

In [None]:
orthogonality_constraint(pca.components_)

In [None]:
pca_reduced = decomposition.PCA(n_components=2)

In [None]:
pca_reduced.fit(X_train_scaled)

In [None]:
pca_reduced.explained_variance_

In [None]:
pca_reduced.components_

In [None]:
reconstruction_error(pca_reduced.components_)

In [None]:
pca_reduced.components_.shape

In [None]:
scipy.linalg.norm(X_train_scaled)

In [None]:
orthogonality_constraint(pca_reduced.components_)

In [None]:
# minimize(reconstruction_error, x0 = pca_reduced.components_.flatten())
# result = minimize(reconstruction_error, x0 = np.random.normal(0, 1, len(pca_reduced.components_.flatten())))
# result = minimize(reconstruction_error, x0 = np.random.normal(0, 1, 5))
# result = minimize(reconstruction_error, x0 = np.zeros(5), method='Nelder-Mead')
result = minimize(reconstruction_error, x0 = np.random.normal(0, 1, 10), method='Nelder-Mead')

In [None]:
result.x

In [None]:
orthogonality_constraint(result.x.reshape(-1, 5))

In [None]:
np.sum(result.x.reshape(-1, 5) ** 2, axis = 1)

In [None]:
result.x.reshape(-1, 5)

In [None]:
max_variance(result.x.reshape(-1, 5))

In [None]:
result

In [None]:
x = pca_reduced.components_.flatten()

In [None]:
print(x)

In [None]:
pca_reduced.components_

In [None]:
V1 = x.reshape(-1, 5)

In [None]:
np.sum(V1 ** 2, axis=1)

In [None]:
V2 = result.x.reshape(-1, 5)

In [None]:
np.sum(V2 ** 2, axis = 1) - np.ones(V2.shape[0])

In [None]:
scipy.linalg.norm(np.sum(V2 ** 2, axis = 1) - np.ones(V2.shape[0]), 2)

In [None]:
np.sqrt(0.01719766 ** 2 + (0.01719765 ** 2))

In [None]:
a = np.array([1, 2, 3])

In [None]:
scipy.linalg.norm(a, 2)

In [None]:
np.sqrt((1 ** 2 + 2 ** 2 + 3 ** 2))

In [None]:
0.01719766 ** 2

In [None]:
(-0.01719765 ** 2)

In [None]:
np.linalg.eig(cc)[0]

In [None]:
import keras
print(keras.__version__)

In [None]:
np.linalg.eig(np.cov(X_train_scaled.T))[0][0:V.shape[0]]