# Autoencoder

Source: https://www.youtube.com/watch?v=LjRvMUk59PI
https://blog.keras.io/building-autoencoders-in-keras.html

In [1]:
import keras
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.datasets import mnist
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import regularizers

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Deezer Dataset

In [87]:
data = pd.read_csv('../Data/train.csv').iloc[:, [0, 2, 3, 4, 6, 7, 9, 10, 11, 13, 14]]

d1 = data.groupby('user_id')['is_listened'].sum()
d2 = d1[d1 > 10].index
d3 = data[data.user_id.isin(d2)].iloc[:, [0, 3, 4, 5, 6, 7, 9, 10]]  #[0, 3, 4, 5, 6, 7, 9]

#df_r = pd.pivot_table(d3, index=['user_id'], columns=['media_id'], values=['is_listened'], fill_value=0).astype(int)

print(d3.info())
cutoff = 5000000
x_train_NN = d3.iloc[:cutoff, :].to_numpy()
x_test_NN = d3.iloc[cutoff:,:].to_numpy()
print(x_train_NN.shape)
print(x_test_NN.shape)
print(x_test_NN)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 7528398 entries, 0 to 7558833
Data columns (total 8 columns):
 #   Column           Dtype
---  ------           -----
 0   genre_id         int64
 1   context_type     int64
 2   platform_name    int64
 3   platform_family  int64
 4   listen_type      int64
 5   user_gender      int64
 6   user_age         int64
 7   is_listened      int64
dtypes: int64(8)
memory usage: 516.9 MB
None
(5000000, 8)
(2528398, 8)
[[10  1  2 ...  0 25  1]
 [10  4  0 ...  0 23  1]
 [10 23  1 ...  0 28  0]
 ...
 [ 0  4  0 ...  0 26  0]
 [ 0  4  0 ...  1 29  1]
 [ 0  4  0 ...  1 30  1]]


## Creating the Architecture of an Autoencoder

In [88]:
# This is the size of our encoded representations
encoding_dim = int(round(x_train_NN.shape[1] / 3, 0))   # 32 floats -> compression of factor 24.5, assuming the input is 784 floats

# This is our input image
input_img = keras.Input(shape=(x_train_NN.shape[1],))
# "encoded" is the encoded representation of the input
encoded = Dense(encoding_dim, activation='relu')(input_img) # ,activity_regularizer=regularizers.l1(10e-5)
# "decoded" is the lossy reconstruction of the input
decoded = Dense(x_train_NN.shape[1], activation='relu')(encoded)#,kernel_regularizer=l2(0.0001)

# This model maps an input to its reconstruction
autoencoder = Model(input_img, decoded)
autoencoder.summary()
# This model maps an input to its encoded representation
encoder = Model(input_img, encoded)
# This is our encoded (32-dimensional) input
encoded_input = Input(shape=(encoding_dim,))
# Retrieve the last layer of the autoencoder model
decoder_layer = autoencoder.layers[-1]
# Create the decoder model
decoder = Model(encoded_input, decoder_layer(encoded_input))
autoencoder.compile(optimizer=SGD(learning_rate=0.25, momentum=0.9), loss='MSE', metrics=['accuracy', 'mse'])

Model: "model_62"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_43 (InputLayer)       [(None, 8)]               0         
                                                                 
 dense_42 (Dense)            (None, 3)                 27        
                                                                 
 dense_43 (Dense)            (None, 8)                 32        
                                                                 
Total params: 59
Trainable params: 59
Non-trainable params: 0
_________________________________________________________________


In [89]:
batch_size = 64
autoencoder.fit(x_train_NN, x_train_NN,
                epochs=10,
                batch_size= batch_size,
                shuffle=True,
                validation_data=(x_test_NN, x_test_NN),
                steps_per_epoch=x_train_NN.shape[0] // batch_size + 1,
                validation_steps=x_test_NN.shape[0] // batch_size + 1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x18bb490aee0>

In [90]:
# Encode and decode some digits
# Note that we take them from the *test* set
encoded_imgs = encoder.predict(x_test_NN)
decoded_imgs = decoder.predict(encoded_imgs)

In [91]:
np.set_printoptions(suppress=True)
print(np.around(decoded_imgs, 0))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [92]:
df = pd.DataFrame(decoded_imgs)
df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7
2528393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2528394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2528395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2528396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2528397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
