In [1]:
import os
import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow_addons as tfa 
from tensorflow import keras
from tensorflow.keras import layers

import matplotlib as mplt
import matplotlib.pyplot as plt
import seaborn as sns

 The versions of TensorFlow you are currently using is 2.11.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


# Load dataset

In [2]:
data = pd.read_csv('output/masked_10_missing_180x21.csv', sep=';', header=None)
Q = pd.read_csv('data/Qmatrix.csv', sep=';', header=None)
Q = Q.T
print(data.shape)
print(Q.shape)

(8000, 180)
(21, 180)


# Helper Functions


In [3]:
# Restrict connection in decoder
def q_constraint(w):
    target = w * Q
    diff = w - target
    w = w * tf.cast(tf.math.equal(diff, 0), keras.backend.floatx()) 
    return w * tf.cast(tf.math.greater_equal(w, 0), keras.backend.floatx())

# Remove zeros function
def remove_zeros(arr):
  n_arr = []
  
  for j in range(NUM_SKILLS): 
    for i in range(NUM_STATS):
      if Q.iloc[j, i] != 0:
        n_arr.append(arr[j][i])
  
  return n_arr


# Variables Initialization

In [4]:
# Set stats and skills
NUM_STATS = 180
NUM_SKILLS = 21

AUTO = tf.data.AUTOTUNE
BUFFER_SIZE = 1024

INTERMEDIATE_DIM = 40
N_DECODERS = 1

# Encoder and Decoder
LAYER_NORM_EPS = 1e-6
ENC_PROJECTION_DIM = 1
DEC_PROJECTION_DIM = 18
ENC_NUM_HEADS = 4
ENC_LAYERS = 6
DEC_NUM_HEADS = 4
DEC_LAYERS = (
    2  # The decoder is lightweight but should be reasonably deep for reconstruction.
)
ENC_TRANSFORMER_UNITS = [
    ENC_PROJECTION_DIM * 2,
    ENC_PROJECTION_DIM,
]  # Size of the transformer layers.
DEC_TRANSFORMER_UNITS = [
    DEC_PROJECTION_DIM * 2,
    DEC_PROJECTION_DIM,
]

# Optimizer
LEARNING_RATE = 5e-3
WEIGHT_DECAY = 1e-4

# Number of persons
NUM_PERSONS = data.shape[0]

BATCH_SIZE = 50
NUM_EPOCHS = 100

# OPTIMIZER
LEARNING_RATE = 5e-3
WEIGHT_DECAY = 1e-4

MASK_PROPORTION = 0.10

## Model

In [5]:
class Sampling(layers.Layer):
  """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
  
  def call(self, inputs):
    z_mean, z_log_var = inputs
    batch = tf.shape(z_mean)[0]
    dim = tf.shape(z_mean)[1]
    epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

class Encoder(keras.Model):
    """Maps items respone to a triplet (z_mean, z_log_var, z)."""

    def __init__(
        self, 
        latent_dim, 
        intermediate_dim,
        n_decoders, 
        name="encoder", 
        **kwargs
    ):
        super(Encoder, self).__init__(name=name, **kwargs)
        self.n_decoders = n_decoders
        self.dense_proj = layers.Dense(intermediate_dim, activation='tanh')
        self.dense_mean = layers.Dense(latent_dim)
        self.dense_log_var = layers.Dense(latent_dim)
        self.sampling = Sampling()

    def call(self, inputs):
        x = self.dense_proj(inputs)
        z_mean = self.dense_mean(x)
        z_log_var = self.dense_log_var(x)

        if self.n_decoders > 1:
          return z_mean, z_log_var, [self.sampling((z_mean, z_log_var)) for i in range(self.n_decoders)]
      
        z = self.sampling((z_mean, z_log_var))
        return z_mean, z_log_var, z


class Decoder(keras.Model):
    """Converts z, the encoded digit vector, back into a readable digit."""

    def __init__(self, original_dim, latent_dim, name="decoder", **kwargs):
        super(Decoder, self).__init__(name=name, **kwargs)
        self.dense_output = layers.Dense(original_dim, activation="sigmoid", kernel_constraint=q_constraint)#,kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4)),bias_regularizer=regularizers.l2(1e-4),activity_regularizer=regularizers.l2(1e-5))

    def call(self, inputs):
        return self.dense_output(inputs)



class VariationalAutoEncoder(keras.Model):
    """Combines the encoder and decoder into an end-to-end model for training."""

    def __init__(
        self,
        original_dim,
        intermediate_dim,
        num_skills,
        n_decoders,
        name="autoencoder"
    ):
        super(VariationalAutoEncoder, self).__init__(name=name)
        #self.original_dim = original_dim
        self.n_decoders = n_decoders
        self.encoder = Encoder(latent_dim=num_skills, intermediate_dim=intermediate_dim, n_decoders=n_decoders)
        self.decoder = Decoder(original_dim, latent_dim=num_skills)

    def call(self, inputs):
        self.z_mean, self.z_log_var, self.z = self.encoder(inputs)
        
        if self.n_decoders > 1:
          return [self.decoder(self.z[i]) for i in range(self.n_decoders)]

        reconstructed = self.decoder(self.z)
        
        return reconstructed

    # Loss function
    def vae_loss(self, input, output):
        cross_entropy_loss = (NUM_STATS/ 1.0) * (tf.reduce_mean((0.5 * tf.math.square(input) + 0.5 * input) * (-1) * tf.math.log(output) + 
        (1 - tf.math.square(input)) * (-1) * tf.math.log(1 - output)))  
        kl_loss = -0.5 * tf.reduce_mean(self.z_log_var - tf.square(self.z_mean) - tf.exp(self.z_log_var) + 1, axis=-1)
        return cross_entropy_loss + kl_loss

    # Get weights
    def _get_weights(self):
        return self.decoder.trainable_weights

    def get_encoder(self):
        return self.encoder

    def get_decoder(self):
        return self.decoder

In [6]:
vae_q = VariationalAutoEncoder(NUM_STATS, INTERMEDIATE_DIM, NUM_SKILLS, N_DECODERS)

# Optimizer
#opt = tf.keras.optimizers.Adam(learning_rate=0.005, amsgrad=True)
opt = tf.keras.optimizers.legacy.SGD()

vae_q.compile(optimizer=opt, loss=vae_q.vae_loss, metrics=['binary_accuracy'])

y = pd.DataFrame(data.values.flatten()) # Item responde values

# Para executar o y imputado tem que colocar o range com valor acima de 1
for i in range(2):
    print("\n %d Iteration #################################################################################### \n" % (i+1))
    #data_train = pd.DataFrame(y.values.reshape(num_stats, N))
    dtrain = tf.cast(data, tf.float32)

    history = vae_q.fit(dtrain,
                        dtrain,
                        epochs=NUM_EPOCHS,
                        batch_size=BATCH_SIZE,
                        shuffle=True)
    # validation_split=0.2
    #ba = 0
    #for value in history.history['binary_accuracy']:
    #    ba += value
    #print("Binary Accuracy: %.4f" % (ba / 25))

    encoder = vae_q.get_encoder()
    decoder = vae_q.get_decoder()

    weights = vae_q._get_weights()

    discr = weights[0].numpy()
    #diff = pd.DataFrame(weights[3].numpy())
    negative_diff_20 = pd.DataFrame(np.negative(weights[1].numpy()))

    # Get latent trait predictions
    thetas_hat20, log_var_thetas_hat20, z_pred20 = encoder.predict(data)
    

    # Get mean.
    if N_DECODERS > 1:
        dec_pred = [decoder.predict(z_pred20[i]) for i in range(5)]
        main = []
        
        for i in range(NUM_PERSONS):
            lst = []
            for j in range(NUM_STATS):
                mu = [dec_pred[k][i][j] for k in range(N_DECODERS)]
                lst.append(np.mean(mu))
                mu.clear()
            main.append(lst)
    else:
        dec_pred = decoder.predict(z_pred20)
        main = dec_pred
    

    # 1 if mean equal or greater than 0.5 and 0 otherwise
    for i in range(NUM_PERSONS):
        for j in range(NUM_STATS):
            if main[i][j] >= 0.5:
                main[i][j] = 1 
            else:
                main[i][j] = 0

    main = np.array(main)
    Y_means = main.T

    # Vectorize in Y_means
    Y_means = []
    for line in main:
        for l in line:
            Y_means.append(l)


    # Imputation
    Y_imputated = []

    for i in range(NUM_PERSONS * NUM_STATS):
        if y.values[i] == -1.0:
            Y_imputated.append(Y_means[i])
        else:
            Y_imputated.append(y.values[i].item())

    y = pd.DataFrame(Y_imputated)

    vae_q = VariationalAutoEncoder(NUM_STATS, INTERMEDIATE_DIM, NUM_SKILLS, N_DECODERS)

    vae_q.compile(optimizer=opt, loss=vae_q.vae_loss, metrics=['binary_accuracy'])

    # Total score on the test -------
    score = np.apply_over_axes(np.sum, dtrain, 1)

    #### Vectoring the matrices Thetas_hat ans discr ####
    theta_hat = np.transpose(thetas_hat20).flatten()
    #step_theta_hat = np.transpose(step_thetas_hat.numpy()).flatten()

    log_var_theta_hat = np.transpose(log_var_thetas_hat20).flatten()
    #step_log_var_theta_hat = np.transpose(step_log_var_thetas_hat.numpy()).flatten()

    discr_hat_20 = remove_zeros(discr)

    # Correlation
    #reshaped = theta_hat.reshape((theta_hat.shape[0], 1))
    #df_theta_hat = pd.DataFrame(reshaped)
    #df_thetas_r_vae = pd.DataFrame(thetas_r_vae.values.flatten())

    #print("\n CORRELAÇÃO THETAS R VAE: %.4f \n" % df_theta_hat.corrwith(df_thetas_r_vae, method='pearson'))
    #print("\n CORRELAÇÃO THETAS PYTHON VAE: %.4f \n" % df_theta_hat.corrwith(thetas_python_vae['Thetas Estimation'], method='pearson'))
    
                                                                                                


 1 Iteration #################################################################################### 

Epoch 1/100


2023-06-07 07:44:38.808315: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [7]:
y.to_csv('output/y_claudia_10_missing_180x21.csv', sep=';', header=None)
np.savetxt('output/thetas_claudia_10_missing_180x21.csv', theta_hat, delimiter=';')
np.savetxt('output/discr_claudia_10_missing_180x21.csv', discr_hat_20, delimiter=';')
np.savetxt('output/diff_claudia_10_missing_180x21.csv', negative_diff_20, delimiter=';')