In [1]:
import numpy as np
from scipy import sparse
from tqdm import tqdm

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted, _check_feature_names_in
from sklearn.preprocessing import OneHotEncoder
import sklearn
import sklearn.impute

import pandas as pd
from pandas.api.types import is_numeric_dtype
import sklearn.compose
import torch
from torch.utils.data import DataLoader, TensorDataset
from scipy import optimize
import numpy as np
from scipy import sparse

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted, _check_feature_names_in
from sklearn.preprocessing import OneHotEncoder
import sklearn
import sklearn.impute
from torch.autograd.variable import Variable
import pandas as pd
from pandas.api.types import is_numeric_dtype
import sklearn.compose
import math
import torch
import sklearn.preprocessing
import openml
import tpot2
import sklearn.metrics
import sklearn
from sklearn.metrics import (roc_auc_score, roc_curve, precision_score, auc, recall_score, precision_recall_curve, \
                             roc_auc_score, accuracy_score, balanced_accuracy_score, f1_score, log_loss,
                             f1_score, root_mean_squared_error)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def add_missing(X, add_missing = 0.05, missing_type = 'MAR'):
    if isinstance(X,np.ndarray):
        X = pd.DataFrame(X)
    missing_mask = X
    missing_mask = missing_mask.mask(missing_mask.isna(), True)
    missing_mask = missing_mask.mask(missing_mask.notna(), False)
    X = X.mask(X.isna(), 0)
    T = torch.tensor(X.to_numpy())

    match missing_type:
        case 'MAR':
            out = MAR(T, [add_missing])
        case 'MCAR':
            out = MCAR(T, [add_missing])
        case 'MNAR':
            out = MNAR_mask_logistic(T, [add_missing])
    
    masked_set = pd.DataFrame(out['Mask'].numpy())
    missing_combo = (missing_mask | masked_set.isna())
    masked_set = masked_set.mask(missing_combo, True)
    masked_set.columns = X.columns.values
    #masked_set = masked_set.to_numpy()

    missing_set = pd.DataFrame(out['Missing'].numpy())
    missing_set.columns = X.columns.values
    #missing_set = missing_set.to_numpy()

    return missing_set, masked_set

"""BEYOND THIS POINT WRITTEN BY Aude Sportisse, Marine Le Morvan and Boris Muzellec - https://rmisstastic.netlify.app/how-to/python/generate_html/how%20to%20generate%20missing%20values"""

def MCAR(X, p_miss):
    out = {'X': X.double()}
    for p in p_miss: 
        mask = (torch.rand(X.shape) < p).double()
        X_nas = X.clone()
        X_nas[mask.bool()] = np.nan
        model_name = 'Missing'
        mask_name = 'Mask'
        out[model_name] = X_nas
        out[mask_name] = mask
    return out

def MAR(X,p_miss,p_obs=0.5):
    out = {'X': X.double()}
    for p in p_miss:
        n, d = X.shape
        mask = torch.zeros(n, d).bool()
        num_no_missing = max(int(p_obs * d), 1)
        num_missing = d - num_no_missing
        obs_samples = np.random.choice(d, num_no_missing, replace=False)
        copy_samples = np.array([i for i in range(d) if i not in obs_samples])
        len_obs = len(obs_samples)
        len_na = len(copy_samples)
        coeffs = torch.randn(len_obs, len_na).double()
        Wx = X[:, obs_samples].mm(coeffs)
        coeffs /= torch.std(Wx, 0, keepdim=True)
        coeffs.double()
        len_obs, len_na = coeffs.shape
        intercepts = torch.zeros(len_na)
        for j in range(len_na):
            def f(x):
                return torch.sigmoid(X[:, obs_samples].mv(coeffs[:, j]) + x).mean().item() - p
            intercepts[j] = optimize.bisect(f, -50, 50)
        ps = torch.sigmoid(X[:, obs_samples].mm(coeffs) + intercepts)
        ber = torch.rand(n, len_na)
        mask[:, copy_samples] = ber < ps
        X_nas = X.clone()
        X_nas[mask.bool()] = np.nan
        model_name = 'Missing'
        mask_name = 'Mask'
        out[model_name] = X_nas
        out[mask_name] = mask
    return out

def MNAR_mask_logistic(X, p_miss, p_params =.5, exclude_inputs=True):
    """
    Missing not at random mechanism with a logistic masking model. It implements two mechanisms:
    (i) Missing probabilities are selected with a logistic model, taking all variables as inputs. Hence, values that are
    inputs can also be missing.
    (ii) Variables are split into a set of intputs for a logistic model, and a set whose missing probabilities are
    determined by the logistic model. Then inputs are then masked MCAR (hence, missing values from the second set will
    depend on masked values.
    In either case, weights are random and the intercept is selected to attain the desired proportion of missing values.
    Parameters
    ----------
    X : torch.DoubleTensor or np.ndarray, shape (n, d)
        Data for which missing values will be simulated.
        If a numpy array is provided, it will be converted to a pytorch tensor.
    p : float
        Proportion of missing values to generate for variables which will have missing values.
    p_params : float
        Proportion of variables that will be used for the logistic masking model (only if exclude_inputs).
    exclude_inputs : boolean, default=True
        True: mechanism (ii) is used, False: (i)
    Returns
    -------
    mask : torch.BoolTensor or np.ndarray (depending on type of X)
        Mask of generated missing values (True if the value is missing).
    """
    out = {'X_init_MNAR': X.double()}
    for p in p_miss: 
        n, d = X.shape
        to_torch = torch.is_tensor(X) ## output a pytorch tensor, or a numpy array
        if not to_torch:
            X = torch.from_numpy(X)
        mask = torch.zeros(n, d).bool() if to_torch else np.zeros((n, d)).astype(bool)
        d_params = max(int(p_params * d), 1) if exclude_inputs else d ## number of variables used as inputs (at least 1)
        d_na = d - d_params if exclude_inputs else d ## number of variables masked with the logistic model
        ### Sample variables that will be parameters for the logistic regression:
        idxs_params = np.random.choice(d, d_params, replace=False) if exclude_inputs else np.arange(d)
        idxs_nas = np.array([i for i in range(d) if i not in idxs_params]) if exclude_inputs else np.arange(d)
        ### Other variables will have NA proportions selected by a logistic model
        ### The parameters of this logistic model are random.
        ### Pick coefficients so that W^Tx has unit variance (avoids shrinking)
        len_obs = len(idxs_params)
        len_na = len(idxs_nas)
        coeffs = torch.randn(len_obs, len_na).double()
        Wx = X[:, idxs_params].mm(coeffs)
        coeffs /= torch.std(Wx, 0, keepdim=True)
        coeffs.double()
        ### Pick the intercepts to have a desired amount of missing values
        len_obs, len_na = coeffs.shape
        intercepts = torch.zeros(len_na)
        for j in range(len_na):
            def f(x):
                return torch.sigmoid(X[:, idxs_params].mv(coeffs[:, j]) + x).mean().item() - p
            intercepts[j] = optimize.bisect(f, -50, 50)
        ps = torch.sigmoid(X[:, idxs_params].mm(coeffs) + intercepts)
        ber = torch.rand(n, d_na)
        mask[:, idxs_nas] = ber < ps
        ## If the inputs of the logistic model are excluded from MNAR missingness,
        ## mask some values used in the logistic model at random.
        ## This makes the missingness of other variables potentially dependent on masked values
        if exclude_inputs:
            mask[:, idxs_params] = torch.rand(n, d_params) < p
        X_nas = X.clone()
        X_nas[mask.bool()] = np.nan
        model_name = 'Missing'
        mask_name = 'Mask'
        out[model_name] = X_nas
        out[mask_name] = mask
    return out


In [3]:
dataset_file = '/Users/gabrielketron/tpot2_addimputers/tpot2/ImputerExperiments/data/Spam.csv'
#%% System Parameters
# 1. Mini batch size
mb_size = 128
# 2. Missing rate
p_miss = 0.2
# 3. Hint rate
p_hint = 0.9
# 4. Loss Hyperparameters
alpha = 10
# 5. Train Rate
train_rate = 0.8

#%% Data

# Data generation
Data = np.loadtxt(dataset_file, delimiter=",",skiprows=1)
X = pd.DataFrame(Data)

X_M, mask = add_missing(X, missing_type='MNAR')
X_M = X_M.astype(float)
mask = mask.astype(float)
no, dim = X_M.shape
idx = np.random.permutation(no)

Train_No = int(no * 0.7)
Test_No = no - Train_No
    
# Train / Test Features
trainX = X_M.iloc[:Train_No]
testX = X_M.iloc[Train_No:]

# Train / Test Missing Indicators
trainM = mask.iloc[:Train_No]
testM = mask.iloc[Train_No:]



  missing_mask = missing_mask.mask(missing_mask.notna(), False)


In [4]:
class VAEImputer(BaseEstimator, TransformerMixin):

    def __init__(self, iterations=1000, batch_size=128, split_size=5, code_size=5, encoder_hidden_sizes=[128, 64], decoder_hidden_sizes=[128, 64],
                    temperature=None, p_miss = 0.2, learning_rate = 0.001, tolerance=0.001):
        
        self.batch_size = batch_size
        self.iterations = iterations
        self.split_size = split_size
        self.code_size = code_size
        self.encoder_hidden_sizes = encoder_hidden_sizes
        self.decoder_hidden_sizes = decoder_hidden_sizes
        self.test_loss_function = torch.nn.MSELoss()
        self.p_miss = p_miss
        self.temperature = temperature
        self.learning_rate = learning_rate
        self.tolerance = tolerance
        torch.set_default_dtype(torch.float32)

    def fit(self, X, y=None):
        self.variable_sizes = [1]*X.shape[1] #list of 1s the same lenght as the features of X
        
        self.encoder_hidden_sizes = [int(math.floor(X.shape[1]/2)), int(math.floor(X.shape[1]*3/10))]
        self.decoder_hidden_sizes = [int(math.floor(X.shape[1]/2)), int(math.floor(X.shape[1]*3/10))]
        self.split_size =int(math.floor(X.shape[1]/5))
        self.code_size=int(math.floor(X.shape[1]/5))
        
        #print(self.encoder_hidden_sizes)

        features = torch.from_numpy(X.to_numpy()) #X features
        features = torch.nan_to_num(features)

        num_samples = len(features)
        variable_masks = []
        for variable_size in self.variable_sizes:
            variable_mask = (torch.zeros(num_samples, 1).uniform_(0.0, 1.0) > self.p_miss).float()
            if variable_size > 1:
                variable_mask = variable_mask.repeat(1, variable_size)
            variable_masks.append(variable_mask)
        mask = torch.cat(variable_masks, dim=1)

        temperature = self.temperature
        self.model = self.VAE(self,
                        features.shape[1],
                        self.split_size,
                        self.code_size,
                        encoder_hidden_sizes=self.encoder_hidden_sizes,
                        decoder_hidden_sizes=self.decoder_hidden_sizes,
                        variable_sizes=(None if temperature is None else self.variable_sizes),  # do not use multi-output without temperature
                        temperature=temperature
                        )
        
        self.model.train(mode=True)
        inverted_mask = 1 - mask
        observed = features * mask
        missing = torch.randn_like(features)
        noisy_features = observed + missing*inverted_mask

        if self.learning_rate is not None:
            missing = torch.autograd.Variable(missing, requires_grad=True)
            self.optim = torch.optim.Adam(self.model.parameters(), weight_decay=0, lr=self.learning_rate)

        self.model.train(mode=True)
        #pbar = tqdm(range(self.iterations))
        for iterations in range(self.iterations):
            train_ds = TensorDataset(features.float(), mask.float(), noisy_features.float())
            losses = []
            for f, m, n in DataLoader(train_ds, batch_size=self.batch_size, shuffle=True):
                loss = self.train_batch(f, m, n)
                losses.append(loss)
                if loss < self.tolerance:
                    break
            #pbar.set_postfix({'loss': min(losses)})
            '''
            if iterations % 100 == 0 :
                print(f'Epoch {iterations} loss: {loss:.4f}')
            '''

        self._VAE_params = self.model.state_dict()
        return self
    
    def train_batch(self, features, mask, noisy_features):
        self.optim.zero_grad()
        #print(features.shape)
        #print(noisy_features.shape)
        #noise = torch.autograd.Variable(torch.FloatTensor(len(noisy_features), self.p_miss).normal_())
        _, reconstructed, mu, log_var = self.model(noisy_features, training=True)
        #print(reconstructed.shape)
        #print(reconstructed)
        # reconstruction of the non-missing values
        reconstruction_loss = self.masked_reconstruction_loss_function(reconstructed,
                                                                  features,
                                                                  mask,
                                                                  self.variable_sizes)
        missing_loss = self.masked_reconstruction_loss_function(reconstructed, features, 1-mask, self.variable_sizes)
        #print(reconstruction_loss)
        loss = torch.sqrt(self.test_loss_function((mask * features + (1.0 - mask) * reconstructed), features))
        
        kld_loss = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
        #print(kld_loss)
        observed_loss = reconstruction_loss + kld_loss
        #loss = loss.type(torch.float32)
        #print(loss)
        observed_loss.backward()

        self.optim.step()

        return observed_loss.detach().numpy()

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

    def transform(self, X, y=None):
        self.model.load_state_dict(self._VAE_params)
        self.model.train(mode=False)
        self.variable_sizes = [1]*X.shape[1] #list of 1s the same lenght as the features of X

        features = torch.from_numpy(X.to_numpy()) #X features
        features = torch.nan_to_num(features)
        mask = torch.from_numpy(1-np.isnan(X.to_numpy()))
        inverted_mask = ~mask
        num_samples = len(features)
        observed = features * mask
        missing = torch.randn_like(features)
        noisy_features = observed + missing*inverted_mask
        f = features.float()
        m = mask.float()
        #print(m)
        n = noisy_features.float()
        #print(n)
        with torch.no_grad():
            _, reconstructed, _, _ = self.model.forward(n)
            #print(reconstructed)
            imputed = m*n + (1.0 - m)*reconstructed
        return imputed.cpu().numpy()

    def reconstruction_loss_function(self, reconstructed, original, variable_sizes, reduction="mean"):
        # by default use loss for binary variables
        if variable_sizes is None:
            return torch.nn.functional.binary_cross_entropy(reconstructed, original, reduction=reduction)
        # use the variable sizes when available
        else:
            loss = 0
            start = 0
            numerical_size = 0
            for variable_size in variable_sizes:
                # if it is a categorical variable
                if variable_size > 1:
                    # add loss from the accumulated continuous variables
                    if numerical_size > 0:
                        end = start + numerical_size
                        batch_reconstructed_variable = reconstructed[:, start:end]
                        batch_target = original[:, start:end]
                        loss += torch.nn.functional.mse_loss(batch_reconstructed_variable, batch_target, reduction=reduction)
                        start = end
                        numerical_size = 0
                    # add loss from categorical variable
                    end = start + variable_size
                    batch_reconstructed_variable = reconstructed[:, start:end]
                    batch_target = torch.argmax(original[:, start:end], dim=1)
                    loss += torch.nn.functional.cross_entropy(batch_reconstructed_variable, batch_target, reduction=reduction)
                    start = end
                # if not, accumulate numerical variables
                else:
                    numerical_size += 1

            # add loss from the remaining accumulated numerical variables
            if numerical_size > 0:
                end = start + numerical_size
                batch_reconstructed_variable = reconstructed[:, start:end]
                batch_target = original[:, start:end]
                loss += torch.nn.functional.mse_loss(batch_reconstructed_variable, batch_target, reduction=reduction)

            return loss

    def masked_reconstruction_loss_function(self, reconstructed, original, mask, variable_sizes):
        return self.reconstruction_loss_function(mask * reconstructed,
                                            mask * original,
                                            variable_sizes,
                                            reduction="sum") / torch.sum(mask)

    class Encoder(torch.nn.Module):

        def __init__(self, VAEImputer, input_size, code_size, hidden_sizes=[], variable_sizes=None):
            super(VAEImputer.Encoder, self).__init__()

            layers = []

            if variable_sizes is None:
                previous_layer_size = input_size
                #print(type(previous_layer_size))
            else:
                multi_input_layer = VAEImputer.MultiInput(VAEImputer, variable_sizes)
                layers.append(multi_input_layer)
                previous_layer_size = multi_input_layer.size
                #print(type(previous_layer_size))

            layer_sizes = list(hidden_sizes) + [code_size]
            hidden_activation = torch.nn.Tanh()

            for layer_size in layer_sizes:
                #print(layer_size)
                layers.append(torch.nn.Linear(previous_layer_size, layer_size))
                layers.append(hidden_activation)
                previous_layer_size = layer_size

            self.hidden_layers = torch.nn.Sequential(*layers)

        def forward(self, inputs):
            #print(inputs)
            return self.hidden_layers(inputs)
    
    class Decoder(torch.nn.Module):

        def __init__(self, VAEImputer, code_size, output_size, hidden_sizes=[], variable_sizes=None, temperature=None):
            super(VAEImputer.Decoder, self).__init__()

            hidden_activation = torch.nn.Tanh()

            previous_layer_size = code_size
            hidden_layers = []

            for layer_size in hidden_sizes:
                hidden_layers.append(torch.nn.Linear(previous_layer_size, layer_size))
                hidden_layers.append(hidden_activation)
                previous_layer_size = layer_size

            if len(hidden_layers) > 0:
                self.hidden_layers = torch.nn.Sequential(*hidden_layers)
            else:
                self.hidden_layers = None

            if variable_sizes is None:
                self.output_layer = VAEImputer.SingleOutput(VAEImputer, previous_layer_size, output_size, activation=torch.nn.Sigmoid())
            else:
                self.output_layer = VAEImputer.MultiOutput(VAEImputer, previous_layer_size, variable_sizes, temperature=temperature)

        def forward(self, code, training=False):
            if self.hidden_layers is None:
                hidden = code
            else:
                hidden = self.hidden_layers(code)

            return self.output_layer(hidden, training=training)

    class VAE(torch.nn.Module):

        def __init__(self, VAEImputer, input_size, split_size, code_size, encoder_hidden_sizes=[], decoder_hidden_sizes=[],
                    variable_sizes=None, temperature=None):

            super(VAEImputer.VAE, self).__init__()

            self.encoder = VAEImputer.Encoder(VAEImputer, input_size, split_size, hidden_sizes=encoder_hidden_sizes, variable_sizes=variable_sizes)
            self.decoder = VAEImputer.Decoder(VAEImputer, code_size, input_size, hidden_sizes=decoder_hidden_sizes, variable_sizes=variable_sizes,
                                temperature=temperature)

            self.mu_layer = torch.nn.Linear(split_size, code_size)
            self.log_var_layer = torch.nn.Linear(split_size, code_size)

        def forward(self, inputs, training=False):
            mu, log_var = self.encode(inputs)
            #print(mu)
            #print(log_var)
            code = self.reparameterize(mu, log_var)
            #print(code.shape)
            reconstructed = self.decode(code, training=training)
            return code, reconstructed, mu, log_var

        def encode(self, inputs):
            outputs = self.encoder(inputs)
            #print(outputs.shape)
            #print(outputs)
            return self.mu_layer(outputs), self.log_var_layer(outputs)

        def decode(self, code, training=False):
            return self.decoder(code, training=training)
        
        def reparameterize(self, mu, log_var):
            std = torch.exp(0.5 * log_var)
            eps = torch.randn_like(std)
            return eps.mul(std).add_(mu)
        
    '''
    class OutputLayer(torch.nn.Module):
        """
        This is just a simple abstract class for single and multi output layers.
        Both need to have the same interface.
        """

        def forward(self, hidden, training=None):
            raise NotImplementedError
    '''

    class SingleOutput(torch.nn.Module):

        def __init__(self, VAEImputer, previous_layer_size, output_size, activation=None):
            super(VAEImputer.SingleOutput, self).__init__()
            if activation is None:
                self.model = torch.nn.Linear(previous_layer_size, output_size)
            else:
                self.model = torch.nn.Sequential(torch.nn.Linear(previous_layer_size, output_size), activation)

        def forward(self, hidden, training=False):
            return self.model(hidden)
    
    class MultiOutput(torch.nn.Module):
        def __init__(self, VAEImputer, input_size, variable_sizes, temperature=None):
            super(VAEImputer.MultiOutput, self).__init__()

            self.output_layers = torch.nn.ModuleList()
            self.output_activations = torch.nn.ModuleList()

            numerical_size = 0
            for i, variable_size in enumerate(variable_sizes):
                # if it is a categorical variable
                if variable_size > 1:
                    # first create the accumulated numerical layer
                    if numerical_size > 0:
                        self.output_layers.append(torch.nn.Linear(input_size, numerical_size))
                        self.output_activations.append(VAEImputer.NumericalActivation())
                        numerical_size = 0
                    # create the categorical layer
                    self.output_layers.append(torch.nn.Linear(input_size, variable_size))
                    self.output_activations.append(VAEImputer.CategoricalActivation(temperature))
                # if not, accumulate numerical variables
                else:
                    numerical_size += 1

            # create the remaining accumulated numerical layer
            if numerical_size > 0:
                self.output_layers.append(torch.nn.Linear(input_size, numerical_size))
                self.output_activations.append(VAEImputer.NumericalActivation())

        def forward(self, inputs, training=True, concat=True):
            outputs = []
            for output_layer, output_activation in zip(self.output_layers, self.output_activations):
                logits = output_layer(inputs)
                output = output_activation(logits, training=training)
                outputs.append(output)

            if concat:
                return torch.cat(outputs, dim=1)
            else:
                return outputs


    class CategoricalActivation(torch.nn.Module):

        def __init__(self, VAEImputer, temperature):
            super(VAEImputer.CategoricalActivation, self).__init__()

            self.temperature = temperature

        def forward(self, logits, training=True):
            # gumbel-softmax (training and evaluation)
            if self.temperature is not None:
                return torch.nn.functional.gumbel_softmax(logits, hard=not training, tau=self.temperature)
            # softmax training
            elif training:
                return torch.nn.functional.softmax(logits, dim=1)
            # softmax evaluation
            else:
                return torch.distributions.OneHotCategorical(logits=logits).sample()


    class NumericalActivation(torch.nn.Module):

        def __init__(self, VAEImputer):
            super(VAEImputer.NumericalActivation, self).__init__()

        def forward(self, logits, training=True):
            return torch.sigmoid(logits)
        
    class MultiInput(torch.nn.Module):

        def __init__(self, VAEImputer, variable_sizes, min_embedding_size=2, max_embedding_size=50):
            super(VAEImputer.MultiInput, self).__init__()

            self.has_categorical = False
            self.size = 0

            embeddings = torch.nn.ParameterList()
            for i, variable_size in enumerate(variable_sizes):
                # if it is a numerical variable
                if variable_size == 1:
                    embeddings.append(None)
                    self.size += 1
                # if it is a categorical variable
                else:
                    # this is an arbitrary rule of thumb taken from several blog posts
                    embedding_size = max(min_embedding_size, min(max_embedding_size, int(variable_size / 2)))

                    # the embedding is implemented manually to be able to use one hot encoding
                    # PyTorch embedding only accepts as input label encoding
                    embedding = torch.nn.Parameter(data=torch.Tensor(variable_size, embedding_size).normal_(), requires_grad=True)

                    embeddings.append(embedding)
                    self.size += embedding_size
                    self.has_categorical = True

            if self.has_categorical:
                self.variable_sizes = variable_sizes
                self.embeddings = embeddings

        def forward(self, inputs):
            if self.has_categorical:
                outputs = []
                start = 0
                for variable_size, embedding in zip(self.variable_sizes, self.embeddings):
                    # extract the variable
                    end = start + variable_size
                    variable = inputs[:, start:end]

                    # numerical variable
                    if variable_size == 1:
                        # leave the input as it is
                        outputs.append(variable)
                    # categorical variable
                    else:
                        output = torch.matmul(variable, embedding).squeeze(1)
                        outputs.append(output)

                    # move the variable limits
                    start = end

                # concatenate all the variable outputs
                return torch.cat(outputs, dim=1)
            else:
                return inputs
        



In [5]:
import tpot2.builtin_modules


gain = tpot2.builtin_modules.imputer.VAEImputer(batch_size=64, iterations=1000)

imputed_train = gain.fit_transform(trainX)
imputed_test = gain.transform(testX)




In [6]:
print(type(imputed_train))
print(type(imputed_test))


<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [7]:
def rmse_loss(ori_data, imputed_data, data_m):
        '''Compute RMSE loss between ori_data and imputed_data
        Args:
            - ori_data: original data without missing values
            - imputed_data: imputed data
            - data_m: indicator matrix for missingness
        Returns:
            - rmse: Root Mean Squared Error
        '''
        #ori_data, norm_parameters = normalization(ori_data)
        #imputed_data, _ = normalization(imputed_data, norm_parameters)
        # Only for missing values
        nominator = np.sum(((1-data_m) * ori_data - (1-data_m) * imputed_data)**2)
        denominator = np.sum(1-data_m)
        rmse = np.sqrt(nominator/float(denominator))
        return rmse


print(f'Train RMSE: {rmse_loss(ori_data=X[:Train_No].to_numpy(), imputed_data=imputed_train, data_m=trainM.to_numpy())}')
print(f'Test RMSE: {rmse_loss(ori_data=X[Train_No:].to_numpy(), imputed_data=imputed_test, data_m=testM.to_numpy())}')

Train RMSE: 2.0007007615450614
Test RMSE: 1.9998040999184423
