In [1]:
import numpy as np
from scipy import sparse

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted, _check_feature_names_in
from sklearn.preprocessing import OneHotEncoder
import sklearn
import sklearn.impute

import pandas as pd
from pandas.api.types import is_numeric_dtype
import sklearn.compose

import torch



In [2]:
dataset_file = '/Users/gabrielketron/tpot2_addimputers/tpot2/ImputerExperiments/data/Spam.csv'
#%% System Parameters
# 1. Mini batch size
mb_size = 128
# 2. Missing rate
p_miss = 0.2
# 3. Hint rate
p_hint = 0.9
# 4. Loss Hyperparameters
alpha = 10
# 5. Train Rate
train_rate = 0.8

#%% Data

# Data generation
Data = np.loadtxt(dataset_file, delimiter=",",skiprows=1)
X = pd.DataFrame(Data)
no, dim = X.shape
idx = np.random.permutation(no)
Train_No = int(no * 0.7)
Test_No = no - Train_No
trainX = X.iloc[0:Train_No]
testX = X.iloc[Train_No+1:]






In [3]:
class GainImputer(BaseEstimator, TransformerMixin):
    """
    Base class for all imputers.
    It adds automatically support for `add_indicator`.
    """

    def __init__(self, 
                 batch_size=128, 
                 hint_rate=0.9, 
                 alpha=100, 
                 iterations=10000,
                 train_rate = 0.8,
                 learning_rate = 0.001,
                 p_miss = 0.2,
                 random_state=None):
        self.batch_size = batch_size
        self.hint_rate = hint_rate
        self.alpha = alpha
        self.iterations = iterations
        self.train_rate = train_rate
        self.learning_rate = learning_rate
        self.p_miss = p_miss
        self.random_state = random_state
        self.device = (
            "cuda"
            if torch.cuda.is_available()
            else "mps"
            if torch.backends.mps.is_available()
            else "cpu"
            )
        torch.set_default_device(self.device)
        torch.set_default_dtype(torch.float32)
        torch.set_grad_enabled(True)
        if random_state is not None:
            torch.manual_seed(self.random_state)

    def fit(self, X, y=None):
        self.fit_transform(X)
        return self

    def transform(self, X, y = None):
        
        self.modelG.load_state_dict(self._Gen_params)

        if hasattr(X, 'dtypes'):
            X = X.to_numpy()
        #define mask matrix
        X_mask = 1 - np.isnan(X)
        #get dimensions
        no, self.dim = X.shape
        self.int_dim = int(self.dim)
        #normalize the original data, and save parameters for renormalization
        norm_data = X.copy()
        min_val = np.zeros(self.dim)
        max_val = np.zeros(self.dim)
        for i in range(self.dim):
            min_val[i] = np.nanmin(norm_data[i])
            norm_data[:, i] -= np.nanmin(norm_data[:, i])
            max_val[i] = np.nanmax(norm_data[i])
            norm_data[:, i] /= (np.nanmax(norm_data[:, i]) + 1e-06)
        norm_parameters = {'min_val': min_val, 'max_val': max_val}
        norm_data_filled = np.nan_to_num(norm_data, 0)
        p_miss_vec = self.p_miss * np.ones((self.dim,1)) 
        Missing = np.zeros((no,self.dim))
        for i in range(self.dim):
            A = np.random.uniform(0., 1., size = [len(norm_data_filled),])
            B = A > p_miss_vec[i]
            Missing[:,i] = 1.*B

        Z_mb = self._sample_Z(no, self.dim)
        M_mb = Missing
        X_mb = norm_data_filled

        New_X_mb = M_mb * X_mb + (1-M_mb) * Z_mb

        X_mb = torch.tensor(X_mb, dtype=torch.float32)
        New_X_mb = torch.tensor(New_X_mb, dtype=torch.float32)
        M_mb = torch.tensor(M_mb, dtype=torch.float32)

        G_sample = self.modelG(X_mb, New_X_mb, M_mb)
        mse_loss = torch.nn.MSELoss(reduction='mean')
        mse_final = mse_loss((1-M_mb)*X_mb, (1-M_mb)*G_sample)/(1-M_mb).sum()
        print('Final Test RMSE: ' + str(np.sqrt(mse_final.item())))

        imputed_data = M_mb * X_mb + (1-M_mb) * G_sample
        imputed_data = imputed_data.cpu().detach().numpy()
        _, dim = imputed_data.shape
        renorm_data = imputed_data.copy()
        for i in range(dim):
            renorm_data[:,i] = renorm_data[:,i] * (max_val[i] + 1e-6)   
            renorm_data[:,i] = renorm_data[:,i] + min_val[i]
        for i in range(dim):
            temp = X[~np.isnan(X[:, i]), i]
            # Only for the categorical variable
            if len(np.unique(temp)) < 20:
                renorm_data[:, i] = np.round(renorm_data[:, i])
        return 
        
    def fit_transform(self, X, y=None):
        if hasattr(X, 'dtypes'):
            X = X.to_numpy()
        #define mask matrix
        X_mask = 1 - np.isnan(X)
        #get dimensions
        no, self.dim = X.shape
        self.int_dim = int(self.dim)
        #normalize the original data, and save parameters for renormalization
        norm_data = X.copy()
        min_val = np.zeros(self.dim)
        max_val = np.zeros(self.dim)
        for i in range(self.dim):
            min_val[i] = np.nanmin(norm_data[i])
            norm_data[:, i] -= np.nanmin(norm_data[:, i])
            max_val[i] = np.nanmax(norm_data[i])
            norm_data[:, i] /= (np.nanmax(norm_data[:, i]) + 1e-06)
        norm_parameters = {'min_val': min_val, 'max_val': max_val}
        norm_data_filled = np.nan_to_num(norm_data, 0)
        p_miss_vec = self.p_miss * np.ones((self.dim,1)) 
        Missing = np.zeros((no,self.dim))
        for i in range(self.dim):
            A = np.random.uniform(0., 1., size = [len(norm_data_filled),])
            B = A > p_miss_vec[i]
            Missing[:,i] = 1.*B
        #internal test-train split
        # Train / Test Missing Indicators
        #model training
        self.modelD = self.Discriminator(GainImputer=self)
        self.modelG = self.Generator(GainImputer=self)

        optimizer_D = torch.optim.Adam(self.modelD.parameters(), 
                                       lr = self.learning_rate)
        optimizer_G = torch.optim.Adam(self.modelG.parameters(), 
                                       lr = self.learning_rate)
        
        bce_loss = torch.nn.BCEWithLogitsLoss(reduction='mean')
        mse_loss = torch.nn.MSELoss(reduction='mean')

        for it in range(self.iterations):
            mb_idx = self._sample_index(no, self.batch_size)
            X_mb = norm_data_filled[mb_idx,:]
            Z_mb = self._sample_Z(self.batch_size, self.dim)

            M_mb = Missing[mb_idx, :]
            H_mb1 = self._sample_M(self.batch_size, self.dim, 1-self.hint_rate)
            H_mb = M_mb*H_mb1 + 0.5*(1-H_mb1)

            New_X_mb = M_mb * X_mb + (1-M_mb)*Z_mb #introduce missing data

            X_mb = torch.tensor(X_mb, dtype=torch.float32)
            New_X_mb = torch.tensor(New_X_mb, dtype=torch.float32)
            Z_mb = torch.tensor(Z_mb, dtype=torch.float32)
            M_mb = torch.tensor(M_mb, dtype=torch.float32)
            H_mb = torch.tensor(H_mb, dtype=torch.float32)

            #Train Discriminator
            G_sample = self.modelG(X_mb, New_X_mb, M_mb)
            D_prob = self.modelD(X_mb, M_mb, G_sample, H_mb)
            D_loss = bce_loss(D_prob, M_mb)

            D_loss.backward()
            optimizer_D.step()
            optimizer_D.zero_grad()

            #Train Generator
            G_sample = self.modelG(X_mb, New_X_mb, M_mb)
            D_prob = self.modelD(X_mb, M_mb, G_sample, H_mb)
            D_prob.cpu().detach()
            G_loss1 = ((1-M_mb)*(torch.sigmoid(D_prob)+1e-8).log()).mean()/(1-M_mb).sum()
            G_mse_loss = mse_loss(M_mb*X_mb, M_mb*G_sample)/M_mb.sum()
            G_loss = G_loss1 + self.alpha*G_mse_loss

            G_loss.backward()
            optimizer_G.step()
            optimizer_G.zero_grad()

            G_mse_test = mse_loss((1-M_mb)*X_mb, (1-M_mb)*G_sample)/(1-M_mb).sum()

            '''if it % 100 == 0:
                print('Iter: {}'.format(it))
                print('D_loss: {:.4}'.format(D_loss))
                print('Train_loss: {:.4}'.format(G_mse_loss))
                print('Test_loss: {:.4}'.format(G_mse_test))
                print()'''
        self._Gen_params = self.modelG.state_dict()

        Z_mb = self._sample_Z(no, self.dim) 
        M_mb = Missing
        X_mb = norm_data_filled
   
        New_X_mb = M_mb * X_mb + (1-M_mb) * Z_mb

        X_mb = torch.tensor(X_mb, dtype=torch.float32)
        New_X_mb = torch.tensor(New_X_mb, dtype=torch.float32)
        M_mb = torch.tensor(M_mb, dtype=torch.float32)

        G_sample = self.modelG(X_mb, New_X_mb, M_mb)
        mse_final = mse_loss((1-M_mb)*X_mb, (1-M_mb)*G_sample)/(1-M_mb).sum()
        #print('Final Train RMSE: ' + str(np.sqrt(mse_final.item())))

        imputed_data = M_mb * X_mb + (1-M_mb) * G_sample
        imputed_data = imputed_data.cpu().detach().numpy()
        _, dim = imputed_data.shape
        renorm_data = imputed_data.copy()
        for i in range(dim):
            renorm_data[:,i] = renorm_data[:,i] * (max_val[i] + 1e-6)   
            renorm_data[:,i] = renorm_data[:,i] + min_val[i]
        for i in range(dim):
            temp = X[~np.isnan(X[:, i]), i]
            # Only for the categorical variable
            if len(np.unique(temp)) < 20:
                renorm_data[:, i] = np.round(renorm_data[:, i])
        return renorm_data
    
    def _sample_M(self, rows, cols, p):
        '''Sample binary random variables.
        Args:
            - p: probability of 1
            - rows: the number of rows
            - cols: the number of columns
        Returns:
            - binary_random_matrix: generated binary random matrix.
        '''
        unif_random_matrix = np.random.uniform(0., 1., size = [rows, cols])
        binary_random_matrix = unif_random_matrix > p
        return 1.*binary_random_matrix

    def _sample_Z(self, rows, cols):
        '''Sample uniform random variables.
        Args:
            - rows: the number of rows
            - cols: the number of columns
        Returns:
            - uniform_random_matrix: generated uniform random matrix.
        '''
        return np.random.uniform(0., 1., size = [rows, cols])       

    def _sample_index(self, rows, batch_size):
        '''Sample index of the mini-batch.
        Args:
            - total: total number of samples (rows)
            - batch_size: batch size
        Returns:
            - batch_idx: batch index
        '''
        total_idx = np.random.permutation(rows)
        batch_idx = total_idx[:batch_size]
        return batch_idx
    
    class Generator(torch.nn.Module):
        def __init__(self, GainImputer):
            super(GainImputer.Generator, self).__init__()
            self.G1 = torch.nn.Linear(GainImputer.dim*2,GainImputer.int_dim)
            self.G2 = torch.nn.Linear(GainImputer.int_dim,GainImputer.int_dim)
            self.G3 = torch.nn.Linear(GainImputer.int_dim,GainImputer.dim)
            self.relu = torch.nn.ReLU()
            self.sigmoid = torch.nn.Sigmoid()
            self.init_weight()

        def init_weight(self):
            layers = [self.G1, self.G2, self.G3]
            [torch.nn.init.xavier_normal_(layer.weight) for layer in layers]

        def forward(self, X: torch.float32, Z: torch.float32, M: torch.float32):
            input = M * X + (1-M)*Z
            input = torch.cat([input, M], dim=1)
            out = self.relu(self.G1(input))
            out = self.relu(self.G2(out))
            out = self.sigmoid(self.G3(out))
            return out
        
    class Discriminator(torch.nn.Module):
        def __init__(self, GainImputer):
            super(GainImputer.Discriminator, self).__init__()
            self.D1 = torch.nn.Linear(GainImputer.dim*2,GainImputer.int_dim)
            self.D2 = torch.nn.Linear(GainImputer.int_dim,GainImputer.int_dim)
            self.D3 = torch.nn.Linear(GainImputer.int_dim,GainImputer.dim)
            self.relu = torch.nn.ReLU()
            self.sigmoid = torch.nn.Sigmoid()
            self.init_weight()
        
        def init_weight(self):
            layers = [self.D1, self.D2, self.D3]
            [torch.nn.init.xavier_normal_(layer.weight) for layer in layers]
        
        def forward(self, X, M, G, H):
            input = M * X + (1-M)*G
            input = torch.cat([input, H], dim=1)
            out = self.relu(self.D1(input))
            out = self.relu(self.D2(out))
            out = self.D3(out)
            return out


In [4]:
gain = GainImputer(batch_size=128, hint_rate=0.9, alpha=10, iterations=10000)

gain.fit(trainX)

out = gain.transform(testX)


Final Test RMSE: 0.0034147700228270507
