National Technical University of Athens  
School of Electrical and Computer Engineering   
Data Science and Machine Learning   
  
"Deep Learning Project"  
   
Creators:  
Zerkelidis Dimitris   
Kaiktzoglou Maria  
Trivyza Marilia

### GAIN: Missing Data Imputation using Generative Adversarial Nets

Reference: Jinsung Yoon, James Jordon, and Mihaela van der Schaar, "GAIN: Missing Data Imputation using Generative Adversarial Nets", In International Conference on Machine Learning (ICML), 2018.  

Adapted from the original implementation in tensorflow: https://github.com/jsyoon0823/GAIN  


#### breast, spam, letter, credit, news Datasets

#### GAIN-TD

In [None]:
# Packages
import torch
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from tqdm.notebook import tqdm_notebook as tqdm

import torch.nn.functional as F
from sklearn.model_selection import KFold

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from numpy import linalg as LA

import pandas as pd
from sklearn import preprocessing

import seaborn as sns
sns.set()

In [None]:
use_gpu = False # set it to True to use GPU and False to use CPU
if use_gpu:
    torch.cuda.set_device(0)

#### System Parameters

Mini batch size: 64   
Missing rate: 0.2  
Hint rate: 0.9  
Loss Hyperparameters: 10  
Train Rate: 0.8  
Learning Rate: 0.001  
Epochs: 10000   

In [None]:
# %% Initialize System Parameters
def init_params(mb_size=64, p_miss=0.2, p_hint=0.9, alpha=10, train_rate=0.8, learning_rate=0.001, epochs=10000): 
    # Mini batch size
    mb_size = mb_size
    # Missing rate
    p_miss = p_miss
    # Hint rate
    p_hint = p_hint
    # Loss Hyperparameters
    alpha = alpha
    # Train Rate
    train_rate = train_rate
    # Learning Rate
    learning_rate = learning_rate
    # Epochs
    epochs = epochs
    
    return mb_size, p_miss, p_hint, alpha, train_rate, learning_rate, epochs

In [None]:
mb_size, p_miss, p_hint, alpha, train_rate, learning_rate, epochs = init_params()

#### GAIN-TD Architecture

In [None]:
# Discriminator
class NetD(torch.nn.Module):
    def __init__(self, Dim, H_Dim1, H_Dim2):
        super(NetD, self).__init__()
        
        self.fc0 = torch.nn.Linear(Dim * 1, H_Dim1) # without Hint 
        self.fc1 = torch.nn.Linear(Dim * 2, H_Dim1) # with Hint
        self.fc2 = torch.nn.Linear(H_Dim1, H_Dim2)
        self.fc3 = torch.nn.Linear(H_Dim2, H_Dim1)
        self.fc4 = torch.nn.Linear(H_Dim1, Dim)
        self.tanh = torch.nn.Tanh()
        self.sigmoid = torch.nn.Sigmoid()
        self.init_weight()

    def init_weight(self):
        layers = [self.fc0, self.fc1, self.fc2, self.fc3, self.fc4]
        [torch.nn.init.xavier_normal_(layer.weight) for layer in layers]

    def forward(self, x, h, hint): 
        if hint == False:
            inp = x # Data without Hint
            out = self.tanh(self.fc0(inp))
        else: 
            inp = torch.cat((x, h), dim=1)  # Hint + Data Concatenate
            out = self.tanh(self.fc1(inp))
        
        out = self.tanh(self.fc2(out))
        out = self.tanh(self.fc3(out))
        out = self.sigmoid(self.fc4(out)) # [0,1] Probability Output
        return out

    
# Generator
class NetG(torch.nn.Module):
    def __init__(self, Dim, H_Dim1, H_Dim2):
        super(NetG, self).__init__()
        self.fc1 = torch.nn.Linear(Dim * 2, H_Dim1) 
        self.fc2 = torch.nn.Linear(H_Dim1, H_Dim2)
        self.fc3 = torch.nn.Linear(H_Dim2, H_Dim1)
        self.fc4 = torch.nn.Linear(H_Dim1, Dim)
        self.tanh = torch.nn.Tanh()
        self.sigmoid = torch.nn.Sigmoid()
        self.init_weight()

    def init_weight(self):
        layers = [self.fc1, self.fc2, self.fc3, self.fc4]
        [torch.nn.init.xavier_normal_(layer.weight) for layer in layers]

    def forward(self, x, m): 
        inp = torch.cat((x, m), dim=1)
        out = self.tanh(self.fc1(inp))
        out = self.tanh(self.fc2(out))
        out = self.tanh(self.fc3(out))
        out = self.sigmoid(self.fc4(out)) # [0,1] Probability Output
        return out

#### GAIN Functions

In [None]:
# Hint Vector Generation
def sample_M(m, n, p):                         # e.g. m=mb_size , n=Dim, p=1-p_hint =0.1
    A = np.random.uniform(0., 1., size=[m, n]) # size of mb_size X Dim -> values between 0 to 1
    B = A > p                                  # if A value bigger than 0.1 then True and C=1 else False and C=0
    C = 1. * B
    return C                                   # C is shape mb_size X Dim


# Random sample generator for Z
def sample_Z(m, n):
    return np.random.uniform(0., 0.01, size = [m, n])   


# Mini-batch generation
def sample_idx(m, n):              
    A = np.random.permutation(m)
    idx = A[:n]
    return idx

In [None]:
def discriminator_loss(netG, netD, M, X, H, hint): 
    # Generator
    G_sample = netG(X, M)
    # Combine with original data
    Hat_New_X = X * M + G_sample * (1-M)
    # Discriminator
    D_prob = netD(Hat_New_X, H, hint)
    # Loss
    D_loss = -torch.mean(M * torch.log(D_prob + 1e-8) + (1-M) * torch.log(1. - D_prob + 1e-8))
    return D_loss


def generator_loss(netG, netD, X, M, New_X, H, Lg, Lm, hint, alpha=alpha):
    # %% Structure
    # Generator
    G_sample = netG(New_X, M)
    # Combine with original data
    Hat_New_X = New_X * M + G_sample * (1-M)
    # Discriminator
    D_prob = netD(Hat_New_X, H, hint)

    # Loss
    G_loss1 = -torch.mean((1-M) * torch.log(D_prob + 1e-8))
    # MSE loss finds the difference between New_X and Generators Sample only on the real values
    # Because i want the real values to be as close as they can be to the New_X. I care only for the imputed values
    MSE_train_loss = torch.mean((M * New_X - M * G_sample)**2) / torch.mean(M)
    
    # G_loss = G_loss1 + alpha * MSE_train_loss 
    if Lm == False:
        G_loss = G_loss1 
    elif Lg == False:
        G_loss = alpha * MSE_train_loss
    elif (Lm == True) and (Lg == True):
        G_loss = G_loss1 + alpha * MSE_train_loss

    # MSE Performance metric
    # The difference between real data , X and the Imputed Data
    MSE_test_loss = torch.mean(((1-M) * X - (1-M)*G_sample)**2) / torch.mean(1-M)
    return G_loss, MSE_train_loss, MSE_test_loss


def test_loss(netG, netD, X, M, New_X):
    # %% Structure
    # Generator
    G_sample = netG(New_X, M)

    # MSE Performance metric
    # The difference between real data , X and the Imputed Data
    MSE_test_loss = torch.mean(((1-M) * X - (1-M)*G_sample)**2) / torch.mean(1-M)
    return MSE_test_loss, G_sample

In [None]:
def initializations(dataset_file, size=False, lnth=0, impute=False): 
    # Data generation
    Data = np.loadtxt(dataset_file, delimiter=",", skiprows=1)
    # Shuffle Data
    np.random.seed(1234)
    np.random.shuffle(Data)
    
    if size==True:
        Data = Data[:lnth] # for data set size vs rmse plot
    
    if impute==True:
        Data_y = Data[:,Data.shape[1]-1]
        Data = Data[:,:Data.shape[1]-1]
    
    # Parameters
    No = len(Data)
    Dim = len(Data[0,:])

    # Hidden state dimensions
    H_Dim1 = Dim
    H_Dim2 = Dim // 2

    # Normalization (0 to 1)
    # z = (x-min) / (X_max + ε)
    # ε is used to avoid division by zero
    Min_Val = np.zeros(Dim)
    Max_Val = np.zeros(Dim)

    for i in range(Dim):
        Min_Val[i] = np.min(Data[:,i])
        Data[:,i] = Data[:,i] - np.min(Data[:,i])
        Max_Val[i] = np.max(Data[:,i])
        Data[:,i] = Data[:,i] / (np.max(Data[:,i]) + 1e-6)    
    
    # %% Missing introducing
    p_miss_vec = p_miss * np.ones((Dim,1)) # p_miss ... Dim x 1 size, Dim is number of columns of dataset 

    Missing = np.zeros((No,Dim))           # zero ... Size of dataset No, Dim , No is the rows
    # Mask Vector Generation
    for i in range(Dim):
        A = np.random.uniform(0., 1., size = [len(Data),]) # A is size No x 1
        B = A > p_miss_vec[i]       # B is size No x 1, If a value is False, it is a missing data point
        Missing[:,i] = 1.*B         # No x Dim, 1 * False = 0 and 1 * True = 1
    
    # %% Train Test Division
    idx = np.random.permutation(No) # number of No values. permutation of numbers from 0 - No
    Train_No = int(No * train_rate) # No.of Rows * train_rate
    Test_No = No - Train_No         # 1 - Train_No 
    
    if impute==False:
        return Data, No, Dim, H_Dim1, H_Dim2, Min_Val, Max_Val, p_miss_vec, Missing, idx, Train_No, Test_No
    else:
        return Data, Data_y, No, Dim, H_Dim1, H_Dim2, Min_Val, Max_Val, p_miss_vec, Missing, idx, Train_No, Test_No

In [None]:
def trainGAIN(netG, netD, Dim, trainX, trainM, optimG, optimD, cv=True, Lg=True, Lm=True, hint=True, 
              alpha=alpha, epochs=epochs, printLoss=False):
    # %% Training
    # %% Start Iterations
    for it in tqdm(range(epochs)):
        # %% Inputs
        if cv == False:
            mb_idx = sample_idx(Train_No, mb_size)    # choose indexes
            X_mb = trainX[mb_idx,:]  

            Z_mb = sample_Z(mb_size, Dim)             # uniform 0 to 0.01
            M_mb = trainM[mb_idx, :]                  # mini batch for Missing, 1 and 0 matrix
            H_mb1 = sample_M(mb_size, Dim, 1-p_hint)  # random 0 or 1 with 1 if prob > 0.1
            H_mb = M_mb * H_mb1 # Hint vector final -> 0 on all Missing values and also to some known
                                # because of a small probability that H_mb1 having 0 values, 
                                # but most known data points are 1
            
            New_X_mb = M_mb * X_mb + (1-M_mb) * Z_mb  # Missing Data Introduce 
        else:
            mb_idx = sample_idx(len(trainX), mb_size)
            X_mb = trainX[mb_idx, :]
            
            Z_mb = sample_Z(mb_size, Dim)
            M_mb = trainM[mb_idx, :] 
            H_mb1 = sample_M(mb_size, Dim, 1-p_hint)
            H_mb = M_mb * H_mb1
            
            New_X_mb = M_mb * X_mb + (1-M_mb) * Z_mb 
        
        if use_gpu is True:
            X_mb = torch.tensor(X_mb, device='cuda').float()
            M_mb = torch.tensor(M_mb, device='cuda').float()
            H_mb = torch.tensor(H_mb, device='cuda').float()
            New_X_mb = torch.tensor(New_X_mb, device='cuda').float()
        else:
            X_mb = torch.tensor(X_mb).float()
            M_mb = torch.tensor(M_mb).float()
            H_mb = torch.tensor(H_mb).float()
            New_X_mb = torch.tensor(New_X_mb).float()

        # Train D
        optimD.zero_grad() 
        D_loss = discriminator_loss(netG, netD, M=M_mb, X=New_X_mb, H=H_mb, hint=hint)
        D_loss.backward()
        optimD.step()

        # Train G
        optimG.zero_grad() 
        G_loss, G_mse_loss, G_mse_test = generator_loss(netG, netD, X=X_mb, M=M_mb, New_X=New_X_mb, H=H_mb, 
                                                        Lg=Lg, Lm=Lm, hint=hint, alpha=alpha)
        G_loss.backward()
        optimG.step()
        
        # %% Intermediate Losses
        if printLoss==True:
            if it % 100 == 0:
                print('Iter: {}'.format(it),end='\t')
                print('Train_loss: {:.4}'.format(G_mse_loss),end='\t')
                print('Test_loss: {:.4}'.format(G_mse_test),end='\t')
                print('D_loss: {:.4}'.format(D_loss))


def testGAIN(netG, netD, Dim, testM, testX, cv=True, impute=False):
    if cv == False:
        Z_mb = sample_Z(Test_No, Dim) 
    else:
        Z_mb = sample_Z(len(testM), Dim) 
        
    M_mb = testM # all test samples together
    X_mb = testX

    New_X_mb = M_mb * X_mb + (1-M_mb) * Z_mb  # Missing Data Introduce

    # make the tensors
    if use_gpu is True:
        X_mb = torch.tensor(X_mb, device='cuda').float()
        M_mb = torch.tensor(M_mb, device='cuda').float()
        New_X_mb = torch.tensor(New_X_mb, device='cuda').float()
    else:
        X_mb = torch.tensor(X_mb).float()
        M_mb = torch.tensor(M_mb).float()
        New_X_mb = torch.tensor(New_X_mb).float()

    MSE_final, Sample = test_loss(netG, netD, X=X_mb, M=M_mb, New_X=New_X_mb)

    """print('Final Test RMSE: ' + str(np.sqrt(MSE_final.item())))
    print()"""
    
    if impute==False:
        return np.sqrt(MSE_final.item())
    else:
        return Sample

In [None]:
def experiment(dataset_file, Lg=True, Lm=True, hint=True, alpha=alpha, epochs=epochs, 
               learning_rate=learning_rate, size=False, lnth=0, impute=False, printLoss=False): 
    # %% Initializations    
    if impute==False:
        if size==False:
            Data, No, Dim, H_Dim1, H_Dim2, Min_Val, Max_Val, p_miss_vec, Missing, idx, Train_No, Test_No = initializations(dataset_file)
        else:
            Data, No, Dim, H_Dim1, H_Dim2, Min_Val, Max_Val, p_miss_vec, Missing, idx, Train_No, Test_No = initializations(dataset_file, size=size, lnth=lnth)           
    else:
        Data, Data_y, No, Dim, H_Dim1, H_Dim2, Min_Val, Max_Val, p_miss_vec, Missing, idx, Train_No, Test_No = initializations(dataset_file, impute=impute)
    
    # %% Cross Validation
    kf = KFold(n_splits=5)
    experiment_rmse_mean =[]
    experiment_rmse_std=[]
    auroc_mean = []
    
    for i in range(2): # Number of experiments
        if impute==False:  
            rmse_cv_scores=[]
            for train_index, test_index in kf.split(Data):

                # Train / Test Features
                trainX, testX = Data[train_index], Data[test_index]
                # Train / Test Missing Indicators
                trainM, testM = Missing[train_index], Missing[test_index]

                if impute==True:
                    trainY, testY = Data_y[train_index], Data_y[test_index]

                # %% Init Network 
                if use_gpu is True:
                    netD = NetD(Dim, H_Dim1, H_Dim2).cuda()
                    netG = NetG(Dim, H_Dim1, H_Dim2).cuda()
                else:
                    netD = NetD(Dim, H_Dim1, H_Dim2)
                    netG = NetG(Dim, H_Dim1, H_Dim2)
                    
                # Optimizers
                optimD = torch.optim.Adam(netD.parameters(), lr=learning_rate) # discriminator optimizer
                optimG = torch.optim.Adam(netG.parameters(), lr=learning_rate) # generator optimizer

                # %% Training
                trainGAIN(netG, netD, Dim, trainX, trainM, optimG, optimD, Lg=Lg, Lm=Lm, hint=hint, 
                          alpha=alpha, epochs=epochs, printLoss=printLoss)

                # %% Testing
                if impute==False:
                    rmse_cv_scores.append(testGAIN(netG, netD, Dim, testM, testX))
                else:
                    # Impute train and test and then fit on logistic
                    trainX = testGAIN(netG, netD, Dim, trainX, trainM, impute=impute)
                    testX  = testGAIN(netG, netD, Dim, testX, testM, impute=impute)

                    # Train a logistic Regression
                    clf = LogisticRegression(random_state=0,solver='lbfgs', 
                                             max_iter=2000).fit(trainX.detach().numpy(),trainY)
                    preds = clf.predict(testX.detach().numpy())

                    # print("roc_auc_score: {}".format(roc_auc_score(testY, preds)))
                    fpr, tpr, thresholds = roc_curve(testY, preds)
                    auroc_scores.append(auc(fpr, tpr))
                    
            # %% Calculate avg_RMSE on test
            mean = sum(rmse_cv_scores) / len(rmse_cv_scores)
            print("Average RMSE score: {}".format(mean))

            # %% Calculate the std on test
            variance = sum((rmse - mean) ** 2 for rmse in rmse_cv_scores) / len(rmse_cv_scores) 
            std = np.sqrt(variance)
            print("Standard Deviation of RMSE score: {}".format(std))

            experiment_rmse_mean.append(mean)
            experiment_rmse_std.append(std)
        else:
            auroc_scores=[]
            for train_index, test_index in kf.split(Data, Data_y):
                trainX, testX = Data[train_index], Data[test_index]
                trainM, testM = Missing[train_index], Missing[test_index]
                trainY, testY = Data_y[train_index] , Data_y[test_index]

                if use_gpu is True:
                    netD = NetD(Dim, H_Dim1, H_Dim2).cuda()
                    netG = NetG(Dim, H_Dim1, H_Dim2).cuda()
                else:
                    netD = NetD(Dim, H_Dim1, H_Dim2)
                    netG = NetG(Dim, H_Dim1, H_Dim2)

                optimD = torch.optim.Adam(netD.parameters(), lr=learning_rate) # discriminator optimizer
                optimG = torch.optim.Adam(netG.parameters(), lr=learning_rate) # generator optimizer

                trainGAIN(netG, netD, Dim, trainX, trainM, optimG, optimD, Lg=Lg, Lm=Lm, hint=hint, 
                          alpha=alpha, epochs=epochs, printLoss=printLoss)

                # Impute train and test and then fit on logistic
                trainX = testGAIN(netG, netD, Dim, trainX, trainM, impute=impute)
                testX  = testGAIN(netG, netD, Dim, testX, testM, impute=impute)

                # Train a logistic Regression
                clf = LogisticRegression(random_state=0,solver='lbfgs', 
                                         max_iter=2000).fit(trainX.detach().numpy(),trainY)
                preds = clf.predict(testX.detach().numpy())

                # print("roc_auc_score: {}".format(roc_auc_score(testY, preds)))
                fpr, tpr, thresholds = roc_curve(testY, preds)
                auroc_scores.append(auc(fpr, tpr))
                
            auroc_mean.append(np.mean(auroc_scores))
    
    if impute==False:
        total_avg = sum(experiment_rmse_mean) / len(experiment_rmse_mean)
        total_std = np.sqrt(sum((rmse - total_avg )**2 for rmse in experiment_rmse_mean)/len(experiment_rmse_mean))
        print("Total average: {}".format(total_avg))
        print("Total std: {}".format(total_std))
        return total_avg, total_std
    else:
        return auroc_mean

### Gain_Loss_Functions 

#### Datasets (without labels)

In [None]:
#%% Data (without labels)
dataset_file_list = ['data_no_labels/breast.csv', 'data_no_labels/Spam.csv', 'data_no_labels/Letter.csv', 
                     'data_no_labels/credit.csv', 'data_no_labels/OnlineNewsPopularity.csv']

### Experiments 

#### RMSE without Cross Validation

Using all loss functions

In [None]:
for dataset_file in dataset_file_list:
    print('Dataset: {}'.format(str(dataset_file)))
    # %% Initializations    
    Data, No, Dim, H_Dim1, H_Dim2, Min_Val, Max_Val, p_miss_vec, Missing, idx, Train_No, Test_No = initializations(dataset_file)
    
    # Train / Test Features
    trainX = Data[idx[:Train_No],:]
    testX = Data[idx[Train_No:],:]
    
    # Train / Test Missing Indicators
    trainM = Missing[idx[:Train_No],:]
    testM = Missing[idx[Train_No:],:]
    
    # %% Init Network 
    if use_gpu is True:
        netD = NetD(Dim, H_Dim1, H_Dim2).cuda()
        netG = NetG(Dim, H_Dim1, H_Dim2).cuda()
    else:
        netD = NetD(Dim, H_Dim1, H_Dim2)
        netG = NetG(Dim, H_Dim1, H_Dim2)
  
    # Optimizers
    optimD = torch.optim.Adam(netD.parameters(), lr=learning_rate) # discriminator optimizer
    optimG = torch.optim.Adam(netG.parameters(), lr=learning_rate) # generator optimizer
    
    # %% Training
    trainGAIN(netG, netD, Dim, trainX, trainM, optimG, optimD, cv=False, Lg=True, Lm=True, hint=True, 
              alpha=alpha, epochs=epochs)

    # %% Testing
    rmse_score = testGAIN(netG, netD, Dim, testM, testX, cv=False)
    print('Final Test RMSE: ' + str(rmse_score))
    print("-------------------------------------")
    print()

#### RMSE avg +/- std 2 times on 5 Fold Cross Validation

Using all loss functions

In [None]:
total_avg_list_1 = []
total_std_list_1 = []
for dataset_file in dataset_file_list:
    print('Dataset: {}'.format(str(dataset_file)))
    total_avg, total_std = experiment(dataset_file, Lg=True, Lm=True, hint=True, 
                                      epochs=epochs, learning_rate=learning_rate, printLoss=False)
    total_avg_list_1.append(total_avg)
    total_std_list_1.append(total_std)
    print("-------------------------------------")
    print()

In [None]:
print(total_avg_list_1)
print(total_std_list_1)

#### RMSE avg +- std 2 times on 5 fold cross validation

Without Lg (generator's loss ) = cross_entropy(discriminator's output and mask) + 

We use only the below equation to Update Generator:

    alpha*mse(x_missing_data,x_imputed)

In [None]:
total_avg_list_2 = []
total_std_list_2 = []
for dataset_file in dataset_file_list:
    print('Dataset: {}'.format(str(dataset_file)))
    total_avg, total_std = experiment(dataset_file, Lg=False, Lm=True, hint=True, 
                                      epochs=epochs, learning_rate=learning_rate, printLoss=False)
    total_avg_list_2.append(total_avg)
    total_std_list_2.append(total_std)
    print("-------------------------------------")
    print()

In [None]:
print("breast | spam | letter | credit | news")
print(total_avg_list_2)
print(total_std_list_2)

#### RMSE avg +- std 2 times on 5 Fold Cross Validation

Without Lm = alpha*mse(x_missing_data,x_imputed)

In [None]:
total_avg_list_3 = []
total_std_list_3 = []
for dataset_file in dataset_file_list:
    print('Dataset: {}'.format(str(dataset_file)))
    total_avg, total_std = experiment(dataset_file, Lg=True, Lm=False, hint=True, 
                                      epochs=epochs, learning_rate=learning_rate, printLoss=False)
    total_avg_list_3.append(total_avg)
    total_std_list_3.append(total_std)
    print("-------------------------------------")
    print()

In [None]:
print("breast | spam | letter | credit | news")
print(total_avg_list_3)
print(total_std_list_3)

#### RMSE avg +- std 2 times on 5 Fold Cross Validation

Without Hint Vector

In [None]:
total_avg_list_4 = []
total_std_list_4 = []
for dataset_file in dataset_file_list:
    print('Dataset: {}'.format(str(dataset_file)))
    total_avg, total_std = experiment(dataset_file, Lg=True, Lm=True, hint=False, 
                                      epochs=epochs, learning_rate=learning_rate, printLoss=False)
    total_avg_list_4.append(total_avg)
    total_std_list_4.append(total_std)
    print("-------------------------------------")
    print()

In [None]:
print("breast | spam | letter | credit | news")
print(total_avg_list_4)
print(total_std_list_4)

#### RMSE avg +- std 2 times on 5 Fold Cross Validation

Without Hint and Lm loss

In [None]:
total_avg_list_5 = []
total_std_list_5 = []
for dataset_file in dataset_file_list:
    print('Dataset: {}'.format(str(dataset_file)))
    total_avg, total_std = experiment(dataset_file, Lg=True, Lm=False, hint=False, 
                                      epochs=epochs, learning_rate=learning_rate, printLoss=False)
    total_avg_list_5.append(total_avg)
    total_std_list_5.append(total_std)
    print("-------------------------------------")
    print()

In [None]:
print("breast | spam | letter | credit | news")
print(total_avg_list_5)
print(total_std_list_5)

### Gain_plots 

#### Plotting RMSE VS Missing Data ,  RMSE VS #samples , RMSE vs #features

In [None]:
if not os.path.exists('plots/gain_td/'):
    os.makedirs('plots/gain_td/')

#### Plot for Missing Data

In [None]:
# Reset System Parameters
mb_size, p_miss, p_hint, alpha, train_rate, learning_rate, epochs = init_params()

In [None]:
missing_rates = [0.2, 0.4, 0.6, 0.8]

total_6 = []
for dataset_file in dataset_file_list:
    print('Dataset: {}'.format(str(dataset_file)))
    total_avg_list_6 = []
    for mr in missing_rates:
        # Missing rate
        p_miss = mr
        total_avg, total_std = experiment(dataset_file, Lg=True, Lm=True, hint=True, 
                                          epochs=epochs, learning_rate=learning_rate, printLoss=False)
        total_avg_list_6.append(total_avg)
    total_6.append(total_avg_list_6)
    print("-------------------------------------")
    print()

In [None]:
print("breast | spam | letter | credit | news")
print(total_6)

In [None]:
df = pd.DataFrame(list(zip(missing_rates, total_6[0], total_6[1], 
                           total_6[2], total_6[3], total_6[4])), 
                  columns =['Missing rate', 'breast', 'spam', 'letter', 'credit', 'news'])

df = df.melt('Missing rate', var_name='Datasets', value_name='RMSE')
# g = sns.catplot(x='Missing rate %', y="RMSE", hue='Datasets', data=df, kind='point')
sns.catplot(x='Missing rate', y="RMSE", hue='Datasets', data=df, kind='point').savefig("plots/gain_td/missing.png")

#### Plot with Data Size

In [None]:
# Reset System Parameters
mb_size, p_miss, p_hint, alpha, train_rate, learning_rate, epochs = init_params()

In [None]:
total_7 = []
data_sizes = [0.1, 0.3, 0.5, 0.7]
for dataset_file in dataset_file_list:
    total_avg_list_7 = []
    print('Dataset: {}'.format(str(dataset_file)))
    
    if (str(dataset_file)=='data_no_labels/breast.csv'): ########
        mb_size = 28
    else:
        mb_size = 64
        
    Data_full = np.loadtxt(dataset_file, delimiter=",",skiprows=1)
    data_size = [int(data_sizes[0]*len(Data_full)),int(data_sizes[1]*len(Data_full)),
                 int(data_sizes[2]*len(Data_full)),int(data_sizes[3]*len(Data_full))]
    for lnth in data_size:
        total_avg, total_std = experiment(dataset_file, Lg=True, Lm=True, hint=True, epochs=epochs, 
                                          learning_rate=learning_rate, size=True, lnth=lnth, printLoss=False)
        total_avg_list_7.append(total_avg)
    total_7.append(total_avg_list_7)
    print("-------------------------------------")
    print()

In [None]:
print("breast | spam | letter | credit | news")
print(total_7)

In [None]:
data_sizes = [0.1, 0.3, 0.5, 0.7]
df = pd.DataFrame(list(zip(data_sizes, total_7[0], total_7[1], total_7[2], total_7[3], total_7[4])), 
               columns =['Data Size', 'breast', 'spam', 'letter', 'credit', 'news'])

df = df.melt('Data Size', var_name='Datasets', value_name='RMSE')
# g = sns.catplot(x='Data Size %', y="RMSE", hue='Datasets', data=df, kind='point')
sns.catplot(x='Data Size', y="RMSE", hue='Datasets', data=df, kind='point').savefig("plots/gain_td/datasize.png")

#### Plot for Alpha hyperparameter

In [None]:
# Reset System Parameters
mb_size, p_miss, p_hint, alpha, train_rate, learning_rate, epochs = init_params()

In [None]:
alpha_list = [0, 3, 5, 6, 8, 10, 15, 20]

total_8 = []
for dataset_file in dataset_file_list:
    total_avg_list_8 = []
    print('Dataset: {}'.format(str(dataset_file)))
    for a in alpha_list:
        total_avg, total_std = experiment(dataset_file, Lg=True, Lm=True, hint=True, 
                                          epochs=epochs, learning_rate=learning_rate, alpha=a, printLoss=False)
        total_avg_list_8.append(total_avg)
    total_8.append(total_avg_list_8)
    print("-------------------------------------")
    print()

In [None]:
print("breast | spam | letter | credit | news")
print(total_8)

In [None]:
df = pd.DataFrame(list(zip(alpha_list, total_8[0], total_8[1], 
                           total_8[2], total_8[3], total_8[4])), 
                  columns =['alpha hyperparameter', 'breast', 'spam', 'letter', 'credit', 'news'])

df = df.melt('alpha hyperparameter', var_name='Datasets', value_name='RMSE')
# g = sns.catplot(x='alpha hyperparameter', y="RMSE", hue='Datasets', data=df, kind='point')
sns.catplot(x='alpha hyperparameter', y="RMSE", hue='Datasets', data=df, kind='point').savefig("plots/gain_td/alpha.png")

#### Plot for Learning Rates

In [None]:
# Reset System Parameters
mb_size, p_miss, p_hint, alpha, train_rate, learning_rate, epochs = init_params()

In [None]:
learning_rate_list = [0.1, 0.01, 0.001]

total_9 = []
for dataset_file in dataset_file_list:
    total_avg_list_9 = []
    print('Dataset: {}'.format(str(dataset_file)))
    for lr in learning_rate_list:
        total_avg, total_std = experiment(dataset_file, Lg=True, Lm=True, hint=True, 
                                          epochs=epochs, learning_rate=lr, printLoss=False)
        total_avg_list_9.append(total_avg)
    total_9.append(total_avg_list_9)
    print("-------------------------------------")
    print()

In [None]:
print("breast | spam | letter | credit | news")
print(total_9)

In [None]:
df = pd.DataFrame(list(zip(learning_rate_list, total_9[0], total_9[1], 
                           total_9[2], total_9[3], total_9[4])), 
                  columns =['learning rate', 'breast', 'spam', 'letter', 'credit', 'news'])

df = df.melt('learning rate', var_name='Datasets', value_name='RMSE')
# g = sns.catplot(x='learning rate', y="RMSE", hue='Datasets', data=df, kind='point')
sns.catplot(x='learning rate', y="RMSE", hue='Datasets', data=df, kind='point').savefig("plots/gain_td/lrates.png")

### Gain_AUROC

#### Datasets (with labels)

In [None]:
#%% Data (with labels)
dataset_file_list_labels = ['data_with_labels/breast_with_label.csv', 'data_with_labels/Spam_with_label.csv', 
                            'data_with_labels/credit_with_label.csv', 
                            'data_with_labels/OnlineNewsPopularity_with_label.csv']

In [None]:
le = preprocessing.LabelEncoder()

#### Plotting missing data over auroc score

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.auc.html

In [None]:
# Reset System Parameters
mb_size, p_miss, p_hint, alpha, train_rate, learning_rate, epochs = init_params()

In [None]:
missing_rates = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
auroc_mean_list = []

for dataset_file in dataset_file_list_labels:
    print('Dataset: {}'.format(str(dataset_file)))
    auroc_mean = []
    for mr in missing_rates:
        # Missing rate
        p_miss = mr
        auroc_mean.append(experiment(dataset_file, Lg=True, Lm=True, hint=True, 
                                     epochs=epochs, learning_rate=learning_rate, impute=True, printLoss=False)) 
    auroc_mean_list.append(auroc_mean)
    print("-------------------------------------")
    print()

auroc_mean_ = np.array(auroc_mean_list)
auroc_mean_ = auroc_mean_.transpose(2, 0, 1).reshape(-1, auroc_mean_.shape[1])

In [None]:
print("breast | spam | credit | news")
print(auroc_mean_)

In [None]:
df = pd.DataFrame(list(zip(missing_rates, list(auroc_mean_[0]), list(auroc_mean_[1]), 
                           list(auroc_mean_[2]) ,list(auroc_mean_[3]))), 
                  columns =['Missing Rate', 'breast', 'spam', 'credit', 'news'])

df = df.melt('Missing Rate', var_name='Datasets', value_name='AUC score')
# g = sns.catplot(x='Missing Rate %', y="AUC score", hue='Datasets', data=df, kind='point')
sns.catplot(x='Missing Rate', y="AUC score", hue='Datasets', data=df, kind='point').savefig("plots/gain_td/auroc.png")