In [1]:
import torch
import os
os.chdir("..")
import sys
import numpy as np
import torch.nn as nn
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from GAIN_imputer_utility import xavier_init,MyDataset,preprocess,load_dataloader,Imputation_model,get_dataset_loaders,set_all_BN_layers_tracking_state,loss,impute_with_prediction,check_and_fill_nan
from sklearn.impute import SimpleImputer
from MNAR.missing_process.block_rules import *
os.chdir("BatchNorm")


In [2]:
missing_type = "quantile"

  # set it to True to use GPU and False to use CPU
use_BN = False
states = [True,True]

dataset_file = 'california'

missing_rule = ["Q1_complete","Q1_partial","Q2_complete","Q2_partial","Q3_complete","Q3_partial","Q4_complete","Q4_partial",
"Q1_Q2_complete","Q1_Q2_partial","Q1_Q3_complete","Q1_Q3_partial","Q1_Q4_complete","Q1_Q4_partial","Q2_Q3_complete","Q2_Q3_partial",
"Q2_Q4_complete","Q2_Q4_partial","Q3_Q4_complete","Q3_Q4_partial"]

missing_rule = ["C0_lower","C0_upper","C0_double","C1_lower","C1_upper","C1_double", 
                "C2_lower","C2_upper","C2_double", "C3_lower","C3_upper", "C3_double",
                "C4_lower","C4_upper","C4_double","C5_lower","C5_upper","C5_double",
                "C6_lower","C6_upper","C6_double","C7_lower","C7_upper","C7_double",
]

missing_rule = ["C0_lower"]

missing_type = "BN"

In [3]:
batch_size = 32
epoch = 300

In [4]:
def loss(truth, mask, data,imputer):

    generated = imputer(data, mask)

    #print(generated[1,:])

    return  torch.mean(((1 - mask) * truth - (1 - mask) * generated) ** 2) / torch.mean(1 - mask), generated

In [5]:
class Imputation_model(nn.Module):
    def __init__(self, dim, hidden_dim1, hidden_dim2,use_BN):
        super(Imputation_model, self).__init__()
    
        self.G_W1 = nn.Parameter(torch.tensor(xavier_init([dim * 2, hidden_dim1]), dtype=torch.float32), requires_grad=True)
        self.G_b1 = nn.Parameter(torch.zeros(hidden_dim1, dtype=torch.float32), requires_grad=True)
        self.G_bn1 = nn.BatchNorm1d(hidden_dim1)

        self.G_W2 = nn.Parameter(torch.tensor(xavier_init([hidden_dim1, hidden_dim2]), dtype=torch.float32), requires_grad=True)
        self.G_b2 = nn.Parameter(torch.zeros(hidden_dim2, dtype=torch.float32), requires_grad=True)
        self.G_bn2 = nn.BatchNorm1d(hidden_dim2)

        self.G_W3 = nn.Parameter(torch.tensor(xavier_init([hidden_dim2, dim]), dtype=torch.float32), requires_grad=True)
        self.G_b3 = nn.Parameter(torch.zeros(dim, dtype=torch.float32), requires_grad=True)

        self.use_BN = use_BN
        self.batch_mean1 = None
        self.batch_var1 = None

        self.batch_mean2 = None
        self.batch_var2 = None

    def forward(self, data, mask):
        inputs = torch.cat(dim=1, tensors=[data, mask])  # Mask + Data Concatenate
        inputs = inputs.float()  
        G_h1 = F.relu(torch.matmul(inputs, self.G_W1.float()) + self.G_b1.float())
        if self.use_BN:
            G_h1 = self.G_bn1(G_h1)  # Batch Normalization
        G_h2 = F.relu(torch.matmul(G_h1, self.G_W2.float()) + self.G_b2.float())
        if self.use_BN:
            G_h2 = self.G_bn2(G_h2)  # Batch Normalization
        G_prob = torch.sigmoid(torch.matmul(G_h2, self.G_W3.float()) + self.G_b3.float())  # [0,1] normalized Output

        if self.use_BN:
            self.batch_mean1 = self.G_bn1.running_mean
            self.batch_var1 = self.G_bn1.running_var
            self.batch_mean2 = self.G_bn2.running_mean
            self.batch_var2 = self.G_bn2.running_var

        return G_prob

In [6]:
class Imputation_model(nn.Module):
    def __init__(self, dim, hidden_dim1, hidden_dim2, use_BN):
        super(Imputation_model, self).__init__()

        self.G_W1 = nn.Parameter(torch.nn.init.xavier_uniform_(torch.empty([dim * 2, hidden_dim1])), requires_grad=True)
        self.G_b1 = nn.Parameter(torch.zeros(hidden_dim1), requires_grad=True)

        self.G_W2 = nn.Parameter(torch.nn.init.xavier_uniform_(torch.empty([hidden_dim1, hidden_dim2])), requires_grad=True)
        self.G_b2 = nn.Parameter(torch.zeros(hidden_dim2), requires_grad=True)

        self.G_W3 = nn.Parameter(torch.nn.init.xavier_uniform_(torch.empty([hidden_dim2, dim])), requires_grad=True)
        self.G_b3 = nn.Parameter(torch.zeros(dim), requires_grad=True)

        self.use_BN = use_BN

        if self.use_BN:
            self.batch_norm_layers = nn.ModuleList([nn.BatchNorm1d(hidden_dim1), nn.BatchNorm1d(hidden_dim2)])

    def forward(self, data, mask):
        inputs = torch.cat([data, mask], dim=1).float()
        G_h1 = nn.ReLU()(torch.matmul(inputs, self.G_W1) + self.G_b1)

        if self.use_BN:
            G_h1 = self.batch_norm_layers[0](G_h1)

        G_h2 = nn.ReLU()(torch.matmul(G_h1, self.G_W2) + self.G_b2)

        if self.use_BN:
            G_h2 = self.batch_norm_layers[1](G_h2)

        G_prob = torch.sigmoid(torch.matmul(G_h2, self.G_W3) + self.G_b3)

        return G_prob

In [7]:
patience = 10  # Number of epochs to wait for improvement
best_validation_loss = float('inf')
best_model_state = None
early_stopping_counter = 0

In [12]:
def run(dataset_file,missing_rule, use_BN):
    
   

    Imputer_RMSE = []
    baseline_RMSE = []
    
    for rule_name in missing_rule:
        print(dataset_file,rule_name,use_BN,states)
        trainX, testX, train_Mask, test_Mask, train_input, test_input, No, Dim,train_H, test_H,Xval_org, Xval_org_mask, val_input, val_H = load_dataloader(dataset_file,missing_type, rule_name)

    
        # train_dataset, test_dataset = MyDataset(trainX, train_Mask,train_input,train_H), MyDataset(testX, test_Mask, test_input,test_H)

        # train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        # test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        imputer = Imputation_model(Dim, Dim, Dim, use_BN)
        #imputer = Simple_imputer(Dim)
        optimizer = torch.optim.Adam(params=imputer.parameters())

        train_loader , test_loader, test_loader = get_dataset_loaders(trainX, train_Mask,train_input,testX, test_Mask,test_input,train_H, test_H,Xval_org, Xval_org_mask, val_input, val_H )


        for it in tqdm(range(epoch)):
            imputer.train()
            total_loss = 0
            batch_no = 0
            for truth_X, mask, data_X,x_hat in train_loader:
                batch_no += 1

                # print("======Batch {} Start======".format(batch_no))
                # print('Running first pass:')

                set_all_BN_layers_tracking_state(imputer,True)

                optimizer.zero_grad()

                Imputer_loss = loss(truth=truth_X, mask=mask, data=data_X, imputer = imputer)[0]
  
                total_loss += Imputer_loss
                Imputer_loss.backward()
                optimizer.step()

                # print('1st BatchNorm Mean: {:.4} Var:{:.4}'.format(torch.mean(imputer.batch_mean1), torch.mean(imputer.batch_var1)))
                # print('2nt BatchNorm Mean: {:.4} Var:{:.4}'.format(torch.mean(imputer.batch_mean2), torch.mean(imputer.batch_var2)), end='\n\n')

                # print('Running second pass:')

                set_all_BN_layers_tracking_state(imputer,True)

                # prediction = loss(truth=truth_X, mask=mask, data=data_X,imputer = imputer)[1]

                # imputed_data = impute_with_prediction(truth_X, mask, prediction)

                # _ = imputer(imputed_data, mask)


                # print('1st BatchNorm Mean: {:.4} Var:{:.4}'.format(torch.mean(imputer.batch_mean1), torch.mean(imputer.batch_var1)))
                # print('2nt BatchNorm Mean: {:.4} Var:{:.4}'.format(torch.mean(imputer.batch_mean2), torch.mean(imputer.batch_var2)), end='\n\n')


                # print("======Batch {} End======\n\n".format(batch_no))


            print('Iter: {}'.format(it), end='\t')
            print(total_loss.item(),batch_no)
            print('Train_loss: {:.4}'.format(np.sqrt(total_loss.item()/batch_no)))


        


        # Evaluation

        with torch.no_grad():
            imputer.eval()
            RMSE_total = []
            for truth_X, mask, data_X, x_hat in test_loader:

                RMSE, prediction =  loss(truth=truth_X, mask=mask, data=data_X,imputer = imputer)
                imputed_data = impute_with_prediction(truth_X, mask, prediction)
                RMSE_total.append(RMSE)


        RMSE_tensor = torch.tensor(RMSE_total)
        rmse_final = torch.mean(RMSE_tensor)

        Imputer_RMSE.append(round(rmse_final.item(),5))

        print('Final Test RMSE: {:.4f}'.format(rmse_final.item()))



    result = pd.DataFrame({"Missing_Rule":[rule_name for rule_name in missing_rule],"Imputer RMSE":Imputer_RMSE,"Baseline RMSE":baseline_RMSE})
    print(result)
    #result.to_csv("results/GAIN_imputer/{}_0pass.csv".format(dataset_file),index=False)


In [13]:
# Training loop with early stopping
def train_with_early_stopping(imputer, train_loader, test_loader, epoch, patience):
    # Define early stopping parameters
    best_validation_loss = float('inf')  # Initialize with a high value
    best_model_state = None
    early_stopping_counter = 0

    optimizer = torch.optim.Adam(params=imputer.parameters())

    for it in tqdm(range(epoch)):
        imputer.train()
        total_loss = 0
        batch_no = 0

        for truth_X, mask, data_X, x_hat in train_loader:
            batch_no += 1

            optimizer.zero_grad()

            Imputer_loss = loss(truth=truth_X, mask=mask, data=data_X, imputer=imputer)[0]
            total_loss += Imputer_loss.item()
            Imputer_loss.backward()
            optimizer.step()

        # Calculate average training loss for the epoch
        avg_train_loss = np.sqrt(total_loss / batch_no)

        # Validation step
        imputer.eval()
        with torch.no_grad():
            total_val_loss = 0
            val_batch_no = 0

            for truth_X_val, mask_val, data_X_val, x_hat_val in test_loader:
                val_batch_no += 1

                val_loss = loss(truth=truth_X_val, mask=mask_val, data=data_X_val, imputer=imputer)[0]
                total_val_loss += val_loss.item()

        avg_val_loss = np.sqrt(total_val_loss / val_batch_no)

        # Check for improvement in validation loss
        if avg_val_loss < best_validation_loss:
            best_validation_loss = avg_val_loss
            best_model_state = imputer.state_dict()
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1

        # Print current epoch's training and validation loss
        print('Epoch: {}'.format(it), end='\t')
        print('Train_loss: {:.4}'.format(avg_train_loss), end='\t')
        print('Val_loss: {:.4}'.format(avg_val_loss), end='\n')

        # Check for early stopping
        if early_stopping_counter >= patience:
            print("Early stopping! No improvement in validation loss for {} epochs.".format(patience))
            break

    # Load the best model state after training loop completes
    imputer.load_state_dict(best_model_state)

In [24]:
def run2(dataset_file,missing_rule, use_BN):
    
   

    Imputer_RMSE = []
    baseline_RMSE = []
    
    for rule_name in missing_rule:
        print(dataset_file,rule_name,use_BN,states)
        trainX, testX, train_Mask, test_Mask, train_input, test_input, No, Dim,train_H, test_H,Xval_org, Xval_org_mask, val_input, val_H = load_dataloader(dataset_file,missing_type, rule_name)

    
        # train_dataset, test_dataset = MyDataset(trainX, train_Mask,train_input,train_H), MyDataset(testX, test_Mask, test_input,test_H)

        # train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        # test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        imputer = Imputation_model(Dim, Dim, Dim, use_BN)
        #imputer = Simple_imputer(Dim)
        optimizer = torch.optim.Adam(params=imputer.parameters())

        train_loader , test_loader, test_loader = get_dataset_loaders(trainX, train_Mask,train_input,testX, test_Mask,test_input,train_H, test_H,Xval_org, Xval_org_mask, val_input, val_H )
        

        train_with_early_stopping(imputer, train_loader, test_loader, epoch=10, patience=10)

        with torch.no_grad():
            imputer.eval()
            RMSE_total = []
            for truth_X, mask, data_X, x_hat in test_loader:

                RMSE, prediction =  loss(truth=truth_X, mask=mask, data=data_X,imputer = imputer)
                imputed_data = impute_with_prediction(truth_X, mask, prediction)
                RMSE_total.append(RMSE)


        RMSE_tensor = torch.tensor(RMSE_total)
        rmse_final = torch.mean(RMSE_tensor)

        Imputer_RMSE.append(round(rmse_final.item(),5))

        print('Final Test RMSE: {:.4f}'.format(rmse_final.item()))


    print([rule_name for rule_name in missing_rule])
    print(Imputer_RMSE,baseline_RMSE)

    result = pd.DataFrame({"Missing_Rule":[rule_name for rule_name in missing_rule],"Imputer RMSE":Imputer_RMSE,"Baseline RMSE":baseline_RMSE})
    print(Imputer_RMSE)
    result.to_csv("results/GAIN_imputer/{}_0pass.csv".format(dataset_file),index=False)

In [25]:
run2(dataset_file,missing_rule,use_BN)

california C0_lower False [True, True]


  0%|          | 0/10 [00:00<?, ?it/s]

 10%|█         | 1/10 [00:00<00:06,  1.34it/s]

Epoch: 0	Train_loss: 0.3041	Val_loss: 0.1151


 20%|██        | 2/10 [00:01<00:05,  1.38it/s]

Epoch: 1	Train_loss: 0.09684	Val_loss: 0.0898


 30%|███       | 3/10 [00:02<00:05,  1.38it/s]

Epoch: 2	Train_loss: nan	Val_loss: nan


 40%|████      | 4/10 [00:02<00:04,  1.40it/s]

Epoch: 3	Train_loss: nan	Val_loss: nan


 50%|█████     | 5/10 [00:03<00:03,  1.40it/s]

Epoch: 4	Train_loss: nan	Val_loss: nan


 60%|██████    | 6/10 [00:04<00:02,  1.40it/s]

Epoch: 5	Train_loss: nan	Val_loss: nan


 70%|███████   | 7/10 [00:05<00:02,  1.41it/s]

Epoch: 6	Train_loss: nan	Val_loss: nan


 80%|████████  | 8/10 [00:05<00:01,  1.40it/s]

Epoch: 7	Train_loss: nan	Val_loss: nan


 90%|█████████ | 9/10 [00:06<00:00,  1.39it/s]

Epoch: 8	Train_loss: nan	Val_loss: nan


100%|██████████| 10/10 [00:07<00:00,  1.39it/s]

Epoch: 9	Train_loss: nan	Val_loss: nan
Final Test RMSE: nan
['C0_lower']
[nan] []





ValueError: All arrays must be of the same length