# Import Statements

In [2]:
import tensorflow as tf
import numpy as np
from tqdm import tqdm
from os import sys, path 
#Make Sure Parent File is stored properly for demo!
from imputation import Imputation

# Class GAIN 

Initialization for mini batch size, missing rate, hint rate, alpha and train rate

In [3]:
class GAIN(Imputation):
    def __init__(self, mb_size, p_miss, p_hint, alpha, train_rate):
        self.mb_size= mb_size
        self.p_miss = p_miss
        self.p_hint = p_hint
        self.alpha = alpha
        self.train_rate = train_rate
        self.H_Dim1 = None
        self.H_Dim2 = None
        print('Init Done')

Functions for normalize, missingness introduction, train test split

In [4]:
class GAIN(GAIN):
    def normalize(self, data, dimension):
            Min_Val = np.zeros(dimension)
            Max_Val = np.zeros(dimension)
            for i in range(dimension):
                Min_Val[i] = np.min(data[:,i])
                data[:,i] = data[:,i] - np.min(data[:,i])
                Max_Val[i] = np.max(data[:,i])
                data[:,i] = data[:,i] / (np.max(data[:,i]) + 1e-6)  
            print('Norm Done')              
            return data   
    
    def introduce_missingness(self, Dim, No, Data):
            p_miss_vec = self.p_miss * np.ones((Dim,1))
            Missing = np.zeros((No, Dim))
            for i in range(Dim):
                A = np.random.uniform(0., 1., size = [len(Data),])
                B = A > p_miss_vec[i]
                Missing[:,i] = 1.*B
            print('Missing Done')
            return Missing
    
    def train_test_split(self, No, Data, Missing):
        idx = np.random.permutation(No)
        Train_No = int(No * self.train_rate)
        Test_No = No - Train_No
        trainX = Data[idx[:Train_No],:]
        testX = Data[idx[Train_No:],:]
        trainM = Missing[idx[:Train_No],:]
        testM = Missing[idx[Train_No:],:]
        print('Train/Test Done')
        return trainX, testX, trainM, testM, Train_No, Test_No

Implementing gain architecture, generator, discriminator

In [5]:
class GAIN(GAIN):
    def gain_architecture(self, Dim):
            X = tf.placeholder(tf.float32, shape = [None, Dim])
            M = tf.placeholder(tf.float32, shape = [None, Dim])
            H = tf.placeholder(tf.float32, shape = [None, Dim])
            New_X = tf.placeholder(tf.float32, shape = [None, Dim])
            D_W1 = tf.Variable(self.xavier_init([Dim*2, self.H_Dim1]))     # Data + Hint as inputs
            D_b1 = tf.Variable(tf.zeros(shape = [ self.H_Dim1]))
            D_W2 = tf.Variable(self.xavier_init([self.H_Dim1, self.H_Dim2]))
            D_b2 = tf.Variable(tf.zeros(shape = [self.H_Dim2]))
            D_W3 = tf.Variable(self.xavier_init([self.H_Dim2, Dim]))
            D_b3 = tf.Variable(tf.zeros(shape = [Dim]))       # Output is multi-variate
            theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]
            G_W1 = tf.Variable(self.xavier_init([Dim*2, self.H_Dim1]))     # Data + Mask as inputs (Random Noises are in Missing Components)
            G_b1 = tf.Variable(tf.zeros(shape = [self.H_Dim1]))
            G_W2 = tf.Variable(self.xavier_init([self.H_Dim1, self.H_Dim2]))
            G_b2 = tf.Variable(tf.zeros(shape = [self.H_Dim2]))
            G_W3 = tf.Variable(self.xavier_init([self.H_Dim2, Dim]))
            G_b3 = tf.Variable(tf.zeros(shape = [Dim]))
            theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]
            print('Gain Arch Done')
            return theta_D, theta_G, X, M, H, New_X
    @staticmethod
    def generator(new_x, m, G_W1, G_W2, G_W3, G_b1, G_b2, G_b3):
        inputs = tf.concat(axis = 1, values = [new_x,m])  # Mask + Data Concatenate
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)   
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) # [0,1] normalized Output
        print('Gen Samp Done')
        return G_prob
            
    @staticmethod
    def discriminator(new_x, h, D_W1, D_W2, D_W3, D_b1, D_b2, D_b3):
        inputs = tf.concat(axis = 1, values = [new_x,h])  # Hint + Data Concatenate
        D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)  
        D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
        D_logit = tf.matmul(D_h2, D_W3) + D_b3
        D_prob = tf.nn.sigmoid(D_logit)  # [0,1] Probability Output
        print('Disc Sample Done')
        return D_prob

 Preprocessing input data : normalize, load, get dimensions

In [6]:
class GAIN(GAIN):
    def preprocess(self, inputData):
            Data = np.loadtxt(inputData, delimiter=",",skiprows=1)
            #Data = np.loadtxt(inputData, delimiter=",")
            No = len(Data)
            Dim = len(Data[0,:])
            self.H_Dim1 = Dim
            self.H_Dim2 = Dim
            normalized_data = self.normalize(Data, Dim)
            print('Preprocess Done')
            return normalized_data, No, Dim

Static methods: xavier initialization, sample generation 

In [7]:
class GAIN(GAIN):
    @staticmethod
    def xavier_init(size):
        in_dim = size[0]
        xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
        return tf.random_normal(shape = size, stddev = xavier_stddev)
    @staticmethod
    def sample_M(m, n, p):
        A = np.random.uniform(0., 1., size = [m, n])
        B = A > p
        C = 1.*B
        return C    
    @staticmethod
    def sample_Z(m, n):
        return np.random.uniform(0., 0.01, size = [m, n])        
    @staticmethod
    def sample_idx(m, n):
         A = np.random.permutation(m)
         idx = A[:n]
         return idx

Train function

In [8]:
class GAIN(GAIN):    
    def train(self, normalized_data, No, Dim):
        missing_matrix  = self.introduce_missingness(Dim, No, normalized_data)
        trainX, testX, trainM, testM, Train_No, Test_No= self.train_test_split(No, normalized_data, missing_matrix)
        theta_D, theta_G, X, M, H, New_X = self.gain_architecture(Dim)
        G_sample = self.generator(New_X, M, theta_G[0],theta_G[1], theta_G[2], theta_G[3], theta_G[4], theta_G[5])
        Hat_New_X = New_X * M + G_sample * (1-M)
        D_prob = self.discriminator(Hat_New_X, H, theta_D[0], theta_D[1], theta_D[2], theta_D[3],theta_D[4],theta_D[5])
        D_loss1 = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) + (1-M) * tf.log(1. - D_prob + 1e-8)) 
        G_loss1 = -tf.reduce_mean((1-M) * tf.log(D_prob + 1e-8))
        MSE_train_loss = tf.reduce_mean((M * New_X - M * G_sample)**2) / tf.reduce_mean(M)
        D_loss = D_loss1
        G_loss = G_loss1 + gain_obj.alpha * MSE_train_loss
        MSE_test_loss = tf.reduce_mean(((1-M) * X - (1-M)*G_sample)**2) / tf.reduce_mean(1-M)
        D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
        G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        for it in tqdm(range(5000)):    
            mb_idx = gain_obj.sample_idx(Train_No, gain_obj.mb_size)
            X_mb = trainX[mb_idx,:]      
            Z_mb = gain_obj.sample_Z(gain_obj.mb_size, Dim) 
            M_mb = trainM[mb_idx,:]  
            H_mb1 = gain_obj.sample_M(gain_obj.mb_size, Dim, 1-gain_obj.p_hint)
            H_mb = M_mb * H_mb1    
            New_X_mb = M_mb * X_mb + (1-M_mb) * Z_mb  # Missing Data Introduce    
            _, D_loss_curr = sess.run([D_solver, D_loss1], feed_dict = {M: M_mb, New_X: New_X_mb, H: H_mb})
            _, G_loss_curr, MSE_train_loss_curr, MSE_test_loss_curr = sess.run([G_solver, G_loss1, MSE_train_loss, MSE_test_loss],
                                                                           feed_dict = {X: X_mb, M: M_mb, New_X: New_X_mb, H: H_mb})
            if it % 100 == 0:
                print('Iter: {}'.format(it))
                print('Train_loss: {:.4}'.format(np.sqrt(MSE_train_loss_curr)))
                print('Test_loss: {:.4}'.format(np.sqrt(MSE_test_loss_curr)))
                print()
        return Test_No, testM, testX, MSE_test_loss, G_sample, sess, X, M, New_X

Test functions, impute, evaluate. [Evaluate results are provided in test]

In [9]:
class GAIN(GAIN):
    def test(self, Test_No, testM, testX, MSE_test_loss, G_sample, sess, X, M, New_X):
        Z_mb = gain_obj.sample_Z(Test_No, Dim) 
        M_mb = testM
        X_mb = testX
        New_X_mb = M_mb * X_mb + (1-M_mb) * Z_mb  # Missing Data Introduce
        MSE_final, Sample = sess.run([MSE_test_loss, G_sample], feed_dict = {X: testX, M: testM, New_X: New_X_mb})
        print('Final Test RMSE: ' + str(np.sqrt(MSE_final)))
        
    def impute(self, trained_model, input):
        pass
    
    def evaluate(self, trained_model, input):
        pass  

# Main function to access GAIN, initial params, and with dataset in .csv

In [10]:
if __name__ == '__main__': 
    gain_obj = GAIN(128, 0.2, 0.9, 10, 0.8)
    normalized_data, No, Dim= gain_obj.preprocess('Letter.csv')
    Test_No, testM, testX, MSE_test_loss, G_sample, sess, X, M, New_X = gain_obj.train(normalized_data, No, Dim)
    gain_obj.test(Test_No, testM, testX, MSE_test_loss, G_sample, sess, X, M, New_X)

Init Done
Norm Done
Preprocess Done
Missing Done
Train/Test Done
Gain Arch Done
Gen Samp Done
Disc Sample Done


  1%|          | 29/5000 [00:00<16:05,  5.15it/s]

Iter: 0
Train_loss: 0.2802
Test_loss: 0.2659



  3%|▎         | 134/5000 [00:00<04:01, 20.16it/s]

Iter: 100
Train_loss: 0.1699
Test_loss: 0.1777



  5%|▍         | 244/5000 [00:01<01:09, 68.10it/s]

Iter: 200
Train_loss: 0.1476
Test_loss: 0.1476



  6%|▋         | 325/5000 [00:01<00:35, 132.49it/s]

Iter: 300
Train_loss: 0.152
Test_loss: 0.1465



  9%|▊         | 432/5000 [00:01<00:22, 205.98it/s]

Iter: 400
Train_loss: 0.1319
Test_loss: 0.154



 11%|█         | 539/5000 [00:02<00:20, 222.58it/s]

Iter: 500
Train_loss: 0.1317
Test_loss: 0.15



 13%|█▎        | 641/5000 [00:02<00:18, 239.06it/s]

Iter: 600
Train_loss: 0.1203
Test_loss: 0.1523



 15%|█▌        | 752/5000 [00:03<00:16, 264.05it/s]

Iter: 700
Train_loss: 0.1121
Test_loss: 0.1333



 17%|█▋        | 833/5000 [00:03<00:16, 253.06it/s]

Iter: 800
Train_loss: 0.1073
Test_loss: 0.1377



 19%|█▊        | 928/5000 [00:04<00:20, 195.55it/s]

Iter: 900
Train_loss: 0.1079
Test_loss: 0.1406



 21%|██        | 1046/5000 [00:04<00:17, 227.57it/s]

Iter: 1000
Train_loss: 0.1029
Test_loss: 0.1467



 22%|██▏       | 1117/5000 [00:04<00:20, 189.99it/s]

Iter: 1100
Train_loss: 0.0897
Test_loss: 0.1332



 25%|██▍       | 1237/5000 [00:05<00:20, 185.03it/s]

Iter: 1200
Train_loss: 0.09721
Test_loss: 0.1295



 27%|██▋       | 1333/5000 [00:06<00:16, 220.68it/s]

Iter: 1300
Train_loss: 0.09611
Test_loss: 0.1318



 29%|██▊       | 1426/5000 [00:06<00:16, 210.87it/s]

Iter: 1400
Train_loss: 0.08711
Test_loss: 0.1213



 31%|███       | 1534/5000 [00:06<00:13, 254.32it/s]

Iter: 1500
Train_loss: 0.08299
Test_loss: 0.1314



 33%|███▎      | 1650/5000 [00:07<00:12, 271.90it/s]

Iter: 1600
Train_loss: 0.08309
Test_loss: 0.1448



 35%|███▍      | 1733/5000 [00:07<00:12, 257.27it/s]

Iter: 1700
Train_loss: 0.07892
Test_loss: 0.1388



 37%|███▋      | 1840/5000 [00:08<00:12, 249.39it/s]

Iter: 1800
Train_loss: 0.08161
Test_loss: 0.1289



 39%|███▉      | 1952/5000 [00:08<00:11, 271.52it/s]

Iter: 1900
Train_loss: 0.07863
Test_loss: 0.1334



 41%|████      | 2040/5000 [00:08<00:10, 276.10it/s]

Iter: 2000
Train_loss: 0.08204
Test_loss: 0.1414



 42%|████▏     | 2122/5000 [00:09<00:12, 237.14it/s]

Iter: 2100
Train_loss: 0.07398
Test_loss: 0.1257



 45%|████▍     | 2227/5000 [00:09<00:11, 245.95it/s]

Iter: 2200
Train_loss: 0.07824
Test_loss: 0.1194



 47%|████▋     | 2328/5000 [00:10<00:11, 223.93it/s]

Iter: 2300
Train_loss: 0.06992
Test_loss: 0.1269



 49%|████▊     | 2436/5000 [00:10<00:10, 253.87it/s]

Iter: 2400
Train_loss: 0.07251
Test_loss: 0.1231



 51%|█████     | 2542/5000 [00:10<00:09, 255.57it/s]

Iter: 2500
Train_loss: 0.07139
Test_loss: 0.1156



 53%|█████▎    | 2646/5000 [00:11<00:09, 245.27it/s]

Iter: 2600
Train_loss: 0.07362
Test_loss: 0.1239



 55%|█████▍    | 2745/5000 [00:11<00:09, 234.52it/s]

Iter: 2700
Train_loss: 0.07666
Test_loss: 0.129



 57%|█████▋    | 2843/5000 [00:12<00:09, 237.25it/s]

Iter: 2800
Train_loss: 0.07178
Test_loss: 0.114



 59%|█████▉    | 2938/5000 [00:12<00:09, 209.67it/s]

Iter: 2900
Train_loss: 0.07283
Test_loss: 0.1272



 61%|██████    | 3029/5000 [00:13<00:09, 215.36it/s]

Iter: 3000
Train_loss: 0.07336
Test_loss: 0.1283



 63%|██████▎   | 3148/5000 [00:13<00:08, 222.78it/s]

Iter: 3100
Train_loss: 0.07096
Test_loss: 0.1277



 65%|██████▍   | 3236/5000 [00:13<00:08, 205.93it/s]

Iter: 3200
Train_loss: 0.06664
Test_loss: 0.1215



 66%|██████▋   | 3321/5000 [00:14<00:08, 198.96it/s]

Iter: 3300
Train_loss: 0.06717
Test_loss: 0.1262



 68%|██████▊   | 3421/5000 [00:14<00:08, 183.81it/s]

Iter: 3400
Train_loss: 0.06829
Test_loss: 0.1309



 71%|███████   | 3527/5000 [00:15<00:07, 207.81it/s]

Iter: 3500
Train_loss: 0.06763
Test_loss: 0.1291



 73%|███████▎  | 3648/5000 [00:16<00:05, 228.49it/s]

Iter: 3600
Train_loss: 0.06577
Test_loss: 0.1238



 74%|███████▍  | 3720/5000 [00:16<00:05, 224.49it/s]

Iter: 3700
Train_loss: 0.06368
Test_loss: 0.1364



 77%|███████▋  | 3828/5000 [00:16<00:05, 196.27it/s]

Iter: 3800
Train_loss: 0.06413
Test_loss: 0.141



 78%|███████▊  | 3913/5000 [00:17<00:05, 189.94it/s]

Iter: 3900
Train_loss: 0.06571
Test_loss: 0.1199



 81%|████████  | 4054/5000 [00:18<00:04, 210.65it/s]

Iter: 4000
Train_loss: 0.07105
Test_loss: 0.127



 82%|████████▏ | 4103/5000 [00:18<00:04, 207.53it/s]

Iter: 4100
Train_loss: 0.06929
Test_loss: 0.127



 84%|████████▍ | 4220/5000 [00:19<00:05, 144.50it/s]

Iter: 4200
Train_loss: 0.06636
Test_loss: 0.1215



 87%|████████▋ | 4343/5000 [00:19<00:03, 198.22it/s]

Iter: 4300
Train_loss: 0.06702
Test_loss: 0.1352



 89%|████████▉ | 4443/5000 [00:20<00:02, 225.77it/s]

Iter: 4400
Train_loss: 0.06947
Test_loss: 0.1129



 91%|█████████ | 4543/5000 [00:20<00:01, 240.10it/s]

Iter: 4500
Train_loss: 0.06479
Test_loss: 0.1262



 93%|█████████▎| 4644/5000 [00:21<00:01, 244.77it/s]

Iter: 4600
Train_loss: 0.07082
Test_loss: 0.1297



 95%|█████████▌| 4750/5000 [00:21<00:00, 255.88it/s]

Iter: 4700
Train_loss: 0.06723
Test_loss: 0.123



 97%|█████████▋| 4832/5000 [00:21<00:00, 264.42it/s]

Iter: 4800
Train_loss: 0.06602
Test_loss: 0.136



 99%|█████████▉| 4938/5000 [00:22<00:00, 256.75it/s]

Iter: 4900
Train_loss: 0.06591
Test_loss: 0.1221



100%|██████████| 5000/5000 [00:22<00:00, 220.22it/s]


Final Test RMSE: 0.12433786
