# Import Statements

In [3]:
import tensorflow as tf
import numpy as np
from tqdm import tqdm
from os import sys, path 
#Make Sure Parent File is stored properly for demo!
from Imputation import Imputation

# Class GAIN 

Initialization for mini batch size, missing rate, hint rate, alpha and train rate

In [4]:
class GAIN(Imputation):
    def __init__(self, mb_size, p_miss, p_hint, alpha, train_rate):
        self.mb_size= mb_size
        self.p_miss = p_miss
        self.p_hint = p_hint
        self.alpha = alpha
        self.train_rate = train_rate
        self.H_Dim1 = None
        self.H_Dim2 = None
        print('Init Done')

Functions for normalize, missingness introduction, train test split

In [5]:
class GAIN(GAIN):
    def normalize(self, data, dimension):
            Min_Val = np.zeros(dimension)
            Max_Val = np.zeros(dimension)
            for i in range(dimension):
                Min_Val[i] = np.min(data[:,i])
                data[:,i] = data[:,i] - np.min(data[:,i])
                Max_Val[i] = np.max(data[:,i])
                data[:,i] = data[:,i] / (np.max(data[:,i]) + 1e-6)  
            print('Norm Done')              
            return data   
    
    def introduce_missingness(self, Dim, No, Data):
            p_miss_vec = self.p_miss * np.ones((Dim,1))
            Missing = np.zeros((No, Dim))
            for i in range(Dim):
                A = np.random.uniform(0., 1., size = [len(Data),])
                B = A > p_miss_vec[i]
                Missing[:,i] = 1.*B
            print('Missing Done')
            return Missing
    
    def train_test_split(self, No, Data, Missing):
        idx = np.random.permutation(No)
        Train_No = int(No * self.train_rate)
        Test_No = No - Train_No
        trainX = Data[idx[:Train_No],:]
        testX = Data[idx[Train_No:],:]
        trainM = Missing[idx[:Train_No],:]
        testM = Missing[idx[Train_No:],:]
        print('Train/Test Done')
        return trainX, testX, trainM, testM, Train_No, Test_No

Implementing gain architecture, generator, discriminator

In [6]:
class GAIN(GAIN):
    def gain_architecture(self, Dim):
            X = tf.placeholder(tf.float32, shape = [None, Dim])
            M = tf.placeholder(tf.float32, shape = [None, Dim])
            H = tf.placeholder(tf.float32, shape = [None, Dim])
            New_X = tf.placeholder(tf.float32, shape = [None, Dim])
            D_W1 = tf.Variable(self.xavier_init([Dim*2, self.H_Dim1]))     # Data + Hint as inputs
            D_b1 = tf.Variable(tf.zeros(shape = [ self.H_Dim1]))
            D_W2 = tf.Variable(self.xavier_init([self.H_Dim1, self.H_Dim2]))
            D_b2 = tf.Variable(tf.zeros(shape = [self.H_Dim2]))
            D_W3 = tf.Variable(self.xavier_init([self.H_Dim2, Dim]))
            D_b3 = tf.Variable(tf.zeros(shape = [Dim]))       # Output is multi-variate
            theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]
            G_W1 = tf.Variable(self.xavier_init([Dim*2, self.H_Dim1]))     # Data + Mask as inputs (Random Noises are in Missing Components)
            G_b1 = tf.Variable(tf.zeros(shape = [self.H_Dim1]))
            G_W2 = tf.Variable(self.xavier_init([self.H_Dim1, self.H_Dim2]))
            G_b2 = tf.Variable(tf.zeros(shape = [self.H_Dim2]))
            G_W3 = tf.Variable(self.xavier_init([self.H_Dim2, Dim]))
            G_b3 = tf.Variable(tf.zeros(shape = [Dim]))
            theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]
            print('Gain Arch Done')
            return theta_D, theta_G, X, M, H, New_X
    @staticmethod
    def generator(new_x, m, G_W1, G_W2, G_W3, G_b1, G_b2, G_b3):
        inputs = tf.concat(axis = 1, values = [new_x,m])  # Mask + Data Concatenate
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)   
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) # [0,1] normalized Output
        print('Gen Samp Done')
        return G_prob
            
    @staticmethod
    def discriminator(new_x, h, D_W1, D_W2, D_W3, D_b1, D_b2, D_b3):
        inputs = tf.concat(axis = 1, values = [new_x,h])  # Hint + Data Concatenate
        D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)  
        D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
        D_logit = tf.matmul(D_h2, D_W3) + D_b3
        D_prob = tf.nn.sigmoid(D_logit)  # [0,1] Probability Output
        print('Disc Sample Done')
        return D_prob

 Preprocessing input data : normalize, load, get dimensions

In [7]:
class GAIN(GAIN):
    def preprocess(self, inputData):
            Data = np.loadtxt(inputData, delimiter=",",skiprows=1)
            #Data = np.loadtxt(inputData, delimiter=",")
            No = len(Data)
            Dim = len(Data[0,:])
            self.H_Dim1 = Dim
            self.H_Dim2 = Dim
            normalized_data = self.normalize(Data, Dim)
            print('Preprocess Done')
            return normalized_data, No, Dim

Static methods: xavier initialization, sample generation 

In [8]:
class GAIN(GAIN):
    @staticmethod
    def xavier_init(size):
        in_dim = size[0]
        xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
        return tf.random_normal(shape = size, stddev = xavier_stddev)
    @staticmethod
    def sample_M(m, n, p):
        A = np.random.uniform(0., 1., size = [m, n])
        B = A > p
        C = 1.*B
        return C    
    @staticmethod
    def sample_Z(m, n):
        return np.random.uniform(0., 0.01, size = [m, n])        
    @staticmethod
    def sample_idx(m, n):
         A = np.random.permutation(m)
         idx = A[:n]
         return idx

Train function

In [9]:
class GAIN(GAIN):    
    def train(self, normalized_data, No, Dim):
        missing_matrix  = self.introduce_missingness(Dim, No, normalized_data)
        trainX, testX, trainM, testM, Train_No, Test_No= self.train_test_split(No, normalized_data, missing_matrix)
        theta_D, theta_G, X, M, H, New_X = self.gain_architecture(Dim)
        G_sample = self.generator(New_X, M, theta_G[0],theta_G[1], theta_G[2], theta_G[3], theta_G[4], theta_G[5])
        Hat_New_X = New_X * M + G_sample * (1-M)
        D_prob = self.discriminator(Hat_New_X, H, theta_D[0], theta_D[1], theta_D[2], theta_D[3],theta_D[4],theta_D[5])
        D_loss1 = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) + (1-M) * tf.log(1. - D_prob + 1e-8)) 
        G_loss1 = -tf.reduce_mean((1-M) * tf.log(D_prob + 1e-8))
        MSE_train_loss = tf.reduce_mean((M * New_X - M * G_sample)**2) / tf.reduce_mean(M)
        D_loss = D_loss1
        G_loss = G_loss1 + gain_obj.alpha * MSE_train_loss
        MSE_test_loss = tf.reduce_mean(((1-M) * X - (1-M)*G_sample)**2) / tf.reduce_mean(1-M)
        D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
        G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        for it in tqdm(range(5000)):    
            mb_idx = gain_obj.sample_idx(Train_No, gain_obj.mb_size)
            X_mb = trainX[mb_idx,:]      
            Z_mb = gain_obj.sample_Z(gain_obj.mb_size, Dim) 
            M_mb = trainM[mb_idx,:]  
            H_mb1 = gain_obj.sample_M(gain_obj.mb_size, Dim, 1-gain_obj.p_hint)
            H_mb = M_mb * H_mb1    
            New_X_mb = M_mb * X_mb + (1-M_mb) * Z_mb  # Missing Data Introduce    
            _, D_loss_curr = sess.run([D_solver, D_loss1], feed_dict = {M: M_mb, New_X: New_X_mb, H: H_mb})
            _, G_loss_curr, MSE_train_loss_curr, MSE_test_loss_curr = sess.run([G_solver, G_loss1, MSE_train_loss, MSE_test_loss],
                                                                           feed_dict = {X: X_mb, M: M_mb, New_X: New_X_mb, H: H_mb})
            if it % 100 == 0:
                print('Iter: {}'.format(it))
                print('Train_loss: {:.4}'.format(np.sqrt(MSE_train_loss_curr)))
                print('Test_loss: {:.4}'.format(np.sqrt(MSE_test_loss_curr)))
                print()
        return Test_No, testM, testX, MSE_test_loss, G_sample, sess, X, M, New_X

Test functions, impute, evaluate. [Evaluate results are provided in test]

In [10]:
class GAIN(GAIN):
    def test(self, Test_No, testM, testX, MSE_test_loss, G_sample, sess, X, M, New_X):
        Z_mb = gain_obj.sample_Z(Test_No, Dim) 
        M_mb = testM
        X_mb = testX
        New_X_mb = M_mb * X_mb + (1-M_mb) * Z_mb  # Missing Data Introduce
        MSE_final, Sample = sess.run([MSE_test_loss, G_sample], feed_dict = {X: testX, M: testM, New_X: New_X_mb})
        print('Final Test RMSE: ' + str(np.sqrt(MSE_final)))
        
    def impute(self, trained_model, input):
        pass
    
    def evaluate(self, trained_model, input):
        pass  

# Main function to access GAIN, initial params, and with dataset in .csv

In [12]:
if __name__ == '__main__': 
    gain_obj = GAIN(128, 0.2, 0.9, 10, 0.8)
    normalized_data, No, Dim= gain_obj.preprocess('Letter.csv')
    Test_No, testM, testX, MSE_test_loss, G_sample, sess, X, M, New_X = gain_obj.train(normalized_data, No, Dim)
    gain_obj.test(Test_No, testM, testX, MSE_test_loss, G_sample, sess, X, M, New_X)

Init Done
Norm Done
Preprocess Done
Missing Done
Train/Test Done
Gain Arch Done
Gen Samp Done
Disc Sample Done


  1%|          | 27/5000 [00:00<14:29,  5.72it/s]

Iter: 0
Train_loss: 0.3289
Test_loss: 0.3305



  3%|▎         | 141/5000 [00:00<03:37, 22.39it/s]

Iter: 100
Train_loss: 0.1767
Test_loss: 0.1831



  5%|▍         | 239/5000 [00:01<01:06, 71.48it/s]

Iter: 200
Train_loss: 0.1494
Test_loss: 0.1553



  7%|▋         | 355/5000 [00:01<00:27, 167.86it/s]

Iter: 300
Train_loss: 0.1392
Test_loss: 0.1418



  9%|▉         | 439/5000 [00:01<00:21, 215.00it/s]

Iter: 400
Train_loss: 0.1285
Test_loss: 0.1486



 11%|█         | 557/5000 [00:02<00:16, 268.96it/s]

Iter: 500
Train_loss: 0.1164
Test_loss: 0.1374



 13%|█▎        | 647/5000 [00:02<00:15, 288.16it/s]

Iter: 600
Train_loss: 0.1162
Test_loss: 0.1521



 15%|█▍        | 738/5000 [00:02<00:14, 294.76it/s]

Iter: 700
Train_loss: 0.1018
Test_loss: 0.1297



 17%|█▋        | 859/5000 [00:03<00:13, 297.87it/s]

Iter: 800
Train_loss: 0.09752
Test_loss: 0.1269



 18%|█▊        | 919/5000 [00:03<00:14, 279.63it/s]

Iter: 900
Train_loss: 0.09116
Test_loss: 0.1248



 21%|██        | 1034/5000 [00:04<00:14, 269.43it/s]

Iter: 1000
Train_loss: 0.09196
Test_loss: 0.1396



 23%|██▎       | 1146/5000 [00:04<00:15, 255.54it/s]

Iter: 1100
Train_loss: 0.08556
Test_loss: 0.1293



 25%|██▌       | 1261/5000 [00:04<00:13, 277.40it/s]

Iter: 1200
Train_loss: 0.07883
Test_loss: 0.1408



 27%|██▋       | 1351/5000 [00:05<00:12, 289.60it/s]

Iter: 1300
Train_loss: 0.07718
Test_loss: 0.1297



 29%|██▉       | 1439/5000 [00:05<00:13, 260.12it/s]

Iter: 1400
Train_loss: 0.07889
Test_loss: 0.1374



 31%|███       | 1553/5000 [00:05<00:12, 271.41it/s]

Iter: 1500
Train_loss: 0.07253
Test_loss: 0.1305



 33%|███▎      | 1635/5000 [00:06<00:13, 245.97it/s]

Iter: 1600
Train_loss: 0.07534
Test_loss: 0.129



 35%|███▍      | 1746/5000 [00:06<00:12, 265.04it/s]

Iter: 1700
Train_loss: 0.07019
Test_loss: 0.1155



 37%|███▋      | 1830/5000 [00:07<00:11, 272.29it/s]

Iter: 1800
Train_loss: 0.07149
Test_loss: 0.1309



 39%|███▉      | 1945/5000 [00:07<00:10, 281.38it/s]

Iter: 1900
Train_loss: 0.06848
Test_loss: 0.128



 41%|████      | 2032/5000 [00:07<00:10, 282.56it/s]

Iter: 2000
Train_loss: 0.07152
Test_loss: 0.1281



 43%|████▎     | 2149/5000 [00:08<00:10, 283.33it/s]

Iter: 2100
Train_loss: 0.06919
Test_loss: 0.1288



 45%|████▍     | 2234/5000 [00:08<00:10, 268.67it/s]

Iter: 2200
Train_loss: 0.06553
Test_loss: 0.1427



 47%|████▋     | 2342/5000 [00:08<00:10, 263.26it/s]

Iter: 2300
Train_loss: 0.07171
Test_loss: 0.1249



 49%|████▉     | 2455/5000 [00:09<00:09, 274.54it/s]

Iter: 2400
Train_loss: 0.06705
Test_loss: 0.1249



 51%|█████     | 2538/5000 [00:09<00:09, 261.06it/s]

Iter: 2500
Train_loss: 0.06336
Test_loss: 0.1271



 53%|█████▎    | 2642/5000 [00:10<00:09, 247.38it/s]

Iter: 2600
Train_loss: 0.06793
Test_loss: 0.1283



 55%|█████▌    | 2755/5000 [00:10<00:08, 270.56it/s]

Iter: 2700
Train_loss: 0.06227
Test_loss: 0.1233



 57%|█████▋    | 2843/5000 [00:10<00:07, 279.70it/s]

Iter: 2800
Train_loss: 0.06099
Test_loss: 0.1226



 59%|█████▉    | 2961/5000 [00:11<00:07, 286.17it/s]

Iter: 2900
Train_loss: 0.06427
Test_loss: 0.1246



 61%|██████    | 3043/5000 [00:11<00:08, 241.93it/s]

Iter: 3000
Train_loss: 0.06245
Test_loss: 0.1178



 63%|██████▎   | 3154/5000 [00:11<00:06, 264.12it/s]

Iter: 3100
Train_loss: 0.06055
Test_loss: 0.1191



 65%|██████▍   | 3233/5000 [00:12<00:07, 250.48it/s]

Iter: 3200
Train_loss: 0.05801
Test_loss: 0.1218



 67%|██████▋   | 3343/5000 [00:12<00:06, 259.14it/s]

Iter: 3300
Train_loss: 0.06217
Test_loss: 0.1327



 69%|██████▉   | 3446/5000 [00:13<00:06, 244.36it/s]

Iter: 3400
Train_loss: 0.05892
Test_loss: 0.1064



 71%|███████   | 3533/5000 [00:13<00:05, 269.74it/s]

Iter: 3500
Train_loss: 0.06042
Test_loss: 0.1266



 73%|███████▎  | 3641/5000 [00:13<00:05, 233.39it/s]

Iter: 3600
Train_loss: 0.06157
Test_loss: 0.1269



 75%|███████▍  | 3736/5000 [00:14<00:05, 226.33it/s]

Iter: 3700
Train_loss: 0.05782
Test_loss: 0.1312



 77%|███████▋  | 3850/5000 [00:14<00:04, 265.41it/s]

Iter: 3800
Train_loss: 0.05971
Test_loss: 0.1385



 79%|███████▊  | 3930/5000 [00:15<00:04, 250.33it/s]

Iter: 3900
Train_loss: 0.05591
Test_loss: 0.1332



 81%|████████  | 4046/5000 [00:15<00:03, 278.62it/s]

Iter: 4000
Train_loss: 0.0544
Test_loss: 0.1183



 83%|████████▎ | 4139/5000 [00:15<00:02, 297.93it/s]

Iter: 4100
Train_loss: 0.05348
Test_loss: 0.1193



 85%|████████▍ | 4232/5000 [00:16<00:02, 300.27it/s]

Iter: 4200
Train_loss: 0.05797
Test_loss: 0.1273



 87%|████████▋ | 4356/5000 [00:16<00:02, 301.22it/s]

Iter: 4300
Train_loss: 0.05584
Test_loss: 0.1261



 89%|████████▉ | 4448/5000 [00:16<00:01, 300.05it/s]

Iter: 4400
Train_loss: 0.05453
Test_loss: 0.1303



 91%|█████████ | 4542/5000 [00:17<00:01, 305.06it/s]

Iter: 4500
Train_loss: 0.05198
Test_loss: 0.1267



 93%|█████████▎| 4635/5000 [00:17<00:01, 302.14it/s]

Iter: 4600
Train_loss: 0.05181
Test_loss: 0.1168



 95%|█████████▌| 4759/5000 [00:17<00:00, 304.80it/s]

Iter: 4700
Train_loss: 0.04897
Test_loss: 0.1212



 97%|█████████▋| 4852/5000 [00:18<00:00, 301.72it/s]

Iter: 4800
Train_loss: 0.05407
Test_loss: 0.1293



 99%|█████████▉| 4945/5000 [00:18<00:00, 301.97it/s]

Iter: 4900
Train_loss: 0.05227
Test_loss: 0.1235



100%|██████████| 5000/5000 [00:18<00:00, 267.52it/s]


Final Test RMSE: 0.12908025
