In [3]:
import numpy as np
import scipy.io as sio
import torch
from sklearn import preprocessing
import sys
import h5py
import os
import datetime

def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:
        m.weight.data.normal_(0.0, 0.02)
        m.bias.data.fill_(0)
    elif classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)
        


class DATA_LOADER(object):
    def __init__(self, opt):
        self.read_dataset(opt)
        self.index_in_epoch = 0
        self.epochs_completed = 0

    def read_dataset(self, opt):
        file_path = opt.dataroot + "/"

        self.train_feature = np.load(file_path+"features/attention_feature_train_512.npy") #(8000,512)
        self.train_label = np.load(file_path+"train_origin/y_train.npy").reshape(-1)       #(1,8000*2048)
        self.test_unseen_feature = np.load(file_path+"features/attention_feature_test_512.npy")
        self.test_unseen_label = np.load(file_path+"test_origin/y_test.npy").reshape(-1)
        self.attribute = np.load(file_path+"semantic/semantic_all_28_2.npy")               #(8,28)
#         self.attribute = np.load(file_path+"semantic/semantic_all_fre_tim_ae.npy")


        if opt.preprocessing:
            if opt.standardization:
                print('standardization...')
                scaler = preprocessing.StandardScaler()
            else:
                scaler = preprocessing.MinMaxScaler() 
                
        self.train_feature = scaler.fit_transform(self.train_feature) 
        
        self.test_unseen_feature = scaler.fit_transform(self.test_unseen_feature)
        
    #         self.test_unseen_feature = torch.from_numpy(feature[val_unseen_loc]).float()
    #         self.test_unseen_label = torch.from_numpy(label[val_unseen_loc]).long()
        
    
        self.train_feature = torch.from_numpy(self.train_feature).float()
#         mx = self.train_feature.max()
#         self.train_feature.mul_(1 / mx)
        self.train_label = torch.from_numpy(self.train_label).long() 
        
        self.test_seen_feature = self.train_feature # gzsl的test_seen
        self.test_seen_label = self.train_label
        
        self.test_unseen_feature = torch.from_numpy(self.test_unseen_feature).float()
#         mx = self.test_unseen_feature.max()
#         self.test_unseen_feature.mul_(1 / mx)
        self.test_unseen_label = torch.from_numpy(self.test_unseen_label).long()
        self.attribute = torch.from_numpy(self.attribute).float()
        
        self.seenclasses = torch.from_numpy(np.unique(self.train_label))
        self.unseenclasses = torch.from_numpy(np.unique(self.test_unseen_label))
        print(self.seenclasses)
        print(self.unseenclasses)

#         self.ntrain = self.train_feature.shape[0]

        self.ntrain = self.train_feature.size()[0]
        self.ntrain_class = self.seenclasses.size(0)
        self.ntest_class = self.unseenclasses.size(0)
        self.train_class = self.seenclasses.clone()
        self.allclasses = torch.arange(0, self.ntrain_class + self.ntest_class).long() 
        print(self.allclasses)
        self.attribute_seen = self.attribute[self.seenclasses]

        # collect the data of each class

        self.train_samples_class_index = torch.tensor([self.train_label.eq(i_class).sum().float() for i_class in self.train_class])


    def next_batch(self, batch_size):
        idx = torch.randperm(self.ntrain)[0:batch_size] 
        batch_feature = self.train_feature[idx]
        batch_label = self.train_label[idx]
        batch_att = self.attribute[batch_label].squeeze()
        return batch_feature, batch_label, batch_att

In [4]:
from __future__ import print_function 
import torch
import torch.nn as nn

class SupConLoss_clear(nn.Module):
    def __init__(self, temperature=0.07):
        super(SupConLoss_clear, self).__init__()
        self.temperature = temperature

    def forward(self, features, labels):

#         device = (torch.device('cuda')
#                   if features.is_cuda
#                   else torch.device('cpu'))

        batch_size = features.shape[0]
        '''
        示例: 
        labels: 
            tensor([[1.],
                    [2.],
                    [1.],
                    [1.]])
        mask:  # 两个样本i,j的label相等时，mask_{i,j}=1
            tensor([[1., 0., 1., 1.],
                    [0., 1., 0., 0.],
                    [1., 0., 1., 1.],
                    [1., 0., 1., 1.]]) 
        '''
        labels = labels.contiguous().view(-1, 1)
        mask = torch.eq(labels, labels.T).float() 
#         mask = torch.eq(labels, labels.T).float().to(device)

        anchor_dot_contrast = torch.div(
            torch.matmul(features, features.T),
            self.temperature)

        # normalize the logits for numerical stability
        logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
        logits = anchor_dot_contrast - logits_max.detach()
        exp_logits = torch.exp(logits)
        '''
        logits是anchor_dot_contrast减去每一行的最大值得到的最终相似度
        示例: logits: torch.size([4,4])
        logits:
            tensor([[ 0.0000, -0.0471, -0.3352, -0.2156],
                    [-1.2576,  0.0000, -0.3367, -0.0725],
                    [-1.3500, -0.1409, -0.1420,  0.0000],
                    [-1.4312, -0.0776, -0.2009,  0.0000]])       
        '''
        # 构建mask 
        logits_mask = torch.ones_like(mask) - torch.eye(batch_size)     
        positives_mask = mask * logits_mask
        negatives_mask = 1. - mask 
        '''
        但是对于计算Loss而言，(i,i)位置表示样本本身的相似度，对Loss是没用的，所以要mask掉
        # 第ind行第ind位置填充为0
        得到logits_mask:
            tensor([[0., 1., 1., 1.],
                    [1., 0., 1., 1.],
                    [1., 1., 0., 1.],
                    [1., 1., 1., 0.]])
        positives_mask:
        tensor([[0., 0., 1., 1.],
                [0., 0., 0., 0.],
                [1., 0., 0., 1.],
                [1., 0., 1., 0.]])
        negatives_mask:
        tensor([[0., 1., 0., 0.],
                [1., 0., 1., 1.],
                [0., 1., 0., 0.],
                [0., 1., 0., 0.]])
        '''        
        num_positives_per_row  = torch.sum(positives_mask , axis=1) # 除了自己之外，正样本的个数  [2 0 2 2]       
        denominator = torch.sum(
        exp_logits * negatives_mask, axis=1, keepdims=True) + torch.sum(
            exp_logits * positives_mask, axis=1, keepdims=True)  
        
        log_probs = logits - torch.log(denominator)
        if torch.any(torch.isnan(log_probs)):
            raise ValueError("Log_prob has nan!")
        

        log_probs = torch.sum(
            log_probs*positives_mask , axis=1)[num_positives_per_row > 0] / num_positives_per_row[num_positives_per_row > 0]
        '''
        计算正样本平均的log-likelihood
        考虑到一个类别可能只有一个样本，就没有正样本了 比如我们labels的第二个类别 labels[1,2,1,1]
        所以这里只计算正样本个数>0的    
        '''
        # loss
        loss = -log_probs
        loss *= self.temperature
        loss = loss.mean()
        return loss

In [5]:
class LINEAR_LOGSOFTMAX1(nn.Module):
    def __init__(self, input_dim, nclass):
        super(LINEAR_LOGSOFTMAX1, self).__init__()
        self.fc = nn.Linear(input_dim, nclass)
        self.logic = nn.LogSoftmax(dim=1)
    def forward(self, x): 
        o = self.logic(self.fc(x)) 
        return o 
    
class CLASSIFIER_TRAIN:
    # train_Y is interger 
    def __init__(self, _train_X, _train_Y, map_net, resSize, data_loader, _nclass, _cuda, _lr=0.00001, _beta1=0.5, _nepoch=20, _batch_size=100, generalized=True):
        self.train_X =  _train_X 
        self.train_Y = _train_Y 
        
        self.test_seen_feature = data_loader.test_seen_feature 
        self.test_seen_label = data_loader.test_seen_label 
        
        self.test_unseen_feature = data_loader.test_unseen_feature 
        self.test_unseen_label = data_loader.test_unseen_label 
        
        self.seenclasses = data_loader.seenclasses
        self.unseenclasses = data_loader.unseenclasses
        
        self.MapNet=map_net #
        
        self.batch_size = _batch_size
        self.nepoch = _nepoch
        
        self.nclass = _nclass
        self.input_dim = resSize
        self.cuda = _cuda
        self.model =  LINEAR_LOGSOFTMAX1(self.input_dim, self.nclass)

        self.model.apply(weights_init)
        self.criterion = nn.NLLLoss() 
        
        self.input = torch.FloatTensor(_batch_size, _train_X.size(1))
        self.label = torch.LongTensor(_batch_size) 
        
        self.lr = _lr
        self.beta1 = _beta1
        # setup optimizer
        self.optimizer = optim.Adam(self.model.parameters(), lr=_lr, betas=(_beta1, 0.999))

        # if self.cuda:
            # self.model.cuda()
            # self.criterion.cuda()
            # self.input = self.input.cuda()
            # self.label = self.label.cuda()

        self.index_in_epoch = 0
        self.epochs_completed = 0
        self.ntrain = self.train_X.size()[0]

        if generalized:
            self.acc_seen, self.acc_unseen, self.H = self.fit()
        else:
            self.acc = self.fit_zsl()
    
    
    def fit_zsl(self):
        best_acc = 0
        mean_loss = 0
        for epoch in range(self.nepoch):
            for i in range(0, self.ntrain, self.batch_size):      
                self.model.zero_grad()
                batch_input, batch_label = self.next_batch(self.batch_size)
                self.input.copy_(batch_input)
                self.label.copy_(batch_label)  
#                 embed, _=self.MapNet(self.input)
#                 output = self.model(embed)
                output = self.model(self.input)
                loss = self.criterion(output, self.label)
                mean_loss += loss.data
                loss.backward()
                self.optimizer.step()
            acc = self.val(self.test_seen_feature, self.test_seen_label, self.seenclasses)
            if acc > best_acc:
                best_acc = acc
        print('Training classifier_train loss= %.4f' % (loss))
        return best_acc 

    def fit(self):
        best_H = 0
        best_seen = 0
        best_unseen = 0
        for epoch in range(self.nepoch):
            for i in range(0, self.ntrain, self.batch_size):      
                self.model.zero_grad()
                batch_input, batch_label = self.next_batch(self.batch_size) 
                self.input.copy_(batch_input)
                self.label.copy_(batch_label)

#                 embed, _ = self.MapNet(self.input)
#                 output = self.model(embed)
                output = self.model(self.input)
                loss = self.criterion(output, self.label)

                loss.backward()
                self.optimizer.step()

            acc_seen = self.val_gzsl(self.test_seen_feature, self.test_seen_label, self.seenclasses)
            acc_unseen = self.val_gzsl(self.test_unseen_feature, self.test_unseen_label, self.unseenclasses)
            if (acc_seen+acc_unseen)==0:  
                print('a bug')
                H=0
            else:
                H = 2*acc_seen*acc_unseen / (acc_seen+acc_unseen)
            if H > best_H:
                best_seen = acc_seen
                best_unseen = acc_unseen
                best_H = H
        return best_seen, best_unseen, best_H
                     
    def next_batch(self, batch_size): #
        start = self.index_in_epoch
        # shuffle the data at the first epoch 随机
        if self.epochs_completed == 0 and start == 0: 
            perm = torch.randperm(self.ntrain) 
            self.train_X = self.train_X[perm]
            self.train_Y = self.train_Y[perm]
        if start + batch_size > self.ntrain:
            self.epochs_completed += 1
            rest_num_examples = self.ntrain - start
            if rest_num_examples > 0:
                X_rest_part = self.train_X[start:self.ntrain]
                Y_rest_part = self.train_Y[start:self.ntrain]
            # shuffle the data
            perm = torch.randperm(self.ntrain)
            self.train_X = self.train_X[perm]
            self.train_Y = self.train_Y[perm]
            # start next epoch ，last part
            start = 0
            self.index_in_epoch = batch_size - rest_num_examples
            end = self.index_in_epoch
            X_new_part = self.train_X[start:end]
            Y_new_part = self.train_Y[start:end]
            #print(start, end)
            if rest_num_examples > 0:
                return torch.cat((X_rest_part, X_new_part), 0) , torch.cat((Y_rest_part, Y_new_part), 0)
            else:
                return X_new_part, Y_new_part
        else:
            self.index_in_epoch += batch_size
            end = self.index_in_epoch
            #print(start, end)
            # from index start to index end-1
            return self.train_X[start:end], self.train_Y[start:end]


    def val_gzsl(self, test_X, test_label, target_classes): 
        start = 0
        ntest = test_X.size()[0]
        predicted_label = torch.LongTensor(test_label.size())
        for i in range(0, ntest, self.batch_size):
            end = min(ntest, start+self.batch_size)
            with torch.no_grad():
                if self.cuda:
#                     embed, _ = self.MapNet(test_X[start:end].cuda())
#                     output = self.model(embed)
                    output = self.model(test_X[start:end].cuda())
                else:
#                     embed, _ = self.MapNet(test_X[start:end])
#                     output = self.model(embed)
                    output = self.model(test_X[start:end])
            _, predicted_label[start:end] = torch.max(output, 1)
        
#             print("-----------以下为预测标签---------")
#             print(predicted_label)
#             print("+++++++++++以下为测试标签++++++++++")
#             print(test_label)
#             print("===========以下为映射标签===========")
#             print(map_label(test_label, target_classes))
#             print("???????????????????end???????????????")
            start = end

        acc = self.compute_per_class_acc_gzsl(test_label, predicted_label, target_classes)
        return acc

    def compute_per_class_acc_gzsl(self, test_label, predicted_label, target_classes):
        acc_per_class = 0
        for i in target_classes:
            idx = (test_label == i)
#             print("===========以下为相同类别的idx===========")
#             print(target_classes)
#             print(idx)
#             print("===========以下为相同类别的idx===========")
            acc_per_class += float(torch.sum(test_label[idx] == predicted_label[idx])) / float(torch.sum(idx))
        acc_per_class /= target_classes.size(0)
        return acc_per_class 

    # test_label is integer 
    def val(self, test_X, test_label, target_classes):
        start = 0
        ntest = test_X.size()[0]
        predicted_label = torch.LongTensor(test_label.size())
        for i in range(0, ntest, self.batch_size):
            end = min(ntest, start+self.batch_size)
            with torch.no_grad():
                if self.cuda:
#                     embed, _ = self.MapNet(test_X[start:end].cuda())
#                     output = self.model(embed)
                    output = self.model(test_X[start:end].cuda())
                else:
#                     embed, _ = self.MapNet(test_X[start:end])
#                     output = self.model(embed)
                    output = self.model(test_X[start:end])
            _, predicted_label[start:end] = torch.max(output, 1)
#             print("-----------以下为预测标签---------")
#             print(predicted_label)
#             print("+++++++++++以下为测试标签++++++++++")
#             print(test_label)
            start = end

        acc = self.compute_per_class_acc(test_label, predicted_label, target_classes.size(0))
        
        return acc

    def compute_per_class_acc(self, test_label, predicted_label, nclass):
        acc_per_class = torch.FloatTensor(nclass).fill_(0)
        for i in range(nclass):
            idx = (test_label == i)
#             print("===========以下为相同类别的idx===========")
#             print(nclass)
#             print(idx)
#             print("===========以下为相同类别的idx===========")
            acc_per_class[i] = float(torch.sum(test_label[idx]==predicted_label[idx])) / float(torch.sum(idx))
        return acc_per_class.mean()

In [6]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import numpy as np
# import util
from sklearn.preprocessing import MinMaxScaler 
import sys

class LINEAR_LOGSOFTMAX(nn.Module):
    def __init__(self, input_dim, nclass):
        super(LINEAR_LOGSOFTMAX, self).__init__()
        self.fc = nn.Linear(input_dim, nclass)
        self.logic = nn.LogSoftmax(dim=1)
    def forward(self, x): 
        o = self.logic(self.fc(x)) 
        return o 

class CLASSIFIER:
    # train_Y is interger 
    def __init__(self, _train_X, _train_Y, map_net, resSize, data_loader, _nclass, _cuda, _lr=0.00001, _beta1=0.5, _nepoch=20, _batch_size=100, generalized=True):
        self.train_X =  _train_X 
        self.train_Y = _train_Y 
        
        self.test_seen_feature = data_loader.test_seen_feature
        self.test_seen_label = data_loader.test_seen_label 
        
        self.test_unseen_feature = data_loader.test_unseen_feature
        self.test_unseen_label = data_loader.test_unseen_label 
        
        self.seenclasses = data_loader.seenclasses
        self.unseenclasses = data_loader.unseenclasses
        
        self.MapNet=map_net
        
        self.batch_size = _batch_size
        self.nepoch = _nepoch
        #分几类
        self.nclass = _nclass
        self.input_dim = resSize
        self.cuda = _cuda
        self.model =  LINEAR_LOGSOFTMAX(self.input_dim, self.nclass)

        self.model.apply(weights_init)
        self.criterion = nn.NLLLoss() 
        
        self.input = torch.FloatTensor(_batch_size, _train_X.size(1))
        self.label = torch.LongTensor(_batch_size) 
        
        self.lr = _lr
        self.beta1 = _beta1
        # setup optimizer
        self.optimizer = optim.Adam(self.model.parameters(), lr=_lr, betas=(_beta1, 0.999))

        # if self.cuda:
            # self.model.cuda()
            # self.criterion.cuda()
            # self.input = self.input.cuda()
            # self.label = self.label.cuda()

        self.index_in_epoch = 0
        self.epochs_completed = 0
        self.ntrain = self.train_X.size()[0]

        
        if generalized:
            self.acc_seen, self.acc_unseen, self.H = self.fit()
        else:
            self.acc, self.label, self.preLabel = self.fit_zsl()
    
    
    def fit_zsl(self):
        best_acc = 0
        pre_label = 0
        tes_label = 0
        mean_loss = 0
        for epoch in range(self.nepoch):
            for i in range(0, self.ntrain, self.batch_size):      
                self.model.zero_grad()
                batch_input, batch_label = self.next_batch(self.batch_size) 
                self.input.copy_(batch_input) 
                self.label.copy_(batch_label)
#                 embed, _=self.MapNet(self.input)
#                 output = self.model(embed)
                output = self.model(self.input)
                loss = self.criterion(output, self.label)
                mean_loss += loss.data
                loss.backward()
                self.optimizer.step()
            acc,test_label,predicted_label = self.val(self.test_unseen_feature, self.test_unseen_label, self.unseenclasses)
            if acc > best_acc:
                best_acc = acc
                tes_label = test_label
                pre_label = predicted_label
        print('Training classifier loss= %.4f' % (loss))
        return best_acc, tes_label, pre_label

    def fit(self):
        best_H = 0
        best_seen = 0
        best_unseen = 0
        for epoch in range(self.nepoch):
            for i in range(0, self.ntrain, self.batch_size):      
                self.model.zero_grad()
                batch_input, batch_label = self.next_batch(self.batch_size) 
                self.input.copy_(batch_input)
                self.label.copy_(batch_label)
#                 embed, _ = self.MapNet(self.input)
#                 output = self.model(embed)
                output = self.model(self.input)
                loss = self.criterion(output, self.label)
#               mean_loss += loss.data
                loss.backward()
                self.optimizer.step()

            acc_seen = self.val_gzsl(self.test_seen_feature, self.test_seen_label, self.seenclasses)
            acc_unseen = self.val_gzsl(self.test_unseen_feature, self.test_unseen_label, self.unseenclasses)
            if (acc_seen+acc_unseen)==0:
                print('a bug')
                H=0
            else:
                H = 2*acc_seen*acc_unseen / (acc_seen+acc_unseen)
            if H > best_H:
                best_seen = acc_seen
                best_unseen = acc_unseen
                best_H = H
        return best_seen, best_unseen, best_H
                     
    def next_batch(self, batch_size):
        start = self.index_in_epoch
        # shuffle the data at the first epoch
        if self.epochs_completed == 0 and start == 0:
            perm = torch.randperm(self.ntrain)
            self.train_X = self.train_X[perm]
            self.train_Y = self.train_Y[perm]
        # the last batch
        if start + batch_size > self.ntrain:
            self.epochs_completed += 1
            rest_num_examples = self.ntrain - start
            if rest_num_examples > 0:
                X_rest_part = self.train_X[start:self.ntrain]
                Y_rest_part = self.train_Y[start:self.ntrain]
            # shuffle the data
            perm = torch.randperm(self.ntrain)
            self.train_X = self.train_X[perm]
            self.train_Y = self.train_Y[perm]
            # start next epoch
            start = 0
            self.index_in_epoch = batch_size - rest_num_examples
            end = self.index_in_epoch
            X_new_part = self.train_X[start:end]
            Y_new_part = self.train_Y[start:end]
            #print(start, end)
            if rest_num_examples > 0:
                return torch.cat((X_rest_part, X_new_part), 0) , torch.cat((Y_rest_part, Y_new_part), 0)
            else:
                return X_new_part, Y_new_part
        else:
            self.index_in_epoch += batch_size
            end = self.index_in_epoch
            #print(start, end)
            # from index start to index end-1
            return self.train_X[start:end], self.train_Y[start:end]


    def val_gzsl(self, test_X, test_label, target_classes): 
        start = 0
        ntest = test_X.size()[0]
        predicted_label = torch.LongTensor(test_label.size())
        for i in range(0, ntest, self.batch_size):
            end = min(ntest, start+self.batch_size)
            with torch.no_grad():
                if self.cuda:
#                     embed, _ = self.MapNet(test_X[start:end].cuda())
#                     output = self.model(embed)
                    output = self.model(test_X[start:end].cuda())
                else:
#                     embed, _ = self.MapNet(test_X[start:end])
#                     output = self.model(embed)
                    output = self.model(test_X[start:end])
            _, predicted_label[start:end] = torch.max(output, 1)
        
#             print("-----------以下为预测标签---------")
#             print(predicted_label)
#             print("+++++++++++以下为测试标签++++++++++")
#             print(test_label)
#             print("===========以下为映射标签===========")
#             print(map_label(test_label, target_classes))
#             print("???????????????????end???????????????")
            start = end

        acc = self.compute_per_class_acc_gzsl(test_label, predicted_label, target_classes)
        return acc

    def compute_per_class_acc_gzsl(self, test_label, predicted_label, target_classes):
        acc_per_class = 0
        for i in target_classes:
            idx = (test_label == i)
#             print("===========以下为相同类别的idx===========")
#             print(target_classes)
#             print(idx)
#             print("===========以下为相同类别的idx===========")
            acc_per_class += float(torch.sum(test_label[idx] == predicted_label[idx])) / float(torch.sum(idx))
        acc_per_class /= target_classes.size(0)
        return acc_per_class 

    # test_label is integer 
    def val(self, test_X, test_label, target_classes): 
        start = 0
        ntest = test_X.size()[0]
        predicted_label = torch.LongTensor(test_label.size())
        for i in range(0, ntest, self.batch_size):
            end = min(ntest, start+self.batch_size)
            with torch.no_grad():
                if self.cuda:
#                     embed, _ = self.MapNet(test_X[start:end].cuda())
#                     output = self.model(embed)
                    output = self.model(test_X[start:end].cuda())
                else:
#                     embed, _ = self.MapNet(test_X[start:end])
#                     output = self.model(embed)
                    output = self.model(test_X[start:end])
            _, predicted_label[start:end] = torch.max(output, 1)
#             print("-----------以下为预测标签---------")
#             print(predicted_label)
#             print("+++++++++++以下为测试标签++++++++++")
#             print(test_label)
            start = end

        acc = self.compute_per_class_acc(test_label, predicted_label, target_classes.size(0))
        
        return acc,test_label,predicted_label

    def compute_per_class_acc(self, test_label, predicted_label, nclass):
        acc_per_class = torch.FloatTensor(nclass).fill_(0)
        for i in range(nclass):
            idx = (test_label == i)
#             print("===========以下为相同类别的idx===========")
#             print(nclass)
#             print(idx)
#             print("===========以下为相同类别的idx===========")
            acc_per_class[i] = float(torch.sum(test_label[idx]==predicted_label[idx])) / float(torch.sum(idx))
        return acc_per_class.mean()  

In [3]:
import torch.nn as nn
import torch
import torch.nn.functional as F

def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:
        m.weight.data.normal_(0.0, 0.02)
        m.bias.data.fill_(0)
    elif classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)

def reparameter(mu,sigma):
    return (torch.randn_like(mu) *sigma) + mu

class Embedding_Net(nn.Module):
    def __init__(self, opt):
        super(Embedding_Net, self).__init__()

        self.fc1 = nn.Linear(opt.attSize + opt.nz, opt.embedSize) 
        self.fc2 = nn.Linear(opt.embedSize, opt.resSize)
        self.lrelu = nn.LeakyReLU(0.2, True)
        self.relu = nn.ReLU(True)
        self.apply(weights_init)

    def forward(self, noise, att):
        h = torch.cat((noise, att), 1)
        embedding= F.normalize(self.relu(self.fc1(h)), dim=1)
        out_z = self.fc2(embedding)
        return embedding,out_z

class MLP_G(nn.Module): 
    def __init__(self, opt):
        super(MLP_G, self).__init__()
        self.fc1 = nn.Linear(opt.embedSize + opt.nz, opt.ngh)
        self.fc2 = nn.Linear(opt.ngh, opt.resSize)
        self.relu = nn.ReLU(True)

        self.apply(weights_init)

    def forward(self, noise, att):
        h = torch.cat((noise, att), 1)
        ngh = F.normalize(self.fc1(h), dim=1)
        h = self.relu(self.fc2(ngh))
        return ngh,h

class MLP_CRITIC(nn.Module):
    def __init__(self, opt):
        super(MLP_CRITIC, self).__init__()
        self.fc1 = nn.Linear(opt.resSize + opt.embedSize, opt.ndh)
        self.fc2 = nn.Linear(opt.ndh, 1)
        self.lrelu = nn.LeakyReLU(0.2, True)
#         self.relu = nn.ReLU(True)
        self.apply(weights_init)

    def forward(self, x, att):
        h = torch.cat((x, att), 1)
#         h = self.lrelu(self.fc1(h))
#         ndh = self.lrelu(self.fc1(h))
        ndh = F.normalize(self.lrelu(self.fc1(h)), dim=1)
        h = self.fc2(ndh)
        return ndh,h

class Dis_Embed_Att(nn.Module):
    def __init__(self, opt):
        super(Dis_Embed_Att, self).__init__()
        self.fc1 = nn.Linear(opt.ndh+opt.attSize, opt.nhF)
        #self.fc2 = nn.Linear(opt.ndh, opt.ndh)
        self.fc2 = nn.Linear(opt.nhF, 1)
        self.lrelu = nn.LeakyReLU(0.2, True)
        self.apply(weights_init)

    def forward(self, input):
        h = self.lrelu(self.fc1(input))
        h = self.fc2(h)
        return h

In [None]:
from __future__ import print_function
import argparse
import sys
sys.path.append("..")
import os
import random
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
# import classifier_embed_contras
import torch.nn.functional as F

parser = argparse.ArgumentParser()

parser.add_argument('--dataroot', default='./datas', help='path to dataset')
parser.add_argument('--syn_num', type=int, default=100, help='number features to generate per class')
parser.add_argument('--gzsl', type=bool, default=False, help='enable generalized zero-shot learning')
parser.add_argument('--preprocessing', type=bool, default=True, help='enbale MinMaxScaler on visual features')
parser.add_argument('--standardization', action='store_true', default=False)
parser.add_argument('--validation', action='store_true', default=False, help='enable cross validation mode')
parser.add_argument('--workers', type=int, help='number of data loading workers', default=2)
# parser.add_argument('--batch_size', type=int, default=2048, help='input batch size')
parser.add_argument('--batch_size', type=int, default=40, help='input batch size')
parser.add_argument('--resSize', type=int, default=512, help='size of visual features')
# parser.add_argument('--attSize', type=int, default=156 , help='size of semantic features')
parser.add_argument('--attSize', type=int, default=28 , help='size of semantic features')
parser.add_argument('--nz', type=int, default=2, help='noise for generation')
parser.add_argument('--embedSize', type=int, default=28, help='size of embedding h')
parser.add_argument('--outzSize', type=int, default=256, help='size of non-liner projection z')

## network architechure
parser.add_argument('--ngh', type=int, default=256, help='size of the hidden units in generator G')
parser.add_argument('--ndh', type=int, default=256, help='size of the hidden units in discriminator D')
parser.add_argument('--nhF', type=int, default=128, help='size of the hidden units comparator network F')

parser.add_argument('--ins_weight', type=float, default=0.001, help='weight of the classification loss when learning G')
parser.add_argument('--cls_weight', type=float, default=0.001, help='weight of the score function when learning G')
parser.add_argument('--ins_temp', type=float, default=0.1, help='temperature in instance-level supervision')
parser.add_argument('--cls_temp', type=float, default=0.1, help='temperature in class-level supervision')

parser.add_argument('--nepoch', type=int, default=223, help='number of epochs to train for')
parser.add_argument('--critic_iter', type=int, default=5, help='critic iteration, following WGAN-GP')
# parser.add_argument('--lr', type=float, default=0.000007, help='learning rate to training')
parser.add_argument('--lr', type=float, default=0.000009, help='learning rate to training')
parser.add_argument('--lr_decay_epoch', type=int, default=100, help='conduct learning rate decay after every 100 epochs')
parser.add_argument('--lr_dec_rate', type=float, default=0.99, help='learning rate decay rate')
parser.add_argument('--lambda1', type=float, default=10, help='gradient penalty regularizer, following WGAN-GP')
parser.add_argument('--classifier_lr', type=float, default=0.0001, help='learning rate to train softmax classifier')
parser.add_argument('--beta1', type=float, default=0.5, help='beta1 for adam. default=0.5')
# parser.add_argument('--cuda', action='store_true', default=True, help='enables cuda')
parser.add_argument('--cuda', action='store_true', default=False, help='enables cuda')
parser.add_argument('--manualSeed', type=int, default=3483, help='manual seed')
parser.add_argument('--nclass_all', type=int, default=8, help='number of all classes')
parser.add_argument('--nclass_seen', type=int, default=4, help='number of all classes')

parser.add_argument('--gpus', default='0', help='the number of the GPU to use')
# opt = parser.parse_args()
opt = parser.parse_known_args()[0]

print(opt)

G_loss_list = []
D_loss_list = []
train_acc_list = []
test_acc_list = []

os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus

if opt.manualSeed is None:
    opt.manualSeed = random.randint(1, 10000)
print("Random Seed: ", opt.manualSeed)
random.seed(opt.manualSeed)
torch.manual_seed(opt.manualSeed)
if opt.cuda:
    torch.cuda.manual_seed_all(opt.manualSeed)

cudnn.benchmark = True

if torch.cuda.is_available() and not opt.cuda:
    print("WARNING: You have a CUDA device, so you should probably run with --cuda")

# load data
# data = util.DATA_LOADER(opt)
data = DATA_LOADER(opt)
print("# of training samples: ", data.ntrain)


netG = MLP_G(opt)
netMap = Embedding_Net(opt)
netD = MLP_CRITIC(opt)
F_ha = Dis_Embed_Att(opt)

model_path = './models/'
if not os.path.exists(model_path):
    os.makedirs(model_path)

contras_criterion = SupConLoss_clear(opt.ins_temp)
mse_criterion = torch.nn.MSELoss(reduction='mean')

input_res = torch.FloatTensor(opt.batch_size, opt.resSize)
input_att = torch.FloatTensor(opt.batch_size, opt.attSize)
noise_gen = torch.FloatTensor(opt.batch_size, opt.nz)
input_label = torch.LongTensor(opt.batch_size)

def sample():
    batch_feature, batch_label, batch_att = data.next_batch(opt.batch_size)
    input_res.copy_(batch_feature)
    input_att.copy_(batch_att)
    input_label.copy_(batch_label)


def generate_syn_test_feature(netG, map_net, classes, attribute, num):
    nclass = classes.size(0)
    #init tensor
    syn_feature = torch.FloatTensor(nclass * num, opt.resSize)
    syn_label = torch.LongTensor(nclass * num)
    syn_att = torch.FloatTensor(num, opt.attSize)
    syn_noise = torch.FloatTensor(num, opt.nz)
#     if opt.cuda:
#         syn_att = syn_att.cuda()
#         syn_noise = syn_noise.cuda()

    for i in range(nclass):
        iclass = classes[i]
        iclass_att = attribute[iclass+4] 
#         print("============================")
#         print(iclass_att)
#         print("============================")
        syn_att.copy_(iclass_att.repeat(num, 1))
        syn_noise.normal_(0, 1)
        with torch.no_grad():
            embed_att,_ = map_net(syn_noise,syn_att)
            _,output = netG(syn_noise, embed_att)
        syn_feature.narrow(0, i * num, num).copy_(output.data.cpu())
        syn_label.narrow(0, i * num, num).fill_(iclass)
    return syn_feature, syn_label

def generate_syn_train_feature(netG, map_net, classes, attribute, num):
    nclass = classes.size(0)
    syn_feature = torch.FloatTensor(nclass * num, opt.resSize)
    syn_label = torch.LongTensor(nclass * num)
    syn_att = torch.FloatTensor(num, opt.attSize)
    syn_noise = torch.FloatTensor(num, opt.nz)
#     if opt.cuda:
#         syn_att = syn_att.cuda()
#         syn_noise = syn_noise.cuda()

    for i in range(nclass):
        iclass = classes[i]
        iclass_att = attribute[iclass]
#         print("---------------------------")
#         print(iclass_att)
#         print("---------------------------")
        syn_att.copy_(iclass_att.repeat(num, 1))
        syn_noise.normal_(0, 1)
        with torch.no_grad():
            embed_att,_ = map_net(syn_noise,syn_att)
            _,output = netG(syn_noise, embed_att)
        syn_feature.narrow(0, i * num, num).copy_(output.data.cpu())
        syn_label.narrow(0, i * num, num).fill_(iclass)
    return syn_feature, syn_label


# setup optimizer
import itertools
optimizerD = optim.Adam(itertools.chain(netD.parameters(), netMap.parameters(), F_ha.parameters()), lr=opt.lr,
                        betas=(opt.beta1, 0.999))
# optimizerD = optim.Adam(netD.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))

def calc_gradient_penalty(netD, real_data, fake_data, input_att):
    # print real_data.size()
    alpha = torch.rand(opt.batch_size, 1)
    alpha = alpha.expand(real_data.size())
#     if opt.cuda:
#         alpha = alpha.cuda()
    interpolates = alpha * real_data + ((1 - alpha) * fake_data)
#     if opt.cuda:
#         interpolates = interpolates.cuda()
    interpolates = Variable(interpolates, requires_grad=True)
    _,disc_interpolates = netD(interpolates, input_att)
    ones = torch.ones(disc_interpolates.size())
#     if opt.cuda:
#         ones = ones.cuda()
    gradients = autograd.grad(outputs=disc_interpolates, inputs=interpolates,
                              grad_outputs=ones,
                              create_graph=True, retain_graph=True, only_inputs=True)[0]
    gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * opt.lambda1
    return gradient_penalty

# use the for-loop to save the GPU-memory
def class_scores_for_loop(embed, input_label, relation_net):
    #all_scores=torch.FloatTensor(embed.shape[0],opt.nclass_seen).cuda()
    all_scores=torch.FloatTensor(embed.shape[0],opt.nclass_seen)
    for i, i_embed in enumerate(embed):
        expand_embed = i_embed.repeat(opt.nclass_seen, 1)#.reshape(embed.shape[0] * opt.nclass_seen, -1)
        #all_scores[i]=(torch.div(relation_net(torch.cat((expand_embed, data.attribute_seen.cuda()), dim=1)),opt.cls_temp).squeeze())
        all_scores[i]=(torch.div(relation_net(torch.cat((expand_embed, data.attribute_seen), dim=1)),opt.cls_temp).squeeze())
    score_max, _ = torch.max(all_scores, dim=1, keepdim=True)
    # normalize the scores for stable training
    scores_norm = all_scores - score_max.detach()
    #mask = F.one_hot(input_label, num_classes=opt.nclass_seen).float().cuda()
    mask = F.one_hot(input_label, num_classes=opt.nclass_seen).float()
    exp_scores = torch.exp(scores_norm)
    log_scores = scores_norm - torch.log(exp_scores.sum(1, keepdim=True))
    cls_loss = -((mask * log_scores).sum(1) / mask.sum(1)).mean()
    return cls_loss

# It is much faster to use the matrix, but it cost much GPU memory.
def class_scores_in_matrix(embed, input_label, relation_net):
    expand_embed = embed.unsqueeze(dim=1).repeat(1, opt.nclass_seen, 1).reshape(embed.shape[0] * opt.nclass_seen, -1)
    expand_att = data.attribute_seen.unsqueeze(dim=0).repeat(embed.shape[0], 1, 1).reshape(
        embed.shape[0] * opt.nclass_seen, -1).cuda()
    all_scores = torch.div(relation_net(torch.cat((expand_embed, expand_att), dim=1)),opt.cls_temp).reshape(embed.shape[0],
                                                                                                    opt.nclass_seen)
    score_max, _ = torch.max(all_scores, dim=1, keepdim=True)
    scores_norm = all_scores - score_max.detach()
    #mask = F.one_hot(input_label, num_classes=opt.nclass_seen).float().cuda()
    mask = F.one_hot(input_label, num_classes=opt.nclass_seen).float()
    exp_scores = torch.exp(scores_norm)
    log_scores = scores_norm - torch.log(exp_scores.sum(1, keepdim=True))
    cls_loss = -((mask * log_scores).sum(1) / mask.sum(1)).mean()
    return cls_loss


for epoch in range(opt.nepoch):
    FP = 0
    mean_lossD = 0
    mean_lossG = 0
    for i in range(0, data.ntrain, opt.batch_size):
#     for i in range(0, 4* opt.batch_size, opt.batch_size):
#     for i in range(0, 4, opt.batch_size):
        ############################
        # (1) Update D network: optimize WGAN-GP objective, Equation (2)
        ###########################
        for p in netD.parameters():  # reset requires_grad
            p.requires_grad = True  # they are set to False below in netG update
        for p in netMap.parameters():  # reset requires_grad
            p.requires_grad = True
        for p in F_ha.parameters():  # reset requires_grad
            p.requires_grad = True

        for iter_d in range(opt.critic_iter):
            sample()
            netD.zero_grad()
            netMap.zero_grad()
            #
            # train with realG
            # sample a mini-batch
            noise_gen.normal_(0, 1)
            embed_att, out_res = netMap(noise_gen, input_att)
            embed_fea,criticD_real = netD(input_res, embed_att)
            criticD_real = criticD_real.mean()
            
            real_ins_contras_loss = contras_criterion(embed_fea, input_label)
            real_att_contras_loss = contras_criterion(embed_att, input_label)
#             print("========================================")
#             print(real_ins_contras_loss)
#             print("========================================")

            # train with fakeG
            noise_gen.normal_(0, 1)
            embed_att, out_res = netMap(noise_gen,input_att)
            _,fake = netG(noise_gen, embed_att)
            _,criticD_fake = netD(fake.detach(), embed_att)
            criticD_fake = criticD_fake.mean()
            
            # gradient penalty
            gradient_penalty = calc_gradient_penalty(netD, input_res, fake.data, embed_att)
            Wasserstein_D = criticD_real - criticD_fake
            res_L = mse_criterion(input_res,out_res)

            cls_loss_real = class_scores_for_loop(embed_fea, input_label, F_ha)

            # D_cost = criticD_fake - criticD_real + gradient_penalty + real_ins_contras_loss + cls_loss_real + input_res.mean()- outz_real.mean()
            D_cost = criticD_fake - criticD_real + gradient_penalty + cls_loss_real + real_ins_contras_loss + real_att_contras_loss + res_L

            D_cost.backward()
            optimizerD.step()
        ############################
        # (2) Update G network: optimize WGAN-GP objective, Equation (2)
        ###########################
        for p in netD.parameters():  # reset requires_grad
            p.requires_grad = False  # avoid computation
        for p in netMap.parameters():  # reset requires_grad
            p.requires_grad = False
        for p in F_ha.parameters():  # reset requires_grad
            p.requires_grad = False

        netG.zero_grad()
        noise_gen.normal_(0, 1)

        embed_att, out_res = netMap(noise_gen, input_att)
        embed_G_fake,fake = netG(noise_gen, embed_att)
        
        embed_fea1,criticG_real = netD(input_res, embed_att)
        embed_fea2,criticG_fake = netD(fake, embed_att)
        criticG_fake = criticG_fake.mean()
        G_cost = -criticG_fake
        
        contras_loss = contras_criterion(embed_G_fake, input_label)

        all_outz = torch.cat((embed_fea2, embed_fea1.detach()), dim=0)
        fake_ins_contras_loss = contras_criterion(all_outz, torch.cat((input_label, input_label), dim=0))
        cls_loss_fake = class_scores_for_loop(embed_fea2, input_label, F_ha)

#         print("+++++++++++++++++++++++++++++++++")
#         print(G_cost)
#         print(contras_loss)
#         print(fake_ins_contras_loss)
#         print(cls_loss_fake)
#         print("+++++++++++++++++++++++++++++++++")


        errG = G_cost + contras_loss + opt.ins_weight * fake_ins_contras_loss + opt.cls_weight * cls_loss_fake  # + opt.ins_weight * c_errG
#         errG = G_cost
        errG.backward()
        optimizerG.step()

    F_ha.zero_grad()

    if (epoch + 1) % opt.lr_decay_epoch == 0:
        for param_group in optimizerD.param_groups:
            param_group['lr'] = param_group['lr'] * opt.lr_dec_rate
        for param_group in optimizerG.param_groups:
            param_group['lr'] = param_group['lr'] * opt.lr_dec_rate

    mean_lossG /= data.ntrain / opt.batch_size
    mean_lossD /= data.ntrain / opt.batch_size
    print('[%d/%d] Loss_D: %.4f Loss_G: %.4f, Wasserstein_dist: %.4f, real_ins_contras_loss:%.4f, fake_ins_contras_loss:%.4f, cls_loss_real: %.4f, cls_loss_fake: %.4f, contras_loss: %.4f'% (epoch, opt.nepoch, D_cost, G_cost, Wasserstein_D, real_ins_contras_loss, fake_ins_contras_loss, cls_loss_real, cls_loss_fake, contras_loss))
    # print('[%d/%d] Loss_D: %.4f Loss_G: %.4f, Wasserstein_dist: %.4f, cls_loss_real: %.4f'% (epoch, opt.nepoch, D_cost, G_cost, Wasserstein_D, cls_loss_real))
    
# evaluate the model, set G to evaluation mode
    netG.eval()

    for p in netMap.parameters():  # reset requires_grad
        p.requires_grad = False

    if opt.gzsl: # Generalized zero-shot learning
        syn_feature, syn_label = generate_syn_feature(netG, data.unseenclasses, data.attribute, opt.syn_num)

        train_X = torch.cat((data.train_feature, syn_feature), 0)
        train_Y = torch.cat((data.train_label, syn_label), 0)

        nclass = opt.nclass_all

        cls = CLASSIFIER(train_X, train_Y, netMap, opt.resSize, data, nclass, opt.cuda, opt.classifier_lr, 0.5, 25, 80, True)
        print('unseen=%.4f, seen=%.4f, h=%.4f' % (cls.acc_unseen, cls.acc_seen, cls.H))

    else:  # conventional zero-shot learning
        syn_test_feature, syn_test_label = generate_syn_test_feature(netG, netMap, data.unseenclasses, data.attribute, opt.syn_num) 
        syn_train_feature, syn_train_label = generate_syn_train_feature(netG, netMap, data.seenclasses, data.attribute, opt.syn_num)

        cls_seen = CLASSIFIER_TRAIN(syn_train_feature, syn_train_label, netMap,opt.resSize, data,data.seenclasses.size(0), opt.cuda, opt.classifier_lr, 0.5, 100,80,False)
        cls_unseen = CLASSIFIER(syn_test_feature, syn_test_label, netMap,opt.resSize, data,data.unseenclasses.size(0), opt.cuda, opt.classifier_lr, 0.5, 100,80,False)
        
        seen_acc = cls_seen.acc
        unseen_acc = cls_unseen.acc
        test_label = cls_unseen.label
        pre_label = cls_unseen.preLabel
        
        print('seen class accuracy=%.4f, unseen class accuracy=%.4f '%(seen_acc, unseen_acc))
        
        if unseen_acc > 0 :
            print("-------------------test_label----------------------")
            print(test_label)
            print(test_label[0:20])
            print(test_label[2000:2020])
            print(test_label[4000:4020])
            print(test_label[6000:6020])
            print("-------------------pre_label----------------------")
            print(pre_label)
            print(pre_label[0:20])
            print(pre_label[2000:2020])
            print(pre_label[4000:4020])
            print(pre_label[6000:6020])

Namespace(dataroot='./datas', syn_num=100, gzsl=False, preprocessing=True, standardization=False, validation=False, workers=2, batch_size=40, resSize=512, attSize=28, nz=2, embedSize=28, outzSize=256, ngh=256, ndh=256, nhF=128, ins_weight=0.001, cls_weight=0.001, ins_temp=0.1, cls_temp=0.1, nepoch=223, critic_iter=5, lr=9e-06, lr_decay_epoch=100, lr_dec_rate=0.99, lambda1=10, classifier_lr=0.0001, beta1=0.5, cuda=False, manualSeed=3483, nclass_all=8, nclass_seen=4, gpus='0')
Random Seed:  3483
tensor([0, 1, 2, 3])
tensor([0, 1, 2, 3])
tensor([0, 1, 2, 3, 4, 5, 6, 7])
# of training samples:  8000
[0/223] Loss_D: 1.3814 Loss_G: 0.0442, Wasserstein_dist: 0.1826, real_ins_contras_loss:0.2544, fake_ins_contras_loss:0.6651, cls_loss_real: 0.5950, cls_loss_fake: 1.3613, contras_loss: 0.5248
Training classifier_train loss= 1.3338
Training classifier loss= 1.3709
seen class accuracy=0.2500, unseen class accuracy=0.3970 
-------------------test_label----------------------
tensor([0, 0, 0,  ..., 