In [None]:
import os
import sys
import time
import argparse
import pickle
import numpy as np
import re
import inspect

import torch
from torch import optim
from torch.autograd import Variable
import torch.nn as nn

#from data import get_nli, get_batch, build_vocab
#from mutils import get_optimizer
#from models import NLINet

W2V_PATH = "/home/dc/cs230_project/dataset/GloVe/glove.840B.300d.txt"




parser = argparse.ArgumentParser(description='NLI training')
# paths
parser.add_argument("--nlipath", type=str, default='/home/dc/cs230_project/dataset/SNLI', help="NLI data path (SNLI or MultiNLI)")
parser.add_argument("--outputdir", type=str, default='savedir/', help="Output directory")
parser.add_argument("--outputmodelname", type=str, default='Hierattn.pickle')


# training
parser.add_argument("--n_epochs", type=int, default=40)
parser.add_argument("--batch_size", type=int, default=64)
parser.add_argument("--dpout_model", type=float, default=0., help="encoder dropout")
parser.add_argument("--dpout_fc", type=float, default=0., help="classifier dropout")
parser.add_argument("--nonlinear_fc", type=float, default=0, help="use nonlinearity in fc")
parser.add_argument("--optimizer", type=str, default="sgd,lr=0.1", help="adam or sgd,lr=0.1")
parser.add_argument("--lrshrink", type=float, default=5, help="shrink factor for sgd")
parser.add_argument("--weight_decay", type=float, default=5e-4, help="weight decay for sgd")

parser.add_argument("--decay", type=float, default=0.99, help="lr decay")
parser.add_argument("--minlr", type=float, default=1e-5, help="minimum lr")
parser.add_argument("--max_norm", type=float, default=5., help="max norm (grad clipping)")

# model
parser.add_argument("--encoder_type", type=str, default='InnerAttentionNAACLEncoder', help="see list of encoders")
parser.add_argument("--enc_lstm_dim", type=int, default=2048, help="encoder nhid dimension")
parser.add_argument("--n_enc_layers", type=int, default=1, help="encoder num layers")
parser.add_argument("--fc_dim", type=int, default=512, help="nhid of fc layers")
parser.add_argument("--n_classes", type=int, default=3, help="entailment/neutral/contradiction")
parser.add_argument("--pool_type", type=str, default='max', help="max or mean")
parser.add_argument("--word_emb_dim", type=int, default='300', help="embedding dim")
parser.add_argument("--LSTM_num_layers", type=int, default='1', help="LSTM num layers")
parser.add_argument("--data_dir", type=str, default='/home/dc/cs230_project/dataset', help="store duplicate questions")

# gpu
parser.add_argument("--gpu_id", type=int, default=0, help="GPU ID")
parser.add_argument("--seed", type=int, default=1234, help="seed")


params, _ = parser.parse_known_args()

# set gpu device
torch.cuda.set_device(params.gpu_id)

# print parameters passed, and all parameters
print('\ntogrep : {0}\n'.format(sys.argv[1:]))
print(params)

"""
SEED
"""
np.random.seed(params.seed)
torch.manual_seed(params.seed)
torch.cuda.manual_seed(params.seed)


#data formatting
QUORA_PATH="/home/dc/cs230_project/dataset"

def clean_quora(quora_path):
    '''
    input: path of quora tsv file downloaded from kaggle
    output: df with questions <10 chars removed
    
    '''
    df = pd.read_csv(os.path.join(quora_path,"quora_duplicate_questions.tsv"),sep="\t")
    print(df.head())
    df = df.drop(["id","qid1","qid2"],axis=1)
    print(df.count())
    df=df.dropna()
    print(df.count())
    df['q1_len'] = df['question1'].apply(len)
    df['q2_len'] = df['question2'].apply(len)
    print(df.head())
    #print(df.loc[df['q1_len'] < 10])
    #print(df.loc[df['q2_len'] < 10])
    df = df.loc[ (df['q1_len'] > 10) & (df['q2_len'] > 10)]
    print(df.count())
    return df


def get_nli(data_path):
    s1 = {}
    s2 = {}
    target = {}

    dico_label = {'entailment': 0,  'neutral': 1, 'contradiction': 2}

    for data_type in ['train', 'dev', 'test']:
        s1[data_type], s2[data_type], target[data_type] = {}, {}, {}
        s1[data_type]['path'] = os.path.join(data_path, 's1.' + data_type)
        s2[data_type]['path'] = os.path.join(data_path, 's2.' + data_type)
        target[data_type]['path'] = os.path.join(data_path,
                                                 'labels.' + data_type)

        s1[data_type]['sent'] = [line.rstrip() for line in
                                 open(s1[data_type]['path'], 'r')]
        s2[data_type]['sent'] = [line.rstrip() for line in
                                 open(s2[data_type]['path'], 'r')]
        target[data_type]['data'] = np.array([dico_label[line.rstrip('\n')]
                for line in open(target[data_type]['path'], 'r')])

        assert len(s1[data_type]['sent']) == len(s2[data_type]['sent']) == \
            len(target[data_type]['data'])

        print('** {0} DATA : Found {1} pairs of {2} sentences.'.format(
                data_type.upper(), len(s1[data_type]['sent']), data_type))

    train = {'s1': s1['train']['sent'], 's2': s2['train']['sent'],
             'label': target['train']['data']}
    dev = {'s1': s1['dev']['sent'], 's2': s2['dev']['sent'],
           'label': target['dev']['data']}
    test = {'s1': s1['test']['sent'], 's2': s2['test']['sent'],
            'label': target['test']['data']}
    return train, dev, test


def load_single_file(filename):
    fh = open(os.path.join(params.data_dir,filename+'.pkl'),'rb')
    data = pickle.load(fh)
    fh.close()
    return data

def load_data():
    X_train = load_single_file("X_train")
    X_valid = load_single_file("X_valid")
    X_test = load_single_file("X_test")
    y_train = load_single_file("y_train")
    y_valid = load_single_file("y_valid")
    y_test = load_single_file("y_test")
    return X_train, X_valid, X_test, y_train,y_valid, y_test
    
def format_data(X_train, X_valid,X_test, y_train,y_valid,y_test):
    s1 = {}
    s2 = {}
    target = {}
    s1['train'],s1['dev'],s1['test'],s2['train'],s2['dev'],s2['test'] = {},{},{},{},{},{}
    target['train'],target['dev'],target['test']={},{},{}

    s1['train']['sent'] = X_train[:,0]
    s2['train']['sent'] = X_train[:,1]
    s1['dev']['sent'] = X_valid[:,0]
    s2['dev']['sent'] = X_valid[:,1]
    s1['test']['sent'] = X_test[:,0]
    s2['test']['sent'] = X_test[:,1]
    target['train']['data'] = y_train[:,:]
    target['dev']['data'] = y_valid[:,:]
    target['test']['data'] = y_test[:,:]

    train = {'s1': s1['train']['sent'], 's2': s2['train']['sent'],
             'label': target['train']['data']}
    dev = {'s1': s1['dev']['sent'], 's2': s2['dev']['sent'],
           'label': target['dev']['data']}
    test = {'s1': s1['test']['sent'], 's2': s2['test']['sent'],
            'label': target['test']['data']}
    return train,dev,test


def get_word_dict(sentences):
    # create vocab of words
    word_dict = {}
    for sent in sentences:
        for word in sent.split():
            if word not in word_dict:
                word_dict[word] = ''
    word_dict['<s>'] = ''
    word_dict['</s>'] = ''
    word_dict['<p>'] = ''
    return word_dict


def get_glove(word_dict, glove_path):
    # create word_vec with glove vectors
    word_vec = {}
    with open(glove_path) as f:
        for line in f:
            word, vec = line.split(' ', 1)
            if word in word_dict:
                word_vec[word] = np.array(list(map(float, vec.split())))
    print('Found {0}(/{1}) words with glove vectors'.format(
                len(word_vec), len(word_dict)))
    return word_vec


def build_vocab(sentences, glove_path):
    word_dict = get_word_dict(sentences)
    word_vec = get_glove(word_dict, glove_path)
    print('Vocab size : {0}'.format(len(word_vec)))
    return word_vec

def orig(params,W2V_PATH):
    print(f"loading from:{params.nlipath}")
    train, valid, test = get_nli(params.nlipath)
    word_vec = build_vocab(train['s1'] + train['s2'] +
                       valid['s1'] + valid['s2'] +
                       test['s1'] + test['s2'], W2V_PATH)

    for split in ['s1', 's2']:
        for data_type in ['train', 'valid', 'test']:
            eval(data_type)[split] = np.array([['<s>'] +
            [word for word in sent.split() if word in word_vec] +
            ['</s>'] for sent in eval(data_type)[split]])
    return train,valid,test,word_vec
    
def quora():
    X_train,X_valid,X_test,y_train,y_valid,y_test = load_data()
    train,dev,test = format_data(X_train, X_valid,X_test, y_train,y_valid,y_test)
    return train,dev,test

train, valid, test,word_vec = orig(params,W2V_PATH)

#train, valid, test = quora()


"""
MODEL
"""
# model config
config_nli_model = {
    'n_words'        :  300          ,
    'word_emb_dim'   :  params.word_emb_dim   ,
    'enc_lstm_dim'   :  params.enc_lstm_dim   ,
    'n_enc_layers'   :  params.n_enc_layers   ,
    'dpout_model'    :  params.dpout_model    ,
    'dpout_fc'       :  params.dpout_fc       ,
    'fc_dim'         :  params.fc_dim         ,
    'bsize'          :  params.batch_size     ,
    'n_classes'      :  params.n_classes      ,
    'pool_type'      :  params.pool_type      ,
    'nonlinear_fc'   :  params.nonlinear_fc   ,
    'encoder_type'   :  params.encoder_type   ,
    'use_cuda'       :  True                  ,

}


class InnerAttentionNAACLEncoder(nn.Module):
    def __init__(self, config):
        super(InnerAttentionNAACLEncoder, self).__init__()
        self.bsize = config['bsize']
        self.word_emb_dim = config['word_emb_dim']
        self.enc_lstm_dim = config['enc_lstm_dim']
        self.pool_type = config['pool_type']


        self.enc_lstm = nn.LSTM(self.word_emb_dim, self.enc_lstm_dim, 1,
                                bidirectional=True)
        self.init_lstm = Variable(torch.FloatTensor(2, self.bsize,
                                  self.enc_lstm_dim).zero_()).cuda()

        self.proj_key = nn.Linear(2*self.enc_lstm_dim, 2*self.enc_lstm_dim,
                                  bias=False)
        self.proj_lstm = nn.Linear(2*self.enc_lstm_dim, 2*self.enc_lstm_dim,
                                   bias=False)
        self.query_embedding = nn.Embedding(1, 2*self.enc_lstm_dim)
        self.softmax = nn.Softmax()

    def forward(self, sent_tuple):
        # sent_len: [max_len, ..., min_len] (batch)
        # sent: Variable(seqlen x batch x worddim)

        sent, sent_len = sent_tuple
        bsize = sent.size(1)

        self.init_lstm = self.init_lstm if bsize == self.init_lstm.size(1) else \
                Variable(torch.FloatTensor(2, bsize, self.enc_lstm_dim).zero_()).cuda()

        # Sort by length (keep idx)
        sent_len, idx_sort = np.sort(sent_len)[::-1], np.argsort(-sent_len)
        sent = sent.index_select(1, Variable(torch.cuda.LongTensor(idx_sort)))
        # Handling padding in Recurrent Networks
        sent_packed = nn.utils.rnn.pack_padded_sequence(sent, sent_len)
        sent_output = self.enc_lstm(sent_packed,
                                    (self.init_lstm, self.init_lstm))[0]
        # seqlen x batch x 2*nhid
        sent_output = nn.utils.rnn.pad_packed_sequence(sent_output)[0]
        # Un-sort by length
        idx_unsort = np.argsort(idx_sort)
        sent_output = sent_output.index_select(1, Variable(torch.cuda.LongTensor(idx_unsort)))

        sent_output = sent_output.transpose(0,1).contiguous()

        sent_output_proj = self.proj_lstm(sent_output.view(-1,
            2*self.enc_lstm_dim)).view(bsize, -1, 2*self.enc_lstm_dim)

        sent_key_proj = self.proj_key(sent_output.view(-1,
            2*self.enc_lstm_dim)).view(bsize, -1, 2*self.enc_lstm_dim)

        sent_key_proj = torch.tanh(sent_key_proj)
        # NAACL paper: u_it=tanh(W_w.h_it + b_w)  (bsize, seqlen, 2nhid)

        sent_w = self.query_embedding(Variable(torch.LongTensor(bsize*[0]).cuda())).unsqueeze(2) #(bsize, 2*nhid, 1)

        Temp = 2
        keys = sent_key_proj.bmm(sent_w).squeeze(2) / Temp

        # Set probas of padding to zero in softmax
        keys = keys + ((keys == 0).float()*-10000)

        alphas = self.softmax(keys/Temp).unsqueeze(2).expand_as(sent_output)
        if int(time.time()) % 100 == 0:
            print('w', torch.max(sent_w), torch.min(sent_w))
            print('alphas', alphas[0, :, 0])
        emb = torch.sum(alphas * sent_output_proj, 1).squeeze(1)

        return emb


class NLINet(nn.Module):
    def __init__(self, config):
        super(NLINet, self).__init__()

        # classifier
        self.nonlinear_fc = config['nonlinear_fc']
        self.fc_dim = config['fc_dim']
        self.n_classes = config['n_classes']
        self.enc_lstm_dim = config['enc_lstm_dim']
        self.encoder_type = config['encoder_type']
        self.dpout_fc = config['dpout_fc']

        self.encoder = eval(self.encoder_type)(config)
        self.inputdim = 4*2*self.enc_lstm_dim
        self.inputdim = 4*self.inputdim if self.encoder_type in \
                        ["ConvNetEncoder", "InnerAttentionMILAEncoder"] else self.inputdim
        self.inputdim = ((int)(self.inputdim/2)) if self.encoder_type == "LSTMEncoder" \
                                        else self.inputdim
        if self.nonlinear_fc:
            self.classifier = nn.Sequential(
                nn.Dropout(p=self.dpout_fc),
                nn.Linear(self.inputdim, self.fc_dim),
                nn.Tanh(),
                nn.Dropout(p=self.dpout_fc),
                nn.Linear(self.fc_dim, self.fc_dim),
                nn.Tanh(),
                nn.Dropout(p=self.dpout_fc),
                nn.Linear(self.fc_dim, self.n_classes),
                )
        else:
            print(f"self.inputdim:{self.inputdim}, self.fc_dim:{self.fc_dim}")
            print(type(self.inputdim),type(self.fc_dim))
            self.classifier = nn.Sequential(
                nn.Linear(self.inputdim, self.fc_dim),
                nn.Linear(self.fc_dim, self.fc_dim),
                nn.Linear(self.fc_dim, self.n_classes)
                )

    def forward(self, s1, s2):
        # s1 : (s1, s1_len)
        u = self.encoder(s1)
        v = self.encoder(s2)

        features = torch.cat((u, v, torch.abs(u-v), u*v), 1)
        output = self.classifier(features)
        return output

    def encode(self, s1):
        emb = self.encoder(s1)
        return emb


def get_optimizer(s):
    """
    Parse optimizer parameters.
    Input should be of the form:
        - "sgd,lr=0.01"
        - "adagrad,lr=0.1,lr_decay=0.05"
    """
    if "," in s:
        method = s[:s.find(',')]
        optim_params = {}
        for x in s[s.find(',') + 1:].split(','):
            split = x.split('=')
            assert len(split) == 2
            assert re.match("^[+-]?(\d+(\.\d*)?|\.\d+)$", split[1]) is not None
            optim_params[split[0]] = float(split[1])
    else:
        method = s
        optim_params = {}

    if method == 'adadelta':
        optim_fn = optim.Adadelta
    elif method == 'adagrad':
        optim_fn = optim.Adagrad
    elif method == 'adam':
        optim_fn = optim.Adam
    elif method == 'adamax':
        optim_fn = optim.Adamax
    elif method == 'asgd':
        optim_fn = optim.ASGD
    elif method == 'rmsprop':
        optim_fn = optim.RMSprop
    elif method == 'rprop':
        optim_fn = optim.Rprop
    elif method == 'sgd':
        optim_fn = optim.SGD
        assert 'lr' in optim_params
    else:
        raise Exception('Unknown optimization method: "%s"' % method)

    # check that we give good parameters to the optimizer
    expected_args = inspect.getargspec(optim_fn.__init__)[0]
    assert expected_args[:2] == ['self', 'params']
    if not all(k in expected_args[2:] for k in optim_params.keys()):
        raise Exception('Unexpected parameters: expected "%s", got "%s"' % (
            str(expected_args[2:]), str(optim_params.keys())))

    return optim_fn, optim_params




# model
encoder_types = ['InferSent', 'BLSTMprojEncoder', 'BGRUlastEncoder',
                 'InnerAttentionMILAEncoder', 'InnerAttentionYANGEncoder',
                 'InnerAttentionNAACLEncoder', 'ConvNetEncoder', 'LSTMEncoder']
assert params.encoder_type in encoder_types, "encoder_type must be in " + \
                                             str(encoder_types)
nli_net = NLINet(config_nli_model)
print(nli_net)


# loss
weight = torch.FloatTensor(params.n_classes).fill_(1)
#BCE next w2 categories
loss_fn = nn.CrossEntropyLoss(weight=weight)
loss_fn.size_average = False

# optimizer
optim_fn, optim_params = get_optimizer(params.optimizer)
optimizer = optim_fn(nli_net.parameters(), **optim_params)

# cuda by default
nli_net.cuda()
loss_fn.cuda()




"""
TRAIN
"""
val_acc_best = -1e10
adam_stop = False
stop_training = False
lr = optim_params['lr'] if 'sgd' in params.optimizer else None


def get_batch(batch, word_vec):
    # sent in batch in decreasing order of lengths (bsize, max_len, word_dim)
    lengths = np.array([len(x) for x in batch])
    max_len = np.max(lengths)
    embed = np.zeros((max_len, len(batch), 300))

    for i in range(len(batch)):
        for j in range(len(batch[i])):
            embed[j, i, :] = word_vec[batch[i][j]]

    return torch.from_numpy(embed).float(), lengths



def trainepoch(epoch):
    print('\nTRAINING : Epoch ' + str(epoch))
    nli_net.train()
    all_costs = []
    logs = []
    words_count = 0

    last_time = time.time()
    correct = 0.
    # shuffle the data
    permutation = np.random.permutation(len(train['s1']))
    print(f"type(permutation):{type(permutation)}")
    print(f"type(train['s1']):{type(train['s1'])}")
    
    s1 = train['s1'][permutation]
    s2 = train['s2'][permutation]
    target = train['label'][permutation]


    optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] * params.decay if epoch>1\
        and 'sgd' in params.optimizer else optimizer.param_groups[0]['lr']
    print('Learning rate : {0}'.format(optimizer.param_groups[0]['lr']))

    for stidx in range(0, len(s1), params.batch_size):
        # prepare batch
        s1_batch, s1_len = get_batch(s1[stidx:stidx + params.batch_size],
                                     word_vec)
        s2_batch, s2_len = get_batch(s2[stidx:stidx + params.batch_size],
                                     word_vec)
        s1_batch, s2_batch = Variable(s1_batch.cuda()), Variable(s2_batch.cuda())
        tgt_batch = Variable(torch.LongTensor(target[stidx:stidx + params.batch_size])).cuda()
        k = s1_batch.size(1)  # actual batch size

        # model forward
        output = nli_net((s1_batch, s1_len), (s2_batch, s2_len))
        #verify for BCE?
        pred = output.data.max(1)[1]
        correct += pred.long().eq(tgt_batch.data.long()).cpu().sum()
        assert len(pred) == len(s1[stidx:stidx + params.batch_size])

        # loss
        loss = loss_fn(output, tgt_batch)
        all_costs.append(loss.item())
        words_count += (s1_batch.nelement() + s2_batch.nelement()) / params.word_emb_dim

        # backward
        optimizer.zero_grad()
        loss.backward()

        # gradient clipping (off by default)
        shrink_factor = 1
        total_norm = 0

        for p in nli_net.parameters():
            if p.requires_grad:
                p.grad.data.div_(k)  # divide by the actual batch size
                total_norm += p.grad.data.norm() ** 2
        total_norm = np.sqrt(total_norm)

        if total_norm > params.max_norm:
            shrink_factor = params.max_norm / total_norm
        current_lr = optimizer.param_groups[0]['lr'] # current lr (no external "lr", for adam)
        optimizer.param_groups[0]['lr'] = current_lr * shrink_factor # just for update

        # optimizer step
        optimizer.step()
        optimizer.param_groups[0]['lr'] = current_lr
        
        if len(all_costs) == 100:
            print(type(correct),correct,correct.item())
            #logs.append('{0} ; loss {1} accuracy:{2} ;'.format(stidx,round(np.mean(all_costs), 2),round(100.*correct.item()/(stidx+k), 2)))
            logs.append('{0} ; loss {1} ; sentence/s {2} ; words/s {3} ; accuracy train : {4}'.format(
                            stidx, round(np.mean(all_costs), 2),
                            int(len(all_costs) * params.batch_size / (time.time() - last_time)),
                            int(words_count * 1.0 / (time.time() - last_time)), 
                            round(100.*correct/(stidx+k), 2)))
            print(logs[-1])
            last_time = time.time()
            words_count = 0
            all_costs = []
    train_acc = round(100 * correct.item()/len(s1), 2)
    print('results : epoch {0} ; mean accuracy train : {1}'
          .format(epoch, train_acc))
    return train_acc


def evaluate(epoch, eval_type='valid', final_eval=False):
    nli_net.eval()
    correct = 0.
    global val_acc_best, lr, stop_training, adam_stop

    if eval_type == 'valid':
        print('\nVALIDATION : Epoch {0}'.format(epoch))

    s1 = valid['s1'] if eval_type == 'valid' else test['s1']
    s2 = valid['s2'] if eval_type == 'valid' else test['s2']
    target = valid['label'] if eval_type == 'valid' else test['label']

    for i in range(0, len(s1), params.batch_size):
        # prepare batch
        s1_batch, s1_len = get_batch(s1[i:i + params.batch_size], word_vec)
        s2_batch, s2_len = get_batch(s2[i:i + params.batch_size], word_vec)
        s1_batch, s2_batch = Variable(s1_batch.cuda()), Variable(s2_batch.cuda())
        tgt_batch = Variable(torch.LongTensor(target[i:i + params.batch_size])).cuda()

        # model forward
        output = nli_net((s1_batch, s1_len), (s2_batch, s2_len))

        pred = output.data.max(1)[1]
        correct += pred.long().eq(tgt_batch.data.long()).cpu().sum()

    # save model
    eval_acc = round(100 * correct.item() / len(s1), 2)
    if final_eval:
        print('finalgrep : accuracy {0} : {1}'.format(eval_type, eval_acc))
    else:
        print('togrep : results : epoch {0} ; mean accuracy {1} :\
              {2}'.format(epoch, eval_type, eval_acc))

    if eval_type == 'valid' and epoch <= params.n_epochs:
        if eval_acc > val_acc_best:
            print('saving model at epoch {0}'.format(epoch))
            if not os.path.exists(params.outputdir):
                os.makedirs(params.outputdir)
            torch.save(nli_net.state_dict(), os.path.join(params.outputdir,
                       params.outputmodelname))
            val_acc_best = eval_acc
        else:
            if 'sgd' in params.optimizer:
                optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] / params.lrshrink
                print('Shrinking lr by : {0}. New lr = {1}'
                      .format(params.lrshrink,
                              optimizer.param_groups[0]['lr']))
                if optimizer.param_groups[0]['lr'] < params.minlr:
                    stop_training = True
            if 'adam' in params.optimizer:
                # early stopping (at 2nd decrease in accuracy)
                stop_training = adam_stop
                adam_stop = True
    return eval_acc


"""
Train model on Natural Language Inference task
"""
epoch = 1

while not stop_training and epoch <= params.n_epochs:
    train_acc = trainepoch(epoch)
    eval_acc = evaluate(epoch, 'valid')
    epoch += 1

# Run best model on test set.
#nli_net.load_state_dict(os.path.join(params.outputdir, params.outputmodelname))

print('\nTEST : Epoch {0}'.format(epoch))
evaluate(1e6, 'valid', True)
evaluate(0, 'test', True)

# Save encoder instead of full model
torch.save(nli_net.encoder.state_dict(), os.path.join(params.outputdir, params.outputmodelname + '.encoder.pkl'))
#save entire model...


print("fin")



togrep : ['-f', '/run/user/1000/jupyter/kernel-d26b709f-8d73-462c-a71e-4a231a03dde9.json']

Namespace(LSTM_num_layers=1, batch_size=64, data_dir='/home/dc/cs230_project/dataset', decay=0.99, dpout_fc=0.0, dpout_model=0.0, enc_lstm_dim=2048, encoder_type='InnerAttentionNAACLEncoder', fc_dim=512, gpu_id=0, lrshrink=5, max_norm=5.0, minlr=1e-05, n_classes=3, n_enc_layers=1, n_epochs=40, nlipath='/home/dc/cs230_project/dataset/SNLI', nonlinear_fc=0, optimizer='sgd,lr=0.1', outputdir='savedir/', outputmodelname='Hierattn.pickle', pool_type='max', seed=1234, weight_decay=0.0005, word_emb_dim=300)
loading from:/home/dc/cs230_project/dataset/SNLI
** TRAIN DATA : Found 549367 pairs of train sentences.
** DEV DATA : Found 9842 pairs of dev sentences.
** TEST DATA : Found 9824 pairs of test sentences.
Found 38957(/43479) words with glove vectors
Vocab size : 38957
self.inputdim:16384, self.fc_dim:512
<class 'int'> <class 'int'>
NLINet(
  (encoder): InnerAttentionNAACLEncoder(
    (enc_lstm): LST




TRAINING : Epoch 1
type(permutation):<class 'numpy.ndarray'>
type(train['s1']):<class 'numpy.ndarray'>
Learning rate : 0.1




<class 'torch.Tensor'> tensor(2128) 2128
6336 ; loss 1.1 accuracy:33.25 ;
<class 'torch.Tensor'> tensor(4314) 4314
12736 ; loss 1.1 accuracy:33.7 ;
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1248, 0.1064, 0.1060, 0.1144, 0.0880, 0.0751, 0.0907, 0.1035, 0.0901,
        0.1011, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1409, 0.1178, 0.1226, 0.0865, 0.1128, 0.1025, 0.0998, 0.1007, 0.1164,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(

w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1358, 0.1302, 0.1058, 0.1106, 0.0908, 0.1107, 0.1119, 0.0954, 0.1089,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1407, 0.1356, 0.1098, 0.1158, 0.0919, 0.1110, 0.0938, 0.0928, 0.1084,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) te

w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1450, 0.1318, 0.1121, 0.0888, 0.0939, 0.1202, 0.1017, 0.0971, 0.1093,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2108, 0.1299, 0.1375, 0.1479, 0.1734, 0.2005, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0669, 0.0637, 0.0309, 0.0375, 0.0

<class 'torch.Tensor'> tensor(58561) 58561
172736 ; loss 1.1 accuracy:33.89 ;
<class 'torch.Tensor'> tensor(60930) 60930
179136 ; loss 1.1 accuracy:34.0 ;
<class 'torch.Tensor'> tensor(63193) 63193
185536 ; loss 1.1 accuracy:34.05 ;
<class 'torch.Tensor'> tensor(65551) 65551
191936 ; loss 1.1 accuracy:34.14 ;
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0705, 0.0747, 0.0482, 0.0472, 0.0525, 0.0464, 0.0519, 0.0631, 0.0606,
        0.0658, 0.0514, 0.0661, 0.0576, 0.0459, 0.0490, 0.0484, 0.0478, 0.0530,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0904, 0.0800, 0.0831, 0.0604, 0.0757, 0.0774, 0.0849, 0.0744, 0.0858,
        0.0618, 0.0529, 0.

w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1316, 0.1055, 0.0918, 0.0823, 0.0942, 0.1066, 0.1050, 0.0900, 0.0936,
        0.0995, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1008, 0.0947, 0.0970, 0.0884, 0.1081, 0.1024, 0.0855, 0.0912, 0.0812,
        0.0733, 0.0774, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.12

w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0626, 0.0529, 0.0398, 0.0454, 0.0413, 0.0384, 0.0513, 0.0479, 0.0580,
        0.0508, 0.0466, 0.0502, 0.0595, 0.0520, 0.0653, 0.0553, 0.0642, 0.0594,
        0.0593, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2398, 0.2088, 0.1757, 0.1695, 0.2062, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0337, 0.0266, 0.0276, 0.0250, 0.0250, 0.0387, 0.0288, 0.0371, 0.0318,
   

<class 'torch.Tensor'> tensor(122614) 122614
345536 ; loss 1.1 accuracy:35.48 ;
<class 'torch.Tensor'> tensor(124956) 124956
351936 ; loss 1.1 accuracy:35.5 ;
<class 'torch.Tensor'> tensor(127460) 127460
358336 ; loss 1.1 accuracy:35.56 ;
<class 'torch.Tensor'> tensor(129968) 129968
364736 ; loss 1.1 accuracy:35.63 ;
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0529, 0.0416, 0.0328, 0.0251, 0.0373, 0.0409, 0.0571, 0.0586, 0.0485,
        0.0470, 0.0446, 0.0551, 0.0469, 0.0494, 0.0503, 0.0497, 0.0571, 0.0552,
        0.0597, 0.0446, 0.0453, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,

<class 'torch.Tensor'> tensor(150635) 150635
415936 ; loss 1.1 accuracy:36.21 ;
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0329, 0.0275, 0.0195, 0.0317, 0.0256, 0.0297, 0.0254, 0.0304, 0.0373,
        0.0324, 0.0401, 0.0362, 0.0422, 0.0418, 0.0425, 0.0418, 0.0453, 0.0393,
        0.0383, 0.0282, 0.0195, 0.0305, 0.0345, 0.0310, 0.0377, 0.0304, 0.0333,
        0.0356, 0.0299, 0.0296, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1007, 0.0823, 0.0564, 0.0716, 0.0659, 0.0821, 0.0778, 0.0553, 0.0955,
        0.1164, 0.1034, 0.0926, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) te

w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0678, 0.0566, 0.0432, 0.0727, 0.0720, 0.0884, 0.0684, 0.0796, 0.0717,
        0.0985, 0.1026, 0.0626, 0.0581, 0.0579, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1463, 0.1560, 0.1146, 0.1362, 0.1319, 0.1092, 0.1028, 0.1030, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(174472) 174472
473536 ; loss 1.1 accuracy:36.84 ;
<class 'torch.Tensor'> tensor(177040) 177040
479936 ; loss 1.1 accuracy:36.88 

<class 'torch.Tensor'> tensor(190568) 190568
518336 ; loss 1.1 accuracy:36.76 ;
<class 'torch.Tensor'> tensor(192758) 192758
524736 ; loss 1.1 accuracy:36.73 ;
<class 'torch.Tensor'> tensor(195335) 195335
531136 ; loss 1.1 accuracy:36.77 ;
<class 'torch.Tensor'> tensor(197929) 197929
537536 ; loss 1.1 accuracy:36.82 ;
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0492, 0.0438, 0.0467, 0.0519, 0.0450, 0.0280, 0.0328, 0.0435, 0.0533,
        0.0456, 0.0574, 0.0543, 0.0580, 0.0492, 0.0310, 0.0453, 0.0408, 0.0626,
        0.0450, 0.0421, 0.0370, 0.0375, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1011, 0.0969, 0.0980, 0.0783, 0.0782, 0.0973, 0.1133, 0.0921, 0.0879,
        0.0777, 0.0791, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0

w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0485, 0.0394, 0.0356, 0.0418, 0.0437, 0.0434, 0.0435, 0.0428, 0.0461,
        0.0550, 0.0569, 0.0492, 0.0413, 0.0406, 0.0708, 0.0513, 0.0617, 0.0498,
        0.0499, 0.0449, 0.0438, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1508, 0.1374, 0.1118, 0.1195, 0.1005, 0.1406, 0.1196, 0.1198, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0436, 0.0446, 0.0400, 0.0389, 0.0

w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0774, 0.0514, 0.0354, 0.0453, 0.0681, 0.0844, 0.0703, 0.0866, 0.0809,
        0.0531, 0.0716, 0.0605, 0.0732, 0.0678, 0.0739, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1348, 0.0924, 0.0687, 0.0961, 0.1370, 0.1143, 0.1213, 0.1134, 0.1221,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, devi

w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0822, 0.0578, 0.0670, 0.0518, 0.0552, 0.0688, 0.0685, 0.0797, 0.0767,
        0.0707, 0.0989, 0.0810, 0.0685, 0.0731, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1393, 0.1010, 0.0973, 0.0835, 0.1022, 0.0964, 0.0836, 0.0903, 0.0940,
        0.1124, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(70621) 70621
166336 ; loss 1.1 accuracy:42.44 ;
<class 'torch.Tensor'> tensor(73545) 73545
172736 ; loss 1.1 accuracy:42.56 ;
<class 'torch.Ten

<class 'torch.Tensor'> tensor(94234) 94234
217536 ; loss 1.1 accuracy:43.31 ;
<class 'torch.Tensor'> tensor(97201) 97201
223936 ; loss 1.1 accuracy:43.39 ;
<class 'torch.Tensor'> tensor(100241) 100241
230336 ; loss 1.1 accuracy:43.51 ;
<class 'torch.Tensor'> tensor(103256) 103256
236736 ; loss 1.1 accuracy:43.6 ;
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0680, 0.0609, 0.0631, 0.0601, 0.0576, 0.0701, 0.0624, 0.0725, 0.0852,
        0.0909, 0.1048, 0.0911, 0.0540, 0.0594, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1

w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0539, 0.0292, 0.0261, 0.0253, 0.0275, 0.0387, 0.0649, 0.0512, 0.0558,
        0.0735, 0.0889, 0.0663, 0.0511, 0.0640, 0.0455, 0.0590, 0.0785, 0.0463,
        0.0542, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1821, 0.0990, 0.0899, 0.0900, 0.1040, 0.1600, 0.1161, 0.1588, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.03

w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0649, 0.0437, 0.0283, 0.0281, 0.0344, 0.0392, 0.0515, 0.0671, 0.0929,
        0.1137, 0.0759, 0.0716, 0.0555, 0.0646, 0.0602, 0.0452, 0.0630, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1095, 0.0556, 0.0979, 0.0640, 0.0962, 0.0921, 0.0842, 0.1239, 0.1012,
        0.0730, 0.1023, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(154221) 154221
345536 ; loss 1.1 accuracy:44.62 ;
<class 'torch.Tensor'> tensor(157201) 157201
351936 ; loss 1.1 accuracy:44.66 ;
<class 'torch.Tensor'> tensor(160309) 160309
358336 ; loss 1.1 accuracy:44.73 ;
<class 'torch

<class 'torch.Tensor'> tensor(181056) 181056
403136 ; loss 1.1 accuracy:44.9 ;
<class 'torch.Tensor'> tensor(183908) 183908
409536 ; loss 1.1 accuracy:44.9 ;
<class 'torch.Tensor'> tensor(186721) 186721
415936 ; loss 1.1 accuracy:44.88 ;
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0774, 0.0449, 0.0317, 0.0443, 0.0682, 0.0966, 0.0801, 0.0764, 0.0562,
        0.0949, 0.0577, 0.0598, 0.0523, 0.0517, 0.0403, 0.0676, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0987, 0.0572, 0.0481, 0.0722, 0.1298, 0.1181, 0.1128, 0.1222, 0.0859,
        0.0597, 0.0953, 0.0000, 0.0000, 0.0000, 0.0000, 0.

w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1205, 0.0604, 0.0524, 0.0733, 0.1005, 0.1554, 0.1704, 0.1150, 0.1520,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1668, 0.0772, 0.0869, 0.1189, 0.1751, 0.1240, 0.0866, 0.1645, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, dev

w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1049, 0.0240, 0.0176, 0.0277, 0.0422, 0.0478, 0.0610, 0.1024, 0.1279,
        0.1168, 0.1025, 0.0645, 0.0490, 0.1117, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1192, 0.0475, 0.0337, 0.1117, 0.1611, 0.1661, 0.1316, 0.0819, 0.1471,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(231193) 231193
518336 ; loss 1.09 accuracy:44.6 ;
<class 'torch.Tensor'> tensor(233937) 233937

w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0620, 0.0283, 0.0281, 0.0293, 0.0604, 0.0874, 0.1173, 0.1166, 0.1317,
        0.1064, 0.0850, 0.0559, 0.0264, 0.0651, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2013, 0.0366, 0.0284, 0.0468, 0.1040, 0.0854, 0.1456, 0.0903, 0.0660,
        0.1956, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0420, 0.00

w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2047, 0.0357, 0.0551, 0.0885, 0.0533, 0.1026, 0.0687, 0.0623, 0.0984,
        0.0547, 0.1760, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2786, 0.0931, 0.1215, 0.0630, 0.0714, 0.0711, 0.0641, 0.2371, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(40972) 40972
89536 ; loss 1.09 accuracy:45.73 ;
<class 'torch.Tensor'> tensor(43877) 43877
95936 ; loss 1.09 accuracy:45.71 ;
<class 'torch.Tensor'> tensor(46845) 46845
102336 ; loss 1.09 ac

<class 'torch.Tensor'> tensor(69897) 69897
153536 ; loss 1.09 accuracy:45.51 ;
<class 'torch.Tensor'> tensor(72805) 72805
159936 ; loss 1.09 accuracy:45.5 ;
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2357, 0.0253, 0.0223, 0.1184, 0.0911, 0.1306, 0.0963, 0.0535, 0.2268,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2099, 0.0561, 0.0317, 0.0623, 0.0971, 0.0809, 0.1295, 0.0908, 0.0461,
        0.1956, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<Se

w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0627, 0.0098, 0.0107, 0.0178, 0.0231, 0.0161, 0.0473, 0.0214, 0.0117,
        0.0344, 0.0324, 0.0253, 0.0261, 0.0189, 0.0338, 0.0443, 0.0703, 0.0735,
        0.0572, 0.0766, 0.0917, 0.0513, 0.0354, 0.0234, 0.0252, 0.0088, 0.0508,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0914, 0.0195, 0.0174, 0.0447, 0.0442, 0.0358, 0.0612, 0.0527, 0.0558,
        0.0498, 0.0746, 0.0462, 0.0823, 0.1247, 0.0494, 0.0434, 0.0130, 0.0202,
        0.0105, 0.0631, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1272, 0.0099, 0.01

w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0720, 0.0039, 0.0022, 0.0057, 0.0107, 0.0139, 0.0269, 0.0386, 0.0702,
        0.0298, 0.0086, 0.0146, 0.0201, 0.0154, 0.0206, 0.0502, 0.0549, 0.1129,
        0.0410, 0.0706, 0.0649, 0.0385, 0.0315, 0.0310, 0.0485, 0.0182, 0.0109,
        0.0121, 0.0075, 0.0542], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1773, 0.0226, 0.0223, 0.1952, 0.1211, 0.0780, 0.0608, 0.0579, 0.0913,
        0.0401, 0.1334, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3648, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0737, 0.0043, 0.0095, 0.0056, 0.0184, 0.0572, 0.0410, 0.0591, 0.1216,
        0.1629, 0.2864, 0.0725, 0.0

<class 'torch.Tensor'> tensor(141143) 141143
313536 ; loss 1.07 accuracy:45.01 ;
<class 'torch.Tensor'> tensor(143990) 143990
319936 ; loss 1.07 accuracy:45.0 ;
<class 'torch.Tensor'> tensor(146797) 146797
326336 ; loss 1.07 accuracy:44.97 ;
<class 'torch.Tensor'> tensor(149696) 149696
332736 ; loss 1.07 accuracy:44.98 ;
w tensor(3.3649, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1966, 0.0119, 0.0129, 0.0128, 0.0211, 0.0399, 0.0583, 0.0582, 0.0358,
        0.0890, 0.0913, 0.0990, 0.0391, 0.0567, 0.0284, 0.0195, 0.0180, 0.0108,
        0.1005, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3649, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0813, 0.0049, 0.0053, 0.0049, 0.0224, 0.0359, 0.0628, 0.0594, 0.3052,
        0.0761

w tensor(3.3649, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2850, 0.0138, 0.0224, 0.0095, 0.0525, 0.0428, 0.0792, 0.0271, 0.0589,
        0.1042, 0.1281, 0.0583, 0.0119, 0.1064, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3649, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.3291, 0.0080, 0.0057, 0.0375, 0.0345, 0.0563, 0.0262, 0.0616, 0.1122,
        0.1386, 0.0630, 0.0129, 0.1144, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3649, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4250, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.3869, 0.0219, 0.0183, 0.0390, 0.0383, 0.1360, 0.1279, 0.0504, 0.0286,
    

w tensor(3.3649, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4251, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1300, 0.0120, 0.0127, 0.0094, 0.0139, 0.0603, 0.0412, 0.4402, 0.2184,
        0.0230, 0.0055, 0.0334, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3649, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4251, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2511, 0.0237, 0.0263, 0.0214, 0.0425, 0.1322, 0.2989, 0.0511, 0.0381,
        0.0457, 0.0689, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(198595) 198595
441536 ; loss 1.05 accuracy:44.97 ;
<class 'torch.Tensor'> tenso

<class 'torch.Tensor'> tensor(221980) 221980
492736 ; loss 1.03 accuracy:45.04 ;
<class 'torch.Tensor'> tensor(225031) 225031
499136 ; loss 1.03 accuracy:45.08 ;
<class 'torch.Tensor'> tensor(227989) 227989
505536 ; loss 1.03 accuracy:45.09 ;
<class 'torch.Tensor'> tensor(231002) 231002
511936 ; loss 1.03 accuracy:45.12 ;
w tensor(3.3650, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4251, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1227, 0.0100, 0.0184, 0.0300, 0.2609, 0.1537, 0.0533, 0.1993, 0.0704,
        0.0242, 0.0283, 0.0057, 0.0230, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3650, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4251, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.5911, 0.0322, 0.0355, 0.0399, 0.1245, 0.1767, 0.000

w tensor(3.3650, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4251, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0601, 0.0053, 0.0052, 0.0035, 0.0081, 0.0128, 0.3702, 0.0093, 0.0058,
        0.0090, 0.0258, 0.0490, 0.0111, 0.1119, 0.1701, 0.1125, 0.0103, 0.0084,
        0.0033, 0.0018, 0.0066, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3650, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4251, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.5596, 0.0290, 0.0813, 0.0312, 0.0543, 0.0405, 0.0587, 0.0260, 0.0215,
        0.0979, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3650, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4251, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.

w tensor(3.3650, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4251, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.7665, 0.0059, 0.0214, 0.0324, 0.0149, 0.0510, 0.0197, 0.0207, 0.0145,
        0.0530, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3650, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4251, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2058, 0.0016, 0.0063, 0.0103, 0.0055, 0.0237, 0.0131, 0.0236, 0.2411,
        0.4282, 0.0156, 0.0064, 0.0041, 0.0147, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(30676) 30676
63936 ; los

<class 'torch.Tensor'> tensor(58756) 58756
121536 ; loss 1.0 accuracy:48.32 ;
<class 'torch.Tensor'> tensor(61894) 61894
127936 ; loss 1.0 accuracy:48.35 ;
<class 'torch.Tensor'> tensor(65115) 65115
134336 ; loss 0.99 accuracy:48.45 ;
<class 'torch.Tensor'> tensor(80797) 80797
166336 ; loss 1.0 accuracy:48.56 ;
<class 'torch.Tensor'> tensor(84002) 84002
172736 ; loss 0.99 accuracy:48.61 ;
<class 'torch.Tensor'> tensor(87150) 87150
179136 ; loss 0.99 accuracy:48.63 ;
<class 'torch.Tensor'> tensor(90372) 90372
185536 ; loss 0.99 accuracy:48.69 ;
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4251, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.3730, 0.0041, 0.0068, 0.0057, 0.0172, 0.0161, 0.0303, 0.3568, 0.1371,
        0.0182, 0.0349, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       

w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4251, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.3450, 0.0132, 0.1334, 0.2211, 0.1999, 0.0199, 0.0368, 0.0102, 0.0206,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4251, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2363, 0.0166, 0.1384, 0.0328, 0.1359, 0.1021, 0.0231, 0.2928, 0.0085,
        0.0134, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<Max

w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4251, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0365, 0.0005, 0.0005, 0.0011, 0.0014, 0.0305, 0.0005, 0.0021, 0.0031,
        0.0110, 0.0084, 0.0040, 0.0227, 0.0046, 0.0011, 0.0076, 0.0116, 0.3499,
        0.3820, 0.1100, 0.0053, 0.0056, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4251, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2438, 0.0258, 0.0296, 0.0197, 0.0340, 0.1192, 0.4261, 0.1018, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4251, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.5527, 0.0075, 0.0067, 0.0147, 0.0170, 0.0172, 0.0606, 0.0327, 0.0270,
        0.

<class 'torch.Tensor'> tensor(167548) 167548
339136 ; loss 0.98 accuracy:49.4 ;
<class 'torch.Tensor'> tensor(170763) 170763
345536 ; loss 0.98 accuracy:49.41 ;
<class 'torch.Tensor'> tensor(173985) 173985
351936 ; loss 0.98 accuracy:49.43 ;
<class 'torch.Tensor'> tensor(177267) 177267
358336 ; loss 0.98 accuracy:49.46 ;
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1145, 0.0019, 0.0069, 0.0096, 0.0494, 0.0070, 0.0088, 0.0528, 0.0515,
        0.1535, 0.0456, 0.0251, 0.0172, 0.2657, 0.1609, 0.0136, 0.0161, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2677, 0.0034, 0.0024, 0.0026, 0.0067, 0.0268, 0.0169, 0.0705, 0.0519,
        0.0654, 0.0148, 0.0059, 0.0102, 0.0241, 0.0116, 0.0081, 0.0104, 0.0325,
        0.0961

w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2308, 0.0060, 0.0636, 0.0208, 0.1604, 0.0598, 0.0098, 0.0081, 0.0079,
        0.0087, 0.0102, 0.0226, 0.0272, 0.0186, 0.0044, 0.0036, 0.1723, 0.1370,
        0.0121, 0.0162, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.6034, 0.0502, 0.0100, 0.0230, 0.2446, 0.0264, 0.0424, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.7436, 0.0151, 0.0153, 0.0172, 0.0249, 0.0261, 0.0

<class 'torch.Tensor'> tensor(232857) 232857
467136 ; loss 0.97 accuracy:49.84 ;
<class 'torch.Tensor'> tensor(236044) 236044
473536 ; loss 0.97 accuracy:49.84 ;
<class 'torch.Tensor'> tensor(239298) 239298
479936 ; loss 0.97 accuracy:49.85 ;
<class 'torch.Tensor'> tensor(242661) 242661
486336 ; loss 0.95 accuracy:49.89 ;
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.6445, 0.1009, 0.0216, 0.0282, 0.0091, 0.0144, 0.0098, 0.0125, 0.0142,
        0.0184, 0.0017, 0.0007, 0.0041, 0.0028, 0.0218, 0.0127, 0.0137, 0.0286,
        0.0402, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.7867, 0.0142, 0.0086, 0.0122, 0.0092, 0.0562, 0.0487, 0.064

w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.3579, 0.0113, 0.0092, 0.0177, 0.1512, 0.0181, 0.0279, 0.0340, 0.0080,
        0.0471, 0.2562, 0.0318, 0.0297, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0224, 0.0008, 0.0009, 0.0031, 0.0737, 0.0123, 0.0025, 0.0011, 0.0183,
        0.7744, 0.0235, 0.0626, 0.0044, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.3207, 0.0672, 0.1

w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.6166, 0.0205, 0.0169, 0.0452, 0.0581, 0.0488, 0.1051, 0.0468, 0.0421,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0705, 0.0023, 0.0020, 0.0047, 0.0470, 0.0247, 0.0025, 0.0520, 0.7269,
        0.0209, 0.0356, 0.0109, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(19766) 19766
38336 ; loss 0.97 accuracy:51.47 ;
<class 'torch.Tensor'> tensor(23114) 23114
44736 ; loss 0.97 a

<class 'torch.Tensor'> tensor(46298) 46298
89536 ; loss 0.96 accuracy:51.67 ;
<class 'torch.Tensor'> tensor(49600) 49600
95936 ; loss 0.97 accuracy:51.67 ;
<class 'torch.Tensor'> tensor(52921) 52921
102336 ; loss 0.96 accuracy:51.68 ;
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2742, 0.0526, 0.0215, 0.0396, 0.3694, 0.0397, 0.0200, 0.1239, 0.0245,
        0.0196, 0.0150, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       dev

w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2538, 0.0461, 0.0130, 0.0062, 0.0093, 0.0160, 0.0570, 0.0125, 0.0262,
        0.0115, 0.0283, 0.0624, 0.0102, 0.0096, 0.0130, 0.1093, 0.0258, 0.0084,
        0.0090, 0.0339, 0.0198, 0.1864, 0.0204, 0.0121, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1466, 0.0363, 0.0234, 0.0209, 0.2316, 0.0320, 0.1841, 0.2932, 0.0320,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>

w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2279, 0.0530, 0.0227, 0.0185, 0.0139, 0.0063, 0.0215, 0.0199, 0.0082,
        0.0305, 0.0373, 0.0375, 0.4348, 0.0502, 0.0179, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.3031, 0.0809, 0.0853, 0.0686, 0.0648, 0.0593, 0.2890, 0.0316, 0.0174,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(109453) 109453
211136 ; loss 0.96 accuracy:51.82 ;
<class 'torch.Tensor'> tensor(112794

<class 'torch.Tensor'> tensor(136182) 136182
262336 ; loss 0.97 accuracy:51.9 ;
<class 'torch.Tensor'> tensor(139577) 139577
268736 ; loss 0.95 accuracy:51.93 ;
<class 'torch.Tensor'> tensor(142902) 142902
275136 ; loss 0.96 accuracy:51.93 ;
<class 'torch.Tensor'> tensor(146216) 146216
281536 ; loss 0.96 accuracy:51.92 ;
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.4662, 0.0210, 0.0576, 0.0114, 0.0102, 0.0252, 0.0182, 0.0174, 0.0581,
        0.2687, 0.0293, 0.0167, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.3833, 0.0174, 0.0479, 0.0096, 0.0087, 0.022

w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2888, 0.0149, 0.0145, 0.0588, 0.0130, 0.0371, 0.0191, 0.0215, 0.0804,
        0.1812, 0.0561, 0.0748, 0.1058, 0.0216, 0.0124, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.7424, 0.0366, 0.0321, 0.0783, 0.0701, 0.0404, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', 

<class 'torch.Tensor'> tensor(202903) 202903
390336 ; loss 0.95 accuracy:51.97 ;
<class 'torch.Tensor'> tensor(206294) 206294
396736 ; loss 0.94 accuracy:51.99 ;
<class 'torch.Tensor'> tensor(209644) 209644
403136 ; loss 0.95 accuracy:52.0 ;
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2748, 0.0118, 0.0413, 0.1556, 0.1163, 0.0301, 0.0102, 0.0072, 0.0111,
        0.0116, 0.0212, 0.0080, 0.0206, 0.0933, 0.1396, 0.0319, 0.0154, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2404, 0.0103, 0.0360, 0.1358, 0.1014, 0.0262, 0.0089, 0.0062, 0.0097,
        0.0103, 0.0193, 0.0077, 0.0225, 0.1900, 0.1342, 0.0279, 0.0131, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000

w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.4160, 0.0185, 0.1046, 0.0516, 0.0116, 0.0191, 0.2353, 0.0366, 0.0210,
        0.0326, 0.0144, 0.0216, 0.0169, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.6581, 0.0444, 0.0391, 0.0461, 0.1593, 0.0531, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.3749, 0.2338, 0.0178, 0.0349, 0.0990, 0.0203, 0.06

w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0849, 0.0031, 0.0027, 0.0008, 0.0062, 0.0340, 0.0018, 0.0109, 0.0429,
        0.0117, 0.0615, 0.1006, 0.0257, 0.0948, 0.0232, 0.1164, 0.3540, 0.0174,
        0.0074, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1718, 0.0251, 0.0178, 0.0042, 0.0343, 0.2716, 0.0327, 0.1284, 0.0384,
        0.0225, 0.0779, 0.0905, 0.0603, 0.0148, 0.0099],
       device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(266650) 266650
511936 ; loss 0.95 accuracy:52.08 ;
<class 'torch.Tensor'> tensor(269988) 269988
518336 ; loss 0.95 accuracy:52.08 ;
<class 'torch.Tensor'> tensor(273420) 273420
524736 ; loss 0.94 accuracy:52.

w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2207, 0.0170, 0.0422, 0.1228, 0.1753, 0.0321, 0.0235, 0.1454, 0.0223,
        0.0764, 0.0160, 0.0180, 0.0481, 0.0300, 0.0101, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.3552, 0.0189, 0.0260, 0.0850, 0.0502, 0.1931, 0.1403, 0.0998, 0.0315,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4252, device='cuda:0', grad_fn=<MinBackwar

w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.4008, 0.0169, 0.0128, 0.0076, 0.0108, 0.0131, 0.0304, 0.0229, 0.0170,
        0.0289, 0.1846, 0.0322, 0.0180, 0.0238, 0.1099, 0.0499, 0.0202, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.3392, 0.0141, 0.0113, 0.0187, 0.5410, 0.0560, 0.0198, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(53988) 53988
102336 ; loss 0.94 accuracy:52.72 ;
<class 'torch.Tensor'> tensor(57376) 57376
108736 ; loss 0.94 accuracy:52.74 ;
w tensor(3.3652, device

<class 'torch.Tensor'> tensor(84581) 84581
159936 ; loss 0.94 accuracy:52.86 ;
<class 'torch.Tensor'> tensor(87979) 87979
166336 ; loss 0.94 accuracy:52.87 ;
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.3696, 0.0170, 0.0204, 0.0229, 0.0143, 0.1278, 0.0280, 0.0458, 0.1211,
        0.0176, 0.0148, 0.0222, 0.0121, 0.0083, 0.0037, 0.0092, 0.0280, 0.0448,
        0.0354, 0.0242, 0.0130, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.3211, 0.0149, 0.0178, 0.0198, 0.0116, 0.0368, 0.0226, 0.0561, 0.3325,
        0.0275, 0.0121, 0.0075, 0.0031, 0.0074, 0.0205, 0.0125, 0.0140, 0.0070,
        0.0284, 0.0169, 0.0101, 0.0000],
       device='cuda:0', grad_fn=<SelectBac

<class 'torch.Tensor'> tensor(115121) 115121
217536 ; loss 0.95 accuracy:52.9 ;
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0701, 0.0044, 0.0046, 0.0035, 0.0038, 0.0208, 0.0142, 0.0973, 0.3371,
        0.1144, 0.0130, 0.0368, 0.0900, 0.1162, 0.0134, 0.0176, 0.0285, 0.0112,
        0.0031, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.5741, 0.0316, 0.0248, 0.0276, 0.0678, 0.1245, 0.1150, 0.0346, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0836, 0.0059, 0.0

w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2078, 0.0500, 0.0305, 0.1269, 0.0330, 0.0294, 0.0869, 0.0122, 0.0242,
        0.0212, 0.0333, 0.0378, 0.2713, 0.0276, 0.0080, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2978, 0.1717, 0.0776, 0.4106, 0.0422, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(149175) 149175
281536 ; loss 0.94 accuracy:52.97 ;
<class 'torch.Tensor'> tensor(152619) 152619


w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.4746, 0.0268, 0.0197, 0.0236, 0.0574, 0.0275, 0.0282, 0.0733, 0.0064,
        0.0135, 0.0300, 0.0226, 0.0851, 0.0544, 0.0422, 0.0144, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.3625, 0.0256, 0.0260, 0.0250, 

w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1580, 0.0118, 0.0213, 0.0096, 0.0056, 0.0068, 0.0014, 0.0006, 0.0045,
        0.0070, 0.0153, 0.0348, 0.4485, 0.0456, 0.0228, 0.0631, 0.1217, 0.0175,
        0.0041, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2062, 0.0132, 0.0192, 0.0171, 0.1429, 0.1279, 0.2514, 0.1482, 0.0603,
        0.0137, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(211195) 211195
396736 ; loss 0.94 accuracy:53.22 ;
<class 'torch.Tensor'> tensor(214675) 214675
403136 ; loss 

<class 'torch.Tensor'> tensor(235479) 235479
441536 ; loss 0.93 accuracy:53.32 ;
<class 'torch.Tensor'> tensor(238928) 238928
447936 ; loss 0.93 accuracy:53.33 ;
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1154, 0.0221, 0.0164, 0.0174, 0.0696, 0.2259, 0.5113, 0.0220, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1408, 0.0386, 0.0143, 0.0759, 0.0161, 0.0245, 0.0114, 0.0452, 0.0279,
        0.0325, 0.2936, 0.0329, 0.0317, 0.1936, 0.0169, 0.0040, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) te

<class 'torch.Tensor'> tensor(280938) 280938
524736 ; loss 0.94 accuracy:53.53 ;
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1632, 0.0113, 0.0122, 0.0067, 0.0111, 0.1759, 0.0764, 0.0573, 0.1251,
        0.3097, 0.0427, 0.0086, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1557, 0.0101, 0.0122, 0.0100, 0.0071, 0.0013, 0.0085, 0.0186, 0.3670,
        0.0177, 0.0364, 0.1987, 0.0418, 0.0838, 0.0244, 0.0066, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0836, 0.0051, 0.0060, 0.0030, 0.

w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1449, 0.0102, 0.0278, 0.0078, 0.0040, 0.0039, 0.0130, 0.0341, 0.0158,
        0.1921, 0.0342, 0.1055, 0.0445, 0.3215, 0.0334, 0.0073, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2280, 0.0312, 0.0166, 0.0160, 0.0168, 0.0459, 0.0192, 0.0159, 0.2699,
        0.0613, 0.2296, 0.0401, 0.0094, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(13962) 13962
25536 ; loss 0.94 accuracy:54.54 ;
<class 'torch.Tensor'> tensor(17514) 17514
31936 ; loss 0.93 accuracy:54.73 ;
w tensor(3.3652, 

w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1337, 0.0242, 0.0194, 0.0698, 0.0158, 0.2324, 0.0344, 0.0028, 0.0981,
        0.0549, 0.2980, 0.0128, 0.0036, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0832, 0.0099, 0.0205, 0.0155, 0.8290, 0.0356, 0.0063, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(38625) 38625
70336 ; loss 0.93 accuracy:54.87 ;
<class 'torch.Tensor'> tensor(42060) 42060
76736

<class 'torch.Tensor'> tensor(70703) 70703
127936 ; loss 0.93 accuracy:55.24 ;
<class 'torch.Tensor'> tensor(74187) 74187
134336 ; loss 0.94 accuracy:55.2 ;
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0521, 0.0064, 0.0093, 0.0030, 0.0030, 0.0129, 0.0037, 0.0023, 0.0162,
        0.0244, 0.1899, 0.6414, 0.0309, 0.0044, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1111, 0.0184, 0.0461, 0.1366, 0.0306, 0.1365, 0.4585, 0.0543, 0.0078,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device=

w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1974, 0.0102, 0.0052, 0.0122, 0.0141, 0.0290, 0.1278, 0.0538, 0.3661,
        0.0514, 0.0571, 0.0635, 0.0124, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.4124, 0.0287, 0.0114, 0.0104, 0.0394, 0.4075, 0.0717, 0.0186, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2517, 0.0164, 0.0209, 0.

<class 'torch.Tensor'> tensor(157052) 157052
281536 ; loss 0.91 accuracy:55.77 ;
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0786, 0.0017, 0.0130, 0.2271, 0.1366, 0.5187, 0.0216, 0.0028, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1363, 0.0243, 0.2446, 0.0246, 0.0179, 0.5175, 0.0305, 0.0043, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2261, 0.0185, 0

<class 'torch.Tensor'> tensor(186251) 186251
332736 ; loss 0.92 accuracy:55.96 ;
<class 'torch.Tensor'> tensor(189904) 189904
339136 ; loss 0.91 accuracy:55.99 ;
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1158, 0.0101, 0.0248, 0.0062, 0.0475, 0.1178, 0.0143, 0.0189, 0.0422,
        0.0163, 0.0069, 0.0079, 0.0041, 0.0022, 0.0024, 0.0088, 0.0058, 0.0477,
        0.4641, 0.0324, 0.0037, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.4181, 0.0366, 0.1384, 0.2243, 0.0383, 0.0877, 0.0316, 0.0197, 0.0053,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000,

w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1273, 0.0384, 0.0299, 0.0447, 0.0668, 0.0218, 0.0525, 0.5369, 0.0090,
        0.0247, 0.0102, 0.0275, 0.0085, 0.0017, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4253, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0602, 0.0068, 0.0189, 0.0116, 0.0250, 0.1074, 0.7059, 0.0583, 0.0059,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(267723) 267723
473536 ; loss 0.91 accuracy:56.53 ;
<class 'torch.Tensor'> tensor(271496) 271496
479936

<class 'torch.Tensor'> tensor(294080) 294080
518336 ; loss 0.89 accuracy:56.73 ;
<class 'torch.Tensor'> tensor(297848) 297848
524736 ; loss 0.89 accuracy:56.75 ;
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1047, 0.0156, 0.0568, 0.0133, 0.0067, 0.0492, 0.0062, 0.0110, 0.0090,
        0.1429, 0.3009, 0.0942, 0.1746, 0.0129, 0.0020, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.3579, 0.0216, 0.0129, 0.0069, 0.0116, 0.0113, 0.0190, 0.1527, 0.3626,
        0.0381, 0.0053, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<M

<class 'torch.Tensor'> tensor(213981) 213981
358336 ; loss 0.86 accuracy:59.7 ;
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1902, 0.0124, 0.0457, 0.0148, 0.0117, 0.1211, 0.0388, 0.0178, 0.0321,
        0.0374, 0.0865, 0.0469, 0.0654, 0.1351, 0.1208, 0.0203, 0.0029, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1260, 0.0091, 0.0608, 0.0202, 0.0129, 0.3485, 0.1674, 0.0721, 0.0233,
        0.0253, 0.1084, 0.0226, 0.0033, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas te

<class 'torch.Tensor'> tensor(249103) 249103
415936 ; loss 0.86 accuracy:59.88 ;
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0687, 0.0564, 0.0636, 0.1553, 0.4367, 0.1012, 0.0182, 0.0170, 0.0713,
        0.0106, 0.0009, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0440, 0.0369, 0.0405, 0.1704, 0.0271, 0.0659, 0.0187, 0.0327, 0.5607,
        0.0033, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3

<class 'torch.Tensor'> tensor(280487) 280487
467136 ; loss 0.86 accuracy:60.04 ;
<class 'torch.Tensor'> tensor(284378) 284378
473536 ; loss 0.86 accuracy:60.05 ;
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1731, 0.0564, 0.0160, 0.0591, 0.3195, 0.1967, 0.1096, 0.0394, 0.0214,
        0.0073, 0.0016, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1168, 0.0510, 0.0226, 0.1322, 0.0775, 0.0468, 0.0577, 0.0414, 0.0132,
        0.0062, 0.0251, 0.1551, 0.0492, 0.0326, 0.1298, 0.0192, 0.0186, 0.0045,
        0.0007, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_f

w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1917, 0.0172, 0.0309, 0.0215, 0.0201, 0.0573, 0.0070, 0.0090, 0.0137,
        0.0278, 0.2112, 0.0589, 0.0184, 0.0115, 0.0079, 0.0229, 0.1263, 0.0311,
        0.0975, 0.0163, 0.0018, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3652, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1405, 0.0128, 0.0238, 0.0174, 0.0176, 0.0579, 0.0089, 0.0464, 0.0295,
        0.0152, 0.5892, 0.0379, 0.0027, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(324096) 324096
537536 ; loss 0.84 accuracy:60.29 ;
w tensor(3.3652, device='cuda:0', grad_fn=<

<class 'torch.Tensor'> tensor(83404) 83404
134336 ; loss 0.84 accuracy:62.06 ;
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1550, 0.0165, 0.0915, 0.0888, 0.0186, 0.0324, 0.1820, 0.2202, 0.1393,
        0.0187, 0.0253, 0.0104, 0.0013, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2941, 0.0324, 0.1982, 0.1731, 0.0312, 0.0879, 0.0789, 0.0161, 0.0597,
        0.0255, 0.0030, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.

w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0260, 0.0128, 0.0332, 0.0154, 0.2740, 0.2188, 0.0137, 0.0071, 0.0702,
        0.1035, 0.1140, 0.0294, 0.0782, 0.0035, 0.0003, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0816, 0.0107, 0.0958, 0.0551, 0.0411, 0.0356, 0.6369, 0.0407, 0.0026,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0301, 0.0049, 0.0387, 0.0780, 0.0

<class 'torch.Tensor'> tensor(151655) 151655
243136 ; loss 0.83 accuracy:62.36 ;
<class 'torch.Tensor'> tensor(155695) 155695
249536 ; loss 0.83 accuracy:62.38 ;
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1768, 0.0172, 0.2951, 0.0911, 0.0148, 0.0159, 0.0166, 0.0137, 0.0153,
        0.0310, 0.0196, 0.0138, 0.0124, 0.0236, 0.0109, 0.0018, 0.0062, 0.0295,
        0.0222, 0.1480, 0.0221, 0.0026, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2496, 0.0204, 0.2479, 0.0379, 0.1451, 0.0353, 0.0491, 0.1312, 0.0627,
        0.0179, 0.0030, 0.0000, 0.0000, 0.0000

<class 'torch.Tensor'> tensor(187996) 187996
300736 ; loss 0.84 accuracy:62.5 ;
<class 'torch.Tensor'> tensor(192069) 192069
307136 ; loss 0.82 accuracy:62.52 ;
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.3155, 0.0336, 0.2115, 0.1685, 0.0097, 0.0280, 0.0160, 0.0091, 0.0340,
        0.0375, 0.0322, 0.0146, 0.0108, 0.0555, 0.0211, 0.0025, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.4005, 0.1265, 0.0320, 0.3022, 0.1314, 0.0074, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w ten

w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0725, 0.0089, 0.0235, 0.0169, 0.0959, 0.0993, 0.0536, 0.1663, 0.0050,
        0.0116, 0.0632, 0.0582, 0.0517, 0.0362, 0.0211, 0.0363, 0.1653, 0.0133,
        0.0013, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1547, 0.0195, 0.0484, 0.0208, 0.0214, 0.0951, 0.0136, 0.0359, 0.1506,
        0.1865, 0.0611, 0.0316, 0.0240, 0.1113, 0.0233, 0.0023, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(224550) 224550
358336 ; loss 0.82 accuracy:62.65 ;
<class 'torch.Tensor'> tensor(228651) 228651
364736 ; loss 0.82 accuracy:62.68 ;
w tensor(3.

w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0431, 0.0053, 0.0114, 0.0236, 0.0200, 0.0307, 0.0122, 0.5311, 0.0268,
        0.0338, 0.2540, 0.0071, 0.0009, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0500, 0.0047, 0.0064, 0.0069, 0.0329, 0.0497, 0.0365, 0.7965, 0.0151,
        0.0013, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(261393) 261393
415936 ; loss 0.81 accuracy:62.83 ;
<class 'torch.Tensor'> tensor(265515) 265515
422336 ; loss 0.81 accuracy:62.86 ;
w tensor(3.

w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0413, 0.0046, 0.0069, 0.0023, 0.0076, 0.0383, 0.0913, 0.0128, 0.0206,
        0.2223, 0.2357, 0.0415, 0.0166, 0.2453, 0.0118, 0.0012, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0314, 0.0284, 0.0168, 0.2817, 0.2025, 0.0106, 0.0152, 0.4084, 0.0047,
        0.0004, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(298044) 298044
473536 ; loss 0.81 accuracy:62.93 ;
<class 'torch.Tensor'> tensor(302089) 302089
479936 ; loss 0.82 accuracy:62.94 ;
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBack

w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1221, 0.0113, 0.0671, 0.0946, 0.0737, 0.0467, 0.1690, 0.0122, 0.0435,
        0.3337, 0.0228, 0.0035, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1401, 0.0079, 0.0275, 0.0448, 0.7516, 0.0281, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(335012) 335012
531136 ; loss 0.81 accuracy:63.07 ;
<class 'torch.Tensor'> tensor(338988) 338988
537536 ; loss 

w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0919, 0.0108, 0.0223, 0.0118, 0.0107, 0.0288, 0.0121, 0.0233, 0.0906,
        0.0340, 0.0204, 0.1243, 0.0749, 0.4118, 0.0294, 0.0029, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0722, 0.0097, 0.0343, 0.0090, 0.0125, 0.3342, 0.0413, 0.0122, 0.0480,
        0.0188, 0.0118, 0.0743, 0.0458, 0.2556, 0.0184, 0.0018, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
<class

w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.1248, 0.0478, 0.0295, 0.0498, 0.1643, 0.1382, 0.0290, 0.0393, 0.0800,
        0.0223, 0.0144, 0.0110, 0.0089, 0.0028, 0.0008, 0.0076, 0.0181, 0.1977,
        0.0117, 0.0016, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0862, 0.0044, 0.0201, 0.0271, 0.8410, 0.0191, 0.0022, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       d

w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0572, 0.0108, 0.0890, 0.0646, 0.0336, 0.0236, 0.0247, 0.0130, 0.0083,
        0.0146, 0.0187, 0.0536, 0.0343, 0.5380, 0.0146, 0.0014, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0463, 0.0083, 0.0600, 0.0334, 0.3948, 0.0210, 0.0160, 0.0696, 0.3427,
        0.0071, 0.0008, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(82695) 8

w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2290, 0.0189, 0.1499, 0.0929, 0.0788, 0.0620, 0.0196, 0.0198, 0.0820,
        0.1818, 0.0583, 0.0070, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(120051) 120051
185536 ; loss 0.82 accuracy:64.68 ;
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2051, 0.0294, 0.0224, 0.0137,

w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0965, 0.0123, 0.0445, 0.1251, 0.0287, 0.1925, 0.0369, 0.1790, 0.0470,
        0.0388, 0.0346, 0.0568, 0.0061, 0.0862, 0.0131, 0.0019, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0898, 0.0136, 0.0380, 0.0175, 0.0586, 0.3496, 0.0767, 0.0262, 0.3078,
        0.0201, 0.0019, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000], device='cuda:0', grad_fn=<SelectBackward>)
<class 'torch.Tensor'> tensor(157233) 157233
243136 ; loss 0.79 accuracy:64.65 ;
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, de

<class 'torch.Tensor'> tensor(190681) 190681
294336 ; loss 0.78 accuracy:64.77 ;
<class 'torch.Tensor'> tensor(194790) 194790
300736 ; loss 0.8 accuracy:64.76 ;
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.2030, 0.2085, 0.5049, 0.0145, 0.0153, 0.0456, 0.0081, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward>)
w tensor(3.3651, device='cuda:0', grad_fn=<MaxBackward1>) tensor(-3.4254, device='cuda:0', grad_fn=<MinBackward1>)
alphas tensor([0.0436, 0.0109, 0.0058, 0.9119, 0.0258, 0.0021, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 

<class 'torch.Tensor'> tensor(228406) 228406
351936 ; loss 0.79 accuracy:64.89 ;
