In [1]:
import argparse, yaml
from openvqa.models.model_loader import CfgLoader
from utils1.exec1 import Execution

In [3]:
import os, copy
import sys
from openvqa.datasets.dataset_loader import DatasetLoader

class Execution:
    def __init__(self, __C):
        self.__C = __C

        print('Loading dataset........')
        self.dataset = DatasetLoader(__C).DataSet()

        # If trigger the evaluation after every epoch
        # Will create a new cfgs with RUN_MODE = 'val'
        self.dataset_eval = None
        if __C.EVAL_EVERY_EPOCH:
            __C_eval = copy.deepcopy(__C)
            setattr(__C_eval, 'RUN_MODE', 'val')

            print('Loading validation set for per-epoch evaluation........')
            self.dataset_eval = DatasetLoader(__C_eval).DataSet()


    def run(self, run_mode):
        
        if run_mode == 'train':
            if self.__C.RESUME is False:
                self.empty_log(self.__C.VERSION)
            train_engine(self.__C, self.dataset, self.dataset_eval)

        elif run_mode == 'val':
            test_engine(self.__C, self.dataset, validation=True)

        elif run_mode == 'test':
            test_engine(self.__C, self.dataset)

        else:
            exit(-1)


    def empty_log(self, version):
        print('Initializing log file........')
        if (os.path.exists(self.__C.LOG_PATH + '/log_run_' + version + '.txt')):
            os.remove(self.__C.LOG_PATH + '/log_run_' + version + '.txt')
        print('Finished!')
        print('')



In [4]:
def parse_args():
    '''
    Parse input arguments
    '''
    parser = argparse.ArgumentParser(description='OpenVQA Args')

    parser.add_argument('--RUN', dest='RUN_MODE',
                      choices=['train', 'val', 'test'],
                      help='{train, val, test}',
                      default='train',
                      type=str, required=False)

    parser.add_argument('--MODEL', dest='MODEL',
                      choices=[
                           'mcan_small',
                           'mcan_small_wa',
                           'mcan_large',
                           'ban_4',
                           #Edits
                           'ban_8_wa',
                           'baseline_wa',
                           #End of Edits
                           'ban_8',
                           'mfb',
                           'mfb_wa',
                           'mfh',
                           'mfh_wa',
                           'mem',
                           'butd',
                           'butd_wa',
                           'baseline',
                           'baseline_wa_no_fusion',
                           'positional',
                           'mcan_large_wa',
                           'mcan_small_augmented',
                           'mcan_small_without_a'
                           ]
                        ,
                      help='{'
                           'mcan_small,'
                           'mcan_small_wa,'
                           'mcan_large,'
                            #Edits
                           'ban_wa,'
                           'baseline_wa,'
                           #End of Edits
                           'ban_4,'
                           'ban_8,'
                           'mfb,'
                           'mfb_wa,'
                           'mfh,'
                           'mfh_wa,'
                           'butd,'
                           'butd_wa,'
                           'baseline,'
                           'baseline_wa_no_fusion,'
                           'positional,'
                           '}'
                        ,
                      type=str, required=True)

    parser.add_argument('--DATASET', dest='DATASET',
                      choices=['vqa', 'gqa', 'clevr'],
                      help='{'
                           'vqa,'
                           'gqa,'
                           'clevr,'
                           '}'
                        ,
                      default='vqa',  
                      type=str, required=False)

    parser.add_argument('--SPLIT', dest='TRAIN_SPLIT',
                      choices=['train', 'train+val', 'train+val+vg'],
                      help="set training split, "
                           "vqa: {'train', 'train+val', 'train+val+vg'}"
                           "gqa: {'train', 'train+val'}"
                           "clevr: {'train', 'train+val'}"
                        ,
                        default='train', required=False,
                      type=str)

    parser.add_argument('--EVAL_EE', dest='EVAL_EVERY_EPOCH',
                      choices=['True', 'False'],
                      help='True: evaluate the val split when an epoch finished,'
                           'False: do not evaluate on local',
                           default='True',
                           required=False,
                      type=str)

    parser.add_argument('--SAVE_PRED', dest='TEST_SAVE_PRED',
                      choices=['True', 'False'],
                      help='True: save the prediction vectors,'
                           'False: do not save the prediction vectors',
                      default='True',
                      required=False,
                      type=str)

    parser.add_argument('--BS', dest='BATCH_SIZE',
                      help='batch size in training',
                      type=int)

    parser.add_argument('--GPU', dest='GPU',
                      help="gpu choose, eg.'0, 1, 2, ...'",
                      default='0, 1',
                      type=str)

    parser.add_argument('--SEED', dest='SEED',
                      help='fix random seed',
                      type=int)

    parser.add_argument('--VERSION', dest='VERSION',
                      help='Enter descriptive name here (eg baseline_wa_gru), will be used for WANDB and for version',
                      required=True,
                      type=str)

    parser.add_argument('--RESUME', dest='RESUME',
                      choices=['True', 'False'],
                      help='True: use checkpoint to resume training,'
                           'False: start training with random init',
                      type=str)

    parser.add_argument('--CKPT_V', dest='CKPT_VERSION',
                      help='checkpoint version',
                      type=str)

    parser.add_argument('--CKPT_E', dest='CKPT_EPOCH',
                      help='checkpoint epoch',
                      type=int)

    parser.add_argument('--CKPT_PATH', dest='CKPT_PATH',
                      help='load checkpoint path, we '
                           'recommend that you use '
                           'CKPT_VERSION and CKPT_EPOCH '
                           'instead, it will override'
                           'CKPT_VERSION and CKPT_EPOCH',
                      type=str)

    parser.add_argument('--ACCU', dest='GRAD_ACCU_STEPS',
                      help='split batch to reduce gpu memory usage',
                      type=int)

    parser.add_argument('--NW', dest='NUM_WORKERS',
                      help='multithreaded loading to accelerate IO',
                      type=int)

    parser.add_argument('--PINM', dest='PIN_MEM',
                      choices=['True', 'False'],
                      help='True: use pin memory, False: not use pin memory',
                      type=str)

    parser.add_argument('--VERB', dest='VERBOSE',
                      choices=['True', 'False'],
                      help='True: verbose print, False: simple print',
                      type=str)

    parser.add_argument('--USE_NEW_QUESTION', dest='USE_NEW_QUESTION',
                      choices=['True', 'False'],
                      help='whether to use new question while testing',
                      default='False',
                      type=str)

    parser.add_argument('--NEW_QUESTION', dest='NEW_QUESTION',
                      help='the new question to be asked while testing',
                      type=str)

    parser.add_argument('--IMAGE_ID', dest='IMAGE_ID',
                      help='image id on which the questions to be asked',
                      type=str)
    
    ######################################################
    #########  CHANGE MODEL AND VERSION HERE #############
    ######################################################
    args = parser.parse_args(args=['--MODEL', 'baseline_wa', '--VERSION', 'hakku'])
    return args

In [5]:
args = parse_args()
print(args)

Namespace(BATCH_SIZE=None, CKPT_EPOCH=None, CKPT_PATH=None, CKPT_VERSION=None, DATASET='vqa', EVAL_EVERY_EPOCH='True', GPU='0, 1', GRAD_ACCU_STEPS=None, IMAGE_ID=None, MODEL='baseline_wa', NEW_QUESTION=None, NUM_WORKERS=None, PIN_MEM=None, RESUME=None, RUN_MODE='train', SEED=None, TEST_SAVE_PRED='True', TRAIN_SPLIT='train', USE_NEW_QUESTION='False', VERBOSE=None, VERSION='hakku')


In [6]:
cfg_file = "configs/{}/{}.yml".format(args.DATASET, args.MODEL)
with open(cfg_file, 'r') as f:

    # Loads the yaml file
    yaml_dict = yaml.load(f)

# Loads the model_cfgs + base_cfgs
__C = CfgLoader(yaml_dict['MODEL_USE']).load()

# Loads the command line cfgs
args = __C.str_to_bool(args)
args_dict = __C.parse_to_dict(args)

# {**dict1, **dict2} creates a new dictionary by merging dict1 and dict2, using dict2 for key clashes
args_dict = {**yaml_dict, **args_dict}
__C.add_args(args_dict)
__C.proc()

# FINAL PREFERENCE OF CFGS:
# COMMAND LINE > YAML FILE > MODEL CFGS > BASE CFGS

print('Hyper Parameters:')
print(__C)

Checking dataset ........
Finished!

Hyper Parameters:
{ ALPHA             }->1
{ ANS_STDDEV        }->0.01
{ AUGMENTED_ANSWER  }->False
{ BATCH_SIZE        }->64
{ BBOXFEAT_EMB_SIZE }->2048
{ BETA              }->30
{ CACHE_PATH        }->./results/cache
{ CAP_DIST          }->0.3
{ CKPTS_PATH        }->./ckpts
{ CKPT_EPOCH        }->0
{ CKPT_PATH         }->None
{ CKPT_VERSION      }->343612
{ DATASET           }->vqa
{ DATA_PATH         }->{'clevr': './data/clevr', 'vqa': './data/vqa', 'gqa': './data/gqa'}
{ DATA_ROOT         }->./data
{ DEVICES           }->[0, 1]
{ DROPOUT_R         }->0.1
{ EVAL_BATCH_SIZE   }->32
{ EVAL_EVERY_EPOCH  }->True
{ FEATS_PATH        }->{'clevr': {'test': './data/clevr/feats/test', 'val': './data/clevr/feats/val', 'train': './data/clevr/feats/train'}, 'vqa': {'test': './data/vqa/feats/test2015', 'val': './data/vqa/feats/val2014', 'train': './data/vqa/feats/train2014'}, 'gqa': {'default-frcn': './data/gqa/feats/gqa-frcn', 'default-grid': './data/gqa/fea

  """


In [7]:
execution = Execution(__C)

Loading dataset........
Loading all questions (for statistics)
Loading all image features
Loading split questions and answers

Tokenising questions
Tokenising answers
Finished!

Loading validation set for per-epoch evaluation........
Loading all questions (for statistics)
Loading all image features
Loading split questions and answers

Tokenising questions
Finished!



In [None]:
import os, torch, datetime, shutil, time
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as Data
import wandb
from openvqa.models.model_loader import ModelLoader
from openvqa.utils.optim import get_optim, adjust_lr
from utils1.test_engine import test_engine, ckpt_proc
from vis import plotter, vis_func
from multiprocessing import Pool
import multiprocessing
import sys
from torchviz import make_dot, make_dot_from_trace

def train_engine(__C, dataset, dataset_eval=None):

    data_size = dataset.data_size
    token_size = dataset.token_size
    ans_size = dataset.ans_size
    pretrained_emb = dataset.pretrained_emb

    #Edits
    pretrained_emb_ans = dataset.pretrained_emb_ans
    token_size_ans = dataset.token_size_ans #End of Edits

    print("Model being used is {}".format(__C.MODEL_USE))

    net = ModelLoader(__C).Net(
        __C,
        pretrained_emb,
        token_size,
        ans_size,
        pretrained_emb_ans,
        token_size_ans
    )

    net.cuda()
    net.train()

    if __C.N_GPU > 1:
        net = nn.DataParallel(net, device_ids=__C.DEVICES)

    # Define Loss Function
    loss_fn = eval('torch.nn.' + __C.LOSS_FUNC_NAME_DICT[__C.LOSS_FUNC] + "(reduction='" + __C.LOSS_REDUCTION + "').cuda()")


    # creating a folder for saving the numpy visualization arrays
    if (__C.WITH_ANSWER and ((__C.VERSION) not in os.listdir(__C.SAVED_PATH))):
        os.mkdir(__C.SAVED_PATH + '/' + __C.VERSION)


    # Load checkpoint if resume training
    if __C.RESUME:
        print(' ========== Resume training')

        if __C.CKPT_PATH is not None:
            print('Warning: Now using CKPT_PATH args, '
                  'CKPT_VERSION and CKPT_EPOCH will not work')
            path = __C.CKPT_PATH
        else:
            path = __C.CKPTS_PATH + \
                   '/ckpt_' + __C.CKPT_VERSION + \
                   '/epoch' + str(__C.CKPT_EPOCH) + '.pkl'

        # Load the network parameters
        print('Loading ckpt from {}'.format(path))
        ckpt = torch.load(path)
        print('Finish!')

        if __C.N_GPU > 1:
            net.load_state_dict(ckpt_proc(ckpt['state_dict']))
        else:
            net.load_state_dict(ckpt['state_dict'])
        start_epoch = ckpt['epoch']

        # Load the optimizer paramters
        optim = get_optim(__C, net, data_size, ckpt['lr_base'])
        optim._step = int(data_size / __C.BATCH_SIZE * start_epoch)
        optim.optimizer.load_state_dict(ckpt['optimizer'])
        
        if ('ckpt_' + __C.VERSION) not in os.listdir(__C.CKPTS_PATH):
            os.mkdir(__C.CKPTS_PATH + '/ckpt_' + __C.VERSION)

    else:
        if ('ckpt_' + __C.VERSION) not in os.listdir(__C.CKPTS_PATH):
            #shutil.rmtree(__C.CKPTS_PATH + '/ckpt_' + __C.VERSION)
            os.mkdir(__C.CKPTS_PATH + '/ckpt_' + __C.VERSION)

        optim = get_optim(__C, net, data_size)
        start_epoch = 0

    loss_sum = 0
    named_params = list(net.named_parameters())
    grad_norm = np.zeros(len(named_params))

    # Define multi-thread dataloader
    # if __C.SHUFFLE_MODE in ['external']:
    #     dataloader = Data.DataLoader(
    #         dataset,
    #         batch_size=__C.BATCH_SIZE,
    #         shuffle=False,
    #         num_workers=__C.NUM_WORKERS,
    #         pin_memory=__C.PIN_MEM,
    #         drop_last=True
    #     )
    # else:
    dataloader = Data.DataLoader(
        dataset,
        batch_size=__C.BATCH_SIZE,
        shuffle=True,
        num_workers=__C.NUM_WORKERS,
        pin_memory=__C.PIN_MEM,
        drop_last=True
    )

    logfile = open(
        __C.LOG_PATH +
        '/log_run_' + __C.VERSION + '.txt',
        'a+'
    )
    logfile.write(str(__C))
    logfile.close()

    # For dry runs
    # os.environ['WANDB_MODE'] = 'dryrun' 

    # initializing the wandb project
    # TODO to change the name of project later, once the proper coding starts
    #wandb.init(project="openvqa", name=__C.VERSION, config=__C)

    # obtain histogram of each gradients in network as it trains
    #wandb.watch(net, log="all")

    #wandb.save("./openvqa/models/" + str(__C.MODEL_USE) + "/net.py")
    #wandb.save("./utils1/train_engine.py")

    # Training script
    for epoch in range(start_epoch, __C.MAX_EPOCH):

        # Save log to file
        logfile = open(
            __C.LOG_PATH +
            '/log_run_' + __C.VERSION + '.txt',
            'a+'
        )
        logfile.write(
            '=====================================\nnowTime: ' +
            datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
            '\n'
        )
        logfile.close()

        # Learning Rate Decay
        if epoch in __C.LR_DECAY_LIST:
            adjust_lr(optim, __C.LR_DECAY_R)

        # Externally shuffle data list
        # if __C.SHUFFLE_MODE == 'external':
        #     dataset.shuffle_list(dataset.ans_list)

        time_start = time.time()
        # Iteration
        for step, (
                frcn_feat_iter,
                grid_feat_iter,
                bbox_feat_iter,
                ques_ix_iter,

                #Edits
                ans_ix_iter,
                #End of Edits

                ans_iter,
                ques_type

        ) in enumerate(dataloader):

            optim.zero_grad()

            frcn_feat_iter = frcn_feat_iter.cuda()
            grid_feat_iter = grid_feat_iter.cuda()
            bbox_feat_iter = bbox_feat_iter.cuda()
            ques_ix_iter = ques_ix_iter.cuda()
            #Edits
            ans_ix_iter = ans_ix_iter.cuda()
            #End of Edits
            ans_iter = ans_iter.cuda()

            loss_tmp = 0

            loss_img_ques_tmp = 0
            loss_ans_tmp = 0
            loss_interp_tmp = 0
            loss_fusion_tmp = 0

            for accu_step in range(__C.GRAD_ACCU_STEPS):
                loss_tmp = 0
                loss_img_ques_tmp = 0
                loss_ans_tmp = 0
                loss_interp_tmp = 0
                loss_fusion_tmp = 0

                sub_frcn_feat_iter = \
                    frcn_feat_iter[accu_step * __C.SUB_BATCH_SIZE:
                                  (accu_step + 1) * __C.SUB_BATCH_SIZE]
                sub_grid_feat_iter = \
                    grid_feat_iter[accu_step * __C.SUB_BATCH_SIZE:
                                  (accu_step + 1) * __C.SUB_BATCH_SIZE]
                sub_bbox_feat_iter = \
                    bbox_feat_iter[accu_step * __C.SUB_BATCH_SIZE:
                                  (accu_step + 1) * __C.SUB_BATCH_SIZE]
                sub_ques_ix_iter = \
                    ques_ix_iter[accu_step * __C.SUB_BATCH_SIZE:
                                 (accu_step + 1) * __C.SUB_BATCH_SIZE]
                #Edits
                sub_ans_ix_iter = \
                    ans_ix_iter[accu_step * __C.SUB_BATCH_SIZE:
                                 (accu_step + 1) * __C.SUB_BATCH_SIZE]
                #End of Edits

                sub_ans_iter = \
                    ans_iter[accu_step * __C.SUB_BATCH_SIZE:
                             (accu_step + 1) * __C.SUB_BATCH_SIZE]

                
                # when making predictions also pass the ans_iter which is a dictionary from which you
                # can extract answers and pass them through decoders

                if (__C.WITH_ANSWER):
                    pred_img_ques, pred_ans, pred_fused, z_img_ques, z_ans, z_fused = net(
                        sub_frcn_feat_iter,
                        sub_grid_feat_iter,
                        sub_bbox_feat_iter,
                        sub_ques_ix_iter,
                        sub_ans_ix_iter,
                        step,
                        epoch
                    )
                else:
                     pred_img_ques = net(
                        sub_frcn_feat_iter,
                        sub_grid_feat_iter,
                        sub_bbox_feat_iter,
                        sub_ques_ix_iter,
                        sub_ans_ix_iter,
                        step,
                        epoch
                    )


                # we need to change the loss terms accordingly
                # now we need to modify the loss terms for the same
                
                #Edits: creating the loss items for each of the prediction vector

                loss_item_img_ques = [pred_img_ques, sub_ans_iter]

                # only calculate the ans and interp loss in case of WITH_ANSWER
                if (__C.WITH_ANSWER):
                    loss_item_ans = [pred_ans, sub_ans_iter]
                    loss_item_interp = [pred_fused, sub_ans_iter]

                
                loss_nonlinear_list = __C.LOSS_FUNC_NONLINEAR[__C.LOSS_FUNC]
                
                # applying the same transformation on the all three
                # althought for 'bce' loss the following does nothing
                for item_ix, loss_nonlinear in enumerate(loss_nonlinear_list):
                    if loss_nonlinear in ['flat']:
                        loss_item_img_ques[item_ix] = loss_item_img_ques[item_ix].view(-1)
                    elif loss_nonlinear:
                        loss_item_img_ques[item_ix] = eval('F.' + loss_nonlinear + '(loss_item_img_ques[item_ix], dim=1)')

                for item_ix, loss_nonlinear in enumerate(loss_nonlinear_list):
                    if loss_nonlinear in ['flat'] and __C.WITH_ANSWER:
                        loss_item_ans[item_ix] = loss_item_ans[item_ix].view(-1)
                    elif loss_nonlinear and __C.WITH_ANSWER:
                        loss_item_ans[item_ix] = eval('F.' + loss_nonlinear + '(loss_item_ans[item_ix], dim=1)')

                for item_ix, loss_nonlinear in enumerate(loss_nonlinear_list):
                    if loss_nonlinear in ['flat'] and __C.WITH_ANSWER:
                        loss_item_interp[item_ix] = loss_item_interp[item_ix].view(-1)
                    elif loss_nonlinear and __C.WITH_ANSWER:
                        loss_item_interp[item_ix] = eval('F.' + loss_nonlinear + '(loss_item_interp[item_ix], dim=1)')


                # Now we create all the four losses and then add them
                #print("shape of loss_item_img_ques[0] is {} and of loss_item_img_ques[1] is {}".format(loss_item_img_ques[0],loss_item_img_ques[1]))
                loss_img_ques = loss_fn(loss_item_img_ques[0], loss_item_img_ques[1])


                loss = 0
                loss += loss_img_ques
                
                if (__C.WITH_ANSWER):

                    # loss for the prediction from the answer
                    #print("shape of loss_item_ans[0] is {} and of loss_item_ans[1] is {}".format(loss_item_ans[0],loss_item_ans[1]))
                    loss_ans = loss_fn(loss_item_ans[0], loss_item_ans[1])
                
                    # Loss for the prediction from the fused vector
                    # I am keeping the loss same as bce but we can change it later for more predictions
                    # loss_fused = interpolation loss
                    #print("shape of loss_item_interp[0] is {} and of loss_item_interp[1] is {}".format(loss_item_interp[0],loss_item_interp[1]))
                    loss_interp = loss_fn(loss_item_interp[0], loss_item_interp[1])
                    
                    # we also need to multiply this fused loss by a hyperparameter alpha
                    # put the alpha in the config and uncomment the following line
                    loss_interp *= __C.ALPHA
                    loss += loss_ans + loss_interp

                    print("\n----------  FIRST LOSS  --------")
                    optim.zero_grad()
                    loss.backward(retain_graph=True)
                    params1 = list(net.parameters())
                    '''
                    lays = 0
                    for name, i in net.named_parameters():
                        print(name, end = ' ')
                        if (i.requires_grad == False):
                            print ("WWWWWWWWWWWWHHHHHHHHHHHAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTT")
                        if ((i.grad != 0).sum() == 0):
                            print(' NOT UPDATING')
                            lays += 1
                        else:
                            print(' ')
                    print ("Gradient not updating in: ", lays, ' of total: ', len(params1))
                    '''



                    if (__C.WITH_FUSION_LOSS):

                        # Now calculate the fusion loss
                        #1. Higher loss for higher distance between vectors predicted
                        # by different models for same example

                        dist_calc = (z_img_ques - z_ans).pow(2).sum(1).sqrt()
                        #print("Count of distances being clipped (true is clipped): ", np.unique((dist_calc > __C.CAP_DIST).cpu().numpy(), return_counts=True))

                        '''
                        loss_fusion = torch.min(
                                torch.tensor(__C.CAP_DIST).cuda(),
                                dist_calc
                                ).mean()

                        #2. Lower loss for more distance between two pred vectors of same model
                        loss_fusion -= torch.min(
                                torch.tensor(__C.CAP_DIST).cuda(), 
                                torch.pdist(z_img_ques, 2)
                                ).mean() 

                        loss_fusion -= torch.min(
                                torch.tensor(__C.CAP_DIST).cuda(), 
                                torch.pdist(z_ans, 2)
                                ).mean() 
                        '''

                        loss_fusion = dist_calc.mean()

                        #2. Lower loss for more distance between two pred vectors of same model
                        '''
                        calculating pairwise intra distance on same type questions
                        '''
                        '''
                        types = ['other', 'yes/no', 'number']
                        for i in range(3):
                            j = (i+1)%3
                            indices_i = [k for k, val in enumerate(ques_type) if val == types[i]]
                            indices_j = [k for k, val in enumerate(ques_type) if val == types[j]]
                            if ((indices_i != []) and (indices_j != [])):
                                loss_fusion -= torch.cdist(z_img_ques[indices_i], z_img_ques[indices_j]).mean()
                                loss_fusion -= torch.cdist(z_ans[indices_i], z_ans[indices_j]).mean()
                            if (indices_i != []):
                                loss_fusion += torch.pdist(z_img_ques[indices_i], 2).mean()
                                loss_fusion += torch.pdist(z_ans[indices_i], 2).mean()
                        '''
                        loss_fusion -= torch.pdist(z_img_ques, 2).mean() 

                        loss_fusion -= torch.pdist(z_ans, 2).mean() 


                        # Multiply the loss fusion with hyperparameter beta
                        loss_fusion *= __C.BETA

                        #print('fusion loss is : {}'.format(loss_fusion))

                        print("\n----------  FUSION LOSS  --------")
                        optim.zero_grad()
                        loss_fusion.backward(retain_graph=True)
                        params2 = list(net.parameters())
                        lays = 0
                        dot_sum = 0
                        for idx, (name, i) in enumerate(net.named_parameters()):
                            dot_sum += (params1[idx] * params2[idx]).sum()
                            print(name, ' dot: ', (params1[idx].grad * i.grad).sum().item(), end = '   ')
                            if ((i.grad != 0).sum() == 0):
                                print(' NOT UPDATING')
                                lays += 1
                            else:
                                print(' ')
                        print ("Gradient not updating in: ", lays, ' of total: ', len(params2))
                        print("@@@@@@@@@@@@@@@@@@@ Overall dot product: ", dot_sum.item())

                        loss += loss_fusion

                loss /= __C.GRAD_ACCU_STEPS

                print("\n----------  MAIN LOSS  --------")
                optim.zero_grad()
                loss.backward()
                params = list(net.parameters())
                '''
                lays = 0
                for name, i in net.named_parameters():
                    print(name, end = ' ')
                    if (i.requires_grad == False):
                        print ("WWWWWWWWWWWWHHHHHHHHHHHAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTT")
                    if ((i.grad != 0).sum() == 0):
                        print(' NOT UPDATING')
                        lays += 1
                    else:
                        print(' ')
                print ("Gradient not updating in: ", lays, ' of total: ', len(params))
                '''

                loss_tmp += loss.cpu().data.numpy() * __C.GRAD_ACCU_STEPS
                loss_sum += loss.cpu().data.numpy() * __C.GRAD_ACCU_STEPS

                # calculating temp loss of each type
                if __C.WITH_ANSWER:
                    loss_img_ques_tmp += loss_img_ques.cpu().data.numpy() * __C.GRAD_ACCU_STEPS
                    loss_ans_tmp += loss_ans.cpu().data.numpy() * __C.GRAD_ACCU_STEPS
                    loss_interp_tmp += loss_interp.cpu().data.numpy() * __C.GRAD_ACCU_STEPS
                    if (__C.WITH_FUSION_LOSS):
                        loss_fusion_tmp += loss_fusion.cpu().data.numpy() * __C.GRAD_ACCU_STEPS


            if __C.VERBOSE:
                if dataset_eval is not None:
                    mode_str = __C.SPLIT['train'] + '->' + __C.SPLIT['val']
                else:
                    mode_str = __C.SPLIT['train'] + '->' + __C.SPLIT['test']

                print("\r[Version %s][Epoch %2d][Step %4d/%4d] Loss: %.4f [iq: %.4f,ans: %.4f,interp: %.4f,fusion: %.4f]" % (
                    __C.VERSION,
                    epoch + 1,
                    step,
                    int(data_size / __C.BATCH_SIZE),
                    loss_tmp / __C.SUB_BATCH_SIZE,
                    loss_img_ques_tmp / __C.SUB_BATCH_SIZE,
                    loss_ans_tmp / __C.SUB_BATCH_SIZE,
                    loss_interp_tmp / __C.SUB_BATCH_SIZE,
                    loss_fusion_tmp / __C.SUB_BATCH_SIZE
                ), end = '          ')

            # Gradient norm clipping
            if __C.GRAD_NORM_CLIP > 0:
                nn.utils.clip_grad_norm_(
                    net.parameters(),
                    __C.GRAD_NORM_CLIP
                )

            # Save the gradient information
            for name in range(len(named_params)):
                norm_v = torch.norm(named_params[name][1].grad).cpu().data.numpy() \
                    if named_params[name][1].grad is not None else 0
                grad_norm[name] += norm_v * __C.GRAD_ACCU_STEPS
                # print('Param %-3s Name %-80s Grad_Norm %-20s'%
                #       (str(grad_wt),
                #        params[grad_wt][0],
                #        str(norm_v)))

            optim.step()

        time_end = time.time()
        elapse_time = time_end-time_start
        print('Finished in {}s'.format(int(elapse_time)))
        epoch_finish = epoch + 1

        # Save checkpoint
        if __C.N_GPU > 1:
            state = {
                'state_dict': net.module.state_dict(),
                'optimizer': optim.optimizer.state_dict(),
                'lr_base': optim.lr_base,
                'epoch': epoch_finish
            }
        else:
            state = {
                'state_dict': net.state_dict(),
                'optimizer': optim.optimizer.state_dict(),
                'lr_base': optim.lr_base,
                'epoch': epoch_finish
            }
        torch.save(
            state,
            __C.CKPTS_PATH +
            '/ckpt_' + __C.VERSION +
            '/epoch' + str(epoch_finish) +
            '.pkl'
        )

        '''
        wandb.save(
            __C.CKPTS_PATH +
            '/ckpt_' + __C.VERSION +
            '/epoch' + str(epoch_finish) +
            '.h5'
        )
        '''
        
        # Logging
        logfile = open(
            __C.LOG_PATH +
            '/log_run_' + __C.VERSION + '.txt',
            'a+'
        )
        logfile.write(
            'Epoch: ' + str(epoch_finish) +
            ', Loss: ' + str(loss_sum / data_size) +
            ', Lr: ' + str(optim._rate) + '\n' +
            'Elapsed time: ' + str(int(elapse_time)) + 
            ', Speed(s/batch): ' + str(elapse_time / step) +
            '\n\n'
        )
        logfile.close()

        '''
        wandb.log({
            'Loss': float(loss_sum / data_size),
            'Learning Rate': optim._rate,
            'Elapsed time': int(elapse_time) 
            })
        '''

        # ---------------------------------------------- #
        # ---- Create visualizations in new processes----#
        # ---------------------------------------------- #
        dic = {}
        dic['version'] = __C.VERSION
        dic['epoch'] = epoch 
        dic['num_samples'] = 1000

        p = Pool(processes= 1)
        p.map_async(vis_func, (dic, ))
        p.close()

        # Eval after every epoch
        epoch_dict = {
                'current_epoch': epoch
                }
        __C.add_args(epoch_dict)
        if dataset_eval is not None:
            test_engine(
                __C,
                dataset_eval,
                state_dict=net.state_dict(),
                validation=True,
                epoch = 0
            )
        p.join()

        # if self.__C.VERBOSE:
        #     logfile = open(
        #         self.__C.LOG_PATH +
        #         '/log_run_' + self.__C.VERSION + '.txt',
        #         'a+'
        #     )
        #     for name in range(len(named_params)):
        #         logfile.write(
        #             'Param %-3s Name %-80s Grad_Norm %-25s\n' % (
        #                 str(name),
        #                 named_params[name][0],
        #                 str(grad_norm[name] / data_size * self.__C.BATCH_SIZE)
        #             )
        #         )
        #     logfile.write('\n')
        #     logfile.close()

        loss_sum = 0
        grad_norm = np.zeros(len(named_params))



In [None]:
execution.run(__C.RUN_MODE)

Initializing log file........
Finished!

Model being used is baseline_wa

----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  2467.458740234375    
module.ans_embedding.weight  dot:  1037403.5    
module.lstm.weight_ih_l0  dot:  3137748.5    
module.lstm.weight_hh_l0  dot:  127333.890625    
module.lstm.bias_ih_l0  dot:  264328.875    
module.lstm.bias_hh_l0  dot:  264328.875    
module.ans_lstm.weight_ih_l0  dot:  169729856.0    
module.ans_lstm.we

[Version hakku][Epoch  1][Step    3/6933] Loss: 6225.2241 [iq: 2162.2109,ans: 2167.1626,interp: 2163.0522,fusion: -267.2014]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  2848.955810546875    
module.ans_embedding.weight  dot:  1162367.75    
module.lstm.weight_ih_l0  dot:  3254530.0    
module.lstm.weight_hh_l0  dot:  121089.1875    
module.lstm.bias_ih_l0  dot:  268234.0    
module.lstm.bias_hh_l0  dot:  268234.0    
module.ans_lst

[Version hakku][Epoch  1][Step    7/6933] Loss: 6121.9053 [iq: 2153.7622,ans: 2164.7441,interp: 2159.1809,fusion: -355.7815]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  2608.907958984375    
module.ans_embedding.weight  dot:  923786.875    
module.lstm.weight_ih_l0  dot:  2856563.5    
module.lstm.weight_hh_l0  dot:  113841.0625    
module.lstm.bias_ih_l0  dot:  235744.703125    
module.lstm.bias_hh_l0  dot:  235744.703125    
modu

module.attflat_ans.linear_merge.weight  dot:  119186360.0    
module.attflat_ans.linear_merge.bias  dot:  364977792.0    
module.proj_norm.a_2  dot:  0.0    NOT UPDATING
module.proj_norm.b_2  dot:  0.0    NOT UPDATING
module.proj.weight  dot:  0.0    NOT UPDATING
module.proj.bias  dot:  0.0    NOT UPDATING
module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452232.625

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step   11/6933] Loss: 6088.2695 [iq: 2142.7622,ans: 2161.8909,interp: 2159.2998,fusion: -375.6831]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear

module.proj.bias  dot:  0.0    NOT UPDATING
module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452234.375

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step   15/6933] Loss: 6002.6875 [iq: 2126.3984,ans: 2158.1035,interp: 2137.9878,fusion: -419.8025]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot: 

module.proj_norm.a_2  dot:  0.0    NOT UPDATING
module.proj_norm.b_2  dot:  0.0    NOT UPDATING
module.proj.weight  dot:  0.0    NOT UPDATING
module.proj.bias  dot:  0.0    NOT UPDATING
module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452236.875

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step   19/6933] Loss: 5944.6069 [iq: 2102.2744,ans: 2153.1033,interp: 2134.0044,fusion: -444.7751]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.lin

[Version hakku][Epoch  1][Step   23/6933] Loss: 5817.0576 [iq: 2061.3066,ans: 2146.0342,interp: 2078.8979,fusion: -469.1809]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1254.58447265625    
module.ans_embedding.weight  dot:  757556.3125    
module.lstm.weight_ih_l0  dot:  1317795.375    
module.lstm.weight_hh_l0  dot:  52314.203125    
module.lstm.bias_ih_l0  dot:  105009.75    
module.lstm.bias_hh_l0  dot:  105009.75    
module.an

[Version hakku][Epoch  1][Step   27/6933] Loss: 5730.8154 [iq: 1985.9347,ans: 2136.3118,interp: 2075.9507,fusion: -467.3818]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  976.1968994140625    
module.ans_embedding.weight  dot:  840209.75    
module.lstm.weight_ih_l0  dot:  1321334.75    
module.lstm.weight_hh_l0  dot:  59965.171875    
module.lstm.bias_ih_l0  dot:  109348.09375    
module.lstm.bias_hh_l0  dot:  109348.09375    
modul

module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452246.125

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step   31/6933] Loss: 5394.5122 [iq: 1828.9491,ans: 2122.9958,interp: 1894.2913,fusion: -451.7242]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1040.481201171875    
module.ans_emb

[Version hakku][Epoch  1][Step   34/6933] Loss: 5250.3301 [iq: 1641.1619,ans: 2109.6470,interp: 1931.7969,fusion: -432.2753]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1422.8682861328125    
module.ans_embedding.weight  dot:  691284.0625    
module.lstm.weight_ih_l0  dot:  1375407.625    
module.lstm.weight_hh_l0  dot:  62337.28125    
module.lstm.bias_ih_l0  dot:  110602.09375    
module.lstm.bias_hh_l0  dot:  110602.09375    
mo

[Version hakku][Epoch  1][Step   38/6933] Loss: 4393.3970 [iq: 1254.3284,ans: 2086.2007,interp: 1435.0889,fusion: -382.2212]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  805.3101806640625    
module.ans_embedding.weight  dot:  507428.625    
module.lstm.weight_ih_l0  dot:  933117.5    
module.lstm.weight_hh_l0  dot:  45042.69921875    
module.lstm.bias_ih_l0  dot:  79108.09375    
module.lstm.bias_hh_l0  dot:  79108.09375    
module

module.proj_norm.b_2  dot:  0.0    NOT UPDATING
module.proj.weight  dot:  0.0    NOT UPDATING
module.proj.bias  dot:  0.0    NOT UPDATING
module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  17  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452257.5

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step   42/6933] Loss: 3985.1199 [iq: 773.0341,ans: 2054.1919,interp: 1495.0764,fusion: -337.1823]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decode

[Version hakku][Epoch  1][Step   46/6933] Loss: 2872.1672 [iq: 359.0984,ans: 2012.2535,interp: 802.6775,fusion: -301.8624]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  314.4257507324219    
module.ans_embedding.weight  dot:  248952.03125    
module.lstm.weight_ih_l0  dot:  381874.5625    
module.lstm.weight_hh_l0  dot:  21462.53125    
module.lstm.bias_ih_l0  dot:  31437.171875    
module.lstm.bias_hh_l0  dot:  31437.171875    
modu

[Version hakku][Epoch  1][Step   50/6933] Loss: 2149.6411 [iq: 147.7231,ans: 1956.4690,interp: 315.1154,fusion: -269.6665]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  225.4712371826172    
module.ans_embedding.weight  dot:  319526.75    
module.lstm.weight_ih_l0  dot:  214888.9375    
module.lstm.weight_hh_l0  dot:  11696.392578125    
module.lstm.bias_ih_l0  dot:  17391.498046875    
module.lstm.bias_hh_l0  dot:  17391.498046875  

[Version hakku][Epoch  1][Step   54/6933] Loss: 2151.9915 [iq: 62.9501,ans: 1890.1833,interp: 450.5147,fusion: -251.6567]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  149.52638244628906    
module.ans_embedding.weight  dot:  218840.8125    
module.lstm.weight_ih_l0  dot:  147939.671875    
module.lstm.weight_hh_l0  dot:  8928.92578125    
module.lstm.bias_ih_l0  dot:  12009.099609375    
module.lstm.bias_hh_l0  dot:  12009.099609375

module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452277.375

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step   58/6933] Loss: 1645.2644 [iq: 33.8178,ans: 1816.3811,interp: 37.4429,fusion: -242.3774]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear

[Version hakku][Epoch  1][Step   62/6933] Loss: 1537.9084 [iq: 22.8660,ans: 1730.9224,interp: 25.9304,fusion: -241.8104]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  116.12076568603516    
module.ans_embedding.weight  dot:  142932.171875    
module.lstm.weight_ih_l0  dot:  85157.0859375    
module.lstm.weight_hh_l0  dot:  6023.52490234375    
module.lstm.bias_ih_l0  dot:  6947.34521484375    
module.lstm.bias_hh_l0  dot:  6947.34521

[Version hakku][Epoch  1][Step   66/6933] Loss: 1440.6378 [iq: 20.0436,ans: 1645.4587,interp: 16.9294,fusion: -241.7940]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  121.89979553222656    
module.ans_embedding.weight  dot:  203988.34375    
module.lstm.weight_ih_l0  dot:  92632.3515625    
module.lstm.weight_hh_l0  dot:  6152.88037109375    
module.lstm.bias_ih_l0  dot:  7527.68115234375    
module.lstm.bias_hh_l0  dot:  7527.681152

[Version hakku][Epoch  1][Step   70/6933] Loss: 1337.6266 [iq: 24.0274,ans: 1547.9241,interp: 16.8351,fusion: -251.1600]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  104.89613342285156    
module.ans_embedding.weight  dot:  109550.0859375    
module.lstm.weight_ih_l0  dot:  70392.8046875    
module.lstm.weight_hh_l0  dot:  4826.96484375    
module.lstm.bias_ih_l0  dot:  5579.6728515625    
module.lstm.bias_hh_l0  dot:  5579.67285156

[Version hakku][Epoch  1][Step   74/6933] Loss: 1851.5452 [iq: 23.5828,ans: 1455.6638,interp: 621.7372,fusion: -249.4385]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  129.69638061523438    
module.ans_embedding.weight  dot:  219200.28125    
module.lstm.weight_ih_l0  dot:  75855.578125    
module.lstm.weight_hh_l0  dot:  4779.01025390625    
module.lstm.bias_ih_l0  dot:  6220.8359375    
module.lstm.bias_hh_l0  dot:  6220.8359375   

[Version hakku][Epoch  1][Step   78/6933] Loss: 1083.1750 [iq: 21.1502,ans: 1301.9347,interp: 12.3582,fusion: -252.2682]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  118.20719909667969    
module.ans_embedding.weight  dot:  103600.71875    
module.lstm.weight_ih_l0  dot:  75659.0    
module.lstm.weight_hh_l0  dot:  5073.51171875    
module.lstm.bias_ih_l0  dot:  5998.7880859375    
module.lstm.bias_hh_l0  dot:  5998.7880859375    
m

module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452299.0

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step   82/6933] Loss: 1260.2974 [iq: 35.9323,ans: 1183.2963,interp: 309.9850,fusion: -268.9161]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  129.09902954101562    
module.ans_embedding.weight  dot:  93739.5546875    
module.lstm.wei


----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step   86/6933] Loss: 853.9058 [iq: 36.1645,ans: 1010.3627,interp: 69.8422,fusion: -262.4636]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  128.7012176513672    
module.ans_embedding.weight  dot:  129814.40625    
module.lstm.weight_ih_l0  dot:  77635.84375    
module.lstm.weight_hh_l0  dot:  4688.3251953125    
module.lstm.bias_ih_l0  dot:  6337.41748046875    
module.lstm.b

[Version hakku][Epoch  1][Step   90/6933] Loss: 648.2698 [iq: 36.3876,ans: 855.9554,interp: 31.2665,fusion: -275.3398]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  127.9031982421875    
module.ans_embedding.weight  dot:  98853.140625    
module.lstm.weight_ih_l0  dot:  80266.4375    
module.lstm.weight_hh_l0  dot:  5671.396484375    
module.lstm.bias_ih_l0  dot:  6328.08740234375    
module.lstm.bias_hh_l0  dot:  6328.08740234375   

module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452309.75

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step   94/6933] Loss: 529.4342 [iq: 40.8335,ans: 745.9913,interp: 23.6058,fusion: -280.9964]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  166.58706665039062    
module.ans_embedding

[Version hakku][Epoch  1][Step   98/6933] Loss: 347.9850 [iq: 41.9413,ans: 558.8450,interp: 21.5866,fusion: -274.3878]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  184.60733032226562    
module.ans_embedding.weight  dot:  162782.140625    
module.lstm.weight_ih_l0  dot:  107109.46875    
module.lstm.weight_hh_l0  dot:  7557.232421875    
module.lstm.bias_ih_l0  dot:  8663.720703125    
module.lstm.bias_hh_l0  dot:  8663.720703125   

[Version hakku][Epoch  1][Step  102/6933] Loss: 192.2789 [iq: 33.4680,ans: 423.4139,interp: 19.4005,fusion: -284.0034]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  198.60894775390625    
module.ans_embedding.weight  dot:  306366.25    
module.lstm.weight_ih_l0  dot:  107465.5859375    
module.lstm.weight_hh_l0  dot:  6865.93359375    
module.lstm.bias_ih_l0  dot:  9048.443359375    
module.lstm.bias_hh_l0  dot:  9048.443359375    
m

module.attflat_ans.mlp.linear.bias  dot:  8.97082408357619e-12    
module.attflat_ans.linear_merge.weight  dot:  29621488.0    
module.attflat_ans.linear_merge.bias  dot:  62878408.0    
module.proj_norm.a_2  dot:  0.0    NOT UPDATING
module.proj_norm.b_2  dot:  0.0    NOT UPDATING
module.proj.weight  dot:  0.0    NOT UPDATING
module.proj.bias  dot:  0.0    NOT UPDATING
module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  17  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452319.75

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  105/6933] Loss: 170.6628 [iq: 54.9212,ans: 373.2015,interp: 47.8659,fusion: -305.3259]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.b

[Version hakku][Epoch  1][Step  109/6933] Loss: 45.0569 [iq: 56.4036,ans: 269.7537,interp: 23.9950,fusion: -305.0954]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  239.4518585205078    
module.ans_embedding.weight  dot:  260277.90625    
module.lstm.weight_ih_l0  dot:  136128.5625    
module.lstm.weight_hh_l0  dot:  8193.5322265625    
module.lstm.bias_ih_l0  dot:  10592.193359375    
module.lstm.bias_hh_l0  dot:  10592.193359375    

module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452326.625

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  113/6933] Loss: -31.9979 [iq: 51.4190,ans: 209.3339,interp: 26.9921,fusion: -319.7430]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.b

[Version hakku][Epoch  1][Step  117/6933] Loss: -126.2689 [iq: 43.4065,ans: 124.8581,interp: 11.4359,fusion: -305.9695]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  301.59832763671875    
module.ans_embedding.weight  dot:  332953.75    
module.lstm.weight_ih_l0  dot:  174063.234375    
module.lstm.weight_hh_l0  dot:  11451.697265625    
module.lstm.bias_ih_l0  dot:  14672.4267578125    
module.lstm.bias_hh_l0  dot:  14672.4267578125

[Version hakku][Epoch  1][Step  121/6933] Loss: -170.4897 [iq: 42.6586,ans: 102.1634,interp: 10.3989,fusion: -325.7106]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  359.04229736328125    
module.ans_embedding.weight  dot:  399747.90625    
module.lstm.weight_ih_l0  dot:  192859.125    
module.lstm.weight_hh_l0  dot:  12498.16015625    
module.lstm.bias_ih_l0  dot:  16001.291015625    
module.lstm.bias_hh_l0  dot:  16001.291015625   

[Version hakku][Epoch  1][Step  124/6933] Loss: -163.8782 [iq: 43.8102,ans: 111.4726,interp: 22.7415,fusion: -341.9025]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  260.42120361328125    
module.ans_embedding.weight  dot:  532508.25    
module.lstm.weight_ih_l0  dot:  160034.84375    
module.lstm.weight_hh_l0  dot:  9694.208984375    
module.lstm.bias_ih_l0  dot:  13032.1796875    
module.lstm.bias_hh_l0  dot:  13032.1796875    
mod

[Version hakku][Epoch  1][Step  128/6933] Loss: -209.0042 [iq: 46.4181,ans: 69.2694,interp: 27.0519,fusion: -351.7436]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  249.6814422607422    
module.ans_embedding.weight  dot:  554320.4375    
module.lstm.weight_ih_l0  dot:  179085.40625    
module.lstm.weight_hh_l0  dot:  10701.5986328125    
module.lstm.bias_ih_l0  dot:  14705.60546875    
module.lstm.bias_hh_l0  dot:  14705.60546875    

[Version hakku][Epoch  1][Step  132/6933] Loss: -253.6816 [iq: 40.1576,ans: 67.4153,interp: 15.7050,fusion: -376.9595]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  267.8083801269531    
module.ans_embedding.weight  dot:  501903.9375    
module.lstm.weight_ih_l0  dot:  170700.0    
module.lstm.weight_hh_l0  dot:  9635.5400390625    
module.lstm.bias_ih_l0  dot:  13897.4716796875    
module.lstm.bias_hh_l0  dot:  13897.4716796875    


[Version hakku][Epoch  1][Step  136/6933] Loss: -259.5749 [iq: 45.5606,ans: 62.0587,interp: 13.0156,fusion: -380.2097]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  179.58294677734375    
module.ans_embedding.weight  dot:  536538.8125    
module.lstm.weight_ih_l0  dot:  148968.75    
module.lstm.weight_hh_l0  dot:  8374.380859375    
module.lstm.bias_ih_l0  dot:  12028.240234375    
module.lstm.bias_hh_l0  dot:  12028.240234375    
m

[Version hakku][Epoch  1][Step  140/6933] Loss: -296.1827 [iq: 28.6204,ans: 47.5103,interp: 13.1065,fusion: -385.4200]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  180.82017517089844    
module.ans_embedding.weight  dot:  882417.25    
module.lstm.weight_ih_l0  dot:  160687.0625    
module.lstm.weight_hh_l0  dot:  9283.693359375    
module.lstm.bias_ih_l0  dot:  12799.708984375    
module.lstm.bias_hh_l0  dot:  12799.708984375    
m

[Version hakku][Epoch  1][Step  144/6933] Loss: -308.0001 [iq: 21.8574,ans: 36.4003,interp: 10.0209,fusion: -376.2786]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  159.9107208251953    
module.ans_embedding.weight  dot:  1210764.0    
module.lstm.weight_ih_l0  dot:  132168.21875    
module.lstm.weight_hh_l0  dot:  6609.09716796875    
module.lstm.bias_ih_l0  dot:  10813.841796875    
module.lstm.bias_hh_l0  dot:  10813.841796875    

[Version hakku][Epoch  1][Step  148/6933] Loss: -305.8547 [iq: 29.4096,ans: 51.4111,interp: 20.0140,fusion: -406.6894]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  125.58256530761719    
module.ans_embedding.weight  dot:  1090953.375    
module.lstm.weight_ih_l0  dot:  105382.21875    
module.lstm.weight_hh_l0  dot:  5160.181640625    
module.lstm.bias_ih_l0  dot:  8362.0234375    
module.lstm.bias_hh_l0  dot:  8362.0234375    
modu

[Version hakku][Epoch  1][Step  152/6933] Loss: -306.4172 [iq: 29.9400,ans: 42.4948,interp: 25.4358,fusion: -404.2878]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  191.4086456298828    
module.ans_embedding.weight  dot:  953549.625    
module.lstm.weight_ih_l0  dot:  151783.421875    
module.lstm.weight_hh_l0  dot:  7898.43603515625    
module.lstm.bias_ih_l0  dot:  12153.3369140625    
module.lstm.bias_hh_l0  dot:  12153.3369140625

[Version hakku][Epoch  1][Step  156/6933] Loss: -340.9801 [iq: 22.5040,ans: 31.7668,interp: 13.3380,fusion: -408.5890]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  114.72671508789062    
module.ans_embedding.weight  dot:  900961.0625    
module.lstm.weight_ih_l0  dot:  95686.171875    
module.lstm.weight_hh_l0  dot:  4604.7158203125    
module.lstm.bias_ih_l0  dot:  7637.0869140625    
module.lstm.bias_hh_l0  dot:  7637.0869140625  

[Version hakku][Epoch  1][Step  160/6933] Loss: -340.7031 [iq: 23.1825,ans: 34.0500,interp: 12.6028,fusion: -410.5383]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  135.35011291503906    
module.ans_embedding.weight  dot:  1337194.125    
module.lstm.weight_ih_l0  dot:  94135.0390625    
module.lstm.weight_hh_l0  dot:  4671.2724609375    
module.lstm.bias_ih_l0  dot:  7334.943359375    
module.lstm.bias_hh_l0  dot:  7334.943359375   

[Version hakku][Epoch  1][Step  164/6933] Loss: -343.8809 [iq: 20.8465,ans: 33.3073,interp: 12.6065,fusion: -410.6413]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  157.89065551757812    
module.ans_embedding.weight  dot:  1173670.75    
module.lstm.weight_ih_l0  dot:  112031.9921875    
module.lstm.weight_hh_l0  dot:  5276.173828125    
module.lstm.bias_ih_l0  dot:  8735.8642578125    
module.lstm.bias_hh_l0  dot:  8735.8642578125  

[Version hakku][Epoch  1][Step  168/6933] Loss: -350.4663 [iq: 25.4184,ans: 31.7711,interp: 19.5256,fusion: -427.1815]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  120.29449462890625    
module.ans_embedding.weight  dot:  1261656.25    
module.lstm.weight_ih_l0  dot:  83925.984375    
module.lstm.weight_hh_l0  dot:  4087.46728515625    
module.lstm.bias_ih_l0  dot:  6441.7060546875    
module.lstm.bias_hh_l0  dot:  6441.7060546875  

[Version hakku][Epoch  1][Step  172/6933] Loss: -372.8026 [iq: 17.1626,ans: 25.3086,interp: 16.4536,fusion: -431.7274]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  120.68145751953125    
module.ans_embedding.weight  dot:  1225718.5    
module.lstm.weight_ih_l0  dot:  80372.3359375    
module.lstm.weight_hh_l0  dot:  3738.753173828125    
module.lstm.bias_ih_l0  dot:  6427.76953125    
module.lstm.bias_hh_l0  dot:  6427.76953125    


[Version hakku][Epoch  1][Step  175/6933] Loss: -369.0586 [iq: 24.8741,ans: 33.7272,interp: 18.3593,fusion: -446.0191]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  129.35272216796875    
module.ans_embedding.weight  dot:  1452442.875    
module.lstm.weight_ih_l0  dot:  87768.265625    
module.lstm.weight_hh_l0  dot:  3922.010009765625    
module.lstm.bias_ih_l0  dot:  7171.736328125    
module.lstm.bias_hh_l0  dot:  7171.736328125  

[Version hakku][Epoch  1][Step  179/6933] Loss: -389.3849 [iq: 18.2710,ans: 36.4875,interp: 10.6847,fusion: -454.8281]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  219.42193603515625    
module.ans_embedding.weight  dot:  1323149.25    
module.lstm.weight_ih_l0  dot:  152509.5625    
module.lstm.weight_hh_l0  dot:  6757.1337890625    
module.lstm.bias_ih_l0  dot:  11612.33984375    
module.lstm.bias_hh_l0  dot:  11612.33984375    
m

[Version hakku][Epoch  1][Step  183/6933] Loss: -398.0518 [iq: 18.0809,ans: 25.9378,interp: 13.4799,fusion: -455.5504]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  127.07054138183594    
module.ans_embedding.weight  dot:  1231706.375    
module.lstm.weight_ih_l0  dot:  67437.0859375    
module.lstm.weight_hh_l0  dot:  3299.85546875    
module.lstm.bias_ih_l0  dot:  5330.3583984375    
module.lstm.bias_hh_l0  dot:  5330.3583984375   

module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452364.375

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  187/6933] Loss: -400.9186 [iq: 22.4243,ans: 25.7293,interp: 13.1892,fusion: -462.2614]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  116.15219116210938    
module.ans_embedding.weight  dot:  1514512.25    
module.lstm.weight_

[Version hakku][Epoch  1][Step  191/6933] Loss: -412.5171 [iq: 17.1968,ans: 26.3502,interp: 13.4176,fusion: -469.4818]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  259.52252197265625    
module.ans_embedding.weight  dot:  1620742.0    
module.lstm.weight_ih_l0  dot:  119744.3046875    
module.lstm.weight_hh_l0  dot:  5207.69580078125    
module.lstm.bias_ih_l0  dot:  9583.9150390625    
module.lstm.bias_hh_l0  dot:  9583.9150390625 

[Version hakku][Epoch  1][Step  195/6933] Loss: -406.5345 [iq: 14.6573,ans: 23.3832,interp: 10.2981,fusion: -454.8731]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  211.50881958007812    
module.ans_embedding.weight  dot:  1353014.125    
module.lstm.weight_ih_l0  dot:  113324.1875    
module.lstm.weight_hh_l0  dot:  5411.0986328125    
module.lstm.bias_ih_l0  dot:  8842.787109375    
module.lstm.bias_hh_l0  dot:  8842.787109375    


[Version hakku][Epoch  1][Step  199/6933] Loss: -404.5577 [iq: 22.4601,ans: 34.2264,interp: 19.8977,fusion: -481.1419]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  317.8357849121094    
module.ans_embedding.weight  dot:  1201477.625    
module.lstm.weight_ih_l0  dot:  114031.7578125    
module.lstm.weight_hh_l0  dot:  5309.82666015625    
module.lstm.bias_ih_l0  dot:  8250.232421875    
module.lstm.bias_hh_l0  dot:  8250.232421875  

[Version hakku][Epoch  1][Step  203/6933] Loss: -433.1963 [iq: 16.5222,ans: 25.1929,interp: 10.2931,fusion: -485.2045]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  333.66473388671875    
module.ans_embedding.weight  dot:  798185.25    
module.lstm.weight_ih_l0  dot:  174569.453125    
module.lstm.weight_hh_l0  dot:  7934.9697265625    
module.lstm.bias_ih_l0  dot:  13808.7421875    
module.lstm.bias_hh_l0  dot:  13808.7421875    
mo

[Version hakku][Epoch  1][Step  207/6933] Loss: -442.3467 [iq: 16.3089,ans: 26.7323,interp: 11.2886,fusion: -496.6765]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  359.51025390625    
module.ans_embedding.weight  dot:  1137204.5    
module.lstm.weight_ih_l0  dot:  190214.25    
module.lstm.weight_hh_l0  dot:  8905.1298828125    
module.lstm.bias_ih_l0  dot:  15114.3984375    
module.lstm.bias_hh_l0  dot:  15114.3984375    
module.an

[Version hakku][Epoch  1][Step  211/6933] Loss: -435.7277 [iq: 20.4856,ans: 23.9053,interp: 16.2369,fusion: -496.3555]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  351.10150146484375    
module.ans_embedding.weight  dot:  1163076.875    
module.lstm.weight_ih_l0  dot:  136869.640625    
module.lstm.weight_hh_l0  dot:  6450.451171875    
module.lstm.bias_ih_l0  dot:  10587.1845703125    
module.lstm.bias_hh_l0  dot:  10587.1845703125

[Version hakku][Epoch  1][Step  215/6933] Loss: -445.4285 [iq: 19.7853,ans: 29.0504,interp: 14.0465,fusion: -508.3107]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  532.286865234375    
module.ans_embedding.weight  dot:  1108949.5    
module.lstm.weight_ih_l0  dot:  175473.515625    
module.lstm.weight_hh_l0  dot:  8287.078125    
module.lstm.bias_ih_l0  dot:  13298.16796875    
module.lstm.bias_hh_l0  dot:  13298.16796875    
module

[Version hakku][Epoch  1][Step  219/6933] Loss: -454.7981 [iq: 19.3888,ans: 24.8598,interp: 13.8864,fusion: -512.9331]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  371.91552734375    
module.ans_embedding.weight  dot:  886806.75    
module.lstm.weight_ih_l0  dot:  108822.03125    
module.lstm.weight_hh_l0  dot:  5525.36279296875    
module.lstm.bias_ih_l0  dot:  7736.298828125    
module.lstm.bias_hh_l0  dot:  7736.298828125    
mod

module.proj.weight  dot:  0.0    NOT UPDATING
module.proj.bias  dot:  0.0    NOT UPDATING
module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  17  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452383.25

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  223/6933] Loss: -458.1111 [iq: 14.9184,ans: 25.9018,interp: 9.0483,fusion: -507.9796]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
mod

[Version hakku][Epoch  1][Step  227/6933] Loss: -475.5763 [iq: 15.3484,ans: 24.6521,interp: 10.4268,fusion: -526.0035]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  525.5235595703125    
module.ans_embedding.weight  dot:  714793.25    
module.lstm.weight_ih_l0  dot:  142566.59375    
module.lstm.weight_hh_l0  dot:  6792.81689453125    
module.lstm.bias_ih_l0  dot:  10482.0556640625    
module.lstm.bias_hh_l0  dot:  10482.0556640625  

[Version hakku][Epoch  1][Step  231/6933] Loss: -475.0854 [iq: 16.8878,ans: 25.0556,interp: 13.6914,fusion: -530.7202]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  871.707275390625    
module.ans_embedding.weight  dot:  744466.5    
module.lstm.weight_ih_l0  dot:  172218.25    
module.lstm.weight_hh_l0  dot:  9094.369140625    
module.lstm.bias_ih_l0  dot:  11133.2861328125    
module.lstm.bias_hh_l0  dot:  11133.2861328125    
modu

[Version hakku][Epoch  1][Step  235/6933] Loss: -491.2316 [iq: 15.4106,ans: 22.9874,interp: 8.9298,fusion: -538.5593]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  714.6357421875    
module.ans_embedding.weight  dot:  470741.625    
module.lstm.weight_ih_l0  dot:  155893.5625    
module.lstm.weight_hh_l0  dot:  8133.34375    
module.lstm.bias_ih_l0  dot:  10429.7265625    
module.lstm.bias_hh_l0  dot:  10429.7265625    
module.ans_ls

[Version hakku][Epoch  1][Step  239/6933] Loss: -490.7462 [iq: 17.4135,ans: 25.3944,interp: 11.6116,fusion: -545.1658]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  782.0081787109375    
module.ans_embedding.weight  dot:  627633.75    
module.lstm.weight_ih_l0  dot:  152732.25    
module.lstm.weight_hh_l0  dot:  7367.134765625    
module.lstm.bias_ih_l0  dot:  10732.82421875    
module.lstm.bias_hh_l0  dot:  10732.82421875    
module

[Version hakku][Epoch  1][Step  243/6933] Loss: -497.1166 [iq: 12.8220,ans: 20.3759,interp: 10.9216,fusion: -541.2361]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  889.8275146484375    
module.ans_embedding.weight  dot:  551120.125    
module.lstm.weight_ih_l0  dot:  230405.796875    
module.lstm.weight_hh_l0  dot:  11987.359375    
module.lstm.bias_ih_l0  dot:  16639.99609375    
module.lstm.bias_hh_l0  dot:  16639.99609375    
mod

[Version hakku][Epoch  1][Step  247/6933] Loss: -490.1440 [iq: 18.2254,ans: 20.8030,interp: 12.1974,fusion: -541.3697]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  943.2442016601562    
module.ans_embedding.weight  dot:  651036.5625    
module.lstm.weight_ih_l0  dot:  211529.5625    
module.lstm.weight_hh_l0  dot:  10148.595703125    
module.lstm.bias_ih_l0  dot:  15252.3671875    
module.lstm.bias_hh_l0  dot:  15252.3671875    
mod

module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452401.75

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  251/6933] Loss: -509.7182 [iq: 14.5685,ans: 18.9500,interp: 13.5848,fusion: -556.8215]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1051.18701171875    
module.ans_embedding.w

[Version hakku][Epoch  1][Step  255/6933] Loss: -501.8850 [iq: 17.9201,ans: 20.1341,interp: 13.6385,fusion: -553.5777]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1339.481201171875    
module.ans_embedding.weight  dot:  410571.0    
module.lstm.weight_ih_l0  dot:  271867.625    
module.lstm.weight_hh_l0  dot:  15513.318359375    
module.lstm.bias_ih_l0  dot:  18277.005859375    
module.lstm.bias_hh_l0  dot:  18277.005859375    
mod

[Version hakku][Epoch  1][Step  259/6933] Loss: -515.0745 [iq: 14.1723,ans: 18.7464,interp: 12.8102,fusion: -560.8033]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1624.387451171875    
module.ans_embedding.weight  dot:  397053.1875    
module.lstm.weight_ih_l0  dot:  167068.859375    
module.lstm.weight_hh_l0  dot:  10601.5576171875    
module.lstm.bias_ih_l0  dot:  10818.41796875    
module.lstm.bias_hh_l0  dot:  10818.41796875   

[Version hakku][Epoch  1][Step  263/6933] Loss: -527.3441 [iq: 16.8859,ans: 21.2164,interp: 10.8701,fusion: -576.3166]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1814.10205078125    
module.ans_embedding.weight  dot:  428212.71875    
module.lstm.weight_ih_l0  dot:  268691.5625    
module.lstm.weight_hh_l0  dot:  15993.8994140625    
module.lstm.bias_ih_l0  dot:  18438.3203125    
module.lstm.bias_hh_l0  dot:  18438.3203125    
mo

[Version hakku][Epoch  1][Step  267/6933] Loss: -527.2816 [iq: 14.6871,ans: 19.0718,interp: 14.3319,fusion: -575.3724]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1938.8890380859375    
module.ans_embedding.weight  dot:  278911.375    
module.lstm.weight_ih_l0  dot:  262688.84375    
module.lstm.weight_hh_l0  dot:  16668.142578125    
module.lstm.bias_ih_l0  dot:  17571.02734375    
module.lstm.bias_hh_l0  dot:  17571.02734375    



----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  271/6933] Loss: -529.3931 [iq: 13.3528,ans: 18.7366,interp: 10.0414,fusion: -571.5238]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  2914.625    
module.ans_embedding.weight  dot:  378526.375    
module.lstm.weight_ih_l0  dot:  291495.625    
module.lstm.weight_hh_l0  dot:  20959.87890625    
module.lstm.bias_ih_l0  dot:  19509.03125    
module.lstm.bias_hh_l0  dot:  19

[Version hakku][Epoch  1][Step  275/6933] Loss: -537.3971 [iq: 12.0915,ans: 16.7768,interp: 9.3240,fusion: -575.5894]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  3103.34375    
module.ans_embedding.weight  dot:  269898.5    
module.lstm.weight_ih_l0  dot:  252721.4375    
module.lstm.weight_hh_l0  dot:  18784.40625    
module.lstm.bias_ih_l0  dot:  14997.103515625    
module.lstm.bias_hh_l0  dot:  14997.103515625    
module.ans_lst

[Version hakku][Epoch  1][Step  279/6933] Loss: -548.2023 [iq: 12.6178,ans: 19.0986,interp: 9.9863,fusion: -589.9052]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  4195.703125    
module.ans_embedding.weight  dot:  253185.1875    
module.lstm.weight_ih_l0  dot:  305254.4375    
module.lstm.weight_hh_l0  dot:  31617.44921875    
module.lstm.bias_ih_l0  dot:  16600.2421875    
module.lstm.bias_hh_l0  dot:  16600.2421875    
module.ans_

[Version hakku][Epoch  1][Step  283/6933] Loss: -536.8102 [iq: 17.6612,ans: 18.7424,interp: 13.0408,fusion: -586.2546]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  4686.4150390625    
module.ans_embedding.weight  dot:  228266.875    
module.lstm.weight_ih_l0  dot:  298959.1875    
module.lstm.weight_hh_l0  dot:  26975.6796875    
module.lstm.bias_ih_l0  dot:  16852.5234375    
module.lstm.bias_hh_l0  dot:  16852.5234375    
module.a

[Version hakku][Epoch  1][Step  287/6933] Loss: -549.4780 [iq: 14.2026,ans: 17.4800,interp: 11.0437,fusion: -592.2043]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  4472.94189453125    
module.ans_embedding.weight  dot:  433801.3125    
module.lstm.weight_ih_l0  dot:  635425.9375    
module.lstm.weight_hh_l0  dot:  29263.341796875    
module.lstm.bias_ih_l0  dot:  52462.20703125    
module.lstm.bias_hh_l0  dot:  52462.20703125    
mo

module.proj_norm.a_2  dot:  0.0    NOT UPDATING
module.proj_norm.b_2  dot:  0.0    NOT UPDATING
module.proj.weight  dot:  0.0    NOT UPDATING
module.proj.bias  dot:  0.0    NOT UPDATING
module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452431.75

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  291/6933] Loss: -564.2881 [iq: 13.3087,ans: 17.5261,interp: 9.6702,fusion: -604.7930]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weig

[Version hakku][Epoch  1][Step  295/6933] Loss: -547.0823 [iq: 14.1167,ans: 16.9490,interp: 12.4428,fusion: -590.5908]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  6690.8203125    
module.ans_embedding.weight  dot:  271835.875    
module.lstm.weight_ih_l0  dot:  419059.65625    
module.lstm.weight_hh_l0  dot:  34296.0390625    
module.lstm.bias_ih_l0  dot:  27719.671875    
module.lstm.bias_hh_l0  dot:  27719.671875    
module.ans_l

module.proj.weight  dot:  0.0    NOT UPDATING
module.proj.bias  dot:  0.0    NOT UPDATING
module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452438.5

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  299/6933] Loss: -555.8971 [iq: 13.3928,ans: 16.1574,interp: 10.5990,fusion: -596.0463]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
mod

[Version hakku][Epoch  1][Step  303/6933] Loss: -561.7501 [iq: 15.2753,ans: 16.9225,interp: 11.3145,fusion: -605.2624]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  9339.7021484375    
module.ans_embedding.weight  dot:  194201.53125    
module.lstm.weight_ih_l0  dot:  429432.4375    
module.lstm.weight_hh_l0  dot:  63297.3359375    
module.lstm.bias_ih_l0  dot:  19679.015625    
module.lstm.bias_hh_l0  dot:  19679.015625    
module.a

[Version hakku][Epoch  1][Step  307/6933] Loss: -563.0848 [iq: 12.0601,ans: 14.9351,interp: 11.0722,fusion: -601.1522]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  8471.0478515625    
module.ans_embedding.weight  dot:  170066.40625    
module.lstm.weight_ih_l0  dot:  353508.0625    
module.lstm.weight_hh_l0  dot:  41282.9375    
module.lstm.bias_ih_l0  dot:  19989.05859375    
module.lstm.bias_hh_l0  dot:  19989.05859375    
module.

[Version hakku][Epoch  1][Step  311/6933] Loss: -552.7241 [iq: 11.1462,ans: 14.7050,interp: 9.3419,fusion: -587.9172]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  14740.7548828125    
module.ans_embedding.weight  dot:  258075.59375    
module.lstm.weight_ih_l0  dot:  624224.625    
module.lstm.weight_hh_l0  dot:  75080.953125    
module.lstm.bias_ih_l0  dot:  32239.703125    
module.lstm.bias_hh_l0  dot:  32239.703125    
module.ans

[Version hakku][Epoch  1][Step  315/6933] Loss: -570.4171 [iq: 13.8866,ans: 16.5394,interp: 11.8564,fusion: -612.6995]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  17881.650390625    
module.ans_embedding.weight  dot:  287038.28125    
module.lstm.weight_ih_l0  dot:  753158.375    
module.lstm.weight_hh_l0  dot:  108524.5234375    
module.lstm.bias_ih_l0  dot:  43105.58203125    
module.lstm.bias_hh_l0  dot:  43105.58203125    
modu

[Version hakku][Epoch  1][Step  319/6933] Loss: -574.7988 [iq: 11.5595,ans: 14.3743,interp: 9.6197,fusion: -610.3522]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  20778.18359375    
module.ans_embedding.weight  dot:  186615.40625    
module.lstm.weight_ih_l0  dot:  1313314.625    
module.lstm.weight_hh_l0  dot:  243178.578125    
module.lstm.bias_ih_l0  dot:  71810.9921875    
module.lstm.bias_hh_l0  dot:  71810.9921875    
module.a

[Version hakku][Epoch  1][Step  323/6933] Loss: -580.3611 [iq: 12.6512,ans: 14.7369,interp: 10.2150,fusion: -617.9643]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  24973.38671875    
module.ans_embedding.weight  dot:  174924.09375    
module.lstm.weight_ih_l0  dot:  757076.1875    
module.lstm.weight_hh_l0  dot:  140619.015625    
module.lstm.bias_ih_l0  dot:  38206.91796875    
module.lstm.bias_hh_l0  dot:  38206.91796875    
modul

[Version hakku][Epoch  1][Step  327/6933] Loss: -582.6220 [iq: 11.2147,ans: 13.5886,interp: 8.8543,fusion: -616.2796]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  30858.24609375    
module.ans_embedding.weight  dot:  182137.53125    
module.lstm.weight_ih_l0  dot:  830848.9375    
module.lstm.weight_hh_l0  dot:  228231.84375    
module.lstm.bias_ih_l0  dot:  39530.6171875    
module.lstm.bias_hh_l0  dot:  39530.6171875    
module.an

[Version hakku][Epoch  1][Step  331/6933] Loss: -588.4113 [iq: 12.4297,ans: 12.9871,interp: 10.4464,fusion: -624.2745]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  39912.19140625    
module.ans_embedding.weight  dot:  149189.5625    
module.lstm.weight_ih_l0  dot:  1124409.25    
module.lstm.weight_hh_l0  dot:  282196.375    
module.lstm.bias_ih_l0  dot:  53503.33984375    
module.lstm.bias_hh_l0  dot:  53503.33984375    
module.ans

[Version hakku][Epoch  1][Step  335/6933] Loss: -592.8856 [iq: 13.2389,ans: 12.8775,interp: 9.1942,fusion: -628.1962]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  40566.07421875    
module.ans_embedding.weight  dot:  153804.65625    
module.lstm.weight_ih_l0  dot:  956701.4375    
module.lstm.weight_hh_l0  dot:  301960.375    
module.lstm.bias_ih_l0  dot:  38417.3984375    
module.lstm.bias_hh_l0  dot:  38417.3984375    
module.ans_

@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452473.875

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  339/6933] Loss: -591.0844 [iq: 11.1468,ans: 12.6301,interp: 9.8479,fusion: -624.7092]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  54893.17578125    
module.ans_embedding.weight  dot:  150059.875    
module.lstm.weight_ih_l0  dot:  1746224.25    
module.lstm.weight_hh_l0  dot:  510656.28125    
module.lstm.bias_ih_l

[Version hakku][Epoch  1][Step  343/6933] Loss: -584.7982 [iq: 12.3929,ans: 12.9732,interp: 11.2891,fusion: -621.4534]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  69013.6953125    
module.ans_embedding.weight  dot:  206408.171875    
module.lstm.weight_ih_l0  dot:  1182298.375    
module.lstm.weight_hh_l0  dot:  252753.90625    
module.lstm.bias_ih_l0  dot:  45290.96875    
module.lstm.bias_hh_l0  dot:  45290.96875    
module.ans_l

[Version hakku][Epoch  1][Step  347/6933] Loss: -598.3936 [iq: 9.2398,ans: 12.2021,interp: 8.8571,fusion: -628.6926]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  97414.46875    
module.ans_embedding.weight  dot:  103531.1484375    
module.lstm.weight_ih_l0  dot:  2897438.25    
module.lstm.weight_hh_l0  dot:  1155570.875    
module.lstm.bias_ih_l0  dot:  114683.546875    
module.lstm.bias_hh_l0  dot:  114683.546875    
module.ans_ls

[Version hakku][Epoch  1][Step  351/6933] Loss: -600.9882 [iq: 12.1526,ans: 11.8993,interp: 9.7436,fusion: -634.7836]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  125415.703125    
module.ans_embedding.weight  dot:  242671.953125    
module.lstm.weight_ih_l0  dot:  3267573.0    
module.lstm.weight_hh_l0  dot:  1708696.75    
module.lstm.bias_ih_l0  dot:  141968.171875    
module.lstm.bias_hh_l0  dot:  141968.171875    
module.ans_ls

[Version hakku][Epoch  1][Step  355/6933] Loss: -596.8127 [iq: 13.5368,ans: 12.3682,interp: 11.8736,fusion: -634.5914]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  197489.015625    
module.ans_embedding.weight  dot:  191485.34375    
module.lstm.weight_ih_l0  dot:  4129792.5    
module.lstm.weight_hh_l0  dot:  1573024.5    
module.lstm.bias_ih_l0  dot:  185181.40625    
module.lstm.bias_hh_l0  dot:  185181.40625    
module.ans_lstm.

[Version hakku][Epoch  1][Step  359/6933] Loss: -605.0215 [iq: 12.9267,ans: 12.7689,interp: 12.6376,fusion: -643.3548]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  165569.6875    
module.ans_embedding.weight  dot:  185064.6875    
module.lstm.weight_ih_l0  dot:  4199825.0    
module.lstm.weight_hh_l0  dot:  1825412.75    
module.lstm.bias_ih_l0  dot:  221887.59375    
module.lstm.bias_hh_l0  dot:  221887.59375    
module.ans_lstm.we

[Version hakku][Epoch  1][Step  363/6933] Loss: -617.2994 [iq: 12.2607,ans: 12.1804,interp: 11.4289,fusion: -653.1694]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  202693.625    
module.ans_embedding.weight  dot:  154866.09375    
module.lstm.weight_ih_l0  dot:  4635979.0    
module.lstm.weight_hh_l0  dot:  2155394.5    
module.lstm.bias_ih_l0  dot:  225274.859375    
module.lstm.bias_hh_l0  dot:  225274.859375    
module.ans_lstm.w

[Version hakku][Epoch  1][Step  367/6933] Loss: -627.4234 [iq: 11.4198,ans: 11.1808,interp: 9.6178,fusion: -659.6418]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  230807.515625    
module.ans_embedding.weight  dot:  109295.2265625    
module.lstm.weight_ih_l0  dot:  5758881.5    
module.lstm.weight_hh_l0  dot:  1643249.25    
module.lstm.bias_ih_l0  dot:  296078.8125    
module.lstm.bias_hh_l0  dot:  296078.8125    
module.ans_lstm.

[Version hakku][Epoch  1][Step  370/6933] Loss: -619.7704 [iq: 11.4108,ans: 11.2651,interp: 9.4088,fusion: -651.8551]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  229205.71875    
module.ans_embedding.weight  dot:  221937.96875    
module.lstm.weight_ih_l0  dot:  6186970.0    
module.lstm.weight_hh_l0  dot:  1225520.5    
module.lstm.bias_ih_l0  dot:  364668.53125    
module.lstm.bias_hh_l0  dot:  364668.53125    
module.ans_lstm.we

[Version hakku][Epoch  1][Step  374/6933] Loss: -625.6906 [iq: 9.9921,ans: 11.4315,interp: 8.9794,fusion: -656.0935]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  285209.375    
module.ans_embedding.weight  dot:  118325.8359375    
module.lstm.weight_ih_l0  dot:  5503416.0    
module.lstm.weight_hh_l0  dot:  1141747.0    
module.lstm.bias_ih_l0  dot:  336826.9375    
module.lstm.bias_hh_l0  dot:  336826.9375    
module.ans_lstm.weigh

[Version hakku][Epoch  1][Step  378/6933] Loss: -653.4945 [iq: 11.1414,ans: 11.9847,interp: 9.7570,fusion: -686.3776]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  397271.3125    
module.ans_embedding.weight  dot:  221247.046875    
module.lstm.weight_ih_l0  dot:  4338555.5    
module.lstm.weight_hh_l0  dot:  1424851.75    
module.lstm.bias_ih_l0  dot:  213412.65625    
module.lstm.bias_hh_l0  dot:  213412.65625    
module.ans_lstm.w

[Version hakku][Epoch  1][Step  382/6933] Loss: -635.9786 [iq: 11.3175,ans: 11.0121,interp: 9.9197,fusion: -668.2279]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  284482.84375    
module.ans_embedding.weight  dot:  189167.625    
module.lstm.weight_ih_l0  dot:  2832945.0    
module.lstm.weight_hh_l0  dot:  942691.6875    
module.lstm.bias_ih_l0  dot:  115148.046875    
module.lstm.bias_hh_l0  dot:  115148.046875    
module.ans_lstm.

@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452518.25

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  386/6933] Loss: -667.6873 [iq: 9.4231,ans: 10.0273,interp: 8.4771,fusion: -695.6147]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  354027.4375    
module.ans_embedding.weight  dot:  430050.8125    
module.lstm.weight_ih_l0  dot:  9709410.0    
module.lstm.weight_hh_l0  dot:  1747857.5    
module.lstm.bias_ih_l0  dot: 

[Version hakku][Epoch  1][Step  390/6933] Loss: -644.7365 [iq: 12.0527,ans: 11.9850,interp: 10.8897,fusion: -679.6639]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  598485.6875    
module.ans_embedding.weight  dot:  293755.40625    
module.lstm.weight_ih_l0  dot:  5159307.0    
module.lstm.weight_hh_l0  dot:  1533292.75    
module.lstm.bias_ih_l0  dot:  234118.78125    
module.lstm.bias_hh_l0  dot:  234118.78125    
module.ans_lstm.w

[Version hakku][Epoch  1][Step  394/6933] Loss: -664.9534 [iq: 8.8251,ans: 9.6778,interp: 7.9410,fusion: -691.3972]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1026921.5    
module.ans_embedding.weight  dot:  237409.140625    
module.lstm.weight_ih_l0  dot:  22760400.0    
module.lstm.weight_hh_l0  dot:  6360850.5    
module.lstm.bias_ih_l0  dot:  1425700.0    
module.lstm.bias_hh_l0  dot:  1425700.0    
module.ans_lstm.weight_ih_l

[Version hakku][Epoch  1][Step  398/6933] Loss: -660.4463 [iq: 11.7111,ans: 10.9042,interp: 8.5787,fusion: -691.6403]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  763765.75    
module.ans_embedding.weight  dot:  408610.78125    
module.lstm.weight_ih_l0  dot:  34373912.0    
module.lstm.weight_hh_l0  dot:  2494620.5    
module.lstm.bias_ih_l0  dot:  2436187.0    
module.lstm.bias_hh_l0  dot:  2436187.0    
module.ans_lstm.weight_ih_

[Version hakku][Epoch  1][Step  402/6933] Loss: -676.6677 [iq: 13.9267,ans: 11.4148,interp: 11.2686,fusion: -713.2778]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1853476.75    
module.ans_embedding.weight  dot:  314904.71875    
module.lstm.weight_ih_l0  dot:  98337200.0    
module.lstm.weight_hh_l0  dot:  7337005.0    
module.lstm.bias_ih_l0  dot:  7136620.0    
module.lstm.bias_hh_l0  dot:  7136620.0    
module.ans_lstm.weight_i

[Version hakku][Epoch  1][Step  406/6933] Loss: -671.4876 [iq: 10.2462,ans: 9.9713,interp: 9.6462,fusion: -701.3512]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  591717.125    
module.ans_embedding.weight  dot:  861676.3125    
module.lstm.weight_ih_l0  dot:  20519996.0    
module.lstm.weight_hh_l0  dot:  2497881.0    
module.lstm.bias_ih_l0  dot:  1326821.625    
module.lstm.bias_hh_l0  dot:  1326821.625    
module.ans_lstm.weight_

[Version hakku][Epoch  1][Step  410/6933] Loss: -692.9707 [iq: 12.2678,ans: 11.6458,interp: 10.6702,fusion: -727.5545]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1522063.875    
module.ans_embedding.weight  dot:  430709.6875    
module.lstm.weight_ih_l0  dot:  16783700.0    
module.lstm.weight_hh_l0  dot:  5015337.0    
module.lstm.bias_ih_l0  dot:  780175.75    
module.lstm.bias_hh_l0  dot:  780175.75    
module.ans_lstm.weight_i

[Version hakku][Epoch  1][Step  414/6933] Loss: -703.3603 [iq: 12.3509,ans: 11.7195,interp: 12.0821,fusion: -739.5128]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1136200.5    
module.ans_embedding.weight  dot:  558517.25    
module.lstm.weight_ih_l0  dot:  8342025.5    
module.lstm.weight_hh_l0  dot:  2899405.5    
module.lstm.bias_ih_l0  dot:  133704.3125    
module.lstm.bias_hh_l0  dot:  133704.3125    
module.ans_lstm.weight_ih

module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452546.25

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  418/6933] Loss: -703.0875 [iq: 11.6034,ans: 9.6391,interp: 8.3961,fusion: -732.7262]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias

[Version hakku][Epoch  1][Step  422/6933] Loss: -698.5793 [iq: 14.3962,ans: 11.5025,interp: 10.7599,fusion: -735.2379]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  6484947.0    
module.ans_embedding.weight  dot:  592802.5    
module.lstm.weight_ih_l0  dot:  175639904.0    
module.lstm.weight_hh_l0  dot:  32638898.0    
module.lstm.bias_ih_l0  dot:  11402536.0    
module.lstm.bias_hh_l0  dot:  11402536.0    
module.ans_lstm.weight_ih

[Version hakku][Epoch  1][Step  426/6933] Loss: -737.3248 [iq: 10.2477,ans: 9.8831,interp: 9.1772,fusion: -766.6328]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  913150.375    
module.ans_embedding.weight  dot:  668359.875    
module.lstm.weight_ih_l0  dot:  5965280.0    
module.lstm.weight_hh_l0  dot:  2509195.0    
module.lstm.bias_ih_l0  dot:  109693.5625    
module.lstm.bias_hh_l0  dot:  109693.5625    
module.ans_lstm.weight_ih

[Version hakku][Epoch  1][Step  430/6933] Loss: -711.5820 [iq: 11.3238,ans: 11.0792,interp: 10.6837,fusion: -744.6687]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  3297313.75    
module.ans_embedding.weight  dot:  721006.0    
module.lstm.weight_ih_l0  dot:  39581112.0    
module.lstm.weight_hh_l0  dot:  7114795.5    
module.lstm.bias_ih_l0  dot:  2237337.5    
module.lstm.bias_hh_l0  dot:  2237337.5    
module.ans_lstm.weight_ih_l0

[Version hakku][Epoch  1][Step  434/6933] Loss: -745.1240 [iq: 10.3994,ans: 9.8570,interp: 8.8245,fusion: -774.2048]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  544509.4375    
module.ans_embedding.weight  dot:  1154469.25    
module.lstm.weight_ih_l0  dot:  31303304.0    
module.lstm.weight_hh_l0  dot:  3478977.75    
module.lstm.bias_ih_l0  dot:  2111747.0    
module.lstm.bias_hh_l0  dot:  2111747.0    
module.ans_lstm.weight_ih_

[Version hakku][Epoch  1][Step  438/6933] Loss: -727.7452 [iq: 11.3762,ans: 9.8522,interp: 9.5321,fusion: -758.5057]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  2689099.25    
module.ans_embedding.weight  dot:  873677.875    
module.lstm.weight_ih_l0  dot:  60851808.0    
module.lstm.weight_hh_l0  dot:  4984290.5    
module.lstm.bias_ih_l0  dot:  3643419.0    
module.lstm.bias_hh_l0  dot:  3643419.0    
module.ans_lstm.weight_ih_l0

[Version hakku][Epoch  1][Step  442/6933] Loss: -715.6427 [iq: 8.8372,ans: 8.8585,interp: 8.2805,fusion: -741.6188]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  8813217.0    
module.ans_embedding.weight  dot:  710823.375    
module.lstm.weight_ih_l0  dot:  252600176.0    
module.lstm.weight_hh_l0  dot:  36530136.0    
module.lstm.bias_ih_l0  dot:  16811118.0    
module.lstm.bias_hh_l0  dot:  16811118.0    
module.ans_lstm.weight_ih_

[Version hakku][Epoch  1][Step  445/6933] Loss: -741.5770 [iq: 8.5455,ans: 8.1467,interp: 8.3619,fusion: -766.6311]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  2280567.0    
module.ans_embedding.weight  dot:  786078.5625    
module.lstm.weight_ih_l0  dot:  38245164.0    
module.lstm.weight_hh_l0  dot:  6170921.5    
module.lstm.bias_ih_l0  dot:  2262827.75    
module.lstm.bias_hh_l0  dot:  2262827.75    
module.ans_lstm.weight_ih_l

[Version hakku][Epoch  1][Step  449/6933] Loss: -744.7769 [iq: 11.4263,ans: 9.9127,interp: 10.4240,fusion: -776.5398]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  2352959.75    
module.ans_embedding.weight  dot:  1052868.25    
module.lstm.weight_ih_l0  dot:  25203436.0    
module.lstm.weight_hh_l0  dot:  7429800.5    
module.lstm.bias_ih_l0  dot:  1189744.5    
module.lstm.bias_hh_l0  dot:  1189744.5    
module.ans_lstm.weight_ih_l

[Version hakku][Epoch  1][Step  453/6933] Loss: -729.6428 [iq: 11.9144,ans: 10.3656,interp: 10.9261,fusion: -762.8488]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  12114380.0    
module.ans_embedding.weight  dot:  719234.125    
module.lstm.weight_ih_l0  dot:  155764000.0    
module.lstm.weight_hh_l0  dot:  30678706.0    
module.lstm.bias_ih_l0  dot:  9001200.0    
module.lstm.bias_hh_l0  dot:  9001200.0    
module.ans_lstm.weight_i

[Version hakku][Epoch  1][Step  457/6933] Loss: -760.4559 [iq: 9.1743,ans: 8.5161,interp: 8.5995,fusion: -786.7457]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  561619.0    
module.ans_embedding.weight  dot:  647218.25    
module.lstm.weight_ih_l0  dot:  11353817.0    
module.lstm.weight_hh_l0  dot:  1810832.75    
module.lstm.bias_ih_l0  dot:  683624.125    
module.lstm.bias_hh_l0  dot:  683624.125    
module.ans_lstm.weight_ih_l0 

[Version hakku][Epoch  1][Step  461/6933] Loss: -749.9094 [iq: 11.2024,ans: 9.6131,interp: 9.2423,fusion: -779.9672]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  6053277.5    
module.ans_embedding.weight  dot:  1062842.75    
module.lstm.weight_ih_l0  dot:  62335024.0    
module.lstm.weight_hh_l0  dot:  11926478.0    
module.lstm.bias_ih_l0  dot:  2989309.0    
module.lstm.bias_hh_l0  dot:  2989309.0    
module.ans_lstm.weight_ih_l0

[Version hakku][Epoch  1][Step  465/6933] Loss: -776.4067 [iq: 12.5956,ans: 9.5147,interp: 9.6211,fusion: -808.1381]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  3511561.25    
module.ans_embedding.weight  dot:  822718.375    
module.lstm.weight_ih_l0  dot:  90836576.0    
module.lstm.weight_hh_l0  dot:  8122164.0    
module.lstm.bias_ih_l0  dot:  6013706.0    
module.lstm.bias_hh_l0  dot:  6013706.0    
module.ans_lstm.weight_ih_l0

module.proj.bias  dot:  0.0    NOT UPDATING
module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452582.875

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  469/6933] Loss: -781.5250 [iq: 8.3220,ans: 8.7843,interp: 8.3898,fusion: -807.0211]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    N


----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  473/6933] Loss: -782.1883 [iq: 9.6419,ans: 8.3049,interp: 8.5628,fusion: -808.6980]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  20174714.0    
module.ans_embedding.weight  dot:  981935.75    
module.lstm.weight_ih_l0  dot:  316342208.0    
module.lstm.weight_hh_l0  dot:  46759760.0    
module.lstm.bias_ih_l0  dot:  21488672.0    
module.lstm.bias_hh_l0  dot:  21488672

[Version hakku][Epoch  1][Step  477/6933] Loss: -744.5490 [iq: 11.4935,ans: 9.4423,interp: 9.7615,fusion: -775.2462]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1734668.0    
module.ans_embedding.weight  dot:  1237807.5    
module.lstm.weight_ih_l0  dot:  23491436.0    
module.lstm.weight_hh_l0  dot:  4526763.5    
module.lstm.bias_ih_l0  dot:  1451880.75    
module.lstm.bias_hh_l0  dot:  1451880.75    
module.ans_lstm.weight_ih_l0

[Version hakku][Epoch  1][Step  481/6933] Loss: -749.2629 [iq: 8.1872,ans: 8.5332,interp: 8.3204,fusion: -774.3036]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  5684254.5    
module.ans_embedding.weight  dot:  1073575.5    
module.lstm.weight_ih_l0  dot:  102619616.0    
module.lstm.weight_hh_l0  dot:  17218824.0    
module.lstm.bias_ih_l0  dot:  6438484.0    
module.lstm.bias_hh_l0  dot:  6438484.0    
module.ans_lstm.weight_ih_l0 

[Version hakku][Epoch  1][Step  484/6933] Loss: -778.8770 [iq: 10.7567,ans: 9.3961,interp: 8.5466,fusion: -807.5762]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  2964572.0    
module.ans_embedding.weight  dot:  1448500.5    
module.lstm.weight_ih_l0  dot:  36820872.0    
module.lstm.weight_hh_l0  dot:  5585377.0    
module.lstm.bias_ih_l0  dot:  2094068.625    
module.lstm.bias_hh_l0  dot:  2094068.625    
module.ans_lstm.weight_ih_

module.proj_norm.a_2  dot:  0.0    NOT UPDATING
module.proj_norm.b_2  dot:  0.0    NOT UPDATING
module.proj.weight  dot:  0.0    NOT UPDATING
module.proj.bias  dot:  0.0    NOT UPDATING
module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452594.75

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  487/6933] Loss: -765.9350 [iq: 14.3134,ans: 10.4861,interp: 10.2861,fusion: -801.0206]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.wei

[Version hakku][Epoch  1][Step  491/6933] Loss: -755.3730 [iq: 9.2591,ans: 8.6787,interp: 8.7751,fusion: -782.0860]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  4739780.5    
module.ans_embedding.weight  dot:  2118172.0    
module.lstm.weight_ih_l0  dot:  28936002.0    
module.lstm.weight_hh_l0  dot:  7376527.5    
module.lstm.bias_ih_l0  dot:  1388867.5    
module.lstm.bias_hh_l0  dot:  1388867.5    
module.ans_lstm.weight_ih_l0  d

@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452599.375

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  495/6933] Loss: -758.5987 [iq: 12.4508,ans: 10.0904,interp: 11.5069,fusion: -792.6468]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  641262.5    
module.ans_embedding.weight  dot:  1329324.875    
module.lstm.weight_ih_l0  dot:  10599054.0    
module.lstm.weight_hh_l0  dot:  4189565.75    
module.lstm.bias_ih_l0  dot

[Version hakku][Epoch  1][Step  499/6933] Loss: -798.5118 [iq: 9.9506,ans: 9.7328,interp: 9.1326,fusion: -827.3278]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  457870.84375    
module.ans_embedding.weight  dot:  1531491.75    
module.lstm.weight_ih_l0  dot:  1945851.25    
module.lstm.weight_hh_l0  dot:  854161.5625    
module.lstm.bias_ih_l0  dot:  90917.015625    
module.lstm.bias_hh_l0  dot:  90917.015625    
module.ans_lstm.wei

[Version hakku][Epoch  1][Step  503/6933] Loss: -793.1208 [iq: 9.7510,ans: 9.4805,interp: 10.1386,fusion: -822.4908]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  13335653.0    
module.ans_embedding.weight  dot:  1108879.5    
module.lstm.weight_ih_l0  dot:  63921460.0    
module.lstm.weight_hh_l0  dot:  9141542.0    
module.lstm.bias_ih_l0  dot:  657373.375    
module.lstm.bias_hh_l0  dot:  657373.375    
module.ans_lstm.weight_ih_l

[Version hakku][Epoch  1][Step  507/6933] Loss: -787.1838 [iq: 11.2590,ans: 9.3942,interp: 9.2625,fusion: -817.0995]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  2060812.625    
module.ans_embedding.weight  dot:  944221.0    
module.lstm.weight_ih_l0  dot:  8415854.0    
module.lstm.weight_hh_l0  dot:  3243903.0    
module.lstm.bias_ih_l0  dot:  236645.9375    
module.lstm.bias_hh_l0  dot:  236645.9375    
module.ans_lstm.weight_ih_

[Version hakku][Epoch  1][Step  511/6933] Loss: -767.5863 [iq: 10.7148,ans: 8.6582,interp: 8.6031,fusion: -795.5624]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  747614.0625    
module.ans_embedding.weight  dot:  932961.1875    
module.lstm.weight_ih_l0  dot:  3759124.5    
module.lstm.weight_hh_l0  dot:  1841650.75    
module.lstm.bias_ih_l0  dot:  158798.21875    
module.lstm.bias_hh_l0  dot:  158798.21875    
module.ans_lstm.weig

[Version hakku][Epoch  1][Step  515/6933] Loss: -810.7844 [iq: 9.0718,ans: 8.7088,interp: 9.0854,fusion: -837.6504]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1507975.875    
module.ans_embedding.weight  dot:  1367006.125    
module.lstm.weight_ih_l0  dot:  18820242.0    
module.lstm.weight_hh_l0  dot:  5468317.0    
module.lstm.bias_ih_l0  dot:  1153219.5    
module.lstm.bias_hh_l0  dot:  1153219.5    
module.ans_lstm.weight_ih_l

module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452614.625

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  519/6933] Loss: -806.9862 [iq: 9.5350,ans: 8.8412,interp: 9.4114,fusion: -834.7737]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight 

[Version hakku][Epoch  1][Step  523/6933] Loss: -814.5839 [iq: 9.4703,ans: 8.3247,interp: 9.4040,fusion: -841.7829]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1200628.125    
module.ans_embedding.weight  dot:  1063612.875    
module.lstm.weight_ih_l0  dot:  6159940.0    
module.lstm.weight_hh_l0  dot:  2861660.0    
module.lstm.bias_ih_l0  dot:  249335.5    
module.lstm.bias_hh_l0  dot:  249335.5    
module.ans_lstm.weight_ih_l0  

module.proj.bias  dot:  0.0    NOT UPDATING
module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452620.125

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  527/6933] Loss: -792.1409 [iq: 11.7430,ans: 8.9320,interp: 8.9606,fusion: -821.7766]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    

[Version hakku][Epoch  1][Step  531/6933] Loss: -827.5744 [iq: 9.3603,ans: 8.8071,interp: 8.7854,fusion: -854.5272]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  452963.25    
module.ans_embedding.weight  dot:  1536395.25    
module.lstm.weight_ih_l0  dot:  13015166.0    
module.lstm.weight_hh_l0  dot:  4549964.0    
module.lstm.bias_ih_l0  dot:  819752.875    
module.lstm.bias_hh_l0  dot:  819752.875    
module.ans_lstm.weight_ih_l0

[Version hakku][Epoch  1][Step  535/6933] Loss: -789.8673 [iq: 12.3687,ans: 9.1661,interp: 9.5041,fusion: -820.9062]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  4876040.5    
module.ans_embedding.weight  dot:  1066130.75    
module.lstm.weight_ih_l0  dot:  147068512.0    
module.lstm.weight_hh_l0  dot:  12938473.0    
module.lstm.bias_ih_l0  dot:  8706757.0    
module.lstm.bias_hh_l0  dot:  8706757.0    
module.ans_lstm.weight_ih_l

[Version hakku][Epoch  1][Step  539/6933] Loss: -795.7589 [iq: 11.5330,ans: 9.7035,interp: 10.1523,fusion: -827.1477]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  425285.6875    
module.ans_embedding.weight  dot:  1127562.125    
module.lstm.weight_ih_l0  dot:  10764786.0    
module.lstm.weight_hh_l0  dot:  5441485.0    
module.lstm.bias_ih_l0  dot:  744552.5    
module.lstm.bias_hh_l0  dot:  744552.5    
module.ans_lstm.weight_ih_l

[Version hakku][Epoch  1][Step  543/6933] Loss: -836.1661 [iq: 10.4740,ans: 8.6824,interp: 8.1848,fusion: -863.5073]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  826478.5625    
module.ans_embedding.weight  dot:  1091279.75    
module.lstm.weight_ih_l0  dot:  7313216.0    
module.lstm.weight_hh_l0  dot:  2131269.25    
module.lstm.bias_ih_l0  dot:  308949.9375    
module.lstm.bias_hh_l0  dot:  308949.9375    
module.ans_lstm.weight_

[Version hakku][Epoch  1][Step  547/6933] Loss: -813.3806 [iq: 11.4564,ans: 9.4714,interp: 9.1427,fusion: -843.4511]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  373502.875    
module.ans_embedding.weight  dot:  1361551.0    
module.lstm.weight_ih_l0  dot:  2660146.75    
module.lstm.weight_hh_l0  dot:  1143905.375    
module.lstm.bias_ih_l0  dot:  110768.484375    
module.lstm.bias_hh_l0  dot:  110768.484375    
module.ans_lstm.wei

[Version hakku][Epoch  1][Step  551/6933] Loss: -786.5781 [iq: 10.5303,ans: 9.7626,interp: 11.8007,fusion: -818.6717]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1592789.125    
module.ans_embedding.weight  dot:  1220780.0    
module.lstm.weight_ih_l0  dot:  18779132.0    
module.lstm.weight_hh_l0  dot:  3497743.75    
module.lstm.bias_ih_l0  dot:  961219.125    
module.lstm.bias_hh_l0  dot:  961219.125    
module.ans_lstm.weight_i

[Version hakku][Epoch  1][Step  555/6933] Loss: -807.2782 [iq: 10.3607,ans: 7.5830,interp: 9.1976,fusion: -834.4194]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  880516.75    
module.ans_embedding.weight  dot:  1083147.375    
module.lstm.weight_ih_l0  dot:  3813255.75    
module.lstm.weight_hh_l0  dot:  1608957.625    
module.lstm.bias_ih_l0  dot:  118673.7421875    
module.lstm.bias_hh_l0  dot:  118673.7421875    
module.ans_lstm.

[Version hakku][Epoch  1][Step  559/6933] Loss: -788.3686 [iq: 10.8472,ans: 8.9431,interp: 11.5356,fusion: -819.6945]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  919523.5625    
module.ans_embedding.weight  dot:  1669327.875    
module.lstm.weight_ih_l0  dot:  4676632.0    
module.lstm.weight_hh_l0  dot:  2798791.5    
module.lstm.bias_ih_l0  dot:  174957.71875    
module.lstm.bias_hh_l0  dot:  174957.71875    
module.ans_lstm.weig

[Version hakku][Epoch  1][Step  563/6933] Loss: -821.4415 [iq: 11.4990,ans: 8.6810,interp: 9.0873,fusion: -850.7088]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1479321.75    
module.ans_embedding.weight  dot:  900389.875    
module.lstm.weight_ih_l0  dot:  18754566.0    
module.lstm.weight_hh_l0  dot:  3220816.75    
module.lstm.bias_ih_l0  dot:  1148404.25    
module.lstm.bias_hh_l0  dot:  1148404.25    
module.ans_lstm.weight_ih

[Version hakku][Epoch  1][Step  567/6933] Loss: -822.8224 [iq: 8.9160,ans: 8.4498,interp: 8.0112,fusion: -848.1993]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  6692206.0    
module.ans_embedding.weight  dot:  1034165.375    
module.lstm.weight_ih_l0  dot:  294132992.0    
module.lstm.weight_hh_l0  dot:  26206544.0    
module.lstm.bias_ih_l0  dot:  16918556.0    
module.lstm.bias_hh_l0  dot:  16918556.0    
module.ans_lstm.weight_ih

[Version hakku][Epoch  1][Step  571/6933] Loss: -799.2667 [iq: 11.0252,ans: 8.7014,interp: 8.5919,fusion: -827.5853]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  2311626.0    
module.ans_embedding.weight  dot:  1563979.5    
module.lstm.weight_ih_l0  dot:  22082988.0    
module.lstm.weight_hh_l0  dot:  4537286.0    
module.lstm.bias_ih_l0  dot:  1153322.75    
module.lstm.bias_hh_l0  dot:  1153322.75    
module.ans_lstm.weight_ih_l0

[Version hakku][Epoch  1][Step  575/6933] Loss: -696.1163 [iq: 11.6913,ans: 9.7179,interp: 10.3176,fusion: -727.8430]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  4487341.0    
module.ans_embedding.weight  dot:  1169392.625    
module.lstm.weight_ih_l0  dot:  102409904.0    
module.lstm.weight_hh_l0  dot:  23592304.0    
module.lstm.bias_ih_l0  dot:  6917555.0    
module.lstm.bias_hh_l0  dot:  6917555.0    
module.ans_lstm.weight_ih

[Version hakku][Epoch  1][Step  579/6933] Loss: -822.8895 [iq: 8.9054,ans: 8.0612,interp: 8.0317,fusion: -847.8878]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  6752364.5    
module.ans_embedding.weight  dot:  1562663.5    
module.lstm.weight_ih_l0  dot:  113458736.0    
module.lstm.weight_hh_l0  dot:  8953020.0    
module.lstm.bias_ih_l0  dot:  5226392.5    
module.lstm.bias_hh_l0  dot:  5226392.5    
module.ans_lstm.weight_ih_l0  

[Version hakku][Epoch  1][Step  583/6933] Loss: -830.6541 [iq: 10.7854,ans: 8.4442,interp: 8.0627,fusion: -857.9464]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  2838211.25    
module.ans_embedding.weight  dot:  1133634.75    
module.lstm.weight_ih_l0  dot:  62025832.0    
module.lstm.weight_hh_l0  dot:  7752058.0    
module.lstm.bias_ih_l0  dot:  4786405.0    
module.lstm.bias_hh_l0  dot:  4786405.0    
module.ans_lstm.weight_ih_l0

[Version hakku][Epoch  1][Step  587/6933] Loss: -841.4775 [iq: 7.8296,ans: 8.2591,interp: 8.0817,fusion: -865.6478]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  504504.75    
module.ans_embedding.weight  dot:  1374786.0    
module.lstm.weight_ih_l0  dot:  9362054.0    
module.lstm.weight_hh_l0  dot:  1486753.75    
module.lstm.bias_ih_l0  dot:  631846.75    
module.lstm.bias_hh_l0  dot:  631846.75    
module.ans_lstm.weight_ih_l0  d

[Version hakku][Epoch  1][Step  591/6933] Loss: -842.6597 [iq: 11.1547,ans: 8.3084,interp: 8.2027,fusion: -870.3254]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  3620198.5    
module.ans_embedding.weight  dot:  1237065.5    
module.lstm.weight_ih_l0  dot:  43326084.0    
module.lstm.weight_hh_l0  dot:  6925321.5    
module.lstm.bias_ih_l0  dot:  2282024.5    
module.lstm.bias_hh_l0  dot:  2282024.5    
module.ans_lstm.weight_ih_l0  

[Version hakku][Epoch  1][Step  595/6933] Loss: -833.2371 [iq: 9.6903,ans: 8.5832,interp: 11.6282,fusion: -863.1388]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  4305305.5    
module.ans_embedding.weight  dot:  1563795.875    
module.lstm.weight_ih_l0  dot:  49058400.0    
module.lstm.weight_hh_l0  dot:  10456875.0    
module.lstm.bias_ih_l0  dot:  2794826.5    
module.lstm.bias_hh_l0  dot:  2794826.5    
module.ans_lstm.weight_ih_l

[Version hakku][Epoch  1][Step  599/6933] Loss: -854.5499 [iq: 9.5463,ans: 7.5784,interp: 7.2592,fusion: -878.9339]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  151858.71875    
module.ans_embedding.weight  dot:  1198227.75    
module.lstm.weight_ih_l0  dot:  7433584.5    
module.lstm.weight_hh_l0  dot:  1693141.5    
module.lstm.bias_ih_l0  dot:  606829.4375    
module.lstm.bias_hh_l0  dot:  606829.4375    
module.ans_lstm.weight_i

[Version hakku][Epoch  1][Step  603/6933] Loss: -802.4435 [iq: 10.0301,ans: 8.7965,interp: 11.2644,fusion: -832.5344]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  233166.640625    
module.ans_embedding.weight  dot:  1599846.625    
module.lstm.weight_ih_l0  dot:  1475648.25    
module.lstm.weight_hh_l0  dot:  991060.25    
module.lstm.bias_ih_l0  dot:  56478.609375    
module.lstm.bias_hh_l0  dot:  56478.609375    
module.ans_lstm.w

[Version hakku][Epoch  1][Step  607/6933] Loss: -810.1124 [iq: 11.5169,ans: 8.8703,interp: 9.2853,fusion: -839.7850]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  470449.3125    
module.ans_embedding.weight  dot:  879090.5625    
module.lstm.weight_ih_l0  dot:  3720897.5    
module.lstm.weight_hh_l0  dot:  1363541.875    
module.lstm.bias_ih_l0  dot:  171968.4375    
module.lstm.bias_hh_l0  dot:  171968.4375    
module.ans_lstm.weigh

[Version hakku][Epoch  1][Step  611/6933] Loss: -857.6496 [iq: 8.0763,ans: 7.5946,interp: 8.7912,fusion: -882.1118]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  238681.0    
module.ans_embedding.weight  dot:  828559.1875    
module.lstm.weight_ih_l0  dot:  1851403.75    
module.lstm.weight_hh_l0  dot:  839092.75    
module.lstm.bias_ih_l0  dot:  71145.4140625    
module.lstm.bias_hh_l0  dot:  71145.4140625    
module.ans_lstm.weight

[Version hakku][Epoch  1][Step  615/6933] Loss: -861.4007 [iq: 11.4290,ans: 9.1676,interp: 9.0312,fusion: -891.0284]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1969253.75    
module.ans_embedding.weight  dot:  1034259.75    
module.lstm.weight_ih_l0  dot:  53290504.0    
module.lstm.weight_hh_l0  dot:  6850398.5    
module.lstm.bias_ih_l0  dot:  3406716.5    
module.lstm.bias_hh_l0  dot:  3406716.5    
module.ans_lstm.weight_ih_l0

[Version hakku][Epoch  1][Step  619/6933] Loss: -859.4165 [iq: 7.7015,ans: 7.4376,interp: 8.7624,fusion: -883.3181]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  277752.5625    
module.ans_embedding.weight  dot:  1150484.5    
module.lstm.weight_ih_l0  dot:  5049055.0    
module.lstm.weight_hh_l0  dot:  4579326.0    
module.lstm.bias_ih_l0  dot:  292737.125    
module.lstm.bias_hh_l0  dot:  292737.125    
module.ans_lstm.weight_ih_l0

[Version hakku][Epoch  1][Step  623/6933] Loss: -845.9506 [iq: 10.7215,ans: 8.2258,interp: 8.3269,fusion: -873.2248]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  2760652.5    
module.ans_embedding.weight  dot:  1193605.875    
module.lstm.weight_ih_l0  dot:  89976496.0    
module.lstm.weight_hh_l0  dot:  8380645.0    
module.lstm.bias_ih_l0  dot:  5878954.0    
module.lstm.bias_hh_l0  dot:  5878954.0    
module.ans_lstm.weight_ih_l0

[Version hakku][Epoch  1][Step  627/6933] Loss: -856.4646 [iq: 8.0824,ans: 7.8092,interp: 8.8355,fusion: -881.1917]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  190565.40625    
module.ans_embedding.weight  dot:  1198925.875    
module.lstm.weight_ih_l0  dot:  5060812.5    
module.lstm.weight_hh_l0  dot:  3370996.75    
module.lstm.bias_ih_l0  dot:  349757.8125    
module.lstm.bias_hh_l0  dot:  349757.8125    
module.ans_lstm.weight

[Version hakku][Epoch  1][Step  631/6933] Loss: -826.2516 [iq: 10.4905,ans: 8.7056,interp: 9.1366,fusion: -854.5843]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1305784.125    
module.ans_embedding.weight  dot:  1156348.75    
module.lstm.weight_ih_l0  dot:  4339232.0    
module.lstm.weight_hh_l0  dot:  1935836.75    
module.lstm.bias_ih_l0  dot:  132438.09375    
module.lstm.bias_hh_l0  dot:  132438.09375    
module.ans_lstm.weigh


----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  635/6933] Loss: -844.6124 [iq: 7.7366,ans: 7.8243,interp: 7.4904,fusion: -867.6636]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1799343.375    
module.ans_embedding.weight  dot:  1799511.5    
module.lstm.weight_ih_l0  dot:  30288810.0    
module.lstm.weight_hh_l0  dot:  3483498.5    
module.lstm.bias_ih_l0  dot:  1868999.5    
module.lstm.bias_hh_l0  dot:  1868999.5 

[Version hakku][Epoch  1][Step  639/6933] Loss: -837.3693 [iq: 7.6528,ans: 7.4837,interp: 7.5247,fusion: -860.0304]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  394210.9375    
module.ans_embedding.weight  dot:  1086151.125    
module.lstm.weight_ih_l0  dot:  8821157.0    
module.lstm.weight_hh_l0  dot:  5387311.0    
module.lstm.bias_ih_l0  dot:  507563.6875    
module.lstm.bias_hh_l0  dot:  507563.6875    
module.ans_lstm.weight_i

[Version hakku][Epoch  1][Step  643/6933] Loss: -784.4398 [iq: 10.3229,ans: 9.4455,interp: 10.9435,fusion: -815.1517]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  459796.53125    
module.ans_embedding.weight  dot:  2147934.5    
module.lstm.weight_ih_l0  dot:  14044514.0    
module.lstm.weight_hh_l0  dot:  3540893.0    
module.lstm.bias_ih_l0  dot:  886912.375    
module.lstm.bias_hh_l0  dot:  886912.375    
module.ans_lstm.weight_i

[Version hakku][Epoch  1][Step  647/6933] Loss: -819.9430 [iq: 9.4201,ans: 8.0568,interp: 8.1670,fusion: -845.5870]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1260978.75    
module.ans_embedding.weight  dot:  1044033.875    
module.lstm.weight_ih_l0  dot:  11745653.0    
module.lstm.weight_hh_l0  dot:  2188930.25    
module.lstm.bias_ih_l0  dot:  525138.375    
module.lstm.bias_hh_l0  dot:  525138.375    
module.ans_lstm.weight_ih

[Version hakku][Epoch  1][Step  651/6933] Loss: -839.9896 [iq: 9.1403,ans: 8.6764,interp: 8.9500,fusion: -866.7562]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  174028.15625    
module.ans_embedding.weight  dot:  905554.0625    
module.lstm.weight_ih_l0  dot:  2418221.75    
module.lstm.weight_hh_l0  dot:  2250481.25    
module.lstm.bias_ih_l0  dot:  137769.390625    
module.lstm.bias_hh_l0  dot:  137769.390625    
module.ans_lstm.w

[Version hakku][Epoch  1][Step  655/6933] Loss: -818.6575 [iq: 11.0153,ans: 9.6227,interp: 9.6045,fusion: -848.9000]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  323816.53125    
module.ans_embedding.weight  dot:  1378004.0    
module.lstm.weight_ih_l0  dot:  5852925.0    
module.lstm.weight_hh_l0  dot:  1256758.5    
module.lstm.bias_ih_l0  dot:  336335.75    
module.lstm.bias_hh_l0  dot:  336335.75    
module.ans_lstm.weight_ih_l0

[Version hakku][Epoch  1][Step  658/6933] Loss: -845.4294 [iq: 9.9422,ans: 8.6178,interp: 9.1211,fusion: -873.1105]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  150224.453125    
module.ans_embedding.weight  dot:  1320001.0    
module.lstm.weight_ih_l0  dot:  3051487.75    
module.lstm.weight_hh_l0  dot:  937941.4375    
module.lstm.bias_ih_l0  dot:  225021.59375    
module.lstm.bias_hh_l0  dot:  225021.59375    
module.ans_lstm.wei

[Version hakku][Epoch  1][Step  662/6933] Loss: -875.8456 [iq: 8.3354,ans: 7.9803,interp: 7.9319,fusion: -900.0932]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  2880455.25    
module.ans_embedding.weight  dot:  733872.75    
module.lstm.weight_ih_l0  dot:  47804128.0    
module.lstm.weight_hh_l0  dot:  7556210.5    
module.lstm.bias_ih_l0  dot:  2803494.0    
module.lstm.bias_hh_l0  dot:  2803494.0    
module.ans_lstm.weight_ih_l0  

[Version hakku][Epoch  1][Step  666/6933] Loss: -846.1697 [iq: 9.8403,ans: 9.0898,interp: 8.8752,fusion: -873.9750]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  440910.75    
module.ans_embedding.weight  dot:  1267657.0    
module.lstm.weight_ih_l0  dot:  3261015.75    
module.lstm.weight_hh_l0  dot:  839812.5    
module.lstm.bias_ih_l0  dot:  156595.78125    
module.lstm.bias_hh_l0  dot:  156595.78125    
module.ans_lstm.weight_ih_

[Version hakku][Epoch  1][Step  670/6933] Loss: -801.0385 [iq: 11.3425,ans: 9.4796,interp: 10.1818,fusion: -832.0423]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  372865.0625    
module.ans_embedding.weight  dot:  1143513.5    
module.lstm.weight_ih_l0  dot:  5506500.0    
module.lstm.weight_hh_l0  dot:  3159990.5    
module.lstm.bias_ih_l0  dot:  361389.25    
module.lstm.bias_hh_l0  dot:  361389.25    
module.ans_lstm.weight_ih_l0

[Version hakku][Epoch  1][Step  674/6933] Loss: -881.8002 [iq: 10.4847,ans: 9.4396,interp: 9.3605,fusion: -911.0850]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  561101.1875    
module.ans_embedding.weight  dot:  865079.0    
module.lstm.weight_ih_l0  dot:  6246529.0    
module.lstm.weight_hh_l0  dot:  1266560.0    
module.lstm.bias_ih_l0  dot:  246867.640625    
module.lstm.bias_hh_l0  dot:  246867.640625    
module.ans_lstm.weight

[Version hakku][Epoch  1][Step  678/6933] Loss: -870.4659 [iq: 9.4467,ans: 8.9246,interp: 8.8882,fusion: -897.7254]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  163621.0    
module.ans_embedding.weight  dot:  1399827.625    
module.lstm.weight_ih_l0  dot:  4213943.0    
module.lstm.weight_hh_l0  dot:  1664206.75    
module.lstm.bias_ih_l0  dot:  263939.65625    
module.lstm.bias_hh_l0  dot:  263939.65625    
module.ans_lstm.weight_i

[Version hakku][Epoch  1][Step  682/6933] Loss: -891.0256 [iq: 8.7214,ans: 7.8035,interp: 8.0925,fusion: -915.6430]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  432603.8125    
module.ans_embedding.weight  dot:  1171520.375    
module.lstm.weight_ih_l0  dot:  5969039.0    
module.lstm.weight_hh_l0  dot:  1653746.25    
module.lstm.bias_ih_l0  dot:  351489.1875    
module.lstm.bias_hh_l0  dot:  351489.1875    
module.ans_lstm.weight_

[Version hakku][Epoch  1][Step  686/6933] Loss: -846.5872 [iq: 9.2108,ans: 8.2031,interp: 8.9882,fusion: -872.9893]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  899465.9375    
module.ans_embedding.weight  dot:  848821.375    
module.lstm.weight_ih_l0  dot:  9971755.0    
module.lstm.weight_hh_l0  dot:  1502563.5    
module.lstm.bias_ih_l0  dot:  545021.625    
module.lstm.bias_hh_l0  dot:  545021.625    
module.ans_lstm.weight_ih_l

[Version hakku][Epoch  1][Step  690/6933] Loss: -855.6744 [iq: 11.1339,ans: 8.4175,interp: 8.9915,fusion: -884.2172]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1072565.125    
module.ans_embedding.weight  dot:  1044812.0    
module.lstm.weight_ih_l0  dot:  10178134.0    
module.lstm.weight_hh_l0  dot:  2539187.0    
module.lstm.bias_ih_l0  dot:  496431.65625    
module.lstm.bias_hh_l0  dot:  496431.65625    
module.ans_lstm.weight

[Version hakku][Epoch  1][Step  694/6933] Loss: -867.2985 [iq: 8.6845,ans: 7.9050,interp: 7.4139,fusion: -891.3019]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  5850095.0    
module.ans_embedding.weight  dot:  798285.625    
module.lstm.weight_ih_l0  dot:  104660368.0    
module.lstm.weight_hh_l0  dot:  9260833.0    
module.lstm.bias_ih_l0  dot:  5996365.5    
module.lstm.bias_hh_l0  dot:  5996365.5    
module.ans_lstm.weight_ih_l0 

Exception ignored in: <bound method _MultiProcessingDataLoaderIter.__del__ of <torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7ff5d44b7da0>>
Traceback (most recent call last):
  File "/home/slam/dl/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 926, in __del__
    self._shutdown_workers()
  File "/home/slam/dl/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 909, in _shutdown_workers
    q.close()
  File "/usr/lib/python3.5/multiprocessing/queues.py", line 134, in close
    self._reader.close()
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 177, in close
    Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/queues.py", line 247, in _feed
    send_bytes(obj)
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(h


----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  334255.375    
module.ans_embedding.weight  dot:  694835.125    
module.lstm.weight_ih_l0  dot:  7246703.5    
module.lstm.weight_hh_l0  dot:  3916434.5    
module.lstm.bias_ih_l0  dot:  478191.40625    
module.lstm.bias_hh_l0  dot:  478191.40625    
module.ans_lstm.weight_ih_l0  dot:  24813380.0    
module.ans_lstm.weight_hh_l0  dot:  53702.375    
module.ans_lstm.bias_ih_l0  dot:  1569560.5    


[Version hakku][Epoch  1][Step  700/6933] Loss: -860.0902 [iq: 8.6476,ans: 8.5025,interp: 8.6772,fusion: -885.9174]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  214810.1875    
module.ans_embedding.weight  dot:  864854.4375    
module.lstm.weight_ih_l0  dot:  7667607.5    
module.lstm.weight_hh_l0  dot:  8641042.0    
module.lstm.bias_ih_l0  dot:  528169.875    
module.lstm.bias_hh_l0  dot:  528169.875    
module.ans_lstm.weight_ih_

[Version hakku][Epoch  1][Step  704/6933] Loss: -867.9968 [iq: 11.4499,ans: 9.4314,interp: 9.2155,fusion: -898.0936]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  439638.15625    
module.ans_embedding.weight  dot:  988190.625    
module.lstm.weight_ih_l0  dot:  4935232.0    
module.lstm.weight_hh_l0  dot:  1995377.75    
module.lstm.bias_ih_l0  dot:  278685.375    
module.lstm.bias_hh_l0  dot:  278685.375    
module.ans_lstm.weight_i

[Version hakku][Epoch  1][Step  708/6933] Loss: -841.3532 [iq: 9.7953,ans: 9.2001,interp: 10.8073,fusion: -871.1559]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  457501.71875    
module.ans_embedding.weight  dot:  603107.375    
module.lstm.weight_ih_l0  dot:  8254188.5    
module.lstm.weight_hh_l0  dot:  5349190.0    
module.lstm.bias_ih_l0  dot:  509056.59375    
module.lstm.bias_hh_l0  dot:  509056.59375    
module.ans_lstm.weigh

[Version hakku][Epoch  1][Step  712/6933] Loss: -885.0441 [iq: 7.4868,ans: 7.5696,interp: 8.0958,fusion: -908.1962]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1635738.375    
module.ans_embedding.weight  dot:  1641823.625    
module.lstm.weight_ih_l0  dot:  30410580.0    
module.lstm.weight_hh_l0  dot:  4103892.5    
module.lstm.bias_ih_l0  dot:  1686069.875    
module.lstm.bias_hh_l0  dot:  1686069.875    
module.ans_lstm.weight_

@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452716.5

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  716/6933] Loss: -864.6670 [iq: 7.2391,ans: 7.3730,interp: 6.9491,fusion: -886.2283]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  599387.0625    
module.ans_embedding.weight  dot:  779594.375    
module.lstm.weight_ih_l0  dot:  6224292.0    
module.lstm.weight_hh_l0  dot:  3091823.5    
module.lstm.bias_ih_l0  dot:  36

module.attflat_ans.linear_merge.bias  dot:  7024037.0    
module.proj_norm.a_2  dot:  0.0    NOT UPDATING
module.proj_norm.b_2  dot:  0.0    NOT UPDATING
module.proj.weight  dot:  0.0    NOT UPDATING
module.proj.bias  dot:  0.0    NOT UPDATING
module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452718.5

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  720/6933] Loss: -896.5134 [iq: 8.6588,ans: 6.8414,interp: 6.6136,fusion: -918.6271]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  

[Version hakku][Epoch  1][Step  724/6933] Loss: -870.2999 [iq: 7.3285,ans: 7.1983,interp: 8.3728,fusion: -893.1995]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  32363370.0    
module.ans_embedding.weight  dot:  888466.8125    
module.lstm.weight_ih_l0  dot:  387512704.0    
module.lstm.weight_hh_l0  dot:  34928156.0    
module.lstm.bias_ih_l0  dot:  24362152.0    
module.lstm.bias_hh_l0  dot:  24362152.0    
module.ans_lstm.weight_i

[Version hakku][Epoch  1][Step  728/6933] Loss: -885.3501 [iq: 9.1122,ans: 7.3105,interp: 8.2161,fusion: -909.9889]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  441747.875    
module.ans_embedding.weight  dot:  773872.9375    
module.lstm.weight_ih_l0  dot:  7053608.0    
module.lstm.weight_hh_l0  dot:  1138470.5    
module.lstm.bias_ih_l0  dot:  493217.5    
module.lstm.bias_hh_l0  dot:  493217.5    
module.ans_lstm.weight_ih_l0  d

[Version hakku][Epoch  1][Step  732/6933] Loss: -842.7147 [iq: 9.7644,ans: 8.5022,interp: 10.1441,fusion: -871.1254]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  750566.8125    
module.ans_embedding.weight  dot:  1233709.75    
module.lstm.weight_ih_l0  dot:  6614801.0    
module.lstm.weight_hh_l0  dot:  4714286.0    
module.lstm.bias_ih_l0  dot:  156112.15625    
module.lstm.bias_hh_l0  dot:  156112.15625    
module.ans_lstm.weight

[Version hakku][Epoch  1][Step  736/6933] Loss: -859.8885 [iq: 10.2578,ans: 8.0731,interp: 8.0527,fusion: -886.2720]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  483687.6875    
module.ans_embedding.weight  dot:  857823.125    
module.lstm.weight_ih_l0  dot:  8521145.0    
module.lstm.weight_hh_l0  dot:  5495264.0    
module.lstm.bias_ih_l0  dot:  515793.375    
module.lstm.bias_hh_l0  dot:  515793.375    
module.ans_lstm.weight_ih_

[Version hakku][Epoch  1][Step  740/6933] Loss: -876.2002 [iq: 9.3480,ans: 8.5633,interp: 8.9012,fusion: -903.0127]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  140693.0625    
module.ans_embedding.weight  dot:  1398277.375    
module.lstm.weight_ih_l0  dot:  1169505.0    
module.lstm.weight_hh_l0  dot:  1103255.0    
module.lstm.bias_ih_l0  dot:  51405.53125    
module.lstm.bias_hh_l0  dot:  51405.53125    
module.ans_lstm.weight_i

[Version hakku][Epoch  1][Step  744/6933] Loss: -876.3409 [iq: 11.5352,ans: 9.3289,interp: 9.9966,fusion: -907.2017]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  4294958.0    
module.ans_embedding.weight  dot:  1148415.0    
module.lstm.weight_ih_l0  dot:  41677672.0    
module.lstm.weight_hh_l0  dot:  5576280.0    
module.lstm.bias_ih_l0  dot:  2462753.0    
module.lstm.bias_hh_l0  dot:  2462753.0    
module.ans_lstm.weight_ih_l0  

module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452731.0

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  748/6933] Loss: -899.5581 [iq: 8.4624,ans: 7.5912,interp: 8.1427,fusion: -923.7544]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  2027428.5    
module.ans_embedding.weight  dot:  538230.5    
module.lstm.weight_ih_l0  dot:  857

[Version hakku][Epoch  1][Step  752/6933] Loss: -883.0563 [iq: 9.0606,ans: 8.1563,interp: 8.0487,fusion: -908.3220]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  6154615.0    
module.ans_embedding.weight  dot:  914423.375    
module.lstm.weight_ih_l0  dot:  57253824.0    
module.lstm.weight_hh_l0  dot:  8178323.0    
module.lstm.bias_ih_l0  dot:  3287413.0    
module.lstm.bias_hh_l0  dot:  3287413.0    
module.ans_lstm.weight_ih_l0  

[Version hakku][Epoch  1][Step  756/6933] Loss: -873.6321 [iq: 9.9949,ans: 8.8125,interp: 9.1134,fusion: -901.5529]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1806770.75    
module.ans_embedding.weight  dot:  864156.125    
module.lstm.weight_ih_l0  dot:  7301265.5    
module.lstm.weight_hh_l0  dot:  1013042.5    
module.lstm.bias_ih_l0  dot:  249624.25    
module.lstm.bias_hh_l0  dot:  249624.25    
module.ans_lstm.weight_ih_l0  

module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452736.125

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  760/6933] Loss: -887.2744 [iq: 8.9595,ans: 8.3148,interp: 8.7181,fusion: -913.2668]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight 

[Version hakku][Epoch  1][Step  764/6933] Loss: -860.5530 [iq: 10.0019,ans: 8.7272,interp: 8.9997,fusion: -888.2818]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  3146841.25    
module.ans_embedding.weight  dot:  901293.0625    
module.lstm.weight_ih_l0  dot:  27958048.0    
module.lstm.weight_hh_l0  dot:  6100940.0    
module.lstm.bias_ih_l0  dot:  1586115.375    
module.lstm.bias_hh_l0  dot:  1586115.375    
module.ans_lstm.weight_

[Version hakku][Epoch  1][Step  768/6933] Loss: -889.5228 [iq: 8.1614,ans: 8.3230,interp: 9.1169,fusion: -915.1241]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1411044.625    
module.ans_embedding.weight  dot:  1323495.625    
module.lstm.weight_ih_l0  dot:  4189661.0    
module.lstm.weight_hh_l0  dot:  1767204.125    
module.lstm.bias_ih_l0  dot:  58265.3203125    
module.lstm.bias_hh_l0  dot:  58265.3203125    
module.ans_lstm.we

[Version hakku][Epoch  1][Step  772/6933] Loss: -881.6490 [iq: 9.0661,ans: 7.9206,interp: 7.4999,fusion: -906.1357]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  6643171.0    
module.ans_embedding.weight  dot:  1106016.625    
module.lstm.weight_ih_l0  dot:  70066968.0    
module.lstm.weight_hh_l0  dot:  14430928.0    
module.lstm.bias_ih_l0  dot:  4309225.0    
module.lstm.bias_hh_l0  dot:  4309225.0    
module.ans_lstm.weight_ih_l0

module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452744.0

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  776/6933] Loss: -846.8157 [iq: 9.1758,ans: 9.0968,interp: 10.5279,fusion: -875.6163]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  11091568.0    
module.ans_embedding.weight  do

[Version hakku][Epoch  1][Step  780/6933] Loss: -885.7862 [iq: 8.3970,ans: 7.2857,interp: 8.0700,fusion: -909.5389]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  667582.0    
module.ans_embedding.weight  dot:  1290185.5    
module.lstm.weight_ih_l0  dot:  11348455.0    
module.lstm.weight_hh_l0  dot:  1991171.875    
module.lstm.bias_ih_l0  dot:  705611.75    
module.lstm.bias_hh_l0  dot:  705611.75    
module.ans_lstm.weight_ih_l0  

[Version hakku][Epoch  1][Step  784/6933] Loss: -908.1454 [iq: 7.2701,ans: 6.8473,interp: 6.8736,fusion: -929.1364]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  2642213.75    
module.ans_embedding.weight  dot:  579101.625    
module.lstm.weight_ih_l0  dot:  16540118.0    
module.lstm.weight_hh_l0  dot:  2751466.25    
module.lstm.bias_ih_l0  dot:  888082.5    
module.lstm.bias_hh_l0  dot:  888082.5    
module.ans_lstm.weight_ih_l0  

[Version hakku][Epoch  1][Step  788/6933] Loss: -915.2238 [iq: 7.6054,ans: 7.6248,interp: 8.3729,fusion: -938.8268]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  7836517.0    
module.ans_embedding.weight  dot:  1095338.25    
module.lstm.weight_ih_l0  dot:  65140224.0    
module.lstm.weight_hh_l0  dot:  6107284.0    
module.lstm.bias_ih_l0  dot:  3660773.5    
module.lstm.bias_hh_l0  dot:  3660773.5    
module.ans_lstm.weight_ih_l0  

[Version hakku][Epoch  1][Step  792/6933] Loss: -877.9110 [iq: 10.6926,ans: 9.5099,interp: 9.5068,fusion: -907.6202]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  19044644.0    
module.ans_embedding.weight  dot:  454673.9375    
module.lstm.weight_ih_l0  dot:  365068416.0    
module.lstm.weight_hh_l0  dot:  32130972.0    
module.lstm.bias_ih_l0  dot:  19633352.0    
module.lstm.bias_hh_l0  dot:  19633352.0    
module.ans_lstm.weight_

[Version hakku][Epoch  1][Step  796/6933] Loss: -870.5936 [iq: 9.3004,ans: 8.0102,interp: 9.0391,fusion: -896.9433]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  462846.125    
module.ans_embedding.weight  dot:  458867.25    
module.lstm.weight_ih_l0  dot:  2981738.25    
module.lstm.weight_hh_l0  dot:  959992.25    
module.lstm.bias_ih_l0  dot:  117606.296875    
module.lstm.bias_hh_l0  dot:  117606.296875    
module.ans_lstm.weight

module.proj.bias  dot:  0.0    NOT UPDATING
module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452754.5

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  800/6933] Loss: -903.8906 [iq: 11.0095,ans: 8.4772,interp: 9.2196,fusion: -932.5969]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NO

[Version hakku][Epoch  1][Step  804/6933] Loss: -896.6926 [iq: 7.2886,ans: 7.1529,interp: 8.4083,fusion: -919.5424]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  165662.1875    
module.ans_embedding.weight  dot:  858534.75    
module.lstm.weight_ih_l0  dot:  1389973.0    
module.lstm.weight_hh_l0  dot:  690349.375    
module.lstm.bias_ih_l0  dot:  58978.25390625    
module.lstm.bias_hh_l0  dot:  58978.25390625    
module.ans_lstm.wei

module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452757.875

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  808/6933] Loss: -859.7308 [iq: 12.4551,ans: 8.5918,interp: 9.9290,fusion: -890.7067]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  584823.375    
module.ans_embedding.weight  dot:  1791263.25    
module.lstm.weight_ih_l0  dot

[Version hakku][Epoch  1][Step  812/6933] Loss: -925.3470 [iq: 6.6488,ans: 6.6362,interp: 9.0504,fusion: -947.6823]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1182190.875    
module.ans_embedding.weight  dot:  1119038.25    
module.lstm.weight_ih_l0  dot:  15760084.0    
module.lstm.weight_hh_l0  dot:  2236805.0    
module.lstm.bias_ih_l0  dot:  871957.0625    
module.lstm.bias_hh_l0  dot:  871957.0625    
module.ans_lstm.weight_i

module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452761.5

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  816/6933] Loss: -896.2711 [iq: 13.1168,ans: 8.7643,interp: 10.7264,fusion: -928.8786]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias

[Version hakku][Epoch  1][Step  820/6933] Loss: -885.4894 [iq: 9.2441,ans: 8.7772,interp: 8.9086,fusion: -912.4193]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  445253.625    
module.ans_embedding.weight  dot:  2118472.25    
module.lstm.weight_ih_l0  dot:  1588608.375    
module.lstm.weight_hh_l0  dot:  924744.875    
module.lstm.bias_ih_l0  dot:  46237.34375    
module.lstm.bias_hh_l0  dot:  46237.34375    
module.ans_lstm.weight_

module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452764.375

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  824/6933] Loss: -893.2446 [iq: 8.9249,ans: 7.8995,interp: 8.0107,fusion: -918.0798]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight 

[Version hakku][Epoch  1][Step  828/6933] Loss: -906.9736 [iq: 9.3994,ans: 8.7032,interp: 9.3180,fusion: -934.3942]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  364677.28125    
module.ans_embedding.weight  dot:  564018.375    
module.lstm.weight_ih_l0  dot:  6234297.0    
module.lstm.weight_hh_l0  dot:  3367382.5    
module.lstm.bias_ih_l0  dot:  368240.28125    
module.lstm.bias_hh_l0  dot:  368240.28125    
module.ans_lstm.weight

module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452767.25

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  832/6933] Loss: -917.1546 [iq: 8.4222,ans: 8.2680,interp: 9.0696,fusion: -942.9144]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  395962.65625    
module.ans_embedding.weight  

[Version hakku][Epoch  1][Step  836/6933] Loss: -902.4649 [iq: 10.5236,ans: 9.3142,interp: 9.2781,fusion: -931.5807]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  825396.375    
module.ans_embedding.weight  dot:  926729.375    
module.lstm.weight_ih_l0  dot:  20243306.0    
module.lstm.weight_hh_l0  dot:  14352210.0    
module.lstm.bias_ih_l0  dot:  1283225.125    
module.lstm.bias_hh_l0  dot:  1283225.125    
module.ans_lstm.weight_

module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452770.25

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  840/6933] Loss: -913.6280 [iq: 9.5647,ans: 9.2762,interp: 9.6693,fusion: -942.1382]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  

[Version hakku][Epoch  1][Step  844/6933] Loss: -940.0753 [iq: 8.8646,ans: 8.1773,interp: 7.6285,fusion: -964.7457]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1445502.375    
module.ans_embedding.weight  dot:  617399.1875    
module.lstm.weight_ih_l0  dot:  53744528.0    
module.lstm.weight_hh_l0  dot:  41958220.0    
module.lstm.bias_ih_l0  dot:  3624476.5    
module.lstm.bias_hh_l0  dot:  3624476.5    
module.ans_lstm.weight_ih_

module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452773.5

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  848/6933] Loss: -881.8984 [iq: 8.0073,ans: 7.8866,interp: 8.4387,fusion: -906.2310]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1078502.75    
module.ans_embedding.weight  dot

@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452775.625

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  852/6933] Loss: -852.8799 [iq: 9.9549,ans: 9.0368,interp: 9.0836,fusion: -880.9553]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  420363.53125    
module.ans_embedding.weight  dot:  1021341.4375    
module.lstm.weight_ih_l0  dot:  5740492.0    
module.lstm.weight_hh_l0  dot:  2837724.5    
module.lstm.bias_ih_l0  dot

[Version hakku][Epoch  1][Step  856/6933] Loss: -887.6413 [iq: 7.8074,ans: 7.7755,interp: 7.9340,fusion: -911.1581]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  62186740.0    
module.ans_embedding.weight  dot:  1079291.25    
module.lstm.weight_ih_l0  dot:  901970368.0    
module.lstm.weight_hh_l0  dot:  75495064.0    
module.lstm.bias_ih_l0  dot:  52889912.0    
module.lstm.bias_hh_l0  dot:  52889912.0    
module.ans_lstm.weight_ih

module.attflat_lang.mlp.fc.linear.bias  dot:  97.52473449707031    
module.attflat_lang.mlp.linear.weight  dot:  169218.5625    
module.attflat_lang.mlp.linear.bias  dot:  2.5067947717616335e-09    
module.attflat_lang.linear_merge.weight  dot:  7147521.0    
module.attflat_lang.linear_merge.bias  dot:  57869.171875    
module.attflat_ans.mlp.fc.linear.weight  dot:  8148.0390625    
module.attflat_ans.mlp.fc.linear.bias  dot:  1675.3182373046875    
module.attflat_ans.mlp.linear.weight  dot:  40323.8515625    
module.attflat_ans.mlp.linear.bias  dot:  1.2825296380469808e-12    
module.attflat_ans.linear_merge.weight  dot:  6036047.5    
module.attflat_ans.linear_merge.bias  dot:  7032064.0    
module.proj_norm.a_2  dot:  0.0    NOT UPDATING
module.proj_norm.b_2  dot:  0.0    NOT UPDATING
module.proj.weight  dot:  0.0    NOT UPDATING
module.proj.bias  dot:  0.0    NOT UPDATING
module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.

module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  276355.40625    
module.ans_embedding.weight  dot:  568622.9375    
module.lstm.weight_ih_l0  dot:  7432136.5    
module.lstm.weight_hh_l0  dot:  9323954.0    
module.lstm.bias_ih_l0  dot:  733691.1875    
module.lstm.bias_hh_l0  dot:  733691.1875    
module.ans_lstm.weight_ih_l0  dot:  7830403.0    
module.ans_lstm.weight_hh_l0  dot:  1065.8994140625    
module.ans_lstm.bias_ih_l0  dot:  187026.515625    
module.ans_lstm.bias_hh_l0  dot:  187026.515625    
module.

[Version hakku][Epoch  1][Step  866/6933] Loss: -946.0249 [iq: 8.1223,ans: 7.7876,interp: 7.8516,fusion: -969.7864]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1162252.125    
module.ans_embedding.weight  dot:  955307.0    
module.lstm.weight_ih_l0  dot:  19207420.0    
module.lstm.weight_hh_l0  dot:  8517253.0    
module.lstm.bias_ih_l0  dot:  1155363.125    
module.lstm.bias_hh_l0  dot:  1155363.125    
module.ans_lstm.weight_ih_

module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452781.625

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  869/6933] Loss: -896.7617 [iq: 7.7810,ans: 7.9053,interp: 8.2400,fusion: -920.6880]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  888405.0    
module.ans_embedding.weight  dot

[Version hakku][Epoch  1][Step  873/6933] Loss: -901.6506 [iq: 10.8868,ans: 8.8025,interp: 8.9917,fusion: -930.3315]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1841258.0    
module.ans_embedding.weight  dot:  349167.25    
module.lstm.weight_ih_l0  dot:  80248904.0    
module.lstm.weight_hh_l0  dot:  59766732.0    
module.lstm.bias_ih_l0  dot:  5004917.0    
module.lstm.bias_hh_l0  dot:  5004917.0    
module.ans_lstm.weight_ih_l0 

[Version hakku][Epoch  1][Step  877/6933] Loss: -913.0778 [iq: 8.4360,ans: 8.5528,interp: 9.1817,fusion: -939.2482]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  236993.984375    
module.ans_embedding.weight  dot:  972271.4375    
module.lstm.weight_ih_l0  dot:  2213831.0    
module.lstm.weight_hh_l0  dot:  1464259.875    
module.lstm.bias_ih_l0  dot:  110442.40625    
module.lstm.bias_hh_l0  dot:  110442.40625    
module.ans_lstm.we

[Version hakku][Epoch  1][Step  881/6933] Loss: -851.4245 [iq: 11.2290,ans: 9.4804,interp: 8.8997,fusion: -881.0336]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  2271747.75    
module.ans_embedding.weight  dot:  625673.5    
module.lstm.weight_ih_l0  dot:  42585416.0    
module.lstm.weight_hh_l0  dot:  7769911.5    
module.lstm.bias_ih_l0  dot:  2673204.0    
module.lstm.bias_hh_l0  dot:  2673204.0    
module.ans_lstm.weight_ih_l0  

[Version hakku][Epoch  1][Step  885/6933] Loss: -875.0795 [iq: 10.2948,ans: 8.9664,interp: 9.3317,fusion: -903.6724]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  2223752.75    
module.ans_embedding.weight  dot:  625513.5    
module.lstm.weight_ih_l0  dot:  66773544.0    
module.lstm.weight_hh_l0  dot:  55181164.0    
module.lstm.bias_ih_l0  dot:  5455128.5    
module.lstm.bias_hh_l0  dot:  5455128.5    
module.ans_lstm.weight_ih_l0 

[Version hakku][Epoch  1][Step  889/6933] Loss: -918.8203 [iq: 8.8512,ans: 8.5070,interp: 8.2102,fusion: -944.3887]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  897956.75    
module.ans_embedding.weight  dot:  898949.5625    
module.lstm.weight_ih_l0  dot:  12609442.0    
module.lstm.weight_hh_l0  dot:  5544607.0    
module.lstm.bias_ih_l0  dot:  798236.375    
module.lstm.bias_hh_l0  dot:  798236.375    
module.ans_lstm.weight_ih_l

[Version hakku][Epoch  1][Step  893/6933] Loss: -937.8989 [iq: 7.7323,ans: 7.7612,interp: 7.8173,fusion: -961.2096]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  194891.015625    
module.ans_embedding.weight  dot:  946295.125    
module.lstm.weight_ih_l0  dot:  3025094.0    
module.lstm.weight_hh_l0  dot:  3210617.5    
module.lstm.bias_ih_l0  dot:  189575.71875    
module.lstm.bias_hh_l0  dot:  189575.71875    
module.ans_lstm.weigh

[Version hakku][Epoch  1][Step  897/6933] Loss: -892.4114 [iq: 9.1386,ans: 7.7895,interp: 7.6673,fusion: -917.0068]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  515263.625    
module.ans_embedding.weight  dot:  626638.875    
module.lstm.weight_ih_l0  dot:  12648418.0    
module.lstm.weight_hh_l0  dot:  7592613.5    
module.lstm.bias_ih_l0  dot:  826609.9375    
module.lstm.bias_hh_l0  dot:  826609.9375    
module.ans_lstm.weight_ih

module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452794.875

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  901/6933] Loss: -941.5053 [iq: 8.8981,ans: 8.7175,interp: 8.2397,fusion: -967.3606]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias

[Version hakku][Epoch  1][Step  905/6933] Loss: -895.0558 [iq: 11.2065,ans: 9.8590,interp: 10.8437,fusion: -926.9650]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  3437515.75    
module.ans_embedding.weight  dot:  789375.125    
module.lstm.weight_ih_l0  dot:  65814672.0    
module.lstm.weight_hh_l0  dot:  9164218.0    
module.lstm.bias_ih_l0  dot:  4372916.0    
module.lstm.bias_hh_l0  dot:  4372916.0    
module.ans_lstm.weight_ih_l

[Version hakku][Epoch  1][Step  909/6933] Loss: -845.7621 [iq: 8.6659,ans: 8.2571,interp: 9.3787,fusion: -872.0638]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  242637.0    
module.ans_embedding.weight  dot:  870951.25    
module.lstm.weight_ih_l0  dot:  1645777.0    
module.lstm.weight_hh_l0  dot:  612509.4375    
module.lstm.bias_ih_l0  dot:  61034.96875    
module.lstm.bias_hh_l0  dot:  61034.96875    
module.ans_lstm.weight_ih_l

[Version hakku][Epoch  1][Step  912/6933] Loss: -928.6895 [iq: 8.1779,ans: 7.9210,interp: 7.8262,fusion: -952.6144]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  289743.9375    
module.ans_embedding.weight  dot:  768314.8125    
module.lstm.weight_ih_l0  dot:  1985289.875    
module.lstm.weight_hh_l0  dot:  1873429.125    
module.lstm.bias_ih_l0  dot:  101495.3125    
module.lstm.bias_hh_l0  dot:  101495.3125    
module.ans_lstm.weig

module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452800.375

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  916/6933] Loss: -941.8728 [iq: 7.9831,ans: 7.9533,interp: 9.7664,fusion: -967.5756]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  514592.46875    
module.ans_embedding.weight 

module.proj_norm.b_2  dot:  0.0    NOT UPDATING
module.proj.weight  dot:  0.0    NOT UPDATING
module.proj.bias  dot:  0.0    NOT UPDATING
module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452801.875

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  920/6933] Loss: -874.3728 [iq: 9.4611,ans: 7.9132,interp: 8.1809,fusion: -899.9280]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_

module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452803.125

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  924/6933] Loss: -898.3331 [iq: 9.0792,ans: 8.1050,interp: 9.4383,fusion: -924.9556]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  239575.71875    
module.ans_embedding.weight  dot:  802898.625    
module.lstm.weight_ih_l0  do

module.proj.weight  dot:  0.0    NOT UPDATING
module.proj.bias  dot:  0.0    NOT UPDATING
module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452805.0

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  928/6933] Loss: -905.7586 [iq: 10.6173,ans: 8.3617,interp: 8.9099,fusion: -933.6476]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
modul

module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452806.125

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  932/6933] Loss: -930.9527 [iq: 8.1148,ans: 7.9977,interp: 8.6402,fusion: -955.7054]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight 

module.proj_norm.a_2  dot:  0.0    NOT UPDATING
module.proj_norm.b_2  dot:  0.0    NOT UPDATING
module.proj.weight  dot:  0.0    NOT UPDATING
module.proj.bias  dot:  0.0    NOT UPDATING
module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452807.25

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  936/6933] Loss: -935.7607 [iq: 8.2470,ans: 8.0674,interp: 7.7804,fusion: -959.8555]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight

module.attflat_lang.linear_merge.weight  dot:  6298489.0    
module.attflat_lang.linear_merge.bias  dot:  52203.171875    
module.attflat_ans.mlp.fc.linear.weight  dot:  91477.359375    
module.attflat_ans.mlp.fc.linear.bias  dot:  19949.3515625    
module.attflat_ans.mlp.linear.weight  dot:  386388.15625    
module.attflat_ans.mlp.linear.bias  dot:  8.881784197001252e-14    
module.attflat_ans.linear_merge.weight  dot:  4739765.0    
module.attflat_ans.linear_merge.bias  dot:  4440823.0    
module.proj_norm.a_2  dot:  0.0    NOT UPDATING
module.proj_norm.b_2  dot:  0.0    NOT UPDATING
module.proj.weight  dot:  0.0    NOT UPDATING
module.proj.bias  dot:  0.0    NOT UPDATING
module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452808.875

----------  MAI

[Version hakku][Epoch  1][Step  944/6933] Loss: -901.5093 [iq: 8.3548,ans: 7.6004,interp: 7.4991,fusion: -924.9637]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  848183.0    
module.ans_embedding.weight  dot:  853622.375    
module.lstm.weight_ih_l0  dot:  20673318.0    
module.lstm.weight_hh_l0  dot:  9184339.0    
module.lstm.bias_ih_l0  dot:  1096038.0    
module.lstm.bias_hh_l0  dot:  1096038.0    
module.ans_lstm.weight_ih_l0  d

[Version hakku][Epoch  1][Step  948/6933] Loss: -920.0955 [iq: 7.4422,ans: 7.5799,interp: 7.7879,fusion: -942.9055]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1707723.75    
module.ans_embedding.weight  dot:  1023143.25    
module.lstm.weight_ih_l0  dot:  37394036.0    
module.lstm.weight_hh_l0  dot:  37267808.0    
module.lstm.bias_ih_l0  dot:  1922751.25    
module.lstm.bias_hh_l0  dot:  1922751.25    
module.ans_lstm.weight_ih_

@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452814.25

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  952/6933] Loss: -949.1213 [iq: 8.2916,ans: 7.6248,interp: 7.0023,fusion: -972.0400]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  249503.9375    
module.ans_embedding.weight  dot:  352607.5    
module.lstm.weight_ih_l0  dot:  4311518.5    
module.lstm.weight_hh_l0  dot:  3906028.25    
module.lstm.bias_ih_l0  dot:  17

[Version hakku][Epoch  1][Step  956/6933] Loss: -920.4875 [iq: 7.5629,ans: 7.8831,interp: 8.2110,fusion: -944.1445]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  863059.875    
module.ans_embedding.weight  dot:  467750.71875    
module.lstm.weight_ih_l0  dot:  35065140.0    
module.lstm.weight_hh_l0  dot:  19327768.0    
module.lstm.bias_ih_l0  dot:  2261687.5    
module.lstm.bias_hh_l0  dot:  2261687.5    
module.ans_lstm.weight_ih_

module.proj.bias  dot:  0.0    NOT UPDATING
module.ans_proj_norm.a_2  dot:  0.0    NOT UPDATING
module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452816.875

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  960/6933] Loss: -937.1515 [iq: 8.7767,ans: 8.8409,interp: 7.9314,fusion: -962.7005]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    N

[Version hakku][Epoch  1][Step  964/6933] Loss: -926.0489 [iq: 4.9033,ans: 5.4389,interp: 6.1311,fusion: -942.5222]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  11157674.0    
module.ans_embedding.weight  dot:  963872.5625    
module.lstm.weight_ih_l0  dot:  114704944.0    
module.lstm.weight_hh_l0  dot:  10693654.0    
module.lstm.bias_ih_l0  dot:  6281522.0    
module.lstm.bias_hh_l0  dot:  6281522.0    
module.ans_lstm.weight_ih_

module.ans_proj_norm.b_2  dot:  0.0    NOT UPDATING
module.ans_proj.weight  dot:  0.0    NOT UPDATING
module.ans_proj.bias  dot:  0.0    NOT UPDATING
Gradient not updating in:  16  of total:  46
@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452819.625

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  968/6933] Loss: -861.6207 [iq: 10.1254,ans: 9.7369,interp: 9.9635,fusion: -891.4465]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight

[Version hakku][Epoch  1][Step  972/6933] Loss: -894.3885 [iq: 8.8203,ans: 8.5622,interp: 9.6724,fusion: -921.4434]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  178685.328125    
module.ans_embedding.weight  dot:  757207.3125    
module.lstm.weight_ih_l0  dot:  3106358.75    
module.lstm.weight_hh_l0  dot:  4057411.25    
module.lstm.bias_ih_l0  dot:  200063.40625    
module.lstm.bias_hh_l0  dot:  200063.40625    
module.ans_lstm.we

[Version hakku][Epoch  1][Step  976/6933] Loss: -905.6023 [iq: 8.3820,ans: 8.4507,interp: 8.5984,fusion: -931.0334]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1573505.0    
module.ans_embedding.weight  dot:  388009.375    
module.lstm.weight_ih_l0  dot:  19791836.0    
module.lstm.weight_hh_l0  dot:  3118148.5    
module.lstm.bias_ih_l0  dot:  960373.5    
module.lstm.bias_hh_l0  dot:  960373.5    
module.ans_lstm.weight_ih_l0  do

@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452823.875

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step  980/6933] Loss: -909.5347 [iq: 9.3463,ans: 8.3577,interp: 8.3460,fusion: -935.5847]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  289150.9375    
module.ans_embedding.weight  dot:  576551.4375    
module.lstm.weight_ih_l0  dot:  14582586.0    
module.lstm.weight_hh_l0  dot:  9183670.0    
module.lstm.bias_ih_l0  dot:

[Version hakku][Epoch  1][Step  984/6933] Loss: -891.7097 [iq: 8.2795,ans: 8.0925,interp: 8.0849,fusion: -916.1666]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  274931.9375    
module.ans_embedding.weight  dot:  919397.25    
module.lstm.weight_ih_l0  dot:  4315119.0    
module.lstm.weight_hh_l0  dot:  6768836.5    
module.lstm.bias_ih_l0  dot:  243525.640625    
module.lstm.bias_hh_l0  dot:  243525.640625    
module.ans_lstm.weight

[Version hakku][Epoch  1][Step  988/6933] Loss: -933.5080 [iq: 10.0032,ans: 8.5054,interp: 8.5147,fusion: -960.5314]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  289075.875    
module.ans_embedding.weight  dot:  1240790.375    
module.lstm.weight_ih_l0  dot:  2924205.75    
module.lstm.weight_hh_l0  dot:  1743412.875    
module.lstm.bias_ih_l0  dot:  106914.3671875    
module.lstm.bias_hh_l0  dot:  106914.3671875    
module.ans_lstm

[Version hakku][Epoch  1][Step  992/6933] Loss: -930.3712 [iq: 6.4289,ans: 6.8649,interp: 7.0514,fusion: -950.7164]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  2286858.5    
module.ans_embedding.weight  dot:  459790.21875    
module.lstm.weight_ih_l0  dot:  50726384.0    
module.lstm.weight_hh_l0  dot:  40253304.0    
module.lstm.bias_ih_l0  dot:  2951904.5    
module.lstm.bias_hh_l0  dot:  2951904.5    
module.ans_lstm.weight_ih_l

[Version hakku][Epoch  1][Step  996/6933] Loss: -921.2007 [iq: 7.3362,ans: 6.5874,interp: 6.4305,fusion: -941.5548]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  90951.3828125    
module.ans_embedding.weight  dot:  1134010.125    
module.lstm.weight_ih_l0  dot:  1098214.625    
module.lstm.weight_hh_l0  dot:  1203472.25    
module.lstm.bias_ih_l0  dot:  107127.0    
module.lstm.bias_hh_l0  dot:  107127.0    
module.ans_lstm.weight_ih

[Version hakku][Epoch  1][Step 1000/6933] Loss: -914.4260 [iq: 10.8398,ans: 9.2364,interp: 9.4660,fusion: -943.9683]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  193574.5    
module.ans_embedding.weight  dot:  643848.375    
module.lstm.weight_ih_l0  dot:  3878010.25    
module.lstm.weight_hh_l0  dot:  6480350.5    
module.lstm.bias_ih_l0  dot:  324510.0    
module.lstm.bias_hh_l0  dot:  324510.0    
module.ans_lstm.weight_ih_l0  do

[Version hakku][Epoch  1][Step 1004/6933] Loss: -936.9852 [iq: 6.2880,ans: 6.4714,interp: 6.3451,fusion: -956.0897]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  192328.0625    
module.ans_embedding.weight  dot:  692635.9375    
module.lstm.weight_ih_l0  dot:  1663312.0    
module.lstm.weight_hh_l0  dot:  1560705.75    
module.lstm.bias_ih_l0  dot:  82628.96875    
module.lstm.bias_hh_l0  dot:  82628.96875    
module.ans_lstm.weight_

[Version hakku][Epoch  1][Step 1008/6933] Loss: -949.1602 [iq: 9.7303,ans: 7.5122,interp: 7.3093,fusion: -973.7119]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  56295.19140625    
module.ans_embedding.weight  dot:  712254.0    
module.lstm.weight_ih_l0  dot:  1973402.0    
module.lstm.weight_hh_l0  dot:  1965190.5    
module.lstm.bias_ih_l0  dot:  123171.265625    
module.lstm.bias_hh_l0  dot:  123171.265625    
module.ans_lstm.weig

[Version hakku][Epoch  1][Step 1012/6933] Loss: -949.6901 [iq: 7.0211,ans: 7.5778,interp: 9.1779,fusion: -973.4669]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  279456.5    
module.ans_embedding.weight  dot:  725057.8125    
module.lstm.weight_ih_l0  dot:  4253082.5    
module.lstm.weight_hh_l0  dot:  2828960.75    
module.lstm.bias_ih_l0  dot:  225237.0625    
module.lstm.bias_hh_l0  dot:  225237.0625    
module.ans_lstm.weight_ih_

[Version hakku][Epoch  1][Step 1016/6933] Loss: -941.7927 [iq: 8.3714,ans: 7.9770,interp: 7.7443,fusion: -965.8854]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  688268.0    
module.ans_embedding.weight  dot:  531792.375    
module.lstm.weight_ih_l0  dot:  25060832.0    
module.lstm.weight_hh_l0  dot:  16310874.0    
module.lstm.bias_ih_l0  dot:  1511200.0    
module.lstm.bias_hh_l0  dot:  1511200.0    
module.ans_lstm.weight_ih_l0  

@@@@@@@@@@@@@@@@@@@ Overall dot product:  1452836.375

----------  MAIN LOSS  --------
[Version hakku][Epoch  1][Step 1020/6933] Loss: -942.0566 [iq: 8.1235,ans: 7.8722,interp: 7.7396,fusion: -965.7918]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  162300.375    
module.ans_embedding.weight  dot:  702468.875    
module.lstm.weight_ih_l0  dot:  2124065.5    
module.lstm.weight_hh_l0  dot:  2681574.0    
module.lstm.bias_ih_l0  dot:  1

[Version hakku][Epoch  1][Step 1024/6933] Loss: -912.8046 [iq: 7.7342,ans: 7.6249,interp: 8.3616,fusion: -936.5253]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  681333.75    
module.ans_embedding.weight  dot:  655066.125    
module.lstm.weight_ih_l0  dot:  13435821.0    
module.lstm.weight_hh_l0  dot:  7409469.0    
module.lstm.bias_ih_l0  dot:  398303.25    
module.lstm.bias_hh_l0  dot:  398303.25    
module.ans_lstm.weight_ih_l0  

[Version hakku][Epoch  1][Step 1028/6933] Loss: -927.6677 [iq: 7.3332,ans: 6.6845,interp: 6.8615,fusion: -948.5469]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  178023.453125    
module.ans_embedding.weight  dot:  744475.9375    
module.lstm.weight_ih_l0  dot:  4204976.0    
module.lstm.weight_hh_l0  dot:  714282.3125    
module.lstm.bias_ih_l0  dot:  25794.490234375    
module.lstm.bias_hh_l0  dot:  25794.490234375    
module.ans_l

[Version hakku][Epoch  1][Step 1032/6933] Loss: -903.6978 [iq: 8.3874,ans: 7.7181,interp: 7.5938,fusion: -927.3971]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  167420.25    
module.ans_embedding.weight  dot:  1104889.75    
module.lstm.weight_ih_l0  dot:  3581422.5    
module.lstm.weight_hh_l0  dot:  3024548.5    
module.lstm.bias_ih_l0  dot:  245135.953125    
module.lstm.bias_hh_l0  dot:  245135.953125    
module.ans_lstm.weight_

[Version hakku][Epoch  1][Step 1036/6933] Loss: -899.0144 [iq: 7.8957,ans: 7.5676,interp: 8.1081,fusion: -922.5858]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  322317.625    
module.ans_embedding.weight  dot:  437679.96875    
module.lstm.weight_ih_l0  dot:  4909859.5    
module.lstm.weight_hh_l0  dot:  8552814.0    
module.lstm.bias_ih_l0  dot:  539463.0625    
module.lstm.bias_hh_l0  dot:  539463.0625    
module.ans_lstm.weight_i

[Version hakku][Epoch  1][Step 1040/6933] Loss: -894.9489 [iq: 9.4573,ans: 8.6675,interp: 8.9594,fusion: -922.0330]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  2462854.75    
module.ans_embedding.weight  dot:  1082109.75    
module.lstm.weight_ih_l0  dot:  28792640.0    
module.lstm.weight_hh_l0  dot:  23619810.0    
module.lstm.bias_ih_l0  dot:  2425760.0    
module.lstm.bias_hh_l0  dot:  2425760.0    
module.ans_lstm.weight_ih_l0

[Version hakku][Epoch  1][Step 1044/6933] Loss: -956.4188 [iq: 7.1945,ans: 7.2673,interp: 7.4817,fusion: -978.3622]          
----------  FIRST LOSS  --------

----------  FUSION LOSS  --------
module.decoder_mlp_1.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_1.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.fc.linear.bias  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.weight  dot:  0.0    NOT UPDATING
module.decoder_mlp_2.linear.bias  dot:  0.0    NOT UPDATING
module.embedding.weight  dot:  1032890.375    
module.ans_embedding.weight  dot:  320127.0625    
module.lstm.weight_ih_l0  dot:  28736028.0    
module.lstm.weight_hh_l0  dot:  8562015.0    
module.lstm.bias_ih_l0  dot:  1868976.0    
module.lstm.bias_hh_l0  dot:  1868976.0    
module.ans_lstm.weight_ih_l