In [1]:
import argparse
import os
import gc
import torch
from torch import nn
# from torch import optim
from tensorboardX import SummaryWriter
import torch.nn.functional as F
from time import gmtime, strftime
from MultiHeadRln import EncoderModel
from Utils.utils import Logger
import json
import torch.nn as nn
from utils.vis import visualize

from utils import load_data, convert_examples_to_features
from pytorch_transformers import BertTokenizer, BertModel #*
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
gpu_list = [4, 6, 7] # List of GPU cards to run on [4, 6, 7]
# os.environ["CUDA_VISIBLE_DEVICES"]="0,1,4" 


In [None]:
def train(model, x, y, mask, seqlens, mloss, optim, args): 
    pred, std = model(x, mask, seqlens) 
    log_softmax = nn.LogSoftmax()
    optim.zero_grad()  
    loss = mloss(log_softmax(pred), y)
   
    std = torch.sum(std)

    loss = loss + args.std_alpha*std
    
    loss.backward()
    # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
    optim.step()

In [None]:
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--training', default=True)
    parser.add_argument('--dropout', default=0.1, type=float)
    parser.add_argument('--std-alpha', default=0.003, type=float)
    parser.add_argument('--test-split', default=0.1, type=float)
    parser.add_argument('--epoch', default=10000, type=int)
    parser.add_argument('--epoch_start', default=0, type=int)
    parser.add_argument('--exp-decay-rate', default=0.99, type=float)
    parser.add_argument('--use_cuda', default=True)
    parser.add_argument('--hidden-size', default=100, type=int)
    parser.add_argument('--learning-rate', default=0.05, type=float)
    parser.add_argument('--print-freq', default=250, type=int)
    parser.add_argument('--train-batch-size', default=60, type=int)
    parser.add_argument('--dev-batch-size', default=100, type=int)
    parser.add_argument('--model-config', default='rnn_config.ini')
    parser.add_argument('--config', default='config.json')
    parser.add_argument('--clip', type=float, default=1, help='gradient clipping')
    parser.add_argument('--model-name', default='MultiHeadRln')
    parser.add_argument('--word-dim', default=100, type=int)
    parser.add_argument('--resume', default='MultiHeadRln_18 _300.model', type=str, metavar='PATH', help='path saved params')
    parser.add_argument("--output_dir", default='./checkpoints/', type=str, 
                        help="The output directory where the model checkpoints will be written.")
    # [/mnt/disk/dagi/BiDAF_multiHead/checkpoints/], [./checkpoints/]
    args = parser.parse_args()
    device = torch.device(f"cuda:{str(gpu_list[0])}" if args.use_cuda and torch.cuda.is_available() else "cpu")
    

    print('loading BABI data...')
    setattr(args, 'dataset', 'Task'+args.task+'_text_10K') #
    setattr(args, 'device', device)
    setattr(args, 'model_time', strftime('%H_%M_%S', gmtime()))

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased')

    data = load_data()

    with open(args.config) as config_file: 
        hyp = json.load(config_file)['hyperparams']  
   
 
    model = EncoderModel(args, hyp)
    model = nn.DataParallel(model, device_ids=gpu_list)

    model_loss = nn.NLLLoss()
    optimizer  = torch.optim.SGD(model.parameters(), lr=args.learning_rate)

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_start_pos = torch.tensor([f.start_position for f in features], dtype=torch.long)
    all_end_pos = torch.tensor([f.end_position for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    dataset = TensorDataset(all_input_ids, all_input_mask, all_start_pos, all_end_pos, all_segment_ids, all_example_index)

    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.train_batch_size)
    

    if os.path.exists(f'model/{args.resume}'):
        print(f'loading ..........model/{args.resume}')
        model.load_state_dict(torch.load(f'model/{args.resume}', map_location=args.device)) #  map_location
    # loss, acc, total = 0, 0, 0
    for e in range(args.epoch_start, args.epoch):        
        loss, acc, total = 0.0, 0.0, 0.0
        for i, batch in enumerate(eval_dataloader): 

            batch_loss, out = train(model, batch[0], batch.answer, batch[1], batch.dialog[1], model_loss, optimizer, args)            
            batch_pred = torch.argmax(out, dim=1)
            batch_acc = (batch_pred == batch.answer).sum()
            loss += float(batch_loss)
            acc += batch_acc.data.item()
            total += args.train_batch_size

In [None]:
if __name__ == '__main__':
    main()