In [1]:
import numpy as np
import os, gc, json
import util.trainer as tr
import torch.nn
from torch.utils.data import DataLoader
from util.input_data import Dataset
from util.AdaBound import AdaBound
from torch.utils.tensorboard import SummaryWriter

def exec_model(
    scale,
    model_type,
    comment='',
    lr = 1e-5,
    wd = 1e-7,
    tries = 1,
    root_model = 'd:/MODELS/202204/nmm',
    root_data  = 'c:/WORKSPACE_KRICT/DATA/data_snu',
    num_epochs = 300,
    batch_size = 128,
    train_ratio = 0.7,
    valid_ratio = 0.2,
    metal_ratio = 1,
):
    gc.collect()
    torch.cuda.empty_cache()

    dataset = Dataset()
    dataset.load_dataset(os.path.join(root_data, f'inputdata_{scale}.pickle'), silent=True)

    for n in range(0, tries):
        rseed  = 35 + n
        train_data, valid_data, test_data = dataset.train_test_split(train_ratio=train_ratio, 
                                                                     valid_ratio=valid_ratio,
                                                                     rseed=rseed)
        train_data_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, 
                                    collate_fn=tr.collate_fn)
        val_data_loader = DataLoader(valid_data, batch_size=batch_size, collate_fn=tr.collate_fn)
        test_data_loader = DataLoader(test_data, batch_size=batch_size, collate_fn=tr.collate_fn)

        model = DistNN(dataset.n_atom_feats, dataset.n_rdf_feature, dataset.n_bdf_feature).cuda()
        optimizer = AdaBound(model.parameters(), lr=lr, weight_decay=wd)
        criterion = torch.nn.L1Loss()

        for i in range(99):
            root = os.path.join(root_model, model_type)
            if not os.path.isdir(root):
                os.makedirs(root)
            if '{}_{:02d}'.format(scale, i) not in ' '.join(os.listdir(root)):
                output_root = os.path.join(root, '{}_{:02d}'.format(scale, i))
                if len(comment) > 0: output_root += f'_{comment}'
                os.makedirs(output_root)
                break
        print(output_root)
        with open(os.path.join(output_root, 'params.json'),'w') as f:
            json.dump(dict(random_seed=rseed, learning_rate=lr, weight_decay=wd, 
                train_ratio=train_ratio, valid_ratio=valid_ratio, batch_size=batch_size), 
                f, indent=4)
        writer = SummaryWriter(output_root)
        #with torch.no_grad():
        #    dummy = iter(test_data_loader).next()
        #    writer.add_graph(model, dummy[:7])

        for epoch in range(1, num_epochs+1):
            train_loss, train_mae, train_f1 = tr.train(model, optimizer, train_data_loader, criterion)
            val_loss, val_mae, val_f1, _, _, _ = tr.test(model, val_data_loader, criterion)
            if epoch > 20 and train_loss > 1: 
                break
            print('Epoch [{}/{}]\tTrain loss: {:.4f}\tVal loss: {:.4f} ({:.4f})'
                    .format(epoch, num_epochs, train_loss, val_loss, val_f1))

            writer.add_scalar('train/loss', train_loss, epoch)
            writer.add_scalar('train/MAE', train_mae, epoch)
            writer.add_scalar('train/F1', train_f1, epoch)
            writer.add_scalar('valid/loss', val_loss, epoch)
            writer.add_scalar('valid/MAE', val_mae, epoch)
            writer.add_scalar('valid/F1', val_f1, epoch)

            if epoch%20 == 0:
                torch.save(model.state_dict(), 
                           os.path.join(output_root, 'model.{:05d}.pt'.format(epoch)))
                _, _, _, idxs, targets, preds = tr.test(model, test_data_loader, criterion)
                np.savetxt(os.path.join(output_root, 'pred.{:05d}.txt'.format(epoch)), 
                           np.hstack([idxs, targets, preds]), delimiter=',')

In [2]:
from model.model_02 import DistNN
import util.trainer_clas as tr

exec_model(scale='metal_TTT', model_type='M02', comment='L1_2L1')

d:/MODELS/202204/nmm\M02\metal_TTT_02_L1_2L1


	add(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add(Tensor other, *, Number alpha) (Triggered internally at  ..\torch\csrc\utils\python_arg_parser.cpp:1050.)
  grad = grad.add(group['weight_decay'], p.data)


Epoch [1/300]	Train loss: 2.3878	Val loss: 1.7253 (0.6781)
Epoch [2/300]	Train loss: 1.2495	Val loss: 1.0158 (0.8651)
Epoch [3/300]	Train loss: 0.9673	Val loss: 0.9726 (0.8641)
Epoch [4/300]	Train loss: 0.9060	Val loss: 0.9453 (0.8861)
Epoch [5/300]	Train loss: 0.8897	Val loss: 0.8940 (0.8838)
Epoch [6/300]	Train loss: 0.8605	Val loss: 0.8794 (0.8913)
Epoch [7/300]	Train loss: 0.8379	Val loss: 1.4117 (0.8604)
Epoch [8/300]	Train loss: 0.8154	Val loss: 0.9827 (0.8946)
Epoch [9/300]	Train loss: 0.7684	Val loss: 0.8269 (0.8987)
Epoch [10/300]	Train loss: 0.7469	Val loss: 0.9546 (0.8680)
Epoch [11/300]	Train loss: 0.7615	Val loss: 0.7964 (0.9032)
Epoch [12/300]	Train loss: 0.7251	Val loss: 0.8783 (0.8788)
Epoch [13/300]	Train loss: 0.7116	Val loss: 0.8570 (0.8882)
Epoch [14/300]	Train loss: 0.7096	Val loss: 0.7383 (0.9031)
Epoch [15/300]	Train loss: 0.6768	Val loss: 0.8938 (0.8752)
Epoch [16/300]	Train loss: 0.6700	Val loss: 0.7012 (0.9175)
Epoch [17/300]	Train loss: 0.6552	Val loss: 0.704

Exception in thread Thread-3:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\envs\ex01\lib\threading.py", line 932, in _bootstrap_inner
    self.run()
  File "C:\ProgramData\Anaconda3\envs\ex01\lib\site-packages\tensorboard\summary\writer\event_file_writer.py", line 233, in run
    self._record_writer.write(data)
  File "C:\ProgramData\Anaconda3\envs\ex01\lib\site-packages\tensorboard\summary\writer\record_writer.py", line 40, in write
    self._writer.write(header + header_crc + data + footer_crc)
  File "C:\ProgramData\Anaconda3\envs\ex01\lib\site-packages\tensorboard\compat\tensorflow_stub\io\gfile.py", line 519, in write
    self.fs.append(self.filename, file_content, self.binary_mode)
  File "C:\ProgramData\Anaconda3\envs\ex01\lib\site-packages\tensorboard\compat\tensorflow_stub\io\gfile.py", line 150, in append
    self._write(filename, file_content, "ab" if binary_mode else "a")
  File "C:\ProgramData\Anaconda3\envs\ex01\lib\site-packages\tensorboard\compat\

Epoch [24/300]	Train loss: 0.5649	Val loss: 0.9589 (0.8837)
Epoch [25/300]	Train loss: 0.5727	Val loss: 0.7147 (0.9100)
