In [1]:
import math
import time

import torch.utils.data
from erutils.loggers import fprint

from modules.models import PGT, CC_PGT
from utils.utils import DatasetPGT, make2d, save_model, get_config_by_name

torch.backends.cudnn.benchmark = True


In [2]:

batch = 2
prp = torch.cuda.get_device_properties("cuda")
fprint(
    f'DEVICES : {torch.cuda.get_device_name()} | {prp.name} |'
    f' {prp.total_memory / 1e9} GB Memory')

data_path = ['data/Data-part-1.pt', 'data/Data-part-2.pt']
dataset = DatasetPGT(batch_size=batch, pt_data=True, src=data_path)

Config = get_config_by_name('PGT-Cs', dataset.vocab_size)
Config.load = False
Config.train = True
Config.data_path = data_path
dataset.chunk = Config.chunk
dataset.data = dataset.data[:100]
Config.batch_size = batch
dataloader = torch.utils.data.DataLoader(dataset=dataset, batch_size=Config.batch_size, num_workers=3,
                                         pin_memory=True)


[1;36mDEVICES : NVIDIA GeForce GTX 1050 | NVIDIA GeForce GTX 1050 | 2.147221504 GB Memory


In [3]:

if Config.load:
    fprint('Loading Model ...')
    model = PGT(config=Config).to('cpu')
    loaded = torch.load('model.pt', 'cpu')
    model.load_state_dict(loaded['model'])
    model = model.to(Config.device)
    fprint(f'Model Loaded With {sum(p.numel() for p in model.parameters()) / 1e6} Million Parameters')
    criterion = torch.nn.CrossEntropyLoss(ignore_index=-1)
    # optimizer = torch.optim.Adamax(model.parameters(), Config.lr, betas=(0.9, 0.99))
    optimizer = model.configure_optimizer(Config)
    optimizer.load_state_dict(loaded['optimizer'])
else:
    fprint('Creating Model ...')
    model = PGT(config=Config).to('cpu').to(Config.device)
    fprint(f'Model Created With {sum(p.numel() for p in model.parameters()) / 1e6} Million Parameters')
    criterion = torch.nn.CrossEntropyLoss(ignore_index=-1)
    optimizer = model.configure_optimizer(Config)
    # optimizer = torch.optim.Adamax(model.parameters(), Config.lr, betas=(0.9, 0.99))

model = torch.compile(model)


[1;36mCreating Model ...
[1;36mModel Created With 34.532202 Million Parameters


In [4]:

total_iterations = dataset.__len__() // Config.batch_size
question = dataset.encode('hello how are you ?').to(Config.device)
question = question['input_ids'].to(Config.device)
mxl = math.ceil(dataset.__len__() / Config.batch_size)


In [None]:

if Config.train:
    if Config.load:
        for epoch in range(loaded['epoch'], Config.epochs):
            loss_avg = 0
            st = time.time()
            for i, (inp, label) in enumerate(dataloader):
                inp = inp.type(torch.long)
                label = label.type(torch.long)
                inp = make2d(inp).to(Config.device)
                label = make2d(label).to(Config.device)
                predict = model(inputs=inp)
                optimizer.zero_grad(set_to_none=True)
                loss = criterion(predict.permute(0, 2, 1), label.view(-1, label.size(-1)))
                loss_avg += loss.item()
                loss.backward()
                optimizer.step()
                fprint(
                    f'\rEPOCH : [{epoch + 1}/{Config.epochs}] | LOSS : {loss.item() / Config.batch_size} | EPOCH LOSS AVG : {(loss_avg / (i + 1)) / Config.batch_size} | ITER : {i + 1}/{mxl} | DEVICE : {Config.device} | EPOCH TIME {int(time.time() - st)} SEC',
                    end='')

            print()
            if (epoch + 1) % 5 == 0:
                print()
                save_model(model=model.state_dict(), optimizer=optimizer.state_dict(), epochs=Config.epochs,
                           epoch=epoch + 1,
                           name='modified_model.pt')
                fprint('==> MODEL SAVED SUCCESSFULLY')
                predictions = model.generate(idx=question, eos=dataset.tokenizer.eos_token_id,
                                             generate=256

                                             )
                fprint(f'QUESTION : {dataset.decode(question)}')
                fprint(f'PREDICTION : {dataset.decode(predictions)}')
    else:
        for epoch in range(Config.epochs):
            loss_avg = 0
            st = time.time()
            for i, (inp, label) in enumerate(dataloader):
                inp = inp.type(torch.long)
                label = label.type(torch.long)
                inp = make2d(inp).to(Config.device)
                label = make2d(label).to(Config.device)
                predict = model(inputs=inp)
                optimizer.zero_grad(set_to_none=True)
                loss = criterion(predict.permute(0, 2, 1), label.view(-1, label.size(-1)))
                loss_avg += loss.item()
                loss.backward()
                optimizer.step()
                fprint(
                    f'\rEPOCH : [{epoch + 1}/{Config.epochs}] | LOSS : {loss.item() / Config.batch_size} | EPOCH LOSS AVG : {(loss_avg / (i + 1)) / Config.batch_size} | ITER : {i + 1}/{mxl} | DEVICE : {Config.device} | EPOCH TIME {int(time.time() - st)} SEC',
                    end='')

            print()
            if (epoch + 1) % 5 == 0:
                print()
                save_model(model=model.state_dict(), optimizer=optimizer.state_dict(), epochs=Config.epochs,
                           epoch=epoch + 1,
                           name='model.pt')
                fprint('==> MODEL SAVED SUCCESSFULLY')
                predictions = model.generate(idx=question, eos=dataset.tokenizer.eos_token_id,
                                             generate=256

                                             )
                fprint(f'QUESTION : {dataset.decode(question)}')
                fprint(f'PREDICTION : {dataset.decode(predictions)}')


EPOCH : [1/1000] | LOSS : 2.9569525718688965 | EPOCH LOSS AVG : 3.8026510000228884 | ITER : 50/50 | DEVICE : cuda | EPOCH TIME 32 SEC[1;36m1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m
EPOCH : [2/1000] | LOSS : 2.6547913551330566 | EPOCH LOSS AVG : 2.7587716579437256 | ITER : 50/50 | DEVICE : cuda | EPOCH TIME 32 SEC1;36mmm[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m
EPOCH : [3/1000] | LOSS : 2.511430263519287 | EPOCH LOSS AVG : 2.600987687110901 | ITER : 50/50 | DEVICE : cuda | EPOCH TIME 33 SECEC[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m[1;36m
EPOCH : [4/1000] | LOSS : 2.395263671875 | EPOCH LOSS AVG : 2.514313554763794 | ITER : 50/50 | DEVICE : cuda | E

In [9]:
for i, (inp, label) in enumerate(dataloader):
    print(inp[0][0, -5:], label[0][0, -5:])
    break

tensor([1029.,  102., 2748., 1010., 1045.]) tensor([ 102., 2748., 1010., 1045., 2228.])
