In [1]:
# !pip install pytorch_lightning==0.7.5
# !pip install transformers==2.9.0 
# !pip install sentencepiece
# !pip install datasets

In [2]:
import argparse
from T5Finetunner import T5FineTuner, LoggingCallback
import glob
import random
import shutil
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from T5Finetunner import ImdbDataset 
from DanMuDataset import DanMuDataset

data_path = "/home/ec2-user/data/Danmu_byt5/pkl"


In [3]:
print(torch.__version__)

1.10.2+cu113


In [4]:
torch.cuda.get_arch_list()


['sm_37', 'sm_50', 'sm_60', 'sm_70', 'sm_75', 'sm_80', 'sm_86']

In [5]:
args_dict = dict(
    data_dir= data_path, # path for data files
    output_dir="checkpoints", # path to save the checkpoints
    #model_name_or_path='t5-base',
    model_name_or_path = 't5-base',
    tokenizer_name_or_path='t5-base',
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=16,
    eval_batch_size=16,
    num_train_epochs=2,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
    num_workers = 1,
)

In [6]:

train_pos_files = glob.glob( data_path +'/train/pos/*.txt')
train_neg_files = glob.glob( data_path +'/train/neg/*.txt')

random.shuffle(train_pos_files)
random.shuffle(train_neg_files)


In [7]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
dataset = DanMuDataset(tokenizer, data_path, 'val',  max_len=512)
print('dataset:')
print(len(dataset))

INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model from cache at /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f


dataset:
33740


In [8]:
#args_dict.update({'output_dir': 'byt5_danmu', 'num_train_epochs':1})
args = argparse.Namespace(**args_dict)

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5
)


In [9]:
train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)


In [10]:
def get_dataset(tokenizer, type_path, args):
    return DanMuDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path,  max_len=args.max_seq_length)


In [11]:
model = T5FineTuner(args)
#print(model.val_dataloader())

trainer = pl.Trainer(**train_params)

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json from cache at /root/.cache/torch/transformers/40578967d1f029acb6162b36db9d8b4307063e885990ccd297c2c5be1cf1b3d7.2995d650f5eba18c8baa4146e210d32d56165e90d374281741fc78b872cd6c9b
INFO:transformers.configuration_utils:Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
   

In [12]:
#If counters the erro that cannot allocate memory:
#add swap memory using the link:https://aws.amazon.com/premiumsupport/knowledge-center/ec2-memory-swap-file/
trainer.fit(model)

INFO:lightning:
    | Name                                                                  | Type                       | Params
-----------------------------------------------------------------------------------------------------------------
0   | model                                                                 | T5ForConditionalGeneration | 222 M 
1   | model.shared                                                          | Embedding                  | 24 M  
2   | model.encoder                                                         | T5Stack                    | 109 M 
3   | model.encoder.block                                                   | ModuleList                 | 84 M  
4   | model.encoder.block.0                                                 | T5Block                    | 7 M   
5   | model.encoder.block.0.layer                                           | ModuleList                 | 7 M   
6   | model.encoder.block.0.layer.0                                     

Validation sanity check: 0it [00:00, ?it/s]

train:
269913


Training: 0it [00:00, ?it/s]

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1050.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)


Validating: 0it [00:00, ?it/s]

INFO:T5Finetunner:***** Validation results *****
INFO:T5Finetunner:avg_val_loss = tensor(2.1262e-06, device='cuda:0')

INFO:T5Finetunner:loss = tensor(5.5134e-07, device='cuda:0')

INFO:T5Finetunner:train_loss = tensor(5.5134e-07, device='cuda:0')

INFO:T5Finetunner:val_loss = tensor(2.1262e-06, device='cuda:0')



RuntimeError: [enforce fail at inline_container.cc:300] . unexpected pos 128640 vs 128532

In [None]:
model.model.save_pretrained('t5_base_danmu_classify_subCate')