In [1]:
import argparse
from T5Finetunner import T5FineTuner, LoggingCallback
import glob
import random
import shutil
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from T5Finetunner import ImdbDataset 
from DanMuDataset import DanMuDataset

data_path = "/media/zihao/New Volume1/UMASS/685_e/github/Zihao_branch/data/Danmu_byt5/pkl"


In [2]:
args_dict = dict(
    data_dir= data_path, # path for data files
    output_dir="checkpoints", # path to save the checkpoints
    #model_name_or_path='t5-base',
    model_name_or_path = 'google/byt5-small',
    tokenizer_name_or_path='google/byt5-small',
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=1,
    eval_batch_size=1,
    num_train_epochs=2,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

In [3]:

train_pos_files = glob.glob( data_path +'/train/pos/*.txt')
train_neg_files = glob.glob( data_path +'/train/neg/*.txt')

random.shuffle(train_pos_files)
random.shuffle(train_neg_files)


In [4]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
dataset = DanMuDataset(tokenizer, data_path, 'val',  max_len=512)
print('dataset:')
print(len(dataset))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."


dataset:
33740


In [5]:
args_dict.update({'output_dir': 't5_imdb_sentiment', 'num_train_epochs':2})
args = argparse.Namespace(**args_dict)

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5
)




In [6]:
train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)


In [7]:
def get_dataset(tokenizer, type_path, args):
    #return ImdbDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path,  max_len=args.max_seq_length)
    return DanMuDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path,  max_len=args.max_seq_length)


In [8]:
model = T5FineTuner(args)
#print(model.val_dataloader())

trainer = pl.Trainer(**train_params)

Downloading: 100%|██████████| 2.53k/2.53k [00:00<00:00, 596kB/s]
Downloading: 100%|██████████| 2.44k/2.44k [00:00<00:00, 677kB/s]
INFO:lightning:GPU available: True, used: True
INFO:lightning:CUDA_VISIBLE_DEVICES: [0]


In [9]:
trainer.fit(model)

INFO:lightning:
    | Name                                                                | Type                       | Params
---------------------------------------------------------------------------------------------------------------
0   | model                                                               | T5ForConditionalGeneration | 299 M 
1   | model.shared                                                        | Embedding                  | 565 K 
2   | model.encoder                                                       | T5Stack                    | 217 M 
3   | model.encoder.block                                                 | ModuleList                 | 217 M 
4   | model.encoder.block.0                                               | T5Block                    | 18 M  
5   | model.encoder.block.0.layer                                         | ModuleList                 | 18 M  
6   | model.encoder.block.0.layer.0                                       | T5LayerSelfA

train:                                                                
269913
Epoch 1:  34%|███▍      | 103317/303653 [6:19:02<12:14:59,  4.54it/s, loss=0.000, v_num=17]

In [None]:
model.model.save_pretrained('t5_small_danmu_classify_subCate')