# Setup Enviroment

In [None]:
import os

# Get the name of the current directory
current_dir = os.path.basename(os.getcwd())

# Check for the two possible names
if current_dir.lower() in {"notebooks", "notebook"}:
    # Move one level up
    os.chdir("..")

# Install Dependencies

In [1]:
!pip install -q -U pip
!pip install -q coqui-tts

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m60.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m129.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.1/18.1 MB[0m [31m116.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m126.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.p

# Import Libraries

In [3]:
import os
import glob
import json
import pandas as pd
import soundfile as sf
from datasets import load_dataset
from sklearn.model_selection import train_test_split
# TTS specific imports
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig, CharactersConfig
from trainer import Trainer, TrainerArgs

2025-05-06 04:27:33.974663: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746505654.152001      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746505654.204447      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
# Python 3.11.11

# numpy==1.26.4
# json: 2.0.9
# pandas: 2.2.3
# soundfile: 0.13.1
# datasets: 3.5.0
# TTS: 0.26.0
# trainer: 0.2.3


# Dataset 

In [None]:
!wget -q https://huggingface.co/datasets/ntt123/viet-tts-dataset/resolve/main/viet-tts.tar.gz -O viet-tts.tar.gz
!mkdir data
!tar -C data -xzf viet-tts.tar.gz
!rm -rf viet-tts.tar.gz

In [None]:
df = pd.read_csv("data/meta_data.tsv", delimiter="\t")
df

Unnamed: 0,wav/000000.wav,Ai đây tức là một kẻ ăn mày vậy. Anh ta chưa kịp quay đi thì đã thấy mấy con chó vàng chạy xồng xộc ra cứ nhảy xổ vào chân anh.
0,wav/000001.wav,Anh mau cứu tôi! những lời nói run sợ một cách...
1,wav/000002.wav,"Anh đi đường cái quan đi ba bước rồi dừng lại,..."
2,wav/000003.wav,Anh đi đường cái quan đáo để ấy mỉm cười đắc c...
3,wav/000004.wav,Biết thế thì thịt quách gà đi lại được lợi cái...
4,wav/000005.wav,Cho vui! Phúc bĩu môi nghĩ thầm. Anh muốn đuổi...
...,...,...
22878,wav/022879.wav,"Ừ, trả thù, và trả thù!"
22879,wav/022880.wav,"Ừ, tôi vẫn biết là mợ đã chán tôi lắm, mợ còn ..."
22880,wav/022881.wav,"Ừ, việc ấy là như thế, thì nào Dung có làm gì ..."
22881,wav/022882.wav,"Ừ, được rồi!..."


In [None]:
# Define paths
output_dataset = "data"

# Define the custom formatter
def my_custom_format(dataset_path, meta_file, **kwargs):
    samples = []
    metadata_path = os.path.join(dataset_path, meta_file)
    
    with open(metadata_path, "r", encoding="utf-8") as f:
        for line in f:
            file_name, transcription = line.strip().split("\t")
            audio_path = os.path.join(dataset_path, file_name)
            samples.append({
                "audio_file": audio_path,
                "text": transcription,
                "speaker_name": None,
                "language": "vi",
                "audio_unique_name": file_name
            })
    return samples

# Define dataset config
dataset_config = BaseDatasetConfig(
    formatter="my_custom_format",
    meta_file_train="meta_data.tsv",
    path=output_dataset
)

In [7]:
def get_vietnamese_chars():
    return (
        "aàáảãạăằắẳẵặâầấẩẫậbcdfđeèéẻẽẹêềếểễệghiìíỉĩịjklmnoòóỏõọôồốổỗộơờớởỡợpqrstuùúủũụưừứửữựvwxyỳýỷỹỵz"
        + "AÀÁẢÃẠĂẰẮẲẴẶÂẦẤẨẪẬBCDFĐEÈÉẺẼẸÊỀẾỂỄỆGHIÌÍỈĨỊJKLMNOÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢPQRSTUÙÚỨỬỮỰVWXYỲÝỶỸỴZ"
        + "0123456789"
    )

def get_characters_config():
    return CharactersConfig(
        pad="<PAD>",
        eos="<EOS>",
        bos="<BOS>",
        blank=None,
        characters=get_vietnamese_chars(),
        punctuations=".,!? ",
        phonemes=None,
        characters_class="TTS.tts.models.vits.VitsCharacters"
    )

def get_glow_tts_config(output_path="./ckpts", config_path=None):
    characters_config = get_characters_config()
    
    if config_path and os.path.exists(config_path):
        print("Load Config from existed directory")
        with open(config_path, "r") as f:
            config_dict = json.load(f)
        valid_keys = GlowTTSConfig.__init__.__code__.co_varnames
        filtered_config_dict = {k: v for k, v in config_dict.items() if k in valid_keys}
        config = GlowTTSConfig(**filtered_config_dict)
        config.characters = characters_config
        
        config.output_path = output_path
    else:
        print("Create new Config")
        config = GlowTTSConfig(
            batch_size=64,
            eval_batch_size=64,
            num_loader_workers=4,
            num_eval_loader_workers=4,
            run_eval=True,
            test_delay_epochs=-1,
            epochs=100,
            text_cleaner="multilingual_cleaners",
            use_phonemes=False,
            phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
            print_step=300,
            print_eval=False,
            mixed_precision=True,
            output_path=output_path,
            save_step=1000,
            eval_split_max_size=256,
            eval_split_size=0.1,
            characters=characters_config,
            save_n_checkpoints=1,
            test_sentences = ["Tôi đã mất khá nhiều thời gian để phát triển một giọng nói, và giờ đây khi đã có nó, tôi sẽ không im lặng.",
                       "Hãy là một giọng nói, không phải tiếng vọng.",
                       "Xin lỗi Dave. Tôi e là tôi không thể làm điều đó.",
                       "Chiếc bánh này tuyệt vời. Nó thật ngon và ẩm.",
                       "Trước ngày hai mươi hai tháng mười một, năm một nghìn chín trăm sáu mươi ba."]
        )
    
    return config

# Usage
output_path = "./ckpts"
run_folder_path = "run-May-02-2025_07+04PM-0000000"
config_path = os.path.join(output_path, run_folder_path, "config.json")
config = get_glow_tts_config(output_path, config_path)

Load Config from existed directory


In [8]:
# Initialize AudioProcessor and Tokenizer
ap = AudioProcessor.init_from_config(config)
tokenizer, config = TTSTokenizer.init_from_config(config)

In [9]:
# Load dataset samples
def load_custom_tts_samples(dataset_config):
    samples = my_custom_format(
        dataset_path=dataset_config.path,
        meta_file=dataset_config.meta_file_train
    )
    eval_split_size = config.eval_split_size
    
    # Use train_test_split to split samples
    train_samples, eval_samples = train_test_split(
        samples,
        test_size=eval_split_size,
        random_state=42
    )
    
    return train_samples, eval_samples

train_samples, eval_samples = load_custom_tts_samples(dataset_config)

In [10]:
from IPython.display import Audio

for dataset, name in [(train_samples, "train sample"), (eval_samples, "eval sample")]:
    sample = dataset[99]
    print(f"{name.capitalize()}:")
    print(sample['text'])
    display(Audio(sample['audio_file']))

Train sample:
- Trong lưng giắt đến ba, bốn chục bạc từ Phủ Lý ra đây định cân sâm cho con giai đang ốm thập tử nhất sinh đấy...


Eval sample:
Bọn này thuộc vào lớp người mà do ai, xã hội được hoạt động, trông vào đâu, cũng thấy danh lợi, sống lên trên những cuộc tai họa hoặc những sự may mắn của người đời.


In [10]:
# Defind Model
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)

In [11]:
# Find the latest checkpoint
checkpoint_dir = os.path.join(output_path, run_folder_path)
checkpoint_files = glob.glob(os.path.join(checkpoint_dir, "checkpoint_*.pth"))
if checkpoint_files:
    latest_checkpoint = max(checkpoint_files, key=os.path.getctime)
    print(f"Resuming training from checkpoint: {latest_checkpoint}")
    model.load_checkpoint(config, latest_checkpoint, eval=False)
else:
    print("No checkpoint found, starting training from scratch.")


Resuming training from checkpoint: ./ckpts/run-May-02-2025_07+04PM-0000000/checkpoint_32000.pth


In [12]:
# Initialize Trainer
trainer = Trainer(
    TrainerArgs(
        continue_path=checkpoint_dir if checkpoint_files else None,  # Resume from checkpoint directory
        restore_path=latest_checkpoint if checkpoint_files else None  # Restore model state
    ),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples
)

fatal: not a git repository (or any parent up to mount point /kaggle)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 4
 | > Num. of Torch Threads: 2
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=./ckpts/run-May-02-2025_07+04PM-0000000
 > Restoring from checkpoint_32000.pth ...
 > Restoring Model...
 > Restoring Optimizer...
 > Restoring Scaler...
 > Model restored from step 32000

 > Model has 28623121 parameters


In [None]:
# --> EVAL PERFORMANCE
#      | > avg_loader_time: 0.6213016510009766 (+0.03184342384338379)
#      | > avg_loss: -0.9545338119779314 (-0.010085366453443267)
#      | > avg_log_mle: -1.0004474197115216 (-0.010152942793709818)
#      | > avg_loss_dur: 0.045913600495883396 (+6.757155060767989e-05)

In [13]:
trainer.fit()

 > Restoring best loss from best_model_31556.pth ...
 > Starting with loaded last best loss {'train_loss': -0.9507075430916958, 'eval_loss': None}

[4m[1m > EPOCH: 0/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 04:31:55) [0m

[1m   --> TIME: 2025-05-06 04:32:47 -- STEP: 99/322 -- GLOBAL_STEP: 32100[0m
     | > loss: -1.0038946866989136  (-0.9749497127051305)
     | > log_mle: -1.0516550540924072  (-1.0211305871154326)
     | > loss_dur: 0.047760359942913055  (0.046180873055650745)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(20.8182, device='cuda:0')  (tensor(39.7450, device='cuda:0'))
     | > current_lr: 2.5e-07 
     | > step_time: 0.4918  (0.4471461628422593)
     | > loader_time: 0.0138  (0.007111368757305723)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.6372686181749616 [0m(+0.0)
     | > avg_loss: -0.9277584331376212 [0m(+0.0)
     | > avg_log_mle: -0.973495364189148 [0m(+0.0)
     | > avg_loss_dur: 0.04573693158371108 [0m(+0.0)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_32323.pth

[4m[1m > EPOCH: 1/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 04:38:31) [0m

[1m   --> TIME: 2025-05-06 04:39:05 -- STEP: 77/322 -- GLOBAL_STEP: 32400[0m
     | > loss: -0.9373217821121216  (-0.8647424594148413)
     | > log_mle: -0.9850516319274902  (-0.9124196210464874)
     | > loss_dur: 0.04772982373833656  (0.04767716463123049)
     | > amp_scaler: 512.0  (518.6493506493506)
     | > grad_norm: tensor(135.6389, device='cuda:0')  (tensor(224.8216, device='cuda:0'))
     | > current_lr: 2.4999999999999998e-05 
     | > step_time: 0.5019  (0.4130130371490082)
     | > loader_time: 0.009  (0.006780060854825107)


[1m > EVALUATIO




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6550652980804443 [0m(+0.017796679905482726)
     | > avg_loss:[92m -0.9417918017932347 [0m(-0.014033368655613554)
     | > avg_log_mle:[92m -0.986799168586731 [0m(-0.013303804397583052)
     | > avg_loss_dur:[92m 0.04500737264752388 [0m(-0.0007295589361872024)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_32645.pth

[4m[1m > EPOCH: 2/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 04:44:53) [0m

[1m   --> TIME: 2025-05-06 04:45:16 -- STEP: 55/322 -- GLOBAL_STEP: 32700[0m
     | > loss: -0.9589123129844666  (-0.9527878046035767)
     | > log_mle: -1.003960371017456  (-0.9981031591242009)
     | > loss_dur: 0.04504803940653801  (0.04531534903428771)
     | > amp_scaler: 512.0  (512.0)
     | > grad_norm: tensor(166.9768, device='cuda:0')  (tensor(174.1974, device='cuda:0'))
     | > current_lr: 2.525e-05 
     | > step_time: 0.4093  (0.3745477329600941)
     | 




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6327776772635324 [0m(-0.022287620816911935)
     | > avg_loss:[91m -0.9240378788539342 [0m(+0.01775392293930056)
     | > avg_log_mle:[91m -0.9687587193080357 [0m(+0.018040449278695303)
     | > avg_loss_dur:[92m 0.044720837793179924 [0m(-0.0002865348543439547)


[4m[1m > EPOCH: 3/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 04:51:09) [0m

[1m   --> TIME: 2025-05-06 04:51:22 -- STEP: 33/322 -- GLOBAL_STEP: 33000[0m
     | > loss: -0.9259595274925232  (-0.9608791055101337)
     | > log_mle: -0.9720196723937988  (-1.0058159394697705)
     | > loss_dur: 0.04606013372540474  (0.0449368337338621)
     | > amp_scaler: 512.0  (512.0)
     | > grad_norm: tensor(124.9697, device='cuda:0')  (tensor(161.7702, device='cuda:0'))
     | > current_lr: 2.55e-05 
     | > step_time: 0.3543  (0.33402336727489124)
     | > loader_time: 0.0053  (0.004969676335652669)


 > CHECKPOINT : ckpts/run-




[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.5849113668714251 [0m(-0.047866310392107314)
     | > avg_loss:[92m -0.9392876642090934 [0m(-0.015249785355159196)
     | > avg_log_mle:[92m -0.9847302675247193 [0m(-0.015971548216683562)
     | > avg_loss_dur:[91m 0.04544260310275214 [0m(+0.0007217653095722143)


[4m[1m > EPOCH: 4/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 04:57:10) [0m

[1m   --> TIME: 2025-05-06 04:57:15 -- STEP: 11/322 -- GLOBAL_STEP: 33300[0m
     | > loss: -0.9070702791213989  (-0.9818167957392606)
     | > log_mle: -0.9608982801437378  (-1.0272294933145696)
     | > loss_dur: 0.05382798612117767  (0.04541269317269325)
     | > amp_scaler: 512.0  (512.0)
     | > grad_norm: tensor(178.0345, device='cuda:0')  (tensor(203.4620, device='cuda:0'))
     | > current_lr: 2.575e-05 
     | > step_time: 0.3057  (0.2904989936135032)
     | > loader_time: 0.0038  (0.0039279894395308065)


[1m   --> TIME: 2025-05




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6000159467969622 [0m(+0.015104579925537154)
     | > avg_loss:[92m -0.958480099269322 [0m(-0.019192435060228608)
     | > avg_log_mle:[92m -1.0035918269838608 [0m(-0.018861559459141586)
     | > avg_loss_dur:[92m 0.04511172292487962 [0m(-0.00033088017787252094)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_33611.pth

[4m[1m > EPOCH: 5/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 05:03:09) [0m

[1m   --> TIME: 2025-05-06 05:07:21 -- STEP: 289/322 -- GLOBAL_STEP: 33900[0m
     | > loss: -0.9747256636619568  (-0.959475728673506)
     | > log_mle: -1.025937795639038  (-1.0079158247548408)
     | > loss_dur: 0.051212139427661896  (0.048440095063002474)
     | > amp_scaler: 512.0  (512.0)
     | > grad_norm: tensor(156.6481, device='cuda:0')  (tensor(172.5272, device='cuda:0'))
     | > current_lr: 2.6e-05 
     | > step_time: 1.3102  (0.7261004918174347)
     |




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6167679446084159 [0m(+0.016751997811453645)
     | > avg_loss:[92m -0.96893287726811 [0m(-0.010452777998787965)
     | > avg_log_mle:[92m -1.0144047703061783 [0m(-0.010812943322317414)
     | > avg_loss_dur:[91m 0.0454718967633588 [0m(+0.0003601738384791803)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_33933.pth

[4m[1m > EPOCH: 6/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 05:09:03) [0m

 > CHECKPOINT : ckpts/run-May-02-2025_07+04PM-0000000/checkpoint_34000.pth





[1m   --> TIME: 2025-05-06 05:12:42 -- STEP: 267/322 -- GLOBAL_STEP: 34200[0m
     | > loss: -0.951501727104187  (-0.9660786347889275)
     | > log_mle: -1.0031169652938843  (-1.0142173387584614)
     | > loss_dur: 0.05161523073911667  (0.04813870302077091)
     | > amp_scaler: 512.0  (512.0)
     | > grad_norm: tensor(244.1676, device='cuda:0')  (tensor(171.9250, device='cuda:0'))
     | > current_lr: 2.625e-05 
     | > step_time: 1.2414  (0.6666917184765417)
     | > loader_time: 0.0277  (0.12879879376415013)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.5902434280940465 [0m(-0.02652451651436938)
     | > avg_loss:[92m -0.9786748988287789 [0m(-0.009742021560668945)
     | > avg_log_mle:[92m -1.0257870640073508 [0m(-0.011382293701172497)
     | > avg_loss_dur:[91m 0.04711216326270785 [0m(+0.0016402664993490496)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_34255.pth

[4m[1m > EPOCH: 7/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 05:15:00) [0m

[1m   --> TIME: 2025-05-06 05:18:02 -- STEP: 245/322 -- GLOBAL_STEP: 34500[0m
     | > loss: -0.993070662021637  (-0.9680676148862255)
     | > log_mle: -1.0402815341949463  (-1.0159215348107469)
     | > loss_dur: 0.047210853546857834  (0.04785391788701622)
     | > amp_scaler: 1024.0  (879.804081632653)
     | > grad_norm: tensor(189.9953, device='cuda:0')  (tensor(171.0933, device='cuda:0'))
     | > current_lr: 2.65e-05 
     | > step_time: 1.1714  (0.626288735136




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.567634310041155 [0m(-0.022609118052891453)
     | > avg_loss:[91m -0.9628212094306946 [0m(+0.015853689398084292)
     | > avg_log_mle:[91m -1.0092089414596557 [0m(+0.016578122547695084)
     | > avg_loss_dur:[92m 0.04638773415769849 [0m(-0.000724429105009354)


[4m[1m > EPOCH: 8/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 05:20:47) [0m

[1m   --> TIME: 2025-05-06 05:23:19 -- STEP: 223/322 -- GLOBAL_STEP: 34800[0m
     | > loss: -0.9653468132019043  (-0.9690840599782798)
     | > log_mle: -1.019024133682251  (-1.0165733553368959)
     | > loss_dur: 0.05367730185389519  (0.04748929686209547)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(165.9544, device='cuda:0')  (tensor(170.8052, device='cuda:0'))
     | > current_lr: 2.675e-05 
     | > step_time: 1.0454  (0.580772767687058)
     | > loader_time: 0.0345  (0.09245023919862486)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6091796806880405 [0m(+0.04154537064688546)
     | > avg_loss:[92m -0.9707649145807539 [0m(-0.007943705150059266)
     | > avg_log_mle:[92m -1.0155244248253956 [0m(-0.006315483365739949)
     | > avg_loss_dur:[92m 0.04475950928671019 [0m(-0.0016282248709882999)


[4m[1m > EPOCH: 9/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 05:26:39) [0m

 > CHECKPOINT : ckpts/run-May-02-2025_07+04PM-0000000/checkpoint_35000.pth





[1m   --> TIME: 2025-05-06 05:28:49 -- STEP: 201/322 -- GLOBAL_STEP: 35100[0m
     | > loss: -0.9687889814376831  (-0.9715885095928439)
     | > log_mle: -1.0189718008041382  (-1.0188248021092587)
     | > loss_dur: 0.05018283426761627  (0.04723629409178572)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(270.5827, device='cuda:0')  (tensor(163.0482, device='cuda:0'))
     | > current_lr: 2.7e-05 
     | > step_time: 0.8388  (0.5474547533253532)
     | > loader_time: 0.0574  (0.07647585275754405)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.5801145349230085 [0m(-0.029065145765032052)
     | > avg_loss:[91m -0.9489833882876805 [0m(+0.021781526293073394)
     | > avg_log_mle:[91m -0.99415032182421 [0m(+0.021374103001185585)
     | > avg_loss_dur:[91m 0.045166935345956255 [0m(+0.00040742605924606184)


[4m[1m > EPOCH: 10/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 05:32:32) [0m

[1m   --> TIME: 2025-05-06 05:34:14 -- STEP: 179/322 -- GLOBAL_STEP: 35400[0m
     | > loss: -0.9830242991447449  (-0.9753451340691337)
     | > log_mle: -1.0307117700576782  (-1.0222964593152088)
     | > loss_dur: 0.04768747836351395  (0.04695132430955019)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(172.1131, device='cuda:0')  (tensor(164.4634, device='cuda:0'))
     | > current_lr: 2.725e-05 
     | > step_time: 0.7248  (0.509201160356319)
     | > loader_time: 0.7081  (0.0502781561633062)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.5912406240190778 [0m(+0.01112608909606938)
     | > avg_loss:[92m -0.9538388405527387 [0m(-0.0048554522650582355)
     | > avg_log_mle:[92m -1.0022466012409754 [0m(-0.008096279416765384)
     | > avg_loss_dur:[91m 0.0484077587723732 [0m(+0.003240823426416947)


[4m[1m > EPOCH: 11/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 05:38:19) [0m

[1m   --> TIME: 2025-05-06 05:39:41 -- STEP: 157/322 -- GLOBAL_STEP: 35700[0m
     | > loss: -0.9629167914390564  (-0.9722048856650188)
     | > log_mle: -1.0122361183166504  (-1.0185182451442554)
     | > loss_dur: 0.0493193157017231  (0.046313362540143295)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(188.7999, device='cuda:0')  (tensor(166.1230, device='cuda:0'))
     | > current_lr: 2.75e-05 
     | > step_time: 0.6949  (0.47753461303224987)
     | > loader_time: 0.0215  (0.03133187020660207)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.5728110926491874 [0m(-0.01842953136989045)
     | > avg_loss:[92m -0.9648025631904602 [0m(-0.010963722637721496)
     | > avg_log_mle:[92m -1.0099201168332779 [0m(-0.007673515592302449)
     | > avg_loss_dur:[92m 0.04511755907109805 [0m(-0.0032901997012751497)


[4m[1m > EPOCH: 12/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 05:44:08) [0m

[1m   --> TIME: 2025-05-06 05:45:12 -- STEP: 135/322 -- GLOBAL_STEP: 36000[0m
     | > loss: -0.967814564704895  (-0.9791331617920487)
     | > log_mle: -1.0143914222717285  (-1.0246309880857116)
     | > loss_dur: 0.0465768463909626  (0.045497826597204916)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(182.8718, device='cuda:0')  (tensor(159.6554, device='cuda:0'))
     | > current_lr: 2.775e-05 
     | > step_time: 0.582  (0.4497115311799226)
     | > loader_time: 0.0169  (0.009601670724374277)


 > CHECKPOINT : ckpts/ru




[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.5914766788482666 [0m(+0.01866558619907921)
     | > avg_loss:[91m -0.9531483820506504 [0m(+0.01165418113980976)
     | > avg_log_mle:[91m -0.9981017351150513 [0m(+0.011818381718226578)
     | > avg_loss_dur:[92m 0.044953351787158424 [0m(-0.0001642072839396283)


[4m[1m > EPOCH: 13/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 05:49:59) [0m

[1m   --> TIME: 2025-05-06 05:50:50 -- STEP: 113/322 -- GLOBAL_STEP: 36300[0m
     | > loss: -0.9640583395957947  (-0.9800501608215602)
     | > log_mle: -1.0150246620178223  (-1.0251829909012384)
     | > loss_dur: 0.0509662963449955  (0.04513282816758196)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(191.2452, device='cuda:0')  (tensor(160.1422, device='cuda:0'))
     | > current_lr: 2.8e-05 
     | > step_time: 0.5611  (0.4296049375449661)
     | > loader_time: 0.0071  (0.007639273078040739)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6313711506979806 [0m(+0.03989447184971395)
     | > avg_loss:[92m -0.9795999526977539 [0m(-0.02645157064710346)
     | > avg_log_mle:[92m -1.0248241118022379 [0m(-0.026722376687186578)
     | > avg_loss_dur:[91m 0.045224160594599584 [0m(+0.0002708088074411599)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_36509.pth

[4m[1m > EPOCH: 14/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 05:56:00) [0m

[1m   --> TIME: 2025-05-06 05:56:40 -- STEP: 91/322 -- GLOBAL_STEP: 36600[0m
     | > loss: -0.9823978543281555  (-0.9824233140264239)
     | > log_mle: -1.028266429901123  (-1.0265447582517349)
     | > loss_dur: 0.045868560671806335  (0.04412144512593089)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(143.7839, device='cuda:0')  (tensor(155.0438, device='cuda:0'))
     | > current_lr: 2.8250000000000002e-05 
     | > step_time: 0.6191  (0.41389428




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.5755949497222901 [0m(-0.05577620097569047)
     | > avg_loss:[91m -0.9749730501856123 [0m(+0.004626902512141595)
     | > avg_log_mle:[91m -1.0190595729010445 [0m(+0.005764538901193328)
     | > avg_loss_dur:[92m 0.044086520693131855 [0m(-0.0011376399014677296)


[4m[1m > EPOCH: 15/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 06:01:54) [0m

[1m   --> TIME: 2025-05-06 06:02:24 -- STEP: 69/322 -- GLOBAL_STEP: 36900[0m
     | > loss: -0.9815012216567993  (-0.9871576773947566)
     | > log_mle: -1.0250345468521118  (-1.030989728112152)
     | > loss_dur: 0.0435333177447319  (0.04383204909770385)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(148.2908, device='cuda:0')  (tensor(147.2876, device='cuda:0'))
     | > current_lr: 2.85e-05 
     | > step_time: 0.4466  (0.3733528869739477)
     | > loader_time: 0.006  (0.005338475324105525)


 > CHECKPOINT : ckpts/run-




[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6366401127406529 [0m(+0.061045163018362825)
     | > avg_loss:[92m -1.0034948910985675 [0m(-0.028521840912955176)
     | > avg_log_mle:[92m -1.0468901055199755 [0m(-0.02783053261893098)
     | > avg_loss_dur:[92m 0.04339521537934031 [0m(-0.0006913053137915445)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_37153.pth

[4m[1m > EPOCH: 16/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 06:07:58) [0m

[1m   --> TIME: 2025-05-06 06:08:17 -- STEP: 47/322 -- GLOBAL_STEP: 37200[0m
     | > loss: -0.9425679445266724  (-0.9879967418122797)
     | > log_mle: -0.9821349382400513  (-1.0311542064585584)
     | > loss_dur: 0.0395670086145401  (0.04315746845083035)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(110.1700, device='cuda:0')  (tensor(159.0287, device='cuda:0'))
     | > current_lr: 2.875e-05 
     | > step_time: 0.3972  (0.35584723695795595)
   




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6454713344573975 [0m(+0.008831221716744553)
     | > avg_loss:[91m -0.9105643170220511 [0m(+0.09293057407651639)
     | > avg_log_mle:[91m -0.9542909179415021 [0m(+0.09259918757847341)
     | > avg_loss_dur:[91m 0.04372659889715058 [0m(+0.0003313835178102714)


[4m[1m > EPOCH: 17/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 06:14:05) [0m

[1m   --> TIME: 2025-05-06 06:14:15 -- STEP: 25/322 -- GLOBAL_STEP: 37500[0m
     | > loss: -1.003465175628662  (-1.0038441848754882)
     | > log_mle: -1.0519670248031616  (-1.0464208650588989)
     | > loss_dur: 0.04850184544920921  (0.04257667377591133)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(149.1936, device='cuda:0')  (tensor(168.8127, device='cuda:0'))
     | > current_lr: 2.9e-05 
     | > step_time: 0.3548  (0.3378714179992677)
     | > loader_time: 0.0049  (0.004722518920898437)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.596295622416905 [0m(-0.049175712040492425)
     | > avg_loss:[92m -0.972083944933755 [0m(-0.061519627911703956)
     | > avg_log_mle:[92m -1.0161365543092997 [0m(-0.06184563636779761)
     | > avg_loss_dur:[91m 0.044052605437380925 [0m(+0.0003260065402303433)


[4m[1m > EPOCH: 18/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 06:20:02) [0m

[1m   --> TIME: 2025-05-06 06:20:06 -- STEP: 3/322 -- GLOBAL_STEP: 37800[0m
     | > loss: -1.0792006254196167  (-1.0800259908040364)
     | > log_mle: -1.1146520376205444  (-1.1214791138966878)
     | > loss_dur: 0.03545144572854042  (0.04145315165321032)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(166.9455, device='cuda:0')  (tensor(170.6312, device='cuda:0'))
     | > current_lr: 2.9250000000000003e-05 
     | > step_time: 0.3318  (0.31404980023701984)
     | > loader_time: 0.003  (0.0034596125284830728)


 > CHECKPOI




[1m   --> TIME: 2025-05-06 06:24:47 -- STEP: 303/322 -- GLOBAL_STEP: 38100[0m
     | > loss: -0.9984995722770691  (-0.9965396273647598)
     | > log_mle: -1.0507341623306274  (-1.0431448335301348)
     | > loss_dur: 0.05223459750413895  (0.046605207013710105)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(125.3842, device='cuda:0')  (tensor(153.1472, device='cuda:0'))
     | > current_lr: 2.9250000000000003e-05 
     | > step_time: 1.5068  (0.7540028465069558)
     | > loader_time: 0.9233  (0.1427461243305269)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6011088575635637 [0m(+0.0048132351466586565)
     | > avg_loss:[91m -0.9614039148603167 [0m(+0.010680030073438318)
     | > avg_log_mle:[91m -1.0088959455490112 [0m(+0.007240608760288536)
     | > avg_loss_dur:[91m 0.047492033349616186 [0m(+0.0034394279122352614)


[4m[1m > EPOCH: 19/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 06:26:06) [0m

[1m   --> TIME: 2025-05-06 06:30:12 -- STEP: 281/322 -- GLOBAL_STEP: 38400[0m
     | > loss: -0.9997125864028931  (-1.0015711884057374)
     | > log_mle: -1.0473567247390747  (-1.047806136548732)
     | > loss_dur: 0.04764414206147194  (0.046234946883552036)
     | > amp_scaler: 2048.0  (1166.1209964412812)
     | > grad_norm: tensor(129.2723, device='cuda:0')  (tensor(156.0141, device='cuda:0'))
     | > current_lr: 2.95e-05 
     | > step_time: 1.2384  (0.6989561382986054)
     | > loader_time: 0.0374  (0.16836591717187194)


[1m > EV




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.5834038189479281 [0m(-0.01770503861563555)
     | > avg_loss:[92m -0.9977692212377276 [0m(-0.036365306377410866)
     | > avg_log_mle:[92m -1.0421266385487147 [0m(-0.03323069299970349)
     | > avg_loss_dur:[92m 0.04435742146202496 [0m(-0.003134611887591225)


[4m[1m > EPOCH: 20/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 06:32:08) [0m

[1m   --> TIME: 2025-05-06 06:35:35 -- STEP: 259/322 -- GLOBAL_STEP: 38700[0m
     | > loss: -0.99736487865448  (-1.001320513979348)
     | > log_mle: -1.043281078338623  (-1.0471722936998462)
     | > loss_dur: 0.04591618478298187  (0.045851779116396735)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(155.0369, device='cuda:0')  (tensor(152.7077, device='cuda:0'))
     | > current_lr: 2.9749999999999998e-05 
     | > step_time: 1.1305  (0.6669858836759469)
     | > loader_time: 0.0267  (0.12349169410793938)


[1m > EVALUATI




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6014406681060791 [0m(+0.018036849158150958)
     | > avg_loss:[91m -0.9905075192451477 [0m(+0.0072617019925799164)
     | > avg_log_mle:[91m -1.0340278012411934 [0m(+0.008098837307521256)
     | > avg_loss_dur:[92m 0.043520278696502955 [0m(-0.000837142765522006)


[4m[1m > EPOCH: 21/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 06:38:06) [0m

[1m   --> TIME: 2025-05-06 06:41:00 -- STEP: 237/322 -- GLOBAL_STEP: 39000[0m
     | > loss: -0.9898934960365295  (-1.0017416904747227)
     | > log_mle: -1.0378607511520386  (-1.047148282014871)
     | > loss_dur: 0.04796723276376724  (0.04540659234179224)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(176.7030, device='cuda:0')  (tensor(156.8638, device='cuda:0'))
     | > current_lr: 2.9999999999999997e-05 
     | > step_time: 1.01  (0.6156430254505654)
     | > loader_time: 0.0383  (0.10961386825464949)


 > CHECKPOI




[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6051630905696324 [0m(+0.0037224224635532543)
     | > avg_loss:[91m -0.9893792986869812 [0m(+0.0011282205581665261)
     | > avg_log_mle:[92m -1.034428770201547 [0m(-0.000400968960353687)
     | > avg_loss_dur:[91m 0.045049475559166496 [0m(+0.0015291968626635413)


[4m[1m > EPOCH: 22/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 06:44:06) [0m

[1m   --> TIME: 2025-05-06 06:46:31 -- STEP: 215/322 -- GLOBAL_STEP: 39300[0m
     | > loss: -1.0452378988265991  (-1.0078387055286147)
     | > log_mle: -1.0917659997940063  (-1.053094407569529)
     | > loss_dur: 0.046528130769729614  (0.04525570261270502)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(142.2718, device='cuda:0')  (tensor(150.6890, device='cuda:0'))
     | > current_lr: 3.025e-05 
     | > step_time: 0.9064  (0.5749785645063535)
     | > loader_time: 0.0116  (0.09193371617516803)


[1m > EVALUATION [




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6058622428349086 [0m(+0.0006991522652762816)
     | > avg_loss:[91m -0.9612871050834656 [0m(+0.02809219360351556)
     | > avg_log_mle:[91m -1.0088449886866981 [0m(+0.02558378151484897)
     | > avg_loss_dur:[91m 0.04755788296461105 [0m(+0.002508407405444557)


[4m[1m > EPOCH: 23/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 06:50:04) [0m

[1m   --> TIME: 2025-05-06 06:52:03 -- STEP: 193/322 -- GLOBAL_STEP: 39600[0m
     | > loss: -1.007860541343689  (-1.008706286781193)
     | > log_mle: -1.052725076675415  (-1.0535150278417549)
     | > loss_dur: 0.044864535331726074  (0.04480873801084382)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(153.4381, device='cuda:0')  (tensor(153.2207, device='cuda:0'))
     | > current_lr: 3.05e-05 
     | > step_time: 0.8557  (0.5348409259875206)
     | > loader_time: 0.7739  (0.07340804776997147)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.5890048980712891 [0m(-0.016857344763619553)
     | > avg_loss:[92m -1.0025185142244613 [0m(-0.041231409140995656)
     | > avg_log_mle:[92m -1.0455388001033235 [0m(-0.03669381141662531)
     | > avg_loss_dur:[92m 0.043020276299544746 [0m(-0.004537606665066307)


[4m[1m > EPOCH: 24/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 06:55:57) [0m

[1m   --> TIME: 2025-05-06 06:57:39 -- STEP: 171/322 -- GLOBAL_STEP: 39900[0m
     | > loss: -1.030822515487671  (-1.008636740216037)
     | > log_mle: -1.08360755443573  (-1.0531800713455457)
     | > loss_dur: 0.05278507620096207  (0.044543332610911104)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(125.5842, device='cuda:0')  (tensor(146.8851, device='cuda:0'))
     | > current_lr: 3.075e-05 
     | > step_time: 0.739  (0.5185180560887211)
     | > loader_time: 0.9541  (0.06712729052493446)


 > CHECKPOINT : ckpts/run-M




[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6002407959529332 [0m(+0.011235897881644141)
     | > avg_loss:[92m -1.0040731804711478 [0m(-0.0015546662466865424)
     | > avg_log_mle:[92m -1.047082015446254 [0m(-0.0015432153429304485)
     | > avg_loss_dur:[92m 0.043008845299482346 [0m(-1.143100006240072e-05)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_40051.pth

[4m[1m > EPOCH: 25/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 07:02:06) [0m

[1m   --> TIME: 2025-05-06 07:03:24 -- STEP: 149/322 -- GLOBAL_STEP: 40200[0m
     | > loss: -1.0012046098709106  (-1.0144472302206418)
     | > log_mle: -1.0482852458953857  (-1.0583316079722151)
     | > loss_dur: 0.04708067700266838  (0.04388437400128217)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(135.5925, device='cuda:0')  (tensor(143.3726, device='cuda:0'))
     | > current_lr: 3.1e-05 
     | > step_time: 0.6636  (0.4754977434273534)
 




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.5973481927599226 [0m(-0.0028926031930106566)
     | > avg_loss:[92m -1.026806744507381 [0m(-0.022733564036233167)
     | > avg_log_mle:[92m -1.0692266702651978 [0m(-0.0221446548189439)
     | > avg_loss_dur:[92m 0.04241992150034223 [0m(-0.0005889237991401133)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_40373.pth

[4m[1m > EPOCH: 26/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 07:08:06) [0m

[1m   --> TIME: 2025-05-06 07:09:06 -- STEP: 127/322 -- GLOBAL_STEP: 40500[0m
     | > loss: -1.016727328300476  (-1.0218442378081674)
     | > log_mle: -1.0620124340057373  (-1.0650414003161943)
     | > loss_dur: 0.04528513923287392  (0.043197161804034025)
     | > amp_scaler: 2048.0  (1685.1653543307084)
     | > grad_norm: tensor(184.9625, device='cuda:0')  (tensor(139.5849, device='cuda:0'))
     | > current_lr: 3.125e-05 
     | > step_time: 0.5921  (0.445973796




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6192492757524762 [0m(+0.021901082992553667)
     | > avg_loss:[91m -0.9993186014039176 [0m(+0.02748814310346337)
     | > avg_log_mle:[91m -1.0423580850873675 [0m(+0.026868585177830306)
     | > avg_loss_dur:[91m 0.04303948836667197 [0m(+0.0006195668663297391)


[4m[1m > EPOCH: 27/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 07:14:03) [0m

[1m   --> TIME: 2025-05-06 07:14:50 -- STEP: 105/322 -- GLOBAL_STEP: 40800[0m
     | > loss: -1.0037068128585815  (-1.017401588530768)
     | > log_mle: -1.0501872301101685  (-1.0600293545495896)
     | > loss_dur: 0.04648040607571602  (0.0426277630740688)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(146.7933, device='cuda:0')  (tensor(152.6682, device='cuda:0'))
     | > current_lr: 3.15e-05 
     | > step_time: 0.5326  (0.42504232497442335)
     | > loader_time: 0.0081  (0.008372034345354353)


 > CHECKPOINT : ckpts/ru




[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6476864269801549 [0m(+0.028437151227678648)
     | > avg_loss:[92m -1.009668266773224 [0m(-0.010349665369306371)
     | > avg_log_mle:[92m -1.0535504238946098 [0m(-0.011192338807242352)
     | > avg_loss_dur:[91m 0.04388215435402734 [0m(+0.0008426659873553705)


[4m[1m > EPOCH: 28/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 07:20:12) [0m

[1m   --> TIME: 2025-05-06 07:20:48 -- STEP: 83/322 -- GLOBAL_STEP: 41100[0m
     | > loss: -1.008068561553955  (-1.0128150445869164)
     | > log_mle: -1.0489426851272583  (-1.0548507722027336)
     | > loss_dur: 0.040874116122722626  (0.0420357200305864)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(109.3232, device='cuda:0')  (tensor(148.8425, device='cuda:0'))
     | > current_lr: 3.1750000000000006e-05 
     | > step_time: 0.4859  (0.4021341254912227)
     | > loader_time: 0.0067  (0.007151753069406532)


[1m > EVAL




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6376588412693569 [0m(-0.010027585710797982)
     | > avg_loss:[91m -0.9580120120729719 [0m(+0.051656254700252036)
     | > avg_log_mle:[91m -1.009980205127171 [0m(+0.04357021876743894)
     | > avg_loss_dur:[91m 0.051968197098800115 [0m(+0.008086042744772773)


[4m[1m > EPOCH: 29/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 07:26:20) [0m

[1m   --> TIME: 2025-05-06 07:26:44 -- STEP: 61/322 -- GLOBAL_STEP: 41400[0m
     | > loss: -1.0203582048416138  (-1.0234477754499094)
     | > log_mle: -1.0648863315582275  (-1.064449292714479)
     | > loss_dur: 0.044528163969516754  (0.04100151952417172)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(253.5950, device='cuda:0')  (tensor(139.0168, device='cuda:0'))
     | > current_lr: 3.2e-05 
     | > step_time: 0.4301  (0.36729950592166083)
     | > loader_time: 0.0116  (0.005804077523653624)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6300395556858608 [0m(-0.007619285583496116)
     | > avg_loss:[92m -1.0083883030073983 [0m(-0.050376290934426415)
     | > avg_log_mle:[92m -1.0521040541785105 [0m(-0.04212384905133959)
     | > avg_loss_dur:[92m 0.04371574712651116 [0m(-0.008252449972288953)


[4m[1m > EPOCH: 30/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 07:32:28) [0m

[1m   --> TIME: 2025-05-06 07:32:46 -- STEP: 39/322 -- GLOBAL_STEP: 41700[0m
     | > loss: -0.972554087638855  (-1.0252757118298454)
     | > log_mle: -1.0145198106765747  (-1.0651723910600712)
     | > loss_dur: 0.041965752840042114  (0.03989667870486394)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(111.6132, device='cuda:0')  (tensor(128.9956, device='cuda:0'))
     | > current_lr: 3.225e-05 
     | > step_time: 0.3808  (0.3677354592543381)
     | > loader_time: 0.0059  (0.005046172019762871)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6330628395080566 [0m(+0.0030232838221958547)
     | > avg_loss:[92m -1.0114376817430768 [0m(-0.003049378735678454)
     | > avg_log_mle:[92m -1.056348122869219 [0m(-0.004244068690708591)
     | > avg_loss_dur:[91m 0.04491043814590999 [0m(+0.0011946910193988275)


[4m[1m > EPOCH: 31/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 07:38:38) [0m

[1m   --> TIME: 2025-05-06 07:38:45 -- STEP: 17/322 -- GLOBAL_STEP: 42000[0m
     | > loss: -1.0197540521621704  (-1.0507880940156824)
     | > log_mle: -1.0545108318328857  (-1.0882570322822123)
     | > loss_dur: 0.03475679084658623  (0.03746893519864363)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(119.0277, device='cuda:0')  (tensor(145.7948, device='cuda:0'))
     | > current_lr: 3.25e-05 
     | > step_time: 0.3122  (0.30355390380410585)
     | > loader_time: 0.0044  (0.004819743773516487)


 > CHECKPOINT : ckpts/




[1m   --> TIME: 2025-05-06 07:43:55 -- STEP: 317/322 -- GLOBAL_STEP: 42300[0m
     | > loss: -1.0197763442993164  (-1.0292269467179327)
     | > log_mle: -1.070609450340271  (-1.0744181363740555)
     | > loss_dur: 0.050833139568567276  (0.04519119029071431)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(159.4624, device='cuda:0')  (tensor(138.1849, device='cuda:0'))
     | > current_lr: 3.25e-05 
     | > step_time: 1.7971  (0.7966570658638655)
     | > loader_time: 1.4651  (0.18825816506467027)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6380005632128034 [0m(+0.004937723704746766)
     | > avg_loss:[92m -1.025084662437439 [0m(-0.01364698069436221)
     | > avg_log_mle:[92m -1.0690096991402767 [0m(-0.012661576271057573)
     | > avg_loss_dur:[92m 0.043925032658236364 [0m(-0.0009854054876736254)


[4m[1m > EPOCH: 32/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 07:44:46) [0m

[1m   --> TIME: 2025-05-06 07:49:20 -- STEP: 295/322 -- GLOBAL_STEP: 42600[0m
     | > loss: -1.0405389070510864  (-1.0287637995461283)
     | > log_mle: -1.0897237062454224  (-1.0735353651693318)
     | > loss_dur: 0.04918478801846504  (0.0447715652317314)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(123.4695, device='cuda:0')  (tensor(138.0986, device='cuda:0'))
     | > current_lr: 3.2749999999999996e-05 
     | > step_time: 1.3417  (0.7408302751638122)
     | > loader_time: 1.687  (0.17963821847560046)


[1m > EVALU




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6151209354400635 [0m(-0.022879627772739886)
     | > avg_loss:[91m -1.0136629905019483 [0m(+0.011421671935490707)
     | > avg_log_mle:[91m -1.0575716154915942 [0m(+0.011438083648682484)
     | > avg_loss_dur:[92m 0.043908627757004326 [0m(-1.6404901232038027e-05)


[4m[1m > EPOCH: 33/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 07:50:53) [0m

[1m   --> TIME: 2025-05-06 07:54:47 -- STEP: 273/322 -- GLOBAL_STEP: 42900[0m
     | > loss: -1.027096152305603  (-1.0319996602805976)
     | > log_mle: -1.0758121013641357  (-1.0764012511396583)
     | > loss_dur: 0.048715993762016296  (0.044401592087178005)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(152.3199, device='cuda:0')  (tensor(137.5931, device='cuda:0'))
     | > current_lr: 3.3e-05 
     | > step_time: 1.1692  (0.6858684064704422)
     | > loader_time: 0.0374  (0.15307842680822795)


[1m > EVALUATION [0




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6286719594682966 [0m(+0.01355102402823305)
     | > avg_loss:[92m -1.0285569684846059 [0m(-0.014893977982657569)
     | > avg_log_mle:[92m -1.0713461058480398 [0m(-0.013774490356445623)
     | > avg_loss_dur:[92m 0.042789140130792346 [0m(-0.0011194876262119804)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_42949.pth

[4m[1m > EPOCH: 34/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 07:56:58) [0m

 > CHECKPOINT : ckpts/run-May-02-2025_07+04PM-0000000/checkpoint_43000.pth





[1m   --> TIME: 2025-05-06 08:00:20 -- STEP: 251/322 -- GLOBAL_STEP: 43200[0m
     | > loss: -1.0422900915145874  (-1.0378607388036665)
     | > log_mle: -1.088077187538147  (-1.081906712387662)
     | > loss_dur: 0.04578714072704315  (0.04404597598836717)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(139.8155, device='cuda:0')  (tensor(142.0648, device='cuda:0'))
     | > current_lr: 3.3249999999999995e-05 
     | > step_time: 1.1532  (0.6478045936599669)
     | > loader_time: 0.0324  (0.1383605677768054)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6228593621935163 [0m(-0.005812597274780229)
     | > avg_loss:[92m -1.0342811635562352 [0m(-0.005724195071629357)
     | > avg_log_mle:[92m -1.0766949585505894 [0m(-0.005348852702549589)
     | > avg_loss_dur:[92m 0.04241379286561694 [0m(-0.0003753472651754075)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_43271.pth

[4m[1m > EPOCH: 35/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 08:03:10) [0m

[1m   --> TIME: 2025-05-06 08:06:00 -- STEP: 229/322 -- GLOBAL_STEP: 43500[0m
     | > loss: -1.0230700969696045  (-1.0358264035012528)
     | > log_mle: -1.0685228109359741  (-1.0795440491630521)
     | > loss_dur: 0.04545266553759575  (0.043717645922081945)
     | > amp_scaler: 1024.0  (1265.4672489082973)
     | > grad_norm: tensor(149.1489, device='cuda:0')  (tensor(136.7155, device='cuda:0'))
     | > current_lr: 3.35e-05 
     | > step_time: 1.3319  (0.6118406




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.5955920560019357 [0m(-0.027267306191580687)
     | > avg_loss:[92m -1.0637979848044263 [0m(-0.029516821248191105)
     | > avg_log_mle:[92m -1.1055349384035387 [0m(-0.028839979852949282)
     | > avg_loss_dur:[92m 0.04173695253474371 [0m(-0.0006768403308732296)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_43593.pth

[4m[1m > EPOCH: 36/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 08:09:17) [0m

[1m   --> TIME: 2025-05-06 08:11:38 -- STEP: 207/322 -- GLOBAL_STEP: 43800[0m
     | > loss: -1.0403928756713867  (-1.0435315327367924)
     | > log_mle: -1.085485816001892  (-1.0867804467390116)
     | > loss_dur: 0.045092906802892685  (0.04324891254449814)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(145.4435, device='cuda:0')  (tensor(136.4507, device='cuda:0'))
     | > current_lr: 3.375e-05 
     | > step_time: 0.8935  (0.5668096657536458)
 




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.5938559600285122 [0m(-0.0017360959734235015)
     | > avg_loss:[91m -1.0292633141790117 [0m(+0.0345346706254146)
     | > avg_log_mle:[91m -1.0733473811830794 [0m(+0.032187557220459295)
     | > avg_loss_dur:[91m 0.04408406370452472 [0m(+0.0023471111697810146)


[4m[1m > EPOCH: 37/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 08:15:23) [0m

 > CHECKPOINT : ckpts/run-May-02-2025_07+04PM-0000000/checkpoint_44000.pth





[1m   --> TIME: 2025-05-06 08:17:17 -- STEP: 185/322 -- GLOBAL_STEP: 44100[0m
     | > loss: -1.0394340753555298  (-1.043847494834178)
     | > log_mle: -1.0839675664901733  (-1.0868435550380402)
     | > loss_dur: 0.04453344643115997  (0.042996061734251086)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(154.3829, device='cuda:0')  (tensor(144.6512, device='cuda:0'))
     | > current_lr: 3.4e-05 
     | > step_time: 0.8886  (0.5233949880342227)
     | > loader_time: 0.016  (0.06339228217666212)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6171491691044398 [0m(+0.02329320907592769)
     | > avg_loss:[91m -1.0011484793254308 [0m(+0.028114834853580906)
     | > avg_log_mle:[91m -1.0450488056455345 [0m(+0.02829857553754489)
     | > avg_loss_dur:[92m 0.04390032717159816 [0m(-0.00018373653292656084)


[4m[1m > EPOCH: 38/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 08:21:24) [0m

[1m   --> TIME: 2025-05-06 08:22:55 -- STEP: 163/322 -- GLOBAL_STEP: 44400[0m
     | > loss: -1.0478625297546387  (-1.0430537912011875)
     | > log_mle: -1.096051812171936  (-1.0856145357061746)
     | > loss_dur: 0.04818931967020035  (0.04256074896162274)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(137.2192, device='cuda:0')  (tensor(134.0686, device='cuda:0'))
     | > current_lr: 3.425e-05 
     | > step_time: 0.7342  (0.4955511780604263)
     | > loader_time: 0.767  (0.049656177590961116)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6098441464560372 [0m(-0.007305022648402604)
     | > avg_loss:[92m -1.013786472593035 [0m(-0.012637993267604175)
     | > avg_log_mle:[92m -1.0568238905497964 [0m(-0.011775084904261934)
     | > avg_loss_dur:[92m 0.04303741699882916 [0m(-0.000862910172769002)


[4m[1m > EPOCH: 39/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 08:27:25) [0m

[1m   --> TIME: 2025-05-06 08:28:38 -- STEP: 141/322 -- GLOBAL_STEP: 44700[0m
     | > loss: -1.0730135440826416  (-1.0481118678201162)
     | > log_mle: -1.1152468919754028  (-1.0900256676031346)
     | > loss_dur: 0.04223333299160004  (0.04191380408955805)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(117.2050, device='cuda:0')  (tensor(135.2967, device='cuda:0'))
     | > current_lr: 3.45e-05 
     | > step_time: 0.6676  (0.47465431774761657)
     | > loader_time: 0.1078  (0.024219090211475995)


[1m > EVALUATION [0m





  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6350268772670202 [0m(+0.02518273081098299)
     | > avg_loss:[92m -1.0279413921492444 [0m(-0.0141549195562094)
     | > avg_log_mle:[92m -1.0697662251336235 [0m(-0.012942334583827098)
     | > avg_loss_dur:[92m 0.041824825533798765 [0m(-0.0012125914650303951)


[4m[1m > EPOCH: 40/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 08:33:30) [0m

[1m   --> TIME: 2025-05-06 08:34:25 -- STEP: 119/322 -- GLOBAL_STEP: 45000[0m
     | > loss: -1.065107822418213  (-1.043782439051556)
     | > log_mle: -1.1077440977096558  (-1.0850716278332626)
     | > loss_dur: 0.04263633117079735  (0.04128918668427387)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(125.6784, device='cuda:0')  (tensor(128.1783, device='cuda:0'))
     | > current_lr: 3.475e-05 
     | > step_time: 0.5384  (0.43191233001837204)
     | > loader_time: 0.008  (0.011723784839405733)


 > CHECKPOINT : ckpts/run




[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6183037212916783 [0m(-0.016723155975341908)
     | > avg_loss:[91m -1.0087585227830065 [0m(+0.01918286936623792)
     | > avg_log_mle:[91m -1.0513944932392665 [0m(+0.018371731894357035)
     | > avg_loss_dur:[91m 0.042635983122246604 [0m(+0.0008111575884478389)


[4m[1m > EPOCH: 41/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 08:39:30) [0m

[1m   --> TIME: 2025-05-06 08:40:12 -- STEP: 97/322 -- GLOBAL_STEP: 45300[0m
     | > loss: -1.0437482595443726  (-1.0516315041129118)
     | > log_mle: -1.089227557182312  (-1.0916714508508902)
     | > loss_dur: 0.04547928273677826  (0.040039943550358115)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(129.9419, device='cuda:0')  (tensor(126.2181, device='cuda:0'))
     | > current_lr: 3.5000000000000004e-05 
     | > step_time: 0.5232  (0.4051712272093468)
     | > loader_time: 0.0069  (0.006861443372116876)


[1m > EV




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.5828457696097238 [0m(-0.035457951681954536)
     | > avg_loss:[92m -1.0214751924787249 [0m(-0.012716669695718386)
     | > avg_log_mle:[92m -1.0634264026369362 [0m(-0.012031909397669738)
     | > avg_loss_dur:[92m 0.041951218992471694 [0m(-0.0006847641297749105)


[4m[1m > EPOCH: 42/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 08:45:24) [0m

[1m   --> TIME: 2025-05-06 08:45:55 -- STEP: 75/322 -- GLOBAL_STEP: 45600[0m
     | > loss: -1.0705121755599976  (-1.0549344333012898)
     | > log_mle: -1.1116236448287964  (-1.0944011863072705)
     | > loss_dur: 0.04111141711473465  (0.03946675275762876)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(106.5563, device='cuda:0')  (tensor(131.6470, device='cuda:0'))
     | > current_lr: 3.5249999999999996e-05 
     | > step_time: 0.4462  (0.376382131576538)
     | > loader_time: 0.0072  (0.006182200113932293)


[1m > EV




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.5980112552642822 [0m(+0.015165485654558442)
     | > avg_loss:[91m -0.9956679974283491 [0m(+0.025807195050375764)
     | > avg_log_mle:[91m -1.0387368815285818 [0m(+0.02468952110835443)
     | > avg_loss_dur:[91m 0.04306888399379594 [0m(+0.0011176650013242431)


[4m[1m > EPOCH: 43/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 08:51:19) [0m

[1m   --> TIME: 2025-05-06 08:51:40 -- STEP: 53/322 -- GLOBAL_STEP: 45900[0m
     | > loss: -1.014993667602539  (-1.0508641915501291)
     | > log_mle: -1.0582340955734253  (-1.089979565368508)
     | > loss_dur: 0.043240487575531006  (0.039115374661841486)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(138.4046, device='cuda:0')  (tensor(135.7585, device='cuda:0'))
     | > current_lr: 3.55e-05 
     | > step_time: 0.4459  (0.3576777116307672)
     | > loader_time: 0.0082  (0.005105284025084299)


 > CHECKPOINT : ckpts/ru




[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6392598220280239 [0m(+0.04124856676374167)
     | > avg_loss:[92m -1.063704316956656 [0m(-0.06803631952830691)
     | > avg_log_mle:[92m -1.105921500069754 [0m(-0.06718461854117219)
     | > avg_loss_dur:[92m 0.04221718098436083 [0m(-0.0008517030094351077)


[4m[1m > EPOCH: 44/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 08:57:17) [0m

[1m   --> TIME: 2025-05-06 08:57:29 -- STEP: 31/322 -- GLOBAL_STEP: 46200[0m
     | > loss: -1.026763677597046  (-1.0732929629664267)
     | > log_mle: -1.0633937120437622  (-1.1106619373444588)
     | > loss_dur: 0.03662998974323273  (0.03736897600033591)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(123.9080, device='cuda:0')  (tensor(128.4274, device='cuda:0'))
     | > current_lr: 3.575e-05 
     | > step_time: 0.3726  (0.3319381052447904)
     | > loader_time: 0.0047  (0.004965605274323494)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.5955222947256905 [0m(-0.04373752730233338)
     | > avg_loss:[91m -1.0479259422847202 [0m(+0.01577837467193577)
     | > avg_log_mle:[91m -1.0906705958502634 [0m(+0.015250904219490602)
     | > avg_loss_dur:[91m 0.042744652394737515 [0m(+0.0005274714103766856)


[4m[1m > EPOCH: 45/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 09:03:14) [0m

[1m   --> TIME: 2025-05-06 09:03:18 -- STEP: 9/322 -- GLOBAL_STEP: 46500[0m
     | > loss: -1.042902946472168  (-1.1183872487809923)
     | > log_mle: -1.0790836811065674  (-1.1551800303988986)
     | > loss_dur: 0.036180756986141205  (0.03679279610514641)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(130.4914, device='cuda:0')  (tensor(134.0693, device='cuda:0'))
     | > current_lr: 3.6e-05 
     | > step_time: 0.288  (0.28401954968770343)
     | > loader_time: 0.0036  (0.003908978568183051)


[1m   --> TIME: 2025-05-0




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6394161633082799 [0m(+0.04389386858258937)
     | > avg_loss:[91m -1.0440756593431746 [0m(+0.0038502829415456574)
     | > avg_log_mle:[91m -1.0873120273862567 [0m(+0.003358568464006728)
     | > avg_loss_dur:[91m 0.04323636538216046 [0m(+0.0004917129874229473)


[4m[1m > EPOCH: 46/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 09:09:26) [0m

 > CHECKPOINT : ckpts/run-May-02-2025_07+04PM-0000000/checkpoint_47000.pth





[1m   --> TIME: 2025-05-06 09:13:36 -- STEP: 287/322 -- GLOBAL_STEP: 47100[0m
     | > loss: -1.0658223628997803  (-1.0624191067360007)
     | > log_mle: -1.1150528192520142  (-1.1057116150440653)
     | > loss_dur: 0.04923040419816971  (0.043292508282104826)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(174.8108, device='cuda:0')  (tensor(132.6639, device='cuda:0'))
     | > current_lr: 3.625e-05 
     | > step_time: 1.3187  (0.7153889293870027)
     | > loader_time: 0.0423  (0.12376718404816418)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6061391966683524 [0m(-0.03327696663992752)
     | > avg_loss:[92m -1.0709047283445086 [0m(-0.02682906900133397)
     | > avg_log_mle:[92m -1.1137011391775948 [0m(-0.026389111791338182)
     | > avg_loss_dur:[92m 0.04279641764504569 [0m(-0.00043994773711477503)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_47135.pth

[4m[1m > EPOCH: 47/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 09:15:23) [0m

[1m   --> TIME: 2025-05-06 09:18:59 -- STEP: 265/322 -- GLOBAL_STEP: 47400[0m
     | > loss: -1.0657967329025269  (-1.059529085879057)
     | > log_mle: -1.1107724905014038  (-1.1025373584819298)
     | > loss_dur: 0.044975750148296356  (0.04300827401567182)
     | > amp_scaler: 2048.0  (1051.0490566037736)
     | > grad_norm: tensor(146.9283, device='cuda:0')  (tensor(129.3206, device='cuda:0'))
     | > current_lr: 3.65e-05 
     | > step_time: 1.2342  (0.671151692




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6667418003082275 [0m(+0.06060260363987513)
     | > avg_loss:[91m -1.062495912824358 [0m(+0.008408815520150492)
     | > avg_log_mle:[91m -1.1062294346945627 [0m(+0.007471704483032138)
     | > avg_loss_dur:[91m 0.04373351835778781 [0m(+0.0009371007127421255)


[4m[1m > EPOCH: 48/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 09:21:31) [0m

[1m   --> TIME: 2025-05-06 09:24:38 -- STEP: 243/322 -- GLOBAL_STEP: 47700[0m
     | > loss: -1.0616114139556885  (-1.0598877035541299)
     | > log_mle: -1.108458161354065  (-1.1025470166539943)
     | > loss_dur: 0.04684676602482796  (0.0426593139813638)
     | > amp_scaler: 1024.0  (1108.2798353909463)
     | > grad_norm: tensor(122.7043, device='cuda:0')  (tensor(125.3865, device='cuda:0'))
     | > current_lr: 3.675e-05 
     | > step_time: 1.0773  (0.6261768861072053)
     | > loader_time: 0.064  (0.13105923943068262)


[1m > EVALUATI




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.5827455793108259 [0m(-0.08399622099740156)
     | > avg_loss:[92m -1.0632025888987953 [0m(-0.0007066760744371958)
     | > avg_log_mle:[91m -1.1062041214534215 [0m(+2.531324114118938e-05)
     | > avg_loss_dur:[92m 0.04300153734428542 [0m(-0.0007319810135023933)


[4m[1m > EPOCH: 49/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 09:27:40) [0m

[1m   --> TIME: 2025-05-06 09:30:09 -- STEP: 221/322 -- GLOBAL_STEP: 48000[0m
     | > loss: -1.0699588060379028  (-1.0647790086754856)
     | > log_mle: -1.114678978919983  (-1.1069641798330114)
     | > loss_dur: 0.0447201281785965  (0.04218516918531372)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(182.0730, device='cuda:0')  (tensor(124.1863, device='cuda:0'))
     | > current_lr: 3.7000000000000005e-05 
     | > step_time: 0.9479  (0.5772331904501935)
     | > loader_time: 1.0229  (0.09029328121858482)


 > CHECKPO




[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.5968905108315603 [0m(+0.014144931520734372)
     | > avg_loss:[92m -1.0722262212208338 [0m(-0.00902363232203851)
     | > avg_log_mle:[92m -1.115053388050624 [0m(-0.008849266597202421)
     | > avg_loss_dur:[92m 0.04282716704266412 [0m(-0.00017437030162129746)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_48101.pth

[4m[1m > EPOCH: 50/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 09:33:34) [0m

[1m   --> TIME: 2025-05-06 09:35:38 -- STEP: 199/322 -- GLOBAL_STEP: 48300[0m
     | > loss: -1.0609283447265625  (-1.068318541924558)
     | > log_mle: -1.1050413846969604  (-1.110062975380288)
     | > loss_dur: 0.04411299154162407  (0.04174442959004013)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(149.0410, device='cuda:0')  (tensor(123.2538, device='cuda:0'))
     | > current_lr: 3.725e-05 
     | > step_time: 0.8743  (0.5433925851505604)
    




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.5914322853088378 [0m(-0.00545822552272246)
     | > avg_loss:[91m -1.0623355048043386 [0m(+0.009890716416495193)
     | > avg_log_mle:[91m -1.105226046698434 [0m(+0.009827341352189878)
     | > avg_loss_dur:[91m 0.042890541681221556 [0m(+6.337463855743408e-05)


[4m[1m > EPOCH: 51/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 09:39:27) [0m

[1m   --> TIME: 2025-05-06 09:41:09 -- STEP: 177/322 -- GLOBAL_STEP: 48600[0m
     | > loss: -1.06150221824646  (-1.0699356666392517)
     | > log_mle: -1.1040645837783813  (-1.111296757466376)
     | > loss_dur: 0.042562372982501984  (0.04136109009048358)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(152.7113, device='cuda:0')  (tensor(129.1440, device='cuda:0'))
     | > current_lr: 3.75e-05 
     | > step_time: 0.797  (0.5099016933117879)
     | > loader_time: 0.019  (0.05581854965727209)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.637472677230835 [0m(+0.04604039192199716)
     | > avg_loss:[92m -1.0927999223981584 [0m(-0.030464417593819793)
     | > avg_log_mle:[92m -1.1350152117865429 [0m(-0.029789165088108804)
     | > avg_loss_dur:[92m 0.04221529194286892 [0m(-0.0006752497383526387)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_48745.pth

[4m[1m > EPOCH: 52/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 09:45:28) [0m

[1m   --> TIME: 2025-05-06 09:46:53 -- STEP: 155/322 -- GLOBAL_STEP: 48900[0m
     | > loss: -1.0487204790115356  (-1.069194117669137)
     | > log_mle: -1.0941165685653687  (-1.1101679248194538)
     | > loss_dur: 0.045396123081445694  (0.040973807703102785)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(152.2958, device='cuda:0')  (tensor(122.4945, device='cuda:0'))
     | > current_lr: 3.775e-05 
     | > step_time: 0.6366  (0.48873253791562965)
 




[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6313125882829939 [0m(-0.006160088947841147)
     | > avg_loss:[91m -1.0633009467806136 [0m(+0.029498975617544776)
     | > avg_log_mle:[91m -1.1053947176252092 [0m(+0.02962049416133361)
     | > avg_loss_dur:[92m 0.042093775527817866 [0m(-0.00012151641505105115)


[4m[1m > EPOCH: 53/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 09:51:34) [0m

[1m   --> TIME: 2025-05-06 09:52:39 -- STEP: 133/322 -- GLOBAL_STEP: 49200[0m
     | > loss: -1.0565142631530762  (-1.0712257050033802)
     | > log_mle: -1.0973690748214722  (-1.111244610377721)
     | > loss_dur: 0.040854860097169876  (0.040018910065965536)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(194.7918, device='cuda:0')  (tensor(128.7751, device='cuda:0'))
     | > current_lr: 3.7999999999999995e-05 
     | > step_time: 0.6174  (0.4611404838418602)
     | > loader_time: 0.1846  (0.01738378876133969)


[1m > 




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6422503335135323 [0m(+0.010937745230538476)
     | > avg_loss:[92m -1.076394064085824 [0m(-0.013093117305210455)
     | > avg_log_mle:[92m -1.1190130233764648 [0m(-0.013618305751255555)
     | > avg_loss_dur:[91m 0.04261896131294114 [0m(+0.0005251857851232763)


[4m[1m > EPOCH: 54/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 09:57:43) [0m

[1m   --> TIME: 2025-05-06 09:58:35 -- STEP: 111/322 -- GLOBAL_STEP: 49500[0m
     | > loss: -1.0748705863952637  (-1.0711315129254317)
     | > log_mle: -1.1171811819076538  (-1.1105544824857971)
     | > loss_dur: 0.04231061413884163  (0.03942296954358483)
     | > amp_scaler: 2048.0  (1236.1801801801803)
     | > grad_norm: tensor(135.9606, device='cuda:0')  (tensor(123.6997, device='cuda:0'))
     | > current_lr: 3.825e-05 
     | > step_time: 0.6025  (0.4457936780946748)
     | > loader_time: 0.0116  (0.008625333373611042)


[1m > EVA




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6230518068586077 [0m(-0.019198526654924608)
     | > avg_loss:[92m -1.099155354499817 [0m(-0.022761290413992885)
     | > avg_log_mle:[92m -1.141017018045698 [0m(-0.022003994669233196)
     | > avg_loss_dur:[92m 0.04186165811760085 [0m(-0.0007573031953402934)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_49711.pth

[4m[1m > EPOCH: 55/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 10:03:49) [0m

[1m   --> TIME: 2025-05-06 10:04:26 -- STEP: 89/322 -- GLOBAL_STEP: 49800[0m
     | > loss: -1.0992063283920288  (-1.076548096838961)
     | > log_mle: -1.1404343843460083  (-1.1150613983025708)
     | > loss_dur: 0.04122806340456009  (0.03851330221703883)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(124.2107, device='cuda:0')  (tensor(116.7851, device='cuda:0'))
     | > current_lr: 3.85e-05 
     | > step_time: 0.5105  (0.39764608158154424)
     




[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6275741645268031 [0m(+0.00452235766819542)
     | > avg_loss:[91m -1.0853404011045182 [0m(+0.013814953395298701)
     | > avg_log_mle:[91m -1.1271849870681765 [0m(+0.013832030977521503)
     | > avg_loss_dur:[92m 0.041844592349869855 [0m(-1.7065767730993497e-05)


[4m[1m > EPOCH: 56/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 10:09:48) [0m

[1m   --> TIME: 2025-05-06 10:10:15 -- STEP: 67/322 -- GLOBAL_STEP: 50100[0m
     | > loss: -1.071141004562378  (-1.0814180961295745)
     | > log_mle: -1.1121243238449097  (-1.1193079681538824)
     | > loss_dur: 0.04098327085375786  (0.037889873581146136)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(100.3356, device='cuda:0')  (tensor(122.6153, device='cuda:0'))
     | > current_lr: 3.875e-05 
     | > step_time: 0.4338  (0.37201719141718165)
     | > loader_time: 0.0109  (0.005732853021194685)


[1m > EVALUATION [




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6165301050458636 [0m(-0.011044059480939561)
     | > avg_loss:[91m -1.04020288671766 [0m(+0.0451375143868582)
     | > avg_log_mle:[91m -1.0820493289402555 [0m(+0.045135658127920975)
     | > avg_loss_dur:[91m 0.041846439561673575 [0m(+1.8472118037196616e-06)


[4m[1m > EPOCH: 57/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 10:15:51) [0m

[1m   --> TIME: 2025-05-06 10:16:09 -- STEP: 45/322 -- GLOBAL_STEP: 50400[0m
     | > loss: -1.1122959852218628  (-1.0926544745763145)
     | > log_mle: -1.1517733335494995  (-1.1292256858613752)
     | > loss_dur: 0.03947738930583  (0.03657121149202187)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(136.6062, device='cuda:0')  (tensor(126.4295, device='cuda:0'))
     | > current_lr: 3.9e-05 
     | > step_time: 0.3795  (0.35491112073262543)
     | > loader_time: 0.0044  (0.004958152770996094)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.697952379499163 [0m(+0.08142227445329941)
     | > avg_loss:[92m -1.0780726773398264 [0m(-0.03786979062216633)
     | > avg_log_mle:[92m -1.1200579983847483 [0m(-0.03800866944449277)
     | > avg_loss_dur:[91m 0.04198530720812934 [0m(+0.00013886764645576477)


[4m[1m > EPOCH: 58/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 10:22:04) [0m

[1m   --> TIME: 2025-05-06 10:22:13 -- STEP: 23/322 -- GLOBAL_STEP: 50700[0m
     | > loss: -1.105982780456543  (-1.1125377468440842)
     | > log_mle: -1.1427205801010132  (-1.1469972600107605)
     | > loss_dur: 0.03673774003982544  (0.03445951106107753)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(150.0690, device='cuda:0')  (tensor(116.7815, device='cuda:0'))
     | > current_lr: 3.925e-05 
     | > step_time: 0.3391  (0.32993669095246686)
     | > loader_time: 0.0051  (0.005061491675998854)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6394078186580113 [0m(-0.05854456084115167)
     | > avg_loss:[91m -1.0618929965155466 [0m(+0.016179680824279785)
     | > avg_log_mle:[91m -1.1070801087788176 [0m(+0.012977889605930715)
     | > avg_loss_dur:[91m 0.0451871038547584 [0m(+0.0032017966466290626)


[4m[1m > EPOCH: 59/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 10:28:14) [0m

[1m   --> TIME: 2025-05-06 10:28:16 -- STEP: 1/322 -- GLOBAL_STEP: 51000[0m
     | > loss: -1.1158744096755981  (-1.1158744096755981)
     | > log_mle: -1.15241277217865  (-1.15241277217865)
     | > loss_dur: 0.036538392305374146  (0.036538392305374146)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(133.8963, device='cuda:0')  (tensor(133.8963, device='cuda:0'))
     | > current_lr: 3.95e-05 
     | > step_time: 0.4292  (0.42922496795654297)
     | > loader_time: 0.0037  (0.003723621368408203)


 > CHECKPOINT : ckpts/run-M




[1m   --> TIME: 2025-05-06 10:32:50 -- STEP: 301/322 -- GLOBAL_STEP: 51300[0m
     | > loss: -1.0972596406936646  (-1.0853817383712308)
     | > log_mle: -1.1463168859481812  (-1.127860145315379)
     | > loss_dur: 0.049057211726903915  (0.04247840507531484)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(158.9152, device='cuda:0')  (tensor(125.6730, device='cuda:0'))
     | > current_lr: 3.95e-05 
     | > step_time: 1.3141  (0.7349999251951806)
     | > loader_time: 0.0253  (0.1676881202431612)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6165529796055385 [0m(-0.022854839052472875)
     | > avg_loss:[92m -1.0692514794213428 [0m(-0.007358482905796215)
     | > avg_log_mle:[92m -1.112366713796343 [0m(-0.005286605017525536)
     | > avg_loss_dur:[92m 0.04311523160764149 [0m(-0.0020718722471169126)


[4m[1m > EPOCH: 60/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 10:34:13) [0m

[1m   --> TIME: 2025-05-06 10:38:12 -- STEP: 279/322 -- GLOBAL_STEP: 51600[0m
     | > loss: -1.0626298189163208  (-1.086202688541891)
     | > log_mle: -1.1099623441696167  (-1.1284165450749013)
     | > loss_dur: 0.04733256623148918  (0.04221385380246519)
     | > amp_scaler: 1024.0  (1027.6702508960577)
     | > grad_norm: tensor(144.4296, device='cuda:0')  (tensor(123.3918, device='cuda:0'))
     | > current_lr: 3.9750000000000004e-05 
     | > step_time: 1.3148  (0.7039928624279608)
     | > loader_time: 0.0428  (0.1446990847160312)







  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6028315544128418 [0m(-0.013721425192696701)
     | > avg_loss:[92m -1.0823661702019824 [0m(-0.013114690780639648)
     | > avg_log_mle:[92m -1.1259928771427699 [0m(-0.013626163346426745)
     | > avg_loss_dur:[91m 0.04362670832446643 [0m(+0.0005114767168249434)


[4m[1m > EPOCH: 61/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 10:40:14) [0m

[1m   --> TIME: 2025-05-06 10:43:38 -- STEP: 257/322 -- GLOBAL_STEP: 51900[0m
     | > loss: -1.0696016550064087  (-1.0841345021696853)
     | > log_mle: -1.1154638528823853  (-1.1258789133932807)
     | > loss_dur: 0.04586216062307358  (0.04174441031038994)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(112.7826, device='cuda:0')  (tensor(117.7658, device='cuda:0'))
     | > current_lr: 3.9999999999999996e-05 
     | > step_time: 1.0319  (0.6463654959712046)
     | > loader_time: 0.5119  (0.13885139398537727)


[1m > EV




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6057968684605191 [0m(+0.0029653140476773032)
     | > avg_loss:[91m -1.071728423663548 [0m(+0.010637746538434412)
     | > avg_log_mle:[91m -1.1158278839928766 [0m(+0.010164993149893276)
     | > avg_loss_dur:[91m 0.0440994527723108 [0m(+0.00047274444784436986)


[4m[1m > EPOCH: 62/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 10:46:11) [0m

 > CHECKPOINT : ckpts/run-May-02-2025_07+04PM-0000000/checkpoint_52000.pth





[1m   --> TIME: 2025-05-06 10:49:04 -- STEP: 235/322 -- GLOBAL_STEP: 52200[0m
     | > loss: -1.088477373123169  (-1.0872002535677976)
     | > log_mle: -1.132771611213684  (-1.1286379428620035)
     | > loss_dur: 0.04429427161812782  (0.04143768916738796)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(110.4280, device='cuda:0')  (tensor(122.8275, device='cuda:0'))
     | > current_lr: 4.025e-05 
     | > step_time: 1.2007  (0.6064868845838182)
     | > loader_time: 0.044  (0.11091243256913859)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6245554379054477 [0m(+0.018758569444928597)
     | > avg_loss:[92m -1.0732501098087857 [0m(-0.0015216861452376218)
     | > avg_log_mle:[91m -1.1155794961111887 [0m(+0.00024838788168790593)
     | > avg_loss_dur:[92m 0.04232938140630722 [0m(-0.0017700713660035797)


[4m[1m > EPOCH: 63/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 10:52:15) [0m

[1m   --> TIME: 2025-05-06 10:54:41 -- STEP: 213/322 -- GLOBAL_STEP: 52500[0m
     | > loss: -1.0863726139068604  (-1.0922273632506263)
     | > log_mle: -1.132400631904602  (-1.1331291853542063)
     | > loss_dur: 0.046028055250644684  (0.040901820844327884)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(110.5674, device='cuda:0')  (tensor(122.3614, device='cuda:0'))
     | > current_lr: 4.05e-05 
     | > step_time: 1.0067  (0.5718164455163091)
     | > loader_time: 0.029  (0.10535821556485316)


[1m > EVALUATION [




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6265012672969273 [0m(+0.001945829391479692)
     | > avg_loss:[91m -1.0636317219052995 [0m(+0.009618387903486125)
     | > avg_log_mle:[91m -1.106570087160383 [0m(+0.009009408950805708)
     | > avg_loss_dur:[91m 0.042938365042209624 [0m(+0.0006089836359024006)


[4m[1m > EPOCH: 64/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 10:58:23) [0m

[1m   --> TIME: 2025-05-06 11:00:26 -- STEP: 191/322 -- GLOBAL_STEP: 52800[0m
     | > loss: -1.1008211374282837  (-1.0895567963884754)
     | > log_mle: -1.1438850164413452  (-1.1302867603551663)
     | > loss_dur: 0.04306383058428764  (0.04072996655098742)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(112.9551, device='cuda:0')  (tensor(125.5963, device='cuda:0'))
     | > current_lr: 4.075e-05 
     | > step_time: 0.9155  (0.5357719616116025)
     | > loader_time: 0.0171  (0.09443882622643915)


[1m > EVALUATION [0m




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6269641944340297 [0m(+0.00046292713710238687)
     | > avg_loss:[92m -1.07088714327131 [0m(-0.007255421366010539)
     | > avg_log_mle:[92m -1.1139776127679006 [0m(-0.007407525607517673)
     | > avg_loss_dur:[91m 0.0430904677936009 [0m(+0.00015210275139127677)


[4m[1m > EPOCH: 65/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 11:04:32) [0m

 > CHECKPOINT : ckpts/run-May-02-2025_07+04PM-0000000/checkpoint_53000.pth





[1m   --> TIME: 2025-05-06 11:06:12 -- STEP: 169/322 -- GLOBAL_STEP: 53100[0m
     | > loss: -1.0790461301803589  (-1.0974763977457087)
     | > log_mle: -1.1232141256332397  (-1.1372867025567233)
     | > loss_dur: 0.04416802152991295  (0.03981030980378567)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(140.8308, device='cuda:0')  (tensor(123.7828, device='cuda:0'))
     | > current_lr: 4.1e-05 
     | > step_time: 0.8063  (0.5039707584493965)
     | > loader_time: 1.06  (0.06563067012990016)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6418459074837821 [0m(+0.014881713049752388)
     | > avg_loss:[92m -1.0840405532291955 [0m(-0.013153409957885387)
     | > avg_log_mle:[92m -1.126728994505746 [0m(-0.012751381737845291)
     | > avg_loss_dur:[92m 0.04268843180366925 [0m(-0.0004020359899316525)


[4m[1m > EPOCH: 66/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 11:10:43) [0m

[1m   --> TIME: 2025-05-06 11:12:00 -- STEP: 147/322 -- GLOBAL_STEP: 53400[0m
     | > loss: -1.0895105600357056  (-1.09961469643781)
     | > log_mle: -1.1331263780593872  (-1.1390341650061062)
     | > loss_dur: 0.04361581429839134  (0.03941947285111259)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(129.4985, device='cuda:0')  (tensor(123.5092, device='cuda:0'))
     | > current_lr: 4.125e-05 
     | > step_time: 0.6574  (0.47864595724611864)
     | > loader_time: 0.0144  (0.03682530980531861)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.630044378553118 [0m(-0.011801528930664107)
     | > avg_loss:[91m -1.022838916097368 [0m(+0.06120163713182736)
     | > avg_log_mle:[91m -1.0666827269962855 [0m(+0.060046267509460405)
     | > avg_loss_dur:[91m 0.04384380547063691 [0m(+0.0011553736669676629)


[4m[1m > EPOCH: 67/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 11:16:54) [0m

[1m   --> TIME: 2025-05-06 11:17:55 -- STEP: 125/322 -- GLOBAL_STEP: 53700[0m
     | > loss: -1.1045000553131104  (-1.099689873695374)
     | > log_mle: -1.1484676599502563  (-1.1383190813064572)
     | > loss_dur: 0.043967556208372116  (0.03862920854985715)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(100.5119, device='cuda:0')  (tensor(122.8331, device='cuda:0'))
     | > current_lr: 4.15e-05 
     | > step_time: 0.5868  (0.44999342918396)
     | > loader_time: 0.0147  (0.021693756103515625)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6003529412405831 [0m(-0.029691437312534874)
     | > avg_loss:[92m -1.0780864136559625 [0m(-0.05524749755859437)
     | > avg_log_mle:[92m -1.1198863574436733 [0m(-0.05320363044738774)
     | > avg_loss_dur:[92m 0.041799956134387424 [0m(-0.002043849336249487)


[4m[1m > EPOCH: 68/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 11:22:56) [0m

[1m   --> TIME: 2025-05-06 11:23:40 -- STEP: 103/322 -- GLOBAL_STEP: 54000[0m
     | > loss: -1.1096670627593994  (-1.100228845494464)
     | > log_mle: -1.153192400932312  (-1.1379921112245728)
     | > loss_dur: 0.04352528974413872  (0.03776326686939567)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(143.7167, device='cuda:0')  (tensor(112.6664, device='cuda:0'))
     | > current_lr: 4.1750000000000005e-05 
     | > step_time: 0.4958  (0.4112399133663733)
     | > loader_time: 0.0079  (0.007218393307287716)


 > CHECKPOIN




[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.5836386340005058 [0m(-0.016714307240077386)
     | > avg_loss:[91m -1.0766549893787927 [0m(+0.0014314242771698016)
     | > avg_log_mle:[92m -1.1204870734895982 [0m(-0.0006007160459249317)
     | > avg_loss_dur:[91m 0.04383208208850452 [0m(+0.0020321259541170936)


[4m[1m > EPOCH: 69/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 11:28:48) [0m

[1m   --> TIME: 2025-05-06 11:29:20 -- STEP: 81/322 -- GLOBAL_STEP: 54300[0m
     | > loss: -1.0789222717285156  (-1.103093466640991)
     | > log_mle: -1.1212810277938843  (-1.1401036153604955)
     | > loss_dur: 0.04235881567001343  (0.03701015290470771)
     | > amp_scaler: 512.0  (884.9382716049382)
     | > grad_norm: tensor(148.1113, device='cuda:0')  (tensor(122.8300, device='cuda:0'))
     | > current_lr: 4.2e-05 
     | > step_time: 0.4593  (0.38197098837958443)
     | > loader_time: 0.0094  (0.006349360501324689)


[1m > EVALU




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.5798771449497768 [0m(-0.003761489050728928)
     | > avg_loss:[91m -1.0503654411860877 [0m(+0.026289548192705015)
     | > avg_log_mle:[91m -1.0934503044400896 [0m(+0.02703676904950858)
     | > avg_loss_dur:[92m 0.043084862189633505 [0m(-0.0007472198988710127)


[4m[1m > EPOCH: 70/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 11:34:41) [0m

[1m   --> TIME: 2025-05-06 11:35:04 -- STEP: 59/322 -- GLOBAL_STEP: 54600[0m
     | > loss: -1.0647757053375244  (-1.113022379956003)
     | > log_mle: -1.1074014902114868  (-1.1492698212801402)
     | > loss_dur: 0.042625755071640015  (0.03624743649388774)
     | > amp_scaler: 512.0  (512.0)
     | > grad_norm: tensor(79.6239, device='cuda:0')  (tensor(116.8798, device='cuda:0'))
     | > current_lr: 4.2250000000000004e-05 
     | > step_time: 0.4424  (0.3559401641457767)
     | > loader_time: 0.0057  (0.005379911196433892)


[1m > EVALU




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.584014333997454 [0m(+0.004137189047677126)
     | > avg_loss:[92m -1.078265254838126 [0m(-0.027899813652038308)
     | > avg_log_mle:[92m -1.122255035809108 [0m(-0.028804731369018288)
     | > avg_loss_dur:[91m 0.043989781396729606 [0m(+0.0009049192070961012)


[4m[1m > EPOCH: 71/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 11:40:30) [0m

[1m   --> TIME: 2025-05-06 11:40:44 -- STEP: 37/322 -- GLOBAL_STEP: 54900[0m
     | > loss: -1.1093398332595825  (-1.1175846441372022)
     | > log_mle: -1.1465216875076294  (-1.1522973840301098)
     | > loss_dur: 0.03718182072043419  (0.03471274502776765)
     | > amp_scaler: 512.0  (512.0)
     | > grad_norm: tensor(145.5528, device='cuda:0')  (tensor(122.2065, device='cuda:0'))
     | > current_lr: 4.25e-05 
     | > step_time: 0.3651  (0.3315392184901882)
     | > loader_time: 0.0061  (0.0050864348540434964)


 > CHECKPOINT : ckpts/run-




[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6204032216753278 [0m(+0.03638888767787385)
     | > avg_loss:[92m -1.0877026081085206 [0m(-0.009437353270394633)
     | > avg_log_mle:[92m -1.1315066746303013 [0m(-0.009251638821193398)
     | > avg_loss_dur:[92m 0.043804071205002924 [0m(-0.0001857101917266818)


[4m[1m > EPOCH: 72/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 11:46:23) [0m

[1m   --> TIME: 2025-05-06 11:46:29 -- STEP: 15/322 -- GLOBAL_STEP: 55200[0m
     | > loss: -1.146506428718567  (-1.144758447011312)
     | > log_mle: -1.178824782371521  (-1.1774543364842733)
     | > loss_dur: 0.03231839835643768  (0.03269587258497873)
     | > amp_scaler: 512.0  (512.0)
     | > grad_norm: tensor(148.8758, device='cuda:0')  (tensor(128.4962, device='cuda:0'))
     | > current_lr: 4.2749999999999996e-05 
     | > step_time: 0.2877  (0.29558027585347496)
     | > loader_time: 0.0058  (0.004355891545613607)


[1m   --> TI




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6180877889905657 [0m(-0.0023154326847620865)
     | > avg_loss:[92m -1.1113810777664181 [0m(-0.02367846965789755)
     | > avg_log_mle:[92m -1.1554494891847884 [0m(-0.023942814554487102)
     | > avg_loss_dur:[91m 0.0440684155694076 [0m(+0.0002643443644046728)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_55507.pth

[4m[1m > EPOCH: 73/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 11:52:23) [0m

[1m   --> TIME: 2025-05-06 11:56:42 -- STEP: 293/322 -- GLOBAL_STEP: 55800[0m
     | > loss: -1.0991705656051636  (-1.102348075791838)
     | > log_mle: -1.1455157995224  (-1.1439431922020773)
     | > loss_dur: 0.04634522274136543  (0.04159511381652167)
     | > amp_scaler: 512.0  (512.0)
     | > grad_norm: tensor(102.5238, device='cuda:0')  (tensor(123.3335, device='cuda:0'))
     | > current_lr: 4.3e-05 
     | > step_time: 1.3457  (0.7252846415132383)
     | > l




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.591800492150443 [0m(-0.026287296840122765)
     | > avg_loss:[92m -1.1161718811307635 [0m(-0.004790803364345342)
     | > avg_log_mle:[92m -1.1582050153187342 [0m(-0.002755526133945807)
     | > avg_loss_dur:[92m 0.04203313961625099 [0m(-0.0020352759531566053)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_55829.pth

[4m[1m > EPOCH: 74/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 11:58:19) [0m

 > CHECKPOINT : ckpts/run-May-02-2025_07+04PM-0000000/checkpoint_56000.pth





[1m   --> TIME: 2025-05-06 12:02:08 -- STEP: 271/322 -- GLOBAL_STEP: 56100[0m
     | > loss: -1.0986944437026978  (-1.1047991668166266)
     | > log_mle: -1.1431185007095337  (-1.1460229079221889)
     | > loss_dur: 0.044424016028642654  (0.04122373995773248)
     | > amp_scaler: 512.0  (512.0)
     | > grad_norm: tensor(130.0506, device='cuda:0')  (tensor(117.1445, device='cuda:0'))
     | > current_lr: 4.3249999999999994e-05 
     | > step_time: 1.287  (0.6843661091864327)
     | > loader_time: 1.0692  (0.12280986493803918)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.594234595979963 [0m(+0.002434103829520029)
     | > avg_loss:[91m -1.1046998364584786 [0m(+0.011472044672284909)
     | > avg_log_mle:[91m -1.147123326574053 [0m(+0.011081688744681184)
     | > avg_loss_dur:[91m 0.04242348660315786 [0m(+0.0003903469869068685)


[4m[1m > EPOCH: 75/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 12:04:19) [0m

[1m   --> TIME: 2025-05-06 12:07:30 -- STEP: 249/322 -- GLOBAL_STEP: 56400[0m
     | > loss: -1.1036208868026733  (-1.1027668020332682)
     | > log_mle: -1.1507784128189087  (-1.1434719940744738)
     | > loss_dur: 0.04715757817029953  (0.040705192976268435)
     | > amp_scaler: 1024.0  (762.8594377510041)
     | > grad_norm: tensor(102.4069, device='cuda:0')  (tensor(117.9642, device='cuda:0'))
     | > current_lr: 4.35e-05 
     | > step_time: 1.1683  (0.6568610218155335)
     | > loader_time: 0.0339  (0.10238298546358282)


[1m > EVALUA




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.5872718470437186 [0m(-0.006962748936244378)
     | > avg_loss:[91m -1.0866771391459873 [0m(+0.018022697312491287)
     | > avg_log_mle:[91m -1.1295783690043857 [0m(+0.01754495756966734)
     | > avg_loss_dur:[91m 0.04290121591516904 [0m(+0.00047772931201117697)


[4m[1m > EPOCH: 76/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 12:10:15) [0m

[1m   --> TIME: 2025-05-06 12:12:55 -- STEP: 227/322 -- GLOBAL_STEP: 56700[0m
     | > loss: -1.1100082397460938  (-1.1048523602506657)
     | > log_mle: -1.153229832649231  (-1.1455684406641817)
     | > loss_dur: 0.043221596628427505  (0.040716079757077035)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(136.1141, device='cuda:0')  (tensor(118.3999, device='cuda:0'))
     | > current_lr: 4.375e-05 
     | > step_time: 0.9778  (0.5904057183454738)
     | > loader_time: 0.9356  (0.10267535600368148)


[1m > EVALUATION [0




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.5902826990400042 [0m(+0.0030108519962855906)
     | > avg_loss:[92m -1.0971928460257394 [0m(-0.010515706879752118)
     | > avg_log_mle:[92m -1.1395956107548302 [0m(-0.010017241750444494)
     | > avg_loss_dur:[92m 0.042402767496449605 [0m(-0.0004984484187194327)


[4m[1m > EPOCH: 77/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 12:16:08) [0m

[1m   --> TIME: 2025-05-06 12:18:20 -- STEP: 205/322 -- GLOBAL_STEP: 57000[0m
     | > loss: -1.111443042755127  (-1.109524418668049)
     | > log_mle: -1.1545387506484985  (-1.1495349703765496)
     | > loss_dur: 0.04309576004743576  (0.04001055059999954)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(107.4548, device='cuda:0')  (tensor(114.1700, device='cuda:0'))
     | > current_lr: 4.4e-05 
     | > step_time: 0.8867  (0.5526719500378866)
     | > loader_time: 0.521  (0.07979205294353206)


 > CHECKPOINT : ckpts/run




[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.5967695440564836 [0m(+0.006486845016479448)
     | > avg_loss:[91m -1.071994464738028 [0m(+0.025198381287711458)
     | > avg_log_mle:[91m -1.113514058930533 [0m(+0.026081551824297167)
     | > avg_loss_dur:[92m 0.041519594298941746 [0m(-0.0008831731975078583)


[4m[1m > EPOCH: 78/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 12:22:04) [0m

[1m   --> TIME: 2025-05-06 12:23:51 -- STEP: 183/322 -- GLOBAL_STEP: 57300[0m
     | > loss: -1.1247351169586182  (-1.1152211069409304)
     | > log_mle: -1.1694897413253784  (-1.1547307505633664)
     | > loss_dur: 0.04475466534495354  (0.03950964196335749)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(95.4904, device='cuda:0')  (tensor(113.3559, device='cuda:0'))
     | > current_lr: 4.425e-05 
     | > step_time: 0.8294  (0.5186494321771008)
     | > loader_time: 0.3908  (0.05686140842125064)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6064684391021729 [0m(+0.009698895045689215)
     | > avg_loss:[92m -1.0868579455784388 [0m(-0.014863480840410892)
     | > avg_log_mle:[92m -1.1287394114903042 [0m(-0.0152253525597712)
     | > avg_loss_dur:[91m 0.04188146644404957 [0m(+0.0003618721451078208)


[4m[1m > EPOCH: 79/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 12:28:01) [0m

[1m   --> TIME: 2025-05-06 12:29:28 -- STEP: 161/322 -- GLOBAL_STEP: 57600[0m
     | > loss: -1.099107265472412  (-1.118526155163783)
     | > log_mle: -1.1429709196090698  (-1.1573900384192144)
     | > loss_dur: 0.043863605707883835  (0.03886388143906312)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(101.1559, device='cuda:0')  (tensor(112.0041, device='cuda:0'))
     | > current_lr: 4.4500000000000004e-05 
     | > step_time: 0.7006  (0.4923856524947267)
     | > loader_time: 0.0098  (0.03919343178316672)


[1m > EVALU




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.5949258872440882 [0m(-0.011542551858084615)
     | > avg_loss:[92m -1.1236983639853342 [0m(-0.03684041840689534)
     | > avg_log_mle:[92m -1.1664880241666522 [0m(-0.03774861267634799)
     | > avg_loss_dur:[91m 0.042789652411426814 [0m(+0.0009081859673772466)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_57761.pth

[4m[1m > EPOCH: 80/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 12:33:58) [0m

[1m   --> TIME: 2025-05-06 12:35:07 -- STEP: 139/322 -- GLOBAL_STEP: 57900[0m
     | > loss: -1.1056580543518066  (-1.1174079008239632)
     | > log_mle: -1.1460002660751343  (-1.1555838542018866)
     | > loss_dur: 0.040342241525650024  (0.038175951461676204)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(110.2596, device='cuda:0')  (tensor(113.8764, device='cuda:0'))
     | > current_lr: 4.475e-05 
     | > step_time: 0.6564  (0.46340710139103075)




[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.5999999250684466 [0m(+0.005074037824358335)
     | > avg_loss:[92m -1.1243788616997854 [0m(-0.0006804977144512225)
     | > avg_log_mle:[91m -1.1661171197891236 [0m(+0.0003709043775286247)
     | > avg_loss_dur:[92m 0.0417382547897952 [0m(-0.0010513976216316168)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_58083.pth

[4m[1m > EPOCH: 81/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 12:39:55) [0m

[1m   --> TIME: 2025-05-06 12:40:49 -- STEP: 117/322 -- GLOBAL_STEP: 58200[0m
     | > loss: -1.1241395473480225  (-1.124107541182103)
     | > log_mle: -1.1663917303085327  (-1.1618358589645124)
     | > loss_dur: 0.04225224256515503  (0.037728316524726704)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(123.2231, device='cuda:0')  (tensor(110.3900, device='cuda:0'))
     | > current_lr: 4.5e-05 
     | > step_time: 0.5317  (0.4367149018833779)
  




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6066508974347796 [0m(+0.006650972366333052)
     | > avg_loss:[91m -1.112109913144793 [0m(+0.012268948554992498)
     | > avg_log_mle:[91m -1.1555938482284547 [0m(+0.010523271560668901)
     | > avg_loss_dur:[91m 0.04348394200205803 [0m(+0.001745687212262835)


[4m[1m > EPOCH: 82/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 12:45:54) [0m

[1m   --> TIME: 2025-05-06 12:46:35 -- STEP: 95/322 -- GLOBAL_STEP: 58500[0m
     | > loss: -1.1392955780029297  (-1.1223487590488623)
     | > log_mle: -1.181235671043396  (-1.1588920028586134)
     | > loss_dur: 0.041940100491046906  (0.03654324573121572)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(116.8030, device='cuda:0')  (tensor(113.3527, device='cuda:0'))
     | > current_lr: 4.5249999999999995e-05 
     | > step_time: 0.5459  (0.4071006373355263)
     | > loader_time: 0.0079  (0.007015471709401985)


[1m > EVAL




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6241302626473563 [0m(+0.017479365212576647)
     | > avg_loss:[92m -1.121459994997297 [0m(-0.009350081852504077)
     | > avg_log_mle:[92m -1.1637148550578524 [0m(-0.008121006829397759)
     | > avg_loss_dur:[92m 0.042254866872515 [0m(-0.0012290751295430322)


[4m[1m > EPOCH: 83/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 12:51:54) [0m

[1m   --> TIME: 2025-05-06 12:52:25 -- STEP: 73/322 -- GLOBAL_STEP: 58800[0m
     | > loss: -1.1002382040023804  (-1.1273127516655075)
     | > log_mle: -1.1394292116165161  (-1.1627657821733655)
     | > loss_dur: 0.039190974086523056  (0.03545302731839761)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(123.0221, device='cuda:0')  (tensor(102.0712, device='cuda:0'))
     | > current_lr: 4.55e-05 
     | > step_time: 0.4401  (0.38839773609213635)
     | > loader_time: 0.0069  (0.006347065102564146)


 > CHECKPOINT : ckpts/ru




[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6356167180197579 [0m(+0.011486455372401672)
     | > avg_loss:[92m -1.1246436085019793 [0m(-0.0031836135046823255)
     | > avg_log_mle:[92m -1.1674860852105278 [0m(-0.003771230152675331)
     | > avg_loss_dur:[91m 0.04284247638923781 [0m(+0.0005876095167228132)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_59049.pth

[4m[1m > EPOCH: 84/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 12:58:08) [0m

[1m   --> TIME: 2025-05-06 12:58:29 -- STEP: 51/322 -- GLOBAL_STEP: 59100[0m
     | > loss: -1.1475712060928345  (-1.1310321653590483)
     | > log_mle: -1.185583233833313  (-1.1656007790098009)
     | > loss_dur: 0.03801202028989792  (0.034568612555078426)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(108.6127, device='cuda:0')  (tensor(104.8169, device='cuda:0'))
     | > current_lr: 4.575e-05 
     | > step_time: 0.4245  (0.36463508886449475)





  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6121072428567069 [0m(-0.02350947516305102)
     | > avg_loss:[91m -1.0897356407982963 [0m(+0.034907967703682985)
     | > avg_log_mle:[91m -1.132781250136239 [0m(+0.034704835074288676)
     | > avg_loss_dur:[91m 0.04304561104093279 [0m(+0.0002031346516949764)


[4m[1m > EPOCH: 85/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 13:04:12) [0m

[1m   --> TIME: 2025-05-06 13:04:23 -- STEP: 29/322 -- GLOBAL_STEP: 59400[0m
     | > loss: -1.0946427583694458  (-1.148696282814289)
     | > log_mle: -1.124940037727356  (-1.181136332709214)
     | > loss_dur: 0.03029731474816799  (0.03244005451942312)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(120.8751, device='cuda:0')  (tensor(122.8714, device='cuda:0'))
     | > current_lr: 4.6e-05 
     | > step_time: 0.3486  (0.32567233874880036)
     | > loader_time: 0.005  (0.005373806789003569)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6498023101261684 [0m(+0.03769506726946148)
     | > avg_loss:[91m -1.0894684042249407 [0m(+0.00026723657335558926)
     | > avg_log_mle:[92m -1.132841781207493 [0m(-6.0531071253810964e-05)
     | > avg_loss_dur:[91m 0.043373379643474305 [0m(+0.0003277686025415158)


[4m[1m > EPOCH: 86/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 13:10:17) [0m

[1m   --> TIME: 2025-05-06 13:10:20 -- STEP: 7/322 -- GLOBAL_STEP: 59700[0m
     | > loss: -1.1806881427764893  (-1.2034115621021815)
     | > log_mle: -1.2105516195297241  (-1.2345741306032454)
     | > loss_dur: 0.029863445088267326  (0.031162583668317114)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(100.0680, device='cuda:0')  (tensor(157.9827, device='cuda:0'))
     | > current_lr: 4.625e-05 
     | > step_time: 0.2612  (0.30645247868129183)
     | > loader_time: 0.0039  (0.003998347691127232)


[1m   --> TIME: 




[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6101945195879255 [0m(-0.039607790538242926)
     | > avg_loss:[92m -1.0957775047847202 [0m(-0.0063091005597795125)
     | > avg_log_mle:[92m -1.137380725996835 [0m(-0.004538944789342025)
     | > avg_loss_dur:[92m 0.04160322376659938 [0m(-0.0017701558768749223)


[4m[1m > EPOCH: 87/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 13:16:18) [0m

[1m   --> TIME: 2025-05-06 13:20:25 -- STEP: 285/322 -- GLOBAL_STEP: 60300[0m
     | > loss: -1.1292680501937866  (-1.1232645921539837)
     | > log_mle: -1.1762405633926392  (-1.1640397067655606)
     | > loss_dur: 0.04697248712182045  (0.04077511549388109)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(89.0863, device='cuda:0')  (tensor(113.9771, device='cuda:0'))
     | > current_lr: 4.6500000000000005e-05 
     | > step_time: 1.3107  (0.7143426568884599)
     | > loader_time: 0.0154  (0.14434856866535392)


[1m > EVA




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6015753405434745 [0m(-0.008619179044450975)
     | > avg_loss:[91m -1.0816553694861275 [0m(+0.014122135298592742)
     | > avg_log_mle:[91m -1.1239177124840873 [0m(+0.01346301351274759)
     | > avg_loss_dur:[91m 0.042262344700949535 [0m(+0.000659120934350152)


[4m[1m > EPOCH: 88/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 13:22:18) [0m

[1m   --> TIME: 2025-05-06 13:26:00 -- STEP: 263/322 -- GLOBAL_STEP: 60600[0m
     | > loss: -1.1474634408950806  (-1.1290731955843734)
     | > log_mle: -1.1922794580459595  (-1.169393181800843)
     | > loss_dur: 0.044815998524427414  (0.040319986917616735)
     | > amp_scaler: 1024.0  (1366.6311787072236)
     | > grad_norm: tensor(115.4266, device='cuda:0')  (tensor(112.5159, device='cuda:0'))
     | > current_lr: 4.675e-05 
     | > step_time: 1.2212  (0.6714006675966794)
     | > loader_time: 0.0452  (0.16263560378506153)


[1m > EVA




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6422649792262485 [0m(+0.040689638682773976)
     | > avg_loss:[92m -1.0883414711271013 [0m(-0.006686101640973829)
     | > avg_log_mle:[92m -1.1316171748297552 [0m(-0.0076994623456678735)
     | > avg_loss_dur:[91m 0.04327570700219699 [0m(+0.0010133623012474544)


[4m[1m > EPOCH: 89/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 13:28:31) [0m

[1m   --> TIME: 2025-05-06 13:31:38 -- STEP: 241/322 -- GLOBAL_STEP: 60900[0m
     | > loss: -1.1403154134750366  (-1.1268420986120125)
     | > log_mle: -1.1839786767959595  (-1.1668556354847193)
     | > loss_dur: 0.043663233518600464  (0.04001353548151823)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(92.7035, device='cuda:0')  (tensor(112.9371, device='cuda:0'))
     | > current_lr: 4.7000000000000004e-05 
     | > step_time: 1.1363  (0.6369858767481757)
     | > loader_time: 0.0667  (0.1303171775153069)


[1m > EV




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.5876756872449602 [0m(-0.05458929198128826)
     | > avg_loss:[92m -1.1434722866330826 [0m(-0.05513081550598131)
     | > avg_log_mle:[92m -1.1865766899926322 [0m(-0.054959515162877004)
     | > avg_loss_dur:[92m 0.04310439931494849 [0m(-0.0001713076872484967)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_60981.pth

[4m[1m > EPOCH: 90/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 13:34:39) [0m

 > CHECKPOINT : ckpts/run-May-02-2025_07+04PM-0000000/checkpoint_61000.pth





[1m   --> TIME: 2025-05-06 13:37:09 -- STEP: 219/322 -- GLOBAL_STEP: 61200[0m
     | > loss: -1.1272518634796143  (-1.1332079188464435)
     | > log_mle: -1.1698960065841675  (-1.1727022172109185)
     | > loss_dur: 0.042644161731004715  (0.03949429790519142)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(101.5290, device='cuda:0')  (tensor(112.5002, device='cuda:0'))
     | > current_lr: 4.7249999999999997e-05 
     | > step_time: 0.9591  (0.5788964582904834)
     | > loader_time: 0.0147  (0.08529602882524605)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.5984415735517229 [0m(+0.010765886306762673)
     | > avg_loss:[91m -1.1053328037261962 [0m(+0.0381394829068864)
     | > avg_log_mle:[91m -1.1482873746326994 [0m(+0.03828931535993285)
     | > avg_loss_dur:[92m 0.04295455749545778 [0m(-0.00014984181949070918)


[4m[1m > EPOCH: 91/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 13:40:32) [0m

[1m   --> TIME: 2025-05-06 13:42:35 -- STEP: 197/322 -- GLOBAL_STEP: 61500[0m
     | > loss: -1.1392103433609009  (-1.1301552287213081)
     | > log_mle: -1.181235909461975  (-1.169283251471931)
     | > loss_dur: 0.0420256182551384  (0.039128017597619046)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(79.5915, device='cuda:0')  (tensor(114.0710, device='cuda:0'))
     | > current_lr: 4.75e-05 
     | > step_time: 0.7919  (0.5406703622208024)
     | > loader_time: 0.0238  (0.07433992473002013)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6821115016937255 [0m(+0.08366992814200258)
     | > avg_loss:[91m -1.1040393420628136 [0m(+0.0012934616633826224)
     | > avg_log_mle:[91m -1.1481092282703944 [0m(+0.00017814636230495395)
     | > avg_loss_dur:[91m 0.04406989408390863 [0m(+0.001115336588450845)


[4m[1m > EPOCH: 92/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 13:46:37) [0m

[1m   --> TIME: 2025-05-06 13:48:18 -- STEP: 175/322 -- GLOBAL_STEP: 61800[0m
     | > loss: -1.1591256856918335  (-1.1340029403141563)
     | > log_mle: -1.2012580633163452  (-1.1726143244334624)
     | > loss_dur: 0.04213234409689903  (0.03861138475792746)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(117.9311, device='cuda:0')  (tensor(103.7452, device='cuda:0'))
     | > current_lr: 4.775e-05 
     | > step_time: 0.7679  (0.5114135156358993)
     | > loader_time: 0.0182  (0.05451223509652274)


[1m > EVALUATION [0




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6362210886819022 [0m(-0.04589041301182328)
     | > avg_loss:[92m -1.105052467754909 [0m(-0.001013125692095329)
     | > avg_log_mle:[92m -1.1491398709160945 [0m(-0.001030642645700075)
     | > avg_loss_dur:[91m 0.044087404651301246 [0m(+1.751056739261736e-05)


[4m[1m > EPOCH: 93/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 13:52:38) [0m

 > CHECKPOINT : ckpts/run-May-02-2025_07+04PM-0000000/checkpoint_62000.pth





[1m   --> TIME: 2025-05-06 13:54:01 -- STEP: 153/322 -- GLOBAL_STEP: 62100[0m
     | > loss: -1.1355855464935303  (-1.1427445925918283)
     | > log_mle: -1.1759289503097534  (-1.180484069718255)
     | > loss_dur: 0.04034344479441643  (0.03773948091258802)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(110.5282, device='cuda:0')  (tensor(106.0709, device='cuda:0'))
     | > current_lr: 4.8e-05 
     | > step_time: 0.6531  (0.4856935684977014)
     | > loader_time: 0.0091  (0.03578511250564475)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.623137698854719 [0m(-0.013083389827183156)
     | > avg_loss:[92m -1.1098321982792445 [0m(-0.004779730524335557)
     | > avg_log_mle:[92m -1.152946298463004 [0m(-0.0038064275469094078)
     | > avg_loss_dur:[92m 0.0431141058249133 [0m(-0.0009732988263879486)


[4m[1m > EPOCH: 94/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 13:58:49) [0m

[1m   --> TIME: 2025-05-06 13:59:53 -- STEP: 131/322 -- GLOBAL_STEP: 62400[0m
     | > loss: -1.1256213188171387  (-1.1439779214276617)
     | > log_mle: -1.1661452054977417  (-1.181025425896389)
     | > loss_dur: 0.040523894131183624  (0.037047505535128464)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(109.2576, device='cuda:0')  (tensor(113.3321, device='cuda:0'))
     | > current_lr: 4.825e-05 
     | > step_time: 0.5562  (0.4575151268762487)
     | > loader_time: 0.0084  (0.012117991920645914)


[1m > EVALUATION [0m




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6535340649741036 [0m(+0.030396366119384566)
     | > avg_loss:[92m -1.1247375420161656 [0m(-0.014905343736921095)
     | > avg_log_mle:[92m -1.1678810153688706 [0m(-0.014934716905866674)
     | > avg_loss_dur:[91m 0.043143475907189505 [0m(+2.9370082276207465e-05)


[4m[1m > EPOCH: 95/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 14:04:53) [0m

[1m   --> TIME: 2025-05-06 14:05:43 -- STEP: 109/322 -- GLOBAL_STEP: 62700[0m
     | > loss: -1.1401246786117554  (-1.138508925744153)
     | > log_mle: -1.179003119468689  (-1.1750868427644081)
     | > loss_dur: 0.03887838497757912  (0.03657791676392796)
     | > amp_scaler: 1024.0  (1822.532110091743)
     | > grad_norm: tensor(95.7008, device='cuda:0')  (tensor(109.4592, device='cuda:0'))
     | > current_lr: 4.85e-05 
     | > step_time: 0.5143  (0.4289920067568438)
     | > loader_time: 0.015  (0.01199095839754157)


[1m > EVALUAT




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.601610210963658 [0m(-0.051923854010445636)
     | > avg_loss:[91m -1.1049222230911255 [0m(+0.01981531892504007)
     | > avg_log_mle:[91m -1.1484807116644724 [0m(+0.01940030370439816)
     | > avg_loss_dur:[91m 0.04355849219220025 [0m(+0.00041501628501074833)


[4m[1m > EPOCH: 96/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 14:10:59) [0m

[1m   --> TIME: 2025-05-06 14:11:36 -- STEP: 87/322 -- GLOBAL_STEP: 63000[0m
     | > loss: -1.1718652248382568  (-1.1491154226763491)
     | > log_mle: -1.2152196168899536  (-1.1840861816515866)
     | > loss_dur: 0.043354425579309464  (0.03497074927663665)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(91.8763, device='cuda:0')  (tensor(101.2807, device='cuda:0'))
     | > current_lr: 4.875e-05 
     | > step_time: 0.5154  (0.39990969361930057)
     | > loader_time: 0.0113  (0.006311970195551029)


 > CHECKPOINT : ckpts/r




[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.6289747646876744 [0m(+0.02736455372401647)
     | > avg_loss:[91m -1.097258196558271 [0m(+0.007664026532854473)
     | > avg_log_mle:[91m -1.1411592040743144 [0m(+0.007321507590158038)
     | > avg_loss_dur:[91m 0.04390099335994039 [0m(+0.0003425011677401363)


[4m[1m > EPOCH: 97/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 14:17:09) [0m

[1m   --> TIME: 2025-05-06 14:17:36 -- STEP: 65/322 -- GLOBAL_STEP: 63300[0m
     | > loss: -1.1423840522766113  (-1.1558613758820755)
     | > log_mle: -1.178001046180725  (-1.190297792508052)
     | > loss_dur: 0.03561696037650108  (0.03443641370305647)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(100.1049, device='cuda:0')  (tensor(100.9727, device='cuda:0'))
     | > current_lr: 4.9e-05 
     | > step_time: 0.5211  (0.38863581877488357)
     | > loader_time: 0.0074  (0.006096076965332031)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6250875200544086 [0m(-0.0038872446332658406)
     | > avg_loss:[91m -1.0883476461683002 [0m(+0.008910550389970906)
     | > avg_log_mle:[91m -1.130781926427569 [0m(+0.010377277646745453)
     | > avg_loss_dur:[92m 0.04243428143007415 [0m(-0.001466711929866242)


[4m[1m > EPOCH: 98/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 14:23:17) [0m

[1m   --> TIME: 2025-05-06 14:23:33 -- STEP: 43/322 -- GLOBAL_STEP: 63600[0m
     | > loss: -1.1114286184310913  (-1.1603197025698284)
     | > log_mle: -1.1410106420516968  (-1.1932504260262775)
     | > loss_dur: 0.029582081362605095  (0.03293072640202767)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(85.7976, device='cuda:0')  (tensor(114.7148, device='cuda:0'))
     | > current_lr: 4.925e-05 
     | > step_time: 0.3796  (0.347862859104955)
     | > loader_time: 0.0053  (0.005383474882258925)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6160542283739362 [0m(-0.0090332916804724)
     | > avg_loss:[92m -1.1219335181372507 [0m(-0.033585871968950576)
     | > avg_log_mle:[92m -1.1645064932959421 [0m(-0.03372456686837322)
     | > avg_loss_dur:[91m 0.04257296760167394 [0m(+0.00013868617159979585)


[4m[1m > EPOCH: 99/100[0m
 --> ckpts/run-May-02-2025_07+04PM-0000000

[1m > TRAINING (2025-05-06 14:29:27) [0m

[1m   --> TIME: 2025-05-06 14:29:35 -- STEP: 21/322 -- GLOBAL_STEP: 63900[0m
     | > loss: -1.2160719633102417  (-1.1870399940581549)
     | > log_mle: -1.2480295896530151  (-1.2181699219204134)
     | > loss_dur: 0.03195766359567642  (0.031129922274322736)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(135.0472, device='cuda:0')  (tensor(120.0423, device='cuda:0'))
     | > current_lr: 4.95e-05 
     | > step_time: 0.3248  (0.31408123742966426)
     | > loader_time: 0.0044  (0.004940714154924665)


 > CHECKPOINT : ckpts/r




[1m   --> TIME: 2025-05-06 14:34:50 -- STEP: 321/322 -- GLOBAL_STEP: 64200[0m
     | > loss: -1.139192819595337  (-1.1461634561651597)
     | > log_mle: -1.1896342039108276  (-1.1868862626708554)
     | > loss_dur: 0.05044136941432953  (0.040722804967998696)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(93.2843, device='cuda:0')  (tensor(114.0992, device='cuda:0'))
     | > current_lr: 4.95e-05 
     | > step_time: 1.5738  (0.7975553745792661)
     | > loader_time: 0.0148  (0.18739397280684145)


[1m > EVALUATION [0m






  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.6140463147844587 [0m(-0.0020079135894774947)
     | > avg_loss:[92m -1.1667222261428833 [0m(-0.044788708005632616)
     | > avg_log_mle:[92m -1.2084908519472395 [0m(-0.04398435865129735)
     | > avg_loss_dur:[92m 0.04176862952964647 [0m(-0.0008043380720274759)

 > BEST MODEL : ckpts/run-May-02-2025_07+04PM-0000000/best_model_64201.pth


In [14]:
!echo abc

abc


# Save and Load Model

## Save Model

In [35]:
import os
import torch
import json
from pathlib import Path
from TTS.tts.configs.shared_configs import CharactersConfig

def save_model_elements(model, config, tokenizer, ap, save_dir):
    """
    Save all elements of the GlowTTS model, including state, config, tokenizer, and audio processor.
    
    Args:
        model: The GlowTTS model instance.
        config: The GlowTTSConfig configuration object.
        tokenizer: The TTSTokenizer instance.
        ap: The AudioProcessor instance.
        save_dir: Directory to save the model elements.
    """
    # Create save directory if it doesn't exist
    save_dir = Path(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)
    
    # 1. Save model state dictionary
    model_path = save_dir / "model_state.pth"
    checkpoint = {
        "model": model.state_dict(),  # Wrap state_dict in 'model' key for TTS.api
        "step": getattr(model, "global_step", 32000),  # From training log
    }
    torch.save(checkpoint, model_path)
    print(f"Model state saved to {model_path}")
    
    # 2. Save configuration
    config_path = save_dir / "config.json"
    config_dict = {
        "batch_size": config.batch_size,
        "eval_batch_size": config.eval_batch_size,
        "num_loader_workers": config.num_loader_workers,
        "num_eval_loader_workers": config.num_eval_loader_workers,
        "run_eval": config.run_eval,
        "test_delay_epochs": config.test_delay_epochs,
        "epochs": config.epochs,
        "text_cleaner": config.text_cleaner,
        "use_phonemes": config.use_phonemes,
        "phoneme_cache_path": config.phoneme_cache_path,
        "print_step": config.print_step,
        "print_eval": config.print_eval,
        "mixed_precision": config.mixed_precision,
        "output_path": config.output_path,
        "save_step": config.save_step,
        "eval_split_max_size": config.eval_split_max_size,
        "eval_split_size": config.eval_split_size,
        "save_n_checkpoints": config.save_n_checkpoints,
        "test_sentences": config.test_sentences,
        "characters": {
            "pad": config.characters.pad,
            "eos": config.characters.eos,
            "bos": config.characters.bos,
            "blank": config.characters.blank,
            "characters": config.characters.characters,
            "punctuations": config.characters.punctuations,
            "phonemes": getattr(config.characters, "phonemes", None),
            "characters_class": config.characters.characters_class
        }
    }
    with open(config_path, "w", encoding="utf-8") as f:
        json.dump(config_dict, f, ensure_ascii=False, indent=2)
    print(f"Configuration saved to {config_path}")
    
    # 3. Save tokenizer
    tokenizer_path = save_dir / "tokenizer.json"
    tokenizer_config = {
        "characters": {
            "pad": tokenizer.characters.pad,
            "eos": tokenizer.characters.eos,
            "bos": tokenizer.characters.bos,
            "blank": tokenizer.characters.blank,
            "characters": tokenizer.characters.characters,
            "punctuations": tokenizer.characters.punctuations,
            "phonemes": getattr(tokenizer.characters, "phonemes", None)
        },
        "pad_token_id": getattr(tokenizer, "pad_token_id", None),
        "eos_token_id": getattr(tokenizer, "eos_token_id", None),
        "bos_token_id": getattr(tokenizer, "bos_token_id", None)
    }
    with open(tokenizer_path, "w", encoding="utf-8") as f:
        json.dump(tokenizer_config, f, ensure_ascii=False, indent=2)
    print(f"Tokenizer saved to {tokenizer_path}")
    
    # 4. Save audio processor settings
    ap_path = save_dir / "audio_processor.json"
    ap_config = {
        "sample_rate": ap.sample_rate,
        "num_mels": ap.num_mels,
        "min_level_db": getattr(ap, "min_level_db", -100),
        "frame_length_ms": getattr(ap, "frame_length_ms", 50),
        "frame_shift_ms": getattr(ap, "frame_shift_ms", 12.5),
        "preemphasis": getattr(ap, "preemphasis", 0.97),
        "ref_level_db": getattr(ap, "ref_level_db", 20),
        "power": getattr(ap, "power", 1.5),
        "griffin_lim_iters": getattr(ap, "griffin_lim_iters", 60),
        "n_fft": getattr(ap, "n_fft", 1024),
        "hop_length": getattr(ap, "hop_length", 256),
        "win_length": getattr(ap, "win_length", 1024)
    }
    with open(ap_path, "w", encoding="utf-8") as f:
        json.dump(ap_config, f, ensure_ascii=False, indent=2)
    print(f"Audio processor settings saved to {ap_path}")

# Usage
output_path = "./ckpts"
run_folder_path = "run-May-02-2025_07+04PM-0000000"
save_directory = os.path.join(output_path, run_folder_path, "model_elements")
save_model_elements(model, config, tokenizer, ap, save_directory)

Model state saved to viet-fine-tune-glow-tts/run-May-02-2025_07+04PM-0000000/model_elements/model_state.pth
Configuration saved to ckpts/run-May-02-2025_07+04PM-0000000/model_elements/config.json
Tokenizer saved to ckpts/run-May-02-2025_07+04PM-0000000/model_elements/tokenizer.json
Audio processor settings saved to ckpts/run-May-02-2025_07+04PM-0000000/model_elements/audio_processor.json


## Load Model

In [36]:
import os
import torch
import json
from pathlib import Path
from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.tts.configs.shared_configs import CharactersConfig

def load_model_elements(load_dir, device="cpu"):
    """
    Load all elements of the GlowTTS model, including state, config, tokenizer, and audio processor.
    
    Args:
        load_dir: Directory where model elements are saved.
        device: Device to load the model onto ('cpu' or 'cuda').
    
    Returns:
        model: Loaded GlowTTS model.
        config: Loaded GlowTTSConfig.
        tokenizer: Loaded TTSTokenizer.
        ap: Loaded AudioProcessor.
    """
    load_dir = Path(load_dir)
    
    # 1. Load configuration
    config_path = load_dir / "config.json"
    if not config_path.exists():
        raise FileNotFoundError(f"Config file not found at {config_path}")
    with open(config_path, "r", encoding="utf-8") as f:
        config_dict = json.load(f)
    
    # Reconstruct CharactersConfig
    characters_config = CharactersConfig(**config_dict["characters"])
    config_dict["characters"] = characters_config
    
    # Initialize GlowTTSConfig
    config = GlowTTSConfig(**{k: v for k, v in config_dict.items() if k != "characters"})
    config.characters = characters_config
    print(f"Configuration loaded from {config_path}")
    
    # 2. Load tokenizer
    tokenizer_path = load_dir / "tokenizer.json"
    if not tokenizer_path.exists():
        raise FileNotFoundError(f"Tokenizer file not found at {tokenizer_path}")
    with open(tokenizer_path, "r", encoding="utf-8") as f:
        tokenizer_config = json.load(f)
    
    # Initialize tokenizer
    tokenizer, _ = TTSTokenizer.init_from_config(config)
    print(f"Tokenizer loaded from {tokenizer_path}")
    
    # 3. Load audio processor
    ap_path = load_dir / "audio_processor.json"
    if not ap_path.exists():
        raise FileNotFoundError(f"Audio processor file not found at {ap_path}")
    with open(ap_path, "r", encoding="utf-8") as f:
        ap_config = json.load(f)
    
    # Initialize AudioProcessor and apply saved settings
    ap = AudioProcessor.init_from_config(config)
    for key, value in ap_config.items():
        setattr(ap, key, value)
    print(f"Audio processor loaded from {ap_path}")
    
    # 4. Initialize model
    model = GlowTTS(config, ap, tokenizer, speaker_manager=None)
    
    # 5. Load model state
    model_path = load_dir / "model_state.pth"
    if not model_path.exists():
        raise FileNotFoundError(f"Model state file not found at {model_path}")
    checkpoint = torch.load(model_path, map_location=device)
    model.load_state_dict(checkpoint["model"])
    model.to(device)
    model.eval()
    print(f"Model state loaded from {model_path}")
    
    return model, config, tokenizer, ap

## Inference from Load elements

In [37]:
import torch
import IPython

def infer_with_glow_tts(model, ap, tokenizer, text, output_wav_path="output.wav"):
    """
    Perform inference with the GlowTTS model to generate speech from text.
    
    Args:
        model: Loaded GlowTTS model.
        ap: Loaded AudioProcessor.
        tokenizer: Loaded TTSTokenizer.
        text: Input text to synthesize.
        output_wav_path: Path to save the generated WAV file.
    """
    # Tokenize input text
    text_inputs = tokenizer.text_to_ids(text)
    text_inputs = torch.LongTensor(text_inputs).unsqueeze(0).to(model.device)  # Shape: [1, seq_len]
    
    # Compute sequence lengths
    x_lengths = torch.LongTensor([text_inputs.size(1)]).to(model.device)  # Shape: [1]
    
    # Perform inference with aux_input
    with torch.no_grad():
        outputs = model.inference(x=text_inputs, aux_input={"x_lengths": x_lengths})
    
    # Extract mel-spectrogram
    mel = outputs["model_outputs"]
    if mel.dim() == 3:
        mel = mel.squeeze(0)
    
    # Convert mel-spectrogram to audio using Griffin-Lim
    audio = ap.griffin_lim(mel.T.cpu().numpy())
    
    # Save audio to WAV file
    ap.save_wav(audio, output_wav_path)
    print(f"Generated audio saved to {output_wav_path}")

# Usage
load_directory = os.path.join("./ckpts", "run-May-02-2025_07+04PM-0000000", "model_elements")
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load model and components
model, config, tokenizer, ap = load_model_elements(load_directory, device=device)

# Perform inference
text = "Trong khi đó, tại bến tàu du lịch Nha Trang, hàng ngàn du khách chen nhau để đi đến các đảo trên vịnh Nha Trang, lực lượng cảnh sát đường thủy đã tăng cường quân số để quản lý, đảm bảo an toàn cho du khách."
out_path = "abc.wav"
infer_with_glow_tts(model, ap, tokenizer, text, out_path)

# Play audio
IPython.display.Audio(out_path)

Configuration loaded from ckpts/run-May-02-2025_07+04PM-0000000/model_elements/config.json
Tokenizer loaded from ckpts/run-May-02-2025_07+04PM-0000000/model_elements/tokenizer.json
Audio processor loaded from ckpts/run-May-02-2025_07+04PM-0000000/model_elements/audio_processor.json


  checkpoint = torch.load(model_path, map_location=device)


Model state loaded from ckpts/run-May-02-2025_07+04PM-0000000/model_elements/model_state.pth


AttributeError: 'AudioProcessor' object has no attribute 'griffin_lim'

In [None]:
%cd /kaggle/working/ckpts

In [None]:
!rm -rf tokenizer.pkl model_checkpoint.pth config.json audio_processor.pkl

In [None]:
!echo done

# Inference

In [None]:
!echo not_done

In [15]:
def get_vietnamese_chars():
    return (
        "aàáảãạăằắẳẵặâầấẩẫậbcdfđeèéẻẽẹêềếểễệghiìíỉĩịjklmnoòóỏõọôồốổỗộơờớởỡợpqrstuùúủũụưừứửữựvwxyỳýỷỹỵz"
        + "AÀÁẢÃẠĂẰẮẲẴẶÂẦẤẨẪẬBCDFĐEÈÉẺẼẸÊỀẾỂỄỆGHIÌÍỈĨỊJKLMNOÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢPQRSTUÙÚỨỬỮỰVWXYỲÝỶỸỴZ"
        + "0123456789"
    )

def get_characters_config():
    return CharactersConfig(
        pad="<PAD>",
        eos="<EOS>",
        bos="<BOS>",
        blank=None,
        characters=get_vietnamese_chars(),
        punctuations=".,!? ",
        phonemes=None,
        characters_class="TTS.tts.models.vits.VitsCharacters"
    )

def get_glow_tts_config(output_path="./ckpts", config_path=None):
    characters_config = get_characters_config()
    
    if config_path and os.path.exists(config_path):
        print("Load Config from existed directory")
        with open(config_path, "r") as f:
            config_dict = json.load(f)
        valid_keys = GlowTTSConfig.__init__.__code__.co_varnames
        filtered_config_dict = {k: v for k, v in config_dict.items() if k in valid_keys}
        config = GlowTTSConfig(**filtered_config_dict)
        config.characters = characters_config
        
        config.output_path = output_path
    else:
        print("Create new Config")
        config = GlowTTSConfig(
            batch_size=64,
            eval_batch_size=64,
            num_loader_workers=4,
            num_eval_loader_workers=4,
            run_eval=True,
            test_delay_epochs=-1,
            epochs=100,
            text_cleaner="multilingual_cleaners",
            use_phonemes=False,
            phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
            print_step=300,
            print_eval=False,
            mixed_precision=True,
            output_path=output_path,
            save_step=1000,
            eval_split_max_size=256,
            eval_split_size=0.1,
            characters=characters_config,
            save_n_checkpoints=1,
            test_sentences = ["Tôi đã mất khá nhiều thời gian để phát triển một giọng nói, và giờ đây khi đã có nó, tôi sẽ không im lặng.",
                       "Hãy là một giọng nói, không phải tiếng vọng.",
                       "Xin lỗi Dave. Tôi e là tôi không thể làm điều đó.",
                       "Chiếc bánh này tuyệt vời. Nó thật ngon và ẩm.",
                       "Trước ngày hai mươi hai tháng mười một, năm một nghìn chín trăm sáu mươi ba."]
        )
    
    return config

# Usage
output_path = "./ckpts"
run_folder_path = "run-May-02-2025_07+04PM-0000000"
config_path = os.path.join(output_path, run_folder_path, "config.json")
config = get_glow_tts_config(output_path, config_path)

# Initialize AudioProcessor and Tokenizer
ap = AudioProcessor.init_from_config(config)
tokenizer, config = TTSTokenizer.init_from_config(config)

# Defind Model
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)

# Find the latest checkpoint
checkpoint_dir = os.path.join(output_path, run_folder_path)
checkpoint_files = glob.glob(os.path.join(checkpoint_dir, "best_model.pth"))
if checkpoint_files:
    latest_checkpoint = max(checkpoint_files, key=os.path.getctime)
    print(f"Resuming training from checkpoint: {latest_checkpoint}")
    model.load_checkpoint(config, latest_checkpoint, eval=False)
else:
    print("No checkpoint found, starting training from scratch.")

# Initialize Trainer
trainer = Trainer(
    TrainerArgs(
        continue_path=checkpoint_dir if checkpoint_files else None,  # Resume from checkpoint directory
        restore_path=latest_checkpoint if checkpoint_files else None  # Restore model state
    ),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples
)



Load Config from existed directory
Resuming training from checkpoint: ./ckpts/run-May-02-2025_07+04PM-0000000/best_model.pth


fatal: not a git repository (or any parent up to mount point /kaggle)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 4
 | > Num. of Torch Threads: 2
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=./ckpts/run-May-02-2025_07+04PM-0000000
 > Restoring from checkpoint_32000.pth ...
 > Restoring Model...
 > Restoring Optimizer...
 > Restoring Scaler...
 > Model restored from step 32000

 > Model has 28623121 parameters


# Good Here

In [15]:
from TTS.api import TTS
import torch

# Define paths and input
check_point_folder = "ckpts/run-May-02-2025_07+04PM-0000000"
model_path = f"{check_point_folder}/best_model.pth"
config_path = f"{check_point_folder}/config.json"

# model_path = f"/kaggle/working/ckpts/model_checkpoint.pth"
# config_path = f"/kaggle/working/ckpts/config.json"

out_path = "bca.wav"
text = "Trong khi đó, tại bến tàu du lịch Nha Trang, hàng ngàn du khách chen nhau để đi đến các đảo trên vịnh Nha Trang, lực lượng cảnh sát đường thủy đã tăng cường quân số để quản lý, đảm bảo an toàn cho du khách."

# Set device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize TTS model
tts = TTS(model_path=model_path, config_path=config_path, progress_bar=True)

# Move model to the specified device
tts.to(device)

# Perform inference and save to file
tts.tts_to_file(text=text, file_path=out_path, speaker=None, language=None, split_sentences=False)

'bca.wav'

In [16]:
import IPython
IPython.display.Audio("bca.wav")

In [17]:
!rm -rf /kaggle/working/dataset

In [18]:
!cp -r ./ckpts/run-May-02-2025_07+04PM-0000000/best_model.pth ./ckpts/
!cp -r ./ckpts/run-May-02-2025_07+04PM-0000000/config.json ./ckpts/


In [18]:
!mv asr_outputwav demo_output/asr_output.wav

In [20]:
%cd ..

/kaggle/working


In [None]:
from huggingface_hub import HfApi, login

# Log in
login(token="<your_hf_token>")

# Upload a single file
api = HfApi()
# api.upload_file(
#     path_or_fileobj="/path/to/local/file.txt",
#     path_in_repo="file.txt",
#     repo_id="your-username/your-space-name",
#     repo_type="space"
# )

# Upload an entire folder
api.upload_folder(
    folder_path="ckpts",
    repo_id="danhtran2mind/Viet-Glow-TTS-finetuning",
    repo_type="model"
)

best_model_64201.pth:   0%|          | 0.00/344M [00:00<?, ?B/s]

checkpoint_64000.pth:   0%|          | 0.00/344M [00:00<?, ?B/s]

best_model_64201.pth:   0%|          | 0.00/344M [00:00<?, ?B/s]

best_model_64201.pth:   0%|          | 0.00/344M [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

events.out.tfevents.1746505880.6523aa8279ee.31.0:   0%|          | 0.00/256M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/danhtran2mind/Viet-Glow-TTS-finetuning/commit/0a57cee67bf462ae5a5bd94e77e8397b4da487c2', commit_message='Upload folder using huggingface_hub', commit_description='', oid='0a57cee67bf462ae5a5bd94e77e8397b4da487c2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/danhtran2mind/ckpts', endpoint='https://huggingface.co', repo_type='model', repo_id='danhtran2mind/ckpts'), pr_revision=None, pr_num=None)

## Evaluation Step

# TensorflowBoard Plotting

In [None]:
!pip install tensorboard
!tensorboard --logdir=tts_train_dir