In [None]:
#https://github.com/ThomasLamsonFr/AITextGenerator

In [1]:
import torch
import pandas as pd
import numpy as np
from transformers import (
    AutoModelWithLMHead, 
    AutoModelForCausalLM, 
    AutoConfig, 
    AutoTokenizer,
    GPT2Tokenizer, 
    AdamW,
    get_linear_schedule_with_warmup
)

In [2]:
pretrained_name_or_path = "gpt2"

#### Define Cuda

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device('cuda:0')
print('Device:',device)

Device: cuda:0


#### Cargamos el modelo pre-entrenado

In [4]:
model = AutoModelWithLMHead.from_pretrained(pretrained_name_or_path);



In [5]:
print('Datos del Modelo:\n=================')
print(' - Tipo modelo Base:', model.config.model_type)
print(' - Arquitectura:', model.config.architectures)
print(' - Posiciones (largo máximo de secuencia):', model.config.n_positions)
print(' - Tamaño dimensional interno:', model.config.n_embd)
print(' - Cabezales de Atención:', model.config.n_head)
print(' - Capas :', model.config.n_layer)
print(' - Tamaño de Vocabulario :', model.config.vocab_size)
print(' - Function de Activación :', model.config.activation_function)
#GELU: https://medium.com/@shoray.goel/gelu-gaussian-error-linear-unit-4ec59fb2e47c

Datos del Modelo:
 - Tipo modelo Base: gpt2
 - Arquitectura: ['GPT2LMHeadModel']
 - Posiciones (largo máximo de secuencia): 1024
 - Tamaño dimensional interno: 768
 - Cabezales de Atención: 12
 - Capas : 12
 - Tamaño de Vocabulario : 50257
 - Function de Activación : gelu_new


#### Cargamos nuestro tokenizador español

In [6]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_name_or_path);

In [7]:
print('Datos del Tokenizador:\n======================')
print(' - Tamaño del vocabulario:', tokenizer.vocab_size)
print(' - Token Inicio de Secuencia (token => id): {} => {}'.format(tokenizer.bos_token, tokenizer.bos_token_id))
print(' - Token Fin de Secuencia (token => id): {} => {}'.format(tokenizer.eos_token, tokenizer.eos_token_id))
print(' - Token de relleno (token => id): {} => {}'.format(tokenizer.pad_token, tokenizer.pad_token_id))
print(' - Token fuera de vocabulario (token => id): {} => {}'.format(tokenizer.unk_token, tokenizer.unk_token))
print(' - Token mascara (token => id): {} => {}'.format(tokenizer.mask_token, tokenizer.mask_token))
print(' - Largo máximo: {}'.format(tokenizer.max_len))
# https://huggingface.co/transformers/main_classes/tokenizer.html

Using pad_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using mask_token, but it is not set yet.


Datos del Tokenizador:
 - Tamaño del vocabulario: 50257
 - Token Inicio de Secuencia (token => id): <|endoftext|> => 50256
 - Token Fin de Secuencia (token => id): <|endoftext|> => 50256
 - Token de relleno (token => id): None => None
 - Token fuera de vocabulario (token => id): <|endoftext|> => <|endoftext|>
 - Token mascara (token => id): None => None
 - Largo máximo: 1024




#### Un ejemplo del tokenizador en funcionamiento
Ubica las palabras en su representación vectorial ya aprendida, una palabra nueva la descompone utilizando tokens ya conocidos

In [8]:
def show_tokenizer_working(seq, tokenizer):
    toks = tokenizer.encode(seq)
    print ("secuencia:", seq)
    for tok in toks:
        print ( " - {} -> {}".format(tokenizer.decode([tok]).strip(),tok))
    print('\n')
show_tokenizer_working('Buenos días a todos',tokenizer)
show_tokenizer_working('yu8ausy',tokenizer)
show_tokenizer_working('123456',tokenizer)
show_tokenizer_working('3.141592',tokenizer)
show_tokenizer_working('π',tokenizer)
show_tokenizer_working('🤔',tokenizer)
#aunque el tokenizador no conoce el emoji, es capaz de codificarlo y decodificarlo
tokenizer.decode(tokenizer.encode('🤔'))

secuencia: Buenos días a todos
 - Bu -> 38374
 - enos -> 28380
 - d -> 288
 - í -> 8836
 - as -> 292
 - a -> 257
 - to -> 284
 - dos -> 37427


secuencia: yu8ausy
 - yu -> 24767
 - 8 -> 23
 - aus -> 8717
 - y -> 88


secuencia: 123456
 - 123 -> 10163
 - 456 -> 29228


secuencia: 3.141592
 - 3 -> 18
 - . -> 13
 - 14 -> 1415
 - 15 -> 1314
 - 92 -> 5892


secuencia: π
 - π -> 46582


secuencia: 🤔
 - � -> 8582
 - � -> 97
 - � -> 242




'🤔'

### Vamos a agregar al tokenizador nuestros tokens especiales nuevos

Cada tweet lo vamos a estructurar de la siguiente forma para entregarle a nuestro modelo cada ejemplo así:

|Coalición|Partido|Sentimiento|Entidades|Frases Clave| tweet |
|---      |---    |---        |---      |---           | ---| 
|[COALICION] chile vamos|[PARTIDO] udi |[SENTIMIENTO] positivo |[ENTIDADES] carabineros  |[FRASES] cuentan con el apoyo| [TWEET] ahora los carabineros se enfrentan...

In [9]:
tokenizer.add_special_tokens(
    {'bos_token': '[TWEET]',
     'additional_special_tokens': ['[COALICION]', '[PARTIDO]', '[SENTIMIENTO]', '[ENTIDADES]', '[HASHTAGS]', '[FRASES]']})

tokenizer.pad_token = tokenizer.eos_token


In [10]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50264, 768)

In [11]:
print('Datos del Modelo:\n=================')
print(' - Tipo modelo Base:', model.config.model_type)
print(' - Arquitectura:', model.config.architectures)
print(' - Posiciones (largo máximo de secuencia):', model.config.n_positions)
print(' - Tamaño dimensional interno:', model.config.n_embd)
print(' - Cabezales de Atención:', model.config.n_head)
print(' - Capas :', model.config.n_layer)
print(' - Tamaño de Vocabulario :', model.config.vocab_size)
print(' - Function de Activación :', model.config.activation_function)
model.to(device);

Datos del Modelo:
 - Tipo modelo Base: gpt2
 - Arquitectura: ['GPT2LMHeadModel']
 - Posiciones (largo máximo de secuencia): 1024
 - Tamaño dimensional interno: 768
 - Cabezales de Atención: 12
 - Capas : 12
 - Tamaño de Vocabulario : 50264
 - Function de Activación : gelu_new


## Tweets Dataset

In [12]:
df = pd.read_json('tweets_formatted.json', lines=True)
df.head(3)

Unnamed: 0,COALICION,PARTIDO,SENTIMIENTO,ENTIDADES,HASHTAGS,FRASES,TWEET
0,Chile Vamos,IND-GOB,NEUTRAL,CarolCBown s_villarrealb sebastianpinera,CuentaPública ChileenMarcha,,Ya estamos en el Congreso con los subses @Caro...
1,Chile Vamos,RN,NEGATIVE,,Araucanía CuentaPública,,"⭕ ""Combatir con máxima voluntad y firmeza, sie..."
2,Chile Vamos,RN,NEUTRAL,Presidente Ministerio de Agricultura y Aliment...,CuentaPública,#CuentaPública ANUNCIO Nuestro Presidente la c...,#CuentaPública\n📢ANUNCIO| Nuestro Presidente a...


In [13]:
!pip install tensorboard

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_latest_p36/bin/python -m pip install --upgrade pip' command.[0m


In [14]:
from sklearn.model_selection import train_test_split
import os
from src.torch_loader import  DatasetFromPandas, VectorizeMode, VectorizeParagraph
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler

from tqdm.notebook import tqdm, trange

import random
from torch.utils.tensorboard import SummaryWriter

train_df, test_df = train_test_split(df,test_size=0.1)
print(len(train_df), len(test_df))


GPT2_BLOCK_SIZE = model.config.n_positions


153275 17031


In [15]:
vectorizer = VectorizeParagraph(
    tokenizer=tokenizer,
    block_size=GPT2_BLOCK_SIZE,
    mode=VectorizeMode.TRAIN
)

train_ds = DatasetFromPandas(train_df, vectorizer)
eval_ds = DatasetFromPandas(test_df, vectorizer)

# TRAINING

In [16]:
#!pip install tensorboard

n_gpu = 1

train_batch_size = 4
per_gpu_train_batch_size = 4

eval_batch_size = 8
per_gpu_eval_batch_size = 8

gradient_accumulation_steps = 1
weight_decay = 0.0
learning_rate = 5e-5
warmup_steps = 0
adam_epsilon = 1e-8
max_grad_norm = 1
logging_steps= 2000
save_steps = 2000
output_dir = 'model_checkpoints_v3'
print_input=False

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)
        
def collate(examples):
    all_inputs = [elt[0] for elt in examples]
    all_types = [elt[1] for elt in examples]
    all_labels = [elt[2] for elt in examples]

    padded_inputs = pad_sequence(
        all_inputs, batch_first=True, padding_value=tokenizer.pad_token_id
    )
    padded_types = pad_sequence(
        all_types, batch_first=True, padding_value=tokenizer.pad_token_id
    )
    padded_labels = pad_sequence(all_labels, batch_first=True, padding_value=-100)

    return padded_inputs, padded_types, padded_labels

In [17]:
def train(train_dataset, model, tokenizer, epochs):
    tb_writer = SummaryWriter()
    num_train_epochs = epochs
    train_sampler = RandomSampler(train_dataset)

    train_dataloader = DataLoader(
        train_dataset,
        sampler=train_sampler,
        batch_size=train_batch_size,
        collate_fn=collate,
    )
    
    t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs

    no_decay = ["bias", "LayerNorm.weight"]
    
    optimizer_grouped_parameters = [
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": weight_decay,
        },
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(
        optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon
    )
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
    )
    
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    
    model.to(device)
    #Entrenar!
    
    print("***** Running training *****")
    print("  Num examples =", len(train_dataset))
    print("  Num Epochs =", num_train_epochs)
    print("  Instantaneous batch size per GPU =", per_gpu_train_batch_size)
    print("  Gradient Accumulation steps =", gradient_accumulation_steps)
    print("  Total optimization steps =", t_total)
    
    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    
    tr_loss, logging_loss = 0.0, 0.0
    
    model_to_resize = model.module if hasattr(model, "module") else model
    model_to_resize.resize_token_embeddings(len(tokenizer))   
    
    model.zero_grad()
    
    train_iterator = trange(
        epochs_trained,
        int(num_train_epochs),
        desc="Epoch",
        disable=False,
    )
    
    set_seed(0)
    
    for ti in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=False)
        
        for step, batch in enumerate(epoch_iterator):
            
            input_ids, type_ids, labels = batch

            if print_input:
                print( "Examples contained in the batch that will be given as input in the model")
                
                for i in range(input_ids.shape[0]):
                    decoded_input = tokenizer.decode(input_ids[i, :].tolist(), skip_special_tokens=False)
                    print("Ex n° {} : {}".format(i, decoded_input)) 
                
            input_ids = input_ids.to(device)
            type_ids = type_ids.to(device)
            labels = labels.to(device)
            
            model.train()
            
            outputs = model(input_ids, labels=labels, token_type_ids=type_ids)
            loss = outputs[ 0]  # model outputs are always tuple in transformers (see doc)

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            
            if (step + 1) % gradient_accumulation_steps == 0:

                torch.nn.utils.clip_grad_norm_( model.parameters(), max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if ( logging_steps > 0 and global_step % logging_steps == 0):
                    results = evaluate(model, tokenizer)
                    for key, value in results.items():
                        tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar(
                        "loss",
                        (tr_loss - logging_loss) / logging_steps,
                        global_step,
                    )
                    logging_loss = tr_loss

                if (global_step % save_steps == 0):
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    train_output_dir = os.path.join(
                        output_dir, "{}-{}".format(checkpoint_prefix, global_step)
                    )
                    os.makedirs(train_output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(train_output_dir)
                    tokenizer.save_pretrained(train_output_dir)

                    #torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    print("Saving model checkpoint to %s", train_output_dir)


                    torch.save(
                        optimizer.state_dict(), os.path.join(train_output_dir, "optimizer.pt")
                    )
                    torch.save(
                        scheduler.state_dict(), os.path.join(train_output_dir, "scheduler.pt")
                    )
                    print(
                        "Saving optimizer and scheduler states to %s", train_output_dir
                    )
            
    tb_writer.close()
    return global_step, tr_loss / global_step

In [18]:
def evaluate(model, tokenizer, prefix=""):
    eval_output_dir = output_dir
    eval_dataset = eval_ds
    os.makedirs(eval_output_dir, exist_ok=True)

    eval_batch_size = per_gpu_eval_batch_size * n_gpu

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset,
        sampler=eval_sampler,
        batch_size=eval_batch_size,
        collate_fn=collate,
    )

    # multi-gpu evaluate
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    model.to(device)

    # Eval!
    print("***** Running evaluation {} *****".format(prefix))
    print("  Num examples =", len(eval_dataset))
    print("  Batch size =",eval_batch_size)
    
    eval_loss = 0.0
    nb_eval_steps = 0
    
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, types, labels = batch
        dev = torch.device("cuda:0")
        
        inputs = inputs.to(dev)
        types  = types.to(dev)
        labels = labels.to(dev)

        with torch.no_grad():
            outputs = model(inputs, labels=labels, token_type_ids=types)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        print("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            text = "{} = {}".format(key, str(result[key]))
            print(text)
            writer.write(text)

    return result

In [None]:
%%time
global_step, tr_loss = train(train_ds, model, tokenizer, 3)

***** Running training *****
  Num examples = 153275
  Num Epochs = 3
  Instantaneous batch size per GPU = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 114957


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=38319.0, style=ProgressStyle(description_…

***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(7.6105)




Saving model checkpoint to %s model_checkpoints_v3/checkpoint-2000




Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-2000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(6.5305)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-4000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-4000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(5.9847)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-6000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-6000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(5.6638)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-8000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-8000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(5.4246)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-10000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-10000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(5.2211)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-12000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-12000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(5.0556)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-14000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-14000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(4.9020)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-16000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-16000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(4.7702)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-18000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-18000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(4.6802)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-20000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-20000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(4.5961)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-22000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-22000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(4.5415)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-24000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-24000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(4.4495)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-26000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-26000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(4.3845)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-28000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-28000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(4.3374)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-30000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-30000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(4.2949)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-32000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-32000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(4.2327)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-34000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-34000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(4.1934)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-36000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-36000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(4.1503)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-38000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-38000



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=38319.0, style=ProgressStyle(description_…

***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(4.1304)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-40000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-40000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(4.0832)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-42000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-42000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(4.0448)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-44000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-44000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(4.0179)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-46000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-46000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(4.0137)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-48000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-48000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(3.9629)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-50000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-50000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(3.9423)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-52000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-52000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(3.9245)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-54000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-54000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(3.9004)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-56000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-56000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(3.8886)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-58000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-58000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(3.8711)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-60000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-60000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(3.8465)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-62000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-62000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(3.8235)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-64000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-64000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(3.8114)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-66000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-66000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(3.7948)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-68000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-68000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(3.7725)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-70000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-70000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(3.7496)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-72000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-72000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(3.7477)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-74000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-74000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(3.7243)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-76000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-76000



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=38319.0, style=ProgressStyle(description_…

***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(3.7323)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-78000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-78000
***** Running evaluation  *****
  Num examples = 17031
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=2129.0, style=ProgressStyle(description_…


***** Eval results  *****
perplexity = tensor(3.7182)
Saving model checkpoint to %s model_checkpoints_v3/checkpoint-80000
Saving optimizer and scheduler states to %s model_checkpoints_v3/checkpoint-80000


In [21]:
print('fail')

fail
