In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from kaggle_secrets import UserSecretsClient

os.environ["HUGGINGFACE_TOKEN"] = UserSecretsClient().get_secret("HUGGINGFACE-TOKEN-TO-UPLOAD")

In [None]:
def install_lib(libname):
    print(f">>> {libname}")
    get_ipython().system(f"pip install -qqq {libname}")

libs = [
    "bitsandbytes",
    "peft",
    "transformers",
    "trl",
    "accelerate",
    "datasets",
    "backoff",
]

for lib in libs:
    install_lib(lib)

In [3]:
import logging
import warnings

# Suprimir avisos específicos de FutureWarning e UserWarning
warnings.filterwarnings("ignore", category=FutureWarning, message=".*TRANSFORMERS_CACHE.*")
warnings.filterwarnings("ignore", message=".*resume_download.*deprecated.*", category=FutureWarning)
warnings.filterwarnings("ignore", message=".*use_cache=True.*", category=UserWarning)
warnings.filterwarnings("ignore", message=".*use_reentrant parameter should be passed explicitly.*", category=UserWarning)
warnings.filterwarnings("ignore", message="torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly.")
warnings.filterwarnings("ignore", message="*torch.cpu.amp.autocast(args...)` is deprecated*")

# Configurar o nível de log para a biblioteca transformers
logging.getLogger("transformers.trainer").setLevel(logging.WARNING)
logging.getLogger("transformers.trainer_utils").setLevel(logging.WARNING)
logging.getLogger("transformers.training_args").setLevel(logging.WARNING)

In [4]:
import torch
import bitsandbytes
import peft
import accelerate
import datasets
import trl
import warnings
import transformers

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("torch version:", torch.__version__)
print("bitsandbytes version:", bitsandbytes.__version__)
print("peft version:", peft.__version__)
print("accelerate version:", accelerate.__version__)
print("datasets version:", datasets.__version__)
print("trl version:", trl.__version__)
print("transformers version:", transformers.__version__)
print(f"Device name: '{torch.cuda.get_device_name()}'")
print("Device:", device)
print(f"Device properties: '{torch.cuda.get_device_properties(torch.cuda.current_device())}'")
print("Suporta bfloat16." if torch.cuda.is_bf16_supported() else "Não suporta bfloat16.")

torch version: 2.4.0
bitsandbytes version: 0.43.3
peft version: 0.12.0
accelerate version: 0.33.0
datasets version: 2.21.0
trl version: 0.9.6
transformers version: 4.44.0
Device name: 'Tesla T4'
Device: cuda
Device properties: '_CudaDeviceProperties(name='Tesla T4', major=7, minor=5, total_memory=15095MB, multi_processor_count=40)'
Suporta bfloat16.


In [5]:
import os
from random import randrange

import torch
import numpy as np
import pandas as pd
from huggingface_hub import login
from datasets import load_dataset, Dataset


from trl import SFTConfig, SFTTrainer
from peft import LoraConfig, prepare_model_for_kbit_training, TaskType, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    TrainerCallback,
    set_seed,
    pipeline,
    TrainerCallback,
    TrainerControl,
    TrainerState,
)
from datasets import load_dataset, concatenate_datasets

In [6]:
class SaveCheckpointCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        print(f"Saving checkpoint at step {state.global_step}")



class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=3, early_stopping_threshold=0.02):
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping_threshold = early_stopping_threshold
        self.best_metric = None
        self.counter = 0

    def on_evaluate(self, args, state: TrainerState, control: TrainerControl, metrics=None, **kwargs):
        current_metric = metrics.get("eval_loss")  # Use the relevant metric for your task

        if current_metric is None:
            return

        if self.best_metric is None or current_metric < self.best_metric - self.early_stopping_threshold:
            self.best_metric = current_metric
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.early_stopping_patience:
                control.should_training_stop = True
                print(f"Early stopping at step {state.global_step} with best eval_loss = {self.best_metric}")



class SaveMetricsCallback(TrainerCallback):
    def __init__(self, output_dir):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        self.output_dir = output_dir
        self.output_path = os.path.join(output_dir, "metrics.json")
        self.state_path = os.path.join(output_dir, "state.json")
        print(f"Output directory initialized at {output_dir}")
        self.metrics = self.load_existing_metrics()

    def load_existing_metrics(self):
        if os.path.isfile(self.output_path):
            return pd.read_json(self.output_path, lines=True).to_dict('records')
        return []

    def on_step_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        state.save_to_json(self.state_path)

    def on_evaluate(self, args, state: TrainerState, control: TrainerControl, metrics=None, **kwargs):
        if metrics:
            step = state.global_step
            _metrics = {"Step": step, **metrics}
            self.metrics.append(_metrics)
            metrics_df = pd.DataFrame(self.metrics).drop_duplicates(subset=['Step'], keep='last')
            metrics_df.to_json(self.output_path, orient="records", lines=True)

In [7]:
def get_latest_checkpoint(checkpoint_dir):
    try:
        checkpoints = [os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir) if d.startswith("checkpoint-")]
        if not checkpoints:
            return None
        latest_checkpoint = max(checkpoints, key=os.path.getctime)
        return latest_checkpoint

    except FileNotFoundError:
        return None

In [8]:
def set_tokenizer(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.padding_side = 'right'
    return tokenizer

In [9]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
os.environ['TORCH_USE_CUDA_DSA'] = "1"



LOCAL_MODELPATH = "models/question-generator/" + model_id.lower().replace("/","-").replace(".","_")
login(token=os.environ.get("HUGGINGFACE_TOKEN"))
set_seed(1234)

attn_implementation = "eager"
compute_dtype = torch.float16

# A quantização é uma técnica para reduzir o tamanho do modelo e aumentar a eficiência computacional.
# Utilizamos a classe BitsAndBytesConfig para configurar a quantização em 4 bits, o que reduz o uso de memória e acelera o treinamento.
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="bfloat16",
        bnb_4bit_use_double_quant=True,
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Discovery

# Training Process

In [20]:
def load_datasets():
    print("Loading dataset")

    dataset_base = load_dataset("emdemor/1984-perguntas-e-respostas")
    train_dataset = dataset_base["train"].select_columns(['pergunta', 'resposta'])
    test_dataset = dataset_base["test"].select_columns(['pergunta', 'resposta'])


    train_dataset = concatenate_datasets(
        [
            train_dataset,
            
            # Synonyms
            load_dataset("emdemor/1984-perguntas-e-respostas-sinonimos")["train"].select_columns(['pergunta', 'resposta']),
            
            # Small Contexts
            # load_dataset("emdemor/small-contexts-1984")["train"].rename_columns({"input":"pergunta",  "output":"resposta"}).select_columns(['pergunta', 'resposta']),
        ]
    )


    condition = lambda x: (x["pergunta"] is not None) and (x["resposta"] is not None)

    train_dataset = train_dataset.shuffle().filter(lambda x: condition(x))
    test_dataset = test_dataset.shuffle().filter(lambda x: condition(x)).select(range(100))
    
    return train_dataset, test_dataset

In [21]:
def load_model():
    #     model_id = "microsoft/Phi-3.5-mini-instruct"
    model_id = "emdemor/guru1984"
    
    print("Loading model")
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=compute_dtype,
        trust_remote_code=True,
        quantization_config=bnb_config,
        device_map="auto",
        attn_implementation=attn_implementation,
    )


    # adapta o modelo para o treinamento em k-bits, otimizando ainda mais o desempenho.
    model = prepare_model_for_kbit_training(model)

    tokenizer = set_tokenizer(model_id)
    
    return model, tokenizer

In [22]:
import backoff

repo_id = "emdemor/guru1984-v2"

@backoff.on_exception(backoff.expo,
                      Exception,
                      max_tries=8,
                      jitter=None)
def train():
    
    try:
        train_dataset, test_dataset = load_datasets()
        model, tokenizer = load_model()

        print("Preprocessing dataset")
        def format_dataset_chatml(row):
            messages = [
                {
                    "content": f"{row['pergunta']}",
                    "role": "user"
                },
                {
                    "content": f"{row['resposta']}",
                    "role": "assistant"
                }
            ]

            return {"text": tokenizer.apply_chat_template(messages, add_generation_prompt=False, tokenize=False)}

        train_dataset_chatml = train_dataset.map(format_dataset_chatml)
        test_dataset_chatml = test_dataset.map(format_dataset_chatml)
    except Exception as err:
        print(err)
        raise err

    
    print("Starting training dataset")
    sft_config = SFTConfig(
        seed=42,
        output_dir=LOCAL_MODELPATH,
        dataset_text_field="text",
        max_seq_length=2000,

        # Training Hyperparameters
        learning_rate=5e-4,
        num_train_epochs=3,
        warmup_ratio=0.1,
        lr_scheduler_type="linear",

        # Validation
        do_eval=True,
        eval_strategy="steps",
        eval_steps=50,

        # Chackpoints
        save_strategy="steps",  # Salvando a cada 1000 passos
        save_steps=50,         # Salvando a cada 1000 passos
        # save_strategy="epoch",  # Salvando ao final de cada época

        # Loggings
        log_level="warning",
        logging_steps=50,

        # Weights yype
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),

        # Report to Weights And Bias? TensorBoard?
        report_to="none",

        push_to_hub=True,
        hub_model_id=repo_id
    )

    peft_config = LoraConfig(
            r=16,
            lora_alpha=1,
            lora_dropout=0.05,
            task_type=TaskType.CAUSAL_LM,
            target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
    )

    try:
        trainer = SFTTrainer(
            model,
            train_dataset=train_dataset_chatml,
            eval_dataset=test_dataset_chatml,
            args=sft_config,
            peft_config=peft_config,
            tokenizer=tokenizer,
            callbacks=[SaveCheckpointCallback(),
                       EarlyStoppingCallback(early_stopping_patience=10, early_stopping_threshold=0.00005),
                       SaveMetricsCallback(LOCAL_MODELPATH)],
        )

        trainer.train()

        trainer.save_model()
    except Exception as err:
        print(err)
        trainer.model.push_to_hub(repo_id=repo_id, commit_description="Using more data")
        raise err

In [23]:
train()

Loading dataset


Filter:   0%|          | 0/10198 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Loading model


adapter_config.json:   0%|          | 0.00/733 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/35.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.37k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

Preprocessing dataset


Map:   0%|          | 0/10177 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Starting training dataset
Output directory initialized at models/question-generator/microsoft-phi-3_5-mini-instruct


Map:   0%|          | 0/10177 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
50,4.2186,3.861753
100,3.1772,2.593402
150,2.401,2.255957
200,2.1397,2.136943
250,2.0834,2.080466
300,2.055,2.056305
350,2.043,2.028634
400,2.0135,2.017705
450,1.9971,2.001974
500,1.9766,1.99144


Saving checkpoint at step 50


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 100


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 150


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 200


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 250


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 300


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 350


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 400


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 450


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 500


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 550


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 600


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 650


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 700


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 750


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 800


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 850


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 900


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 950


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1000


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1050


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1100


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1150


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1200


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1250


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1300


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1350


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1400


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1450


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1500


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1550


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1600


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1650


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1700


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1750


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1800


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1850


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1900


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 1950


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2000


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2050


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2100


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2150


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2200


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2250


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2300


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2350


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2400


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2450


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2500


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2550


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2600


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2650


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2700


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2750


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2800


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2850


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2900


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 2950


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 3000


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 3050


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 3100


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 3150


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 3200


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 3250


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 3300


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 3350


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 3400


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 3450


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 3500


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 3550


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 3600


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 3650


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 3700


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 3750


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 3800


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Saving checkpoint at step 3819


In [16]:
train_dataset, test_dataset = load_datasets()
train_dataset

Loading dataset


Filter:   0%|          | 0/10198 [00:00<?, ? examples/s]

Dataset({
    features: ['pergunta', 'resposta'],
    num_rows: 10177
})

In [19]:
train_dataset.shuffle()[0]

{'pergunta': 'Qual era a crença da pessoa sobre o mundo sexual fora do controle do Partido?',
 'resposta': 'Aceitar a ideia de um mundo próprio, fora do Partido, que devia ser destruído.'}