In [1]:
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Access dataset variables
dataset_train_path = os.getenv("DATASET_TRAIN_PATH")
dataset_test_path = os.getenv("DATASET_TEST_PATH")
dataset_unsupervised_path = os.getenv("DATASET_UNSUPERVISED_PATH")

# Access training variables
training_result = os.getenv("TRAINING_RESULT")
training_model = os.getenv("TRAINING_MODEL")

In [2]:
from datasets import load_dataset

# Carrega os arquivos JSON locais
dataset = load_dataset('json', data_files={
    'train': dataset_train_path,
    'test': dataset_test_path,
    'unsupervised': dataset_unsupervised_path
})

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 25000 examples [00:00, 364724.62 examples/s]
Generating test split: 25000 examples [00:00, 393503.26 examples/s]
Generating unsupervised split: 50000 examples [00:00, 418188.38 examples/s]


In [3]:
from transformers import BertTokenizer

# Load the tokenizer for a pretrained BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
# Tokenizing function
def preprocess_function(examples):
    return tokenizer(
        examples['text'],
        add_special_tokens=True,
        truncation=True,
        padding='max_length',
        max_length=64,
        return_attention_mask=True
    )

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 25000/25000 [01:15<00:00, 330.59 examples/s]
Map: 100%|██████████| 25000/25000 [01:16<00:00, 325.02 examples/s]
Map: 100%|██████████| 50000/50000 [02:13<00:00, 374.05 examples/s]


In [5]:
from transformers import BertForSequenceClassification

# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from transformers import TrainingArguments

# Set up training arguments
training_args = TrainingArguments(
    output_dir=training_result,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
)

In [7]:
# Split dataset into train and test sets
train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]

In [8]:
from transformers import Trainer

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=tokenizer,
)

In [9]:
# Train the model
trainer.train()

Step,Training Loss
500,0.4386
1000,0.3406
1500,0.2703
2000,0.1885


TrainOutput(global_step=2346, training_loss=0.2881857976889062, metrics={'train_runtime': 279.3355, 'train_samples_per_second': 268.494, 'train_steps_per_second': 8.399, 'total_flos': 2466666144000000.0, 'train_loss': 0.2881857976889062, 'epoch': 3.0})

In [10]:
# Evaluate the model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.44337308406829834, 'eval_runtime': 23.0499, 'eval_samples_per_second': 1084.602, 'eval_steps_per_second': 33.926, 'epoch': 3.0}


In [11]:
import os

# Create directory model
os.makedirs(training_model, exist_ok=True)

In [12]:
# Save model
trainer.save_model(training_model)

# Save tokenizer
tokenizer.save_pretrained(training_model)

('../model\\tokenizer_config.json',
 '../model\\special_tokens_map.json',
 '../model\\vocab.txt',
 '../model\\added_tokens.json')