In [24]:
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Access dataset variables
dataset_train_path = os.getenv("DATASET_TRAIN_PATH")
dataset_test_path = os.getenv("DATASET_TEST_PATH")
dataset_unsupervised_path = os.getenv("DATASET_UNSUPERVISED_PATH")

# Access training variables
training_result = os.getenv("TRAINING_RESULT")
training_model = os.getenv("TRAINING_MODEL")

In [25]:
from datasets import load_dataset

# Carrega os arquivos JSON locais
dataset = load_dataset('json', data_files={
    'train': dataset_train_path,
    'test': dataset_test_path,
    'unsupervised': dataset_unsupervised_path
})

In [26]:
from transformers import BertTokenizer

# Load the tokenizer for a pretrained BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [27]:
# Tokenizing function
def preprocess_function(examples):
    return tokenizer(
        examples['text'],
        add_special_tokens=True,
        truncation=True,
        padding='max_length',
        max_length=64,
        return_attention_mask=True
    )

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [28]:
from transformers import BertForSequenceClassification

# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
from transformers import TrainingArguments

# Set up training arguments
training_args = TrainingArguments(
    output_dir=training_result,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
)

In [30]:
# Split dataset into train and test sets
train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]

In [31]:
from evaluate import load

# Evaluation metrics 
accuracy = load("accuracy")
f1 = load("f1")
precision = load("precision")
recall = load("recall")

In [32]:
# Compute metric method
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {
        'accuracy': accuracy.compute(predictions=predictions, references=labels)['accuracy'],
        'f1': f1.compute(predictions=predictions, references=labels)['f1'],
        'precision': precision.compute(predictions=predictions, references=labels)['precision'],
        'recall': recall.compute(predictions=predictions, references=labels)['recall'],
    }

In [33]:
from transformers import Trainer

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

In [34]:
# Train the model
trainer.train()

Step,Training Loss
500,0.4397
1000,0.3365
1500,0.267
2000,0.1809


TrainOutput(global_step=2346, training_loss=0.28450608639664254, metrics={'train_runtime': 296.9367, 'train_samples_per_second': 252.579, 'train_steps_per_second': 7.901, 'total_flos': 2466666144000000.0, 'train_loss': 0.28450608639664254, 'epoch': 3.0})

In [35]:
# Evaluate the model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.4434332549571991, 'eval_accuracy': 0.84184, 'eval_f1': 0.8439251598642141, 'eval_precision': 0.8329437431821723, 'eval_recall': 0.8552, 'eval_runtime': 24.4776, 'eval_samples_per_second': 1021.344, 'eval_steps_per_second': 31.948, 'epoch': 3.0}


In [36]:
import os

# Create directory model
os.makedirs(training_model, exist_ok=True)

In [37]:
# Save model
trainer.save_model(training_model)

# Save tokenizer
tokenizer.save_pretrained(training_model)

('../model\\tokenizer_config.json',
 '../model\\special_tokens_map.json',
 '../model\\vocab.txt',
 '../model\\added_tokens.json')