In [None]:
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Access dataset variables
dataset_train_path = os.getenv("DATASET_TRAIN_PATH")
dataset_test_path = os.getenv("DATASET_TEST_PATH")
dataset_unsupervised_path = os.getenv("DATASET_UNSUPERVISED_PATH")

# Access training variables
training_result = os.getenv("TRAINING_RESULT")
training_model = os.getenv("TRAINING_MODEL")

In [None]:
from datasets import load_dataset

# Carrega os arquivos JSON locais
dataset = load_dataset('json', data_files={
    'train': dataset_train_path,
    'test': dataset_test_path,
    'unsupervised': dataset_unsupervised_path
})

In [None]:
from transformers import BertTokenizer

# Load the tokenizer for a pretrained BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Tokenizing function
def preprocess_function(examples):
    return tokenizer(
        examples['text'],
        add_special_tokens=True,
        truncation=True,
        padding='max_length',
        max_length=64,
        return_attention_mask=True
    )

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import BertForSequenceClassification

# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
from transformers import TrainingArguments

# Set up training arguments
training_args = TrainingArguments(
    output_dir=training_result,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
)

In [None]:
# Split dataset into train and test sets
train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]

In [None]:
from transformers import Trainer

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=tokenizer,
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Evaluate the model
results = trainer.evaluate()
print(results)

In [None]:
import os

# Create directory model
os.makedirs(training_model, exist_ok=True)

In [None]:
# Save model
trainer.save_model(training_model)

# Save tokenizer
tokenizer.save_pretrained(training_model)