In [None]:
from transformers import AutoTokenizer

# Load the tokenizer for a pretrained BERT model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
from datasets import load_dataset

# Carrega os arquivos JSON locais
dataset = load_dataset('json', data_files={
    'train': '../dataset/imdb_train.csv',
    'test': '../dataset/imdb_test.csv',
    'unsupervised': '../dataset/imdb_unsupervised.csv'
})

In [None]:
# Tokenizing function
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=512)

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification

# Load a pretrained BERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
from transformers import TrainingArguments

# Set up training arguments
training_args = TrainingArguments(
    output_dir="../results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [None]:
# Split dataset into train and test sets
train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]

In [None]:
from transformers import Trainer

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Evaluate the model
results = trainer.evaluate()
print(results)

In [None]:
import os

os.makedirs("../model", exist_ok=True)

In [None]:
# Save model
trainer.save_model("../model")

# Save tokenizer
tokenizer.save_pretrained("../model")