# SPR 2026 - BERTimbau

**Config:** Add Input -> Models -> bertimbau-ptbr-complete (fabianofilho), Internet OFF, GPU T4 x2

In [None]:
import os, numpy as np, pandas as pd, torch, warnings
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
warnings.filterwarnings('ignore')

SEED, DATA_DIR = 42, '/kaggle/input/spr-2026-mammography-report-classification'
MAX_LENGTH, BATCH_SIZE, EPOCHS, LR = 512, 8, 3, 2e-5

def find_model_path():
    base = '/kaggle/input'
    def search(d, depth=0):
        if depth > 10: return None
        try:
            for item in os.listdir(d):
                p = os.path.join(d, item)
                if os.path.isdir(p):
                    if os.path.exists(os.path.join(p, 'config.json')): return p
                    r = search(p, depth+1)
                    if r: return r
        except: pass
        return None
    return search(base)

MODEL_PATH = find_model_path()
if not MODEL_PATH: raise FileNotFoundError('Modelo nao encontrado!')
print(f'Model: {MODEL_PATH}')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
np.random.seed(SEED); torch.manual_seed(SEED)

In [None]:
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f'Train: {train_df.shape}, Test: {test_df.shape}')

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=512):
        self.texts, self.labels, self.tokenizer, self.max_len = texts, labels, tokenizer, max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(str(self.texts[idx]), truncation=True, max_length=self.max_len, padding='max_length', return_tensors='pt')
        item = {'input_ids': enc['input_ids'].squeeze(), 'attention_mask': enc['attention_mask'].squeeze()}
        if self.labels is not None: item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)

# COLUNAS CORRETAS: report e target
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['report'].tolist(), train_df['target'].tolist(), test_size=0.1, stratify=train_df['target'], random_state=SEED)

train_ds = TextDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
val_ds = TextDataset(val_texts, val_labels, tokenizer, MAX_LENGTH)
print(f'Train: {len(train_ds)}, Val: {len(val_ds)}')

In [None]:
def compute_metrics(p): return {'f1_macro': f1_score(p.label_ids, np.argmax(p.predictions, axis=1), average='macro')}

model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=7, local_files_only=True)

args = TrainingArguments(
    output_dir='/kaggle/working/model', num_train_epochs=EPOCHS, per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE*2, learning_rate=LR, weight_decay=0.01, warmup_ratio=0.1,
    fp16=torch.cuda.is_available(), eval_strategy='epoch', save_strategy='epoch',
    load_best_model_at_end=True, metric_for_best_model='f1_macro', greater_is_better=True, report_to='none', seed=SEED)

trainer = Trainer(model=model, args=args, train_dataset=train_ds, eval_dataset=val_ds, compute_metrics=compute_metrics)
trainer.train()

In [None]:
# Predicao e Submissao
test_ds = TextDataset(test_df['report'].tolist(), None, tokenizer, MAX_LENGTH)
preds = np.argmax(trainer.predict(test_ds).predictions, axis=1)

# COLUNAS CORRETAS: ID e target
submission = pd.DataFrame({'ID': test_df['ID'], 'target': preds})
submission.to_csv('/kaggle/working/submission.csv', index=False)
print('CONCLUIDO!')
print(dict(submission['target'].value_counts().sort_index()))

# SPR 2026 - BERTimbau

**Config:** Add Input -> Models -> bertimbau-ptbr-complete (fabianofilho), Internet OFF, GPU T4 x2

In [None]:
import os, numpy as np, pandas as pd, torch, warnings
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
warnings.filterwarnings("ignore")

SEED, DATA_DIR = 42, "/kaggle/input/spr-2026-mammography-report-classification"
MAX_LENGTH, BATCH_SIZE, EPOCHS, LR = 512, 8, 3, 2e-5

def find_model_path():
    base = "/kaggle/input"
    def search(d, depth=0):
        if depth > 10: return None
        try:
            for item in os.listdir(d):
                p = os.path.join(d, item)
                if os.path.isdir(p):
                    if os.path.exists(os.path.join(p, "config.json")): return p
                    r = search(p, depth+1)
                    if r: return r
        except: pass
        return None
    return search(base)

MODEL_PATH = find_model_path()
if not MODEL_PATH: raise FileNotFoundError("Modelo nao encontrado!")
print(f"Model: {MODEL_PATH}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
np.random.seed(SEED); torch.manual_seed(SEED)

In [None]:
train_df = pd.read_csv(f"{DATA_DIR}/train.csv")
test_df = pd.read_csv(f"{DATA_DIR}/test.csv")
print(f"Train: {train_df.shape}, Test: {test_df.shape}")

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=512):
        self.texts, self.labels, self.tokenizer, self.max_len = texts, labels, tokenizer, max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(str(self.texts[idx]), truncation=True, max_length=self.max_len, padding="max_length", return_tensors="pt")
        item = {"input_ids": enc["input_ids"].squeeze(), "attention_mask": enc["attention_mask"].squeeze()}
        if self.labels is not None: item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)

# COLUNAS CORRETAS: report e target
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df["report"].tolist(), train_df["target"].tolist(), test_size=0.1, stratify=train_df["target"], random_state=SEED)

train_ds = TextDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
val_ds = TextDataset(val_texts, val_labels, tokenizer, MAX_LENGTH)
print(f"Train: {len(train_ds)}, Val: {len(val_ds)}")

In [None]:
def compute_metrics(p): return {"f1_macro": f1_score(p.label_ids, np.argmax(p.predictions, axis=1), average="macro")}

model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=7, local_files_only=True)

args = TrainingArguments(
    output_dir="/kaggle/working/model", num_train_epochs=EPOCHS, per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE*2, learning_rate=LR, weight_decay=0.01, warmup_ratio=0.1,
    fp16=torch.cuda.is_available(), eval_strategy="epoch", save_strategy="epoch",
    load_best_model_at_end=True, metric_for_best_model="f1_macro", greater_is_better=True, report_to="none", seed=SEED)

trainer = Trainer(model=model, args=args, train_dataset=train_ds, eval_dataset=val_ds, compute_metrics=compute_metrics)
trainer.train()

In [None]:
# Predicao e Submissao
test_ds = TextDataset(test_df["report"].tolist(), None, tokenizer, MAX_LENGTH)
preds = np.argmax(trainer.predict(test_ds).predictions, axis=1)

# COLUNAS CORRETAS: ID e target
submission = pd.DataFrame({"ID": test_df["ID"], "target": preds})
submission.to_csv("/kaggle/working/submission.csv", index=False)
print("CONCLUIDO!")
print(dict(submission["target"].value_counts().sort_index()))